LoopVectorize.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp]

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10	// and generates target-independent LLVM-IR.
11	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12	// of instructions in order to estimate the profitability of vectorization.
13	//
14	// The loop vectorizer combines consecutive loop iterations into a single
15	// 'wide' iteration. After this transformation the index is incremented
16	// by the SIMD vector width, and not by one.
17	//
18	// This pass has three parts:
19	// 1. The main loop pass that drives the different parts.
20	// 2. LoopVectorizationLegality - A unit that checks for the legality
21	// of the vectorization.
22	// 3. InnerLoopVectorizer - A unit that performs the actual
23	// widening of instructions.
24	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25	// of vectorization. It decides on the optimal vector width, which
26	// can be one, if vectorization is not profitable.
27	//
28	// There is a development effort going on to migrate loop vectorizer to the
29	// VPlan infrastructure and to introduce outer loop vectorization support (see
30	// docs/VectorizationPlan.rst and
31	// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32	// purpose, we temporarily introduced the VPlan-native vectorization path: an
33	// alternative vectorization path that is natively implemented on top of the
34	// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35	//
36	//===----------------------------------------------------------------------===//
37	//
38	// The reduction-variable vectorization is based on the paper:
39	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40	//
41	// Variable uniformity checks are inspired by:
42	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43	//
44	// The interleaved access vectorization is based on the paper:
45	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46	// Data for SIMD
47	//
48	// Other ideas/concepts are from:
49	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50	//
51	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52	// Vectorizing Compilers.
53	//
54	//===----------------------------------------------------------------------===//
55
56	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57	#include "LoopVectorizationPlanner.h"
58	#include "VPRecipeBuilder.h"
59	#include "VPlan.h"
60	#include "VPlanAnalysis.h"
61	#include "VPlanCFG.h"
62	#include "VPlanHelpers.h"
63	#include "VPlanPatternMatch.h"
64	#include "VPlanTransforms.h"
65	#include "VPlanUtils.h"
66	#include "VPlanVerifier.h"
67	#include "llvm/ADT/APInt.h"
68	#include "llvm/ADT/ArrayRef.h"
69	#include "llvm/ADT/DenseMap.h"
70	#include "llvm/ADT/DenseMapInfo.h"
71	#include "llvm/ADT/Hashing.h"
72	#include "llvm/ADT/MapVector.h"
73	#include "llvm/ADT/STLExtras.h"
74	#include "llvm/ADT/SmallPtrSet.h"
75	#include "llvm/ADT/SmallVector.h"
76	#include "llvm/ADT/Statistic.h"
77	#include "llvm/ADT/StringRef.h"
78	#include "llvm/ADT/Twine.h"
79	#include "llvm/ADT/TypeSwitch.h"
80	#include "llvm/ADT/iterator_range.h"
81	#include "llvm/Analysis/AssumptionCache.h"
82	#include "llvm/Analysis/BasicAliasAnalysis.h"
83	#include "llvm/Analysis/BlockFrequencyInfo.h"
84	#include "llvm/Analysis/CFG.h"
85	#include "llvm/Analysis/CodeMetrics.h"
86	#include "llvm/Analysis/DemandedBits.h"
87	#include "llvm/Analysis/GlobalsModRef.h"
88	#include "llvm/Analysis/LoopAccessAnalysis.h"
89	#include "llvm/Analysis/LoopAnalysisManager.h"
90	#include "llvm/Analysis/LoopInfo.h"
91	#include "llvm/Analysis/LoopIterator.h"
92	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93	#include "llvm/Analysis/ProfileSummaryInfo.h"
94	#include "llvm/Analysis/ScalarEvolution.h"
95	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96	#include "llvm/Analysis/TargetLibraryInfo.h"
97	#include "llvm/Analysis/TargetTransformInfo.h"
98	#include "llvm/Analysis/ValueTracking.h"
99	#include "llvm/Analysis/VectorUtils.h"
100	#include "llvm/IR/Attributes.h"
101	#include "llvm/IR/BasicBlock.h"
102	#include "llvm/IR/CFG.h"
103	#include "llvm/IR/Constant.h"
104	#include "llvm/IR/Constants.h"
105	#include "llvm/IR/DataLayout.h"
106	#include "llvm/IR/DebugInfo.h"
107	#include "llvm/IR/DebugLoc.h"
108	#include "llvm/IR/DerivedTypes.h"
109	#include "llvm/IR/DiagnosticInfo.h"
110	#include "llvm/IR/Dominators.h"
111	#include "llvm/IR/Function.h"
112	#include "llvm/IR/IRBuilder.h"
113	#include "llvm/IR/InstrTypes.h"
114	#include "llvm/IR/Instruction.h"
115	#include "llvm/IR/Instructions.h"
116	#include "llvm/IR/IntrinsicInst.h"
117	#include "llvm/IR/Intrinsics.h"
118	#include "llvm/IR/MDBuilder.h"
119	#include "llvm/IR/Metadata.h"
120	#include "llvm/IR/Module.h"
121	#include "llvm/IR/Operator.h"
122	#include "llvm/IR/PatternMatch.h"
123	#include "llvm/IR/ProfDataUtils.h"
124	#include "llvm/IR/Type.h"
125	#include "llvm/IR/Use.h"
126	#include "llvm/IR/User.h"
127	#include "llvm/IR/Value.h"
128	#include "llvm/IR/Verifier.h"
129	#include "llvm/Support/Casting.h"
130	#include "llvm/Support/CommandLine.h"
131	#include "llvm/Support/Debug.h"
132	#include "llvm/Support/ErrorHandling.h"
133	#include "llvm/Support/InstructionCost.h"
134	#include "llvm/Support/MathExtras.h"
135	#include "llvm/Support/NativeFormatting.h"
136	#include "llvm/Support/raw_ostream.h"
137	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139	#include "llvm/Transforms/Utils/Local.h"
140	#include "llvm/Transforms/Utils/LoopSimplify.h"
141	#include "llvm/Transforms/Utils/LoopUtils.h"
142	#include "llvm/Transforms/Utils/LoopVersioning.h"
143	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144	#include "llvm/Transforms/Utils/SizeOpts.h"
145	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146	#include <algorithm>
147	#include <cassert>
148	#include <cstdint>
149	#include <functional>
150	#include <iterator>
151	#include <limits>
152	#include <memory>
153	#include <string>
154	#include <tuple>
155	#include <utility>
156
157	using namespace llvm;
158
159	#define LV_NAME "loop-vectorize"
160	#define DEBUG_TYPE LV_NAME
161
162	#ifndef NDEBUG
163	const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164	#endif
165
166	/// @{
167	/// Metadata attribute names
168	const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169	const char LLVMLoopVectorizeFollowupVectorized[] =
170	"llvm.loop.vectorize.followup_vectorized";
171	const char LLVMLoopVectorizeFollowupEpilogue[] =
172	"llvm.loop.vectorize.followup_epilogue";
173	/// @}
174
175	STATISTIC(LoopsVectorized, "Number of loops vectorized");
176	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177	STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178
179	static cl::opt<bool> EnableEpilogueVectorization(
180	"enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
181	cl::desc ("Enable vectorization of epilogue loops."));
182
183	static cl::opt<unsigned> EpilogueVectorizationForceVF(
184	"epilogue-vectorization-force-VF", cl::init(Val: `1`), cl::Hidden,
185	cl::desc ("When epilogue vectorization is enabled, and a value greater than "
186	"1 is specified, forces the given VF for all applicable epilogue "
187	"loops."));
188
189	static cl::opt<unsigned> EpilogueVectorizationMinVF(
190	"epilogue-vectorization-minimum-VF", cl::Hidden,
191	cl::desc ("Only loops with vectorization factor equal to or larger than "
192	"the specified value are considered for epilogue vectorization."));
193
194	/// Loops with a known constant trip count below this number are vectorized only
195	/// if no scalar iteration overheads are incurred.
196	static cl::opt<unsigned> TinyTripCountVectorThreshold(
197	"vectorizer-min-trip-count", cl::init(Val: `16`), cl::Hidden,
198	cl::desc ("Loops with a constant trip count that is smaller than this "
199	"value are vectorized only if no scalar iteration overheads "
200	"are incurred."));
201
202	static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
203	"vectorize-memory-check-threshold", cl::init(Val: `128`), cl::Hidden,
204	cl::desc ("The maximum allowed number of runtime memory checks"));
205
206	// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207	// that predication is preferred, and this lists all options. I.e., the
208	// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209	// and predicate the instructions accordingly. If tail-folding fails, there are
210	// different fallback strategies depending on these values:
211	namespace PreferPredicateTy {
212	enum Option {
213	ScalarEpilogue = `0`,
214	PredicateElseScalarEpilogue,
215	PredicateOrDontVectorize
216	};
217	} // namespace PreferPredicateTy
218
219	static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220	"prefer-predicate-over-epilogue",
221	cl::init(Val: PreferPredicateTy::ScalarEpilogue),
222	cl::Hidden,
223	cl::desc ("Tail-folding and predication preferences over creating a scalar "
224	"epilogue loop."),
225	cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226	"scalar-epilogue",
227	"Don't tail-predicate loops, create scalar epilogue"),
228	clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229	"predicate-else-scalar-epilogue",
230	"prefer tail-folding, create scalar epilogue if tail "
231	"folding fails."),
232	clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233	"predicate-dont-vectorize",
234	"prefers tail-folding, don't attempt vectorization if "
235	"tail-folding fails.")));
236
237	static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
238	"force-tail-folding-style", cl::desc ("Force the tail folding style"),
239	cl::init(Val: TailFoldingStyle::None),
240	cl::values(
241	clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
242	clEnumValN(
243	TailFoldingStyle::Data, "data",
244	"Create lane mask for data only, using active.lane.mask intrinsic"),
245	clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
246	"data-without-lane-mask",
247	"Create lane mask with compare/stepvector"),
248	clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249	"Create lane mask using active.lane.mask intrinsic, and use "
250	"it for both data and control flow"),
251	clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252	"data-and-control-without-rt-check",
253	"Similar to data-and-control, but remove the runtime check"),
254	clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
255	"Use predicated EVL instructions for tail folding. If EVL "
256	"is unsupported, fallback to data-without-lane-mask.")));
257
258	static cl::opt<bool> MaximizeBandwidth(
259	"vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
260	cl::desc ("Maximize bandwidth when selecting vectorization factor which "
261	"will be determined by the smallest type in loop."));
262
263	static cl::opt<bool> EnableInterleavedMemAccesses(
264	"enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
265	cl::desc ("Enable vectorization on interleaved memory accesses in a loop"));
266
267	/// An interleave-group may need masking if it resides in a block that needs
268	/// predication, or in order to mask away gaps.
269	static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
270	"enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
271	cl::desc ("Enable vectorization on masked interleaved memory accesses in a loop"));
272
273	static cl::opt<unsigned> ForceTargetNumScalarRegs(
274	"force-target-num-scalar-regs", cl::init(Val: `0`), cl::Hidden,
275	cl::desc ("A flag that overrides the target's number of scalar registers."));
276
277	static cl::opt<unsigned> ForceTargetNumVectorRegs(
278	"force-target-num-vector-regs", cl::init(Val: `0`), cl::Hidden,
279	cl::desc ("A flag that overrides the target's number of vector registers."));
280
281	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
282	"force-target-max-scalar-interleave", cl::init(Val: `0`), cl::Hidden,
283	cl::desc ("A flag that overrides the target's max interleave factor for "
284	"scalar loops."));
285
286	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
287	"force-target-max-vector-interleave", cl::init(Val: `0`), cl::Hidden,
288	cl::desc ("A flag that overrides the target's max interleave factor for "
289	"vectorized loops."));
290
291	cl::opt<unsigned> llvm::ForceTargetInstructionCost(
292	"force-target-instruction-cost", cl::init(Val: `0`), cl::Hidden,
293	cl::desc ("A flag that overrides the target's expected cost for "
294	"an instruction to a single constant value. Mostly "
295	"useful for getting consistent testing."));
296
297	static cl::opt<bool> ForceTargetSupportsScalableVectors(
298	"force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
299	cl::desc (
300	"Pretend that scalable vectors are supported, even if the target does "
301	"not support them. This flag should only be used for testing."));
302
303	static cl::opt<unsigned> SmallLoopCost(
304	"small-loop-cost", cl::init(Val: `20`), cl::Hidden,
305	cl::desc (
306	"The cost of a loop that is considered 'small' by the interleaver."));
307
308	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
309	"loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
310	cl::desc ("Enable the use of the block frequency analysis to access PGO "
311	"heuristics minimizing code growth in cold regions and being more "
312	"aggressive in hot regions."));
313
314	// Runtime interleave loops for load/store throughput.
315	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316	"enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
317	cl::desc (
318	"Enable runtime interleaving until load/store ports are saturated"));
319
320	/// The number of stores in a loop that are allowed to need predication.
321	static cl::opt<unsigned> NumberOfStoresToPredicate(
322	"vectorize-num-stores-pred", cl::init(Val: `1`), cl::Hidden,
323	cl::desc ("Max number of stores to be predicated behind an if."));
324
325	static cl::opt<bool> EnableIndVarRegisterHeur(
326	"enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
327	cl::desc ("Count the induction variable only once when interleaving"));
328
329	static cl::opt<bool> EnableCondStoresVectorization(
330	"enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
331	cl::desc ("Enable if predication of stores during vectorization."));
332
333	static cl::opt<unsigned> MaxNestedScalarReductionIC(
334	"max-nested-scalar-reduction-interleave", cl::init(Val: `2`), cl::Hidden,
335	cl::desc ("The maximum interleave count to use when interleaving a scalar "
336	"reduction in a nested loop."));
337
338	static cl::opt<bool>
339	PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
340	cl::Hidden,
341	cl::desc ("Prefer in-loop vector reductions, "
342	"overriding the targets preference."));
343
344	static cl::opt<bool> ForceOrderedReductions(
345	"force-ordered-reductions", cl::init(Val: false), cl::Hidden,
346	cl::desc ("Enable the vectorisation of loops with in-order (strict) "
347	"FP reductions"));
348
349	static cl::opt<bool> PreferPredicatedReductionSelect(
350	"prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
351	cl::desc (
352	"Prefer predicating a reduction operation over an after loop select."));
353
354	cl::opt<bool> llvm::EnableVPlanNativePath(
355	"enable-vplan-native-path", cl::Hidden,
356	cl::desc ("Enable VPlan-native vectorization path with "
357	"support for outer loop vectorization."));
358
359	cl::opt<bool>
360	llvm::VerifyEachVPlan("vplan-verify-each",
361	#ifdef EXPENSIVE_CHECKS
362	cl::init(true),
363	#else
364	cl::init(Val: false),
365	#endif
366	cl::Hidden,
367	cl::desc ("Verfiy VPlans after VPlan transforms."));
368
369	// This flag enables the stress testing of the VPlan H-CFG construction in the
370	// VPlan-native vectorization path. It must be used in conjuction with
371	// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372	// verification of the H-CFGs built.
373	static cl::opt<bool> VPlanBuildStressTest(
374	"vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
375	cl::desc (
376	"Build VPlan for every supported loop nest in the function and bail "
377	"out right after the build (stress test the VPlan H-CFG construction "
378	"in the VPlan-native vectorization path)."));
379
380	cl::opt<bool> llvm::EnableLoopInterleaving(
381	"interleave-loops", cl::init(Val: true), cl::Hidden,
382	cl::desc ("Enable loop interleaving in Loop vectorization passes"));
383	cl::opt<bool> llvm::EnableLoopVectorization(
384	"vectorize-loops", cl::init(Val: true), cl::Hidden,
385	cl::desc ("Run the Loop vectorization passes"));
386
387	static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
388	"force-widen-divrem-via-safe-divisor", cl::Hidden,
389	cl::desc (
390	"Override cost based safe divisor widening for div/rem instructions"));
391
392	static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
393	"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
394	cl::Hidden,
395	cl::desc ("Try wider VFs if they enable the use of vector variants"));
396
397	static cl::opt<bool> EnableEarlyExitVectorization(
398	"enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
399	cl::desc (
400	"Enable vectorization of early exit loops with uncountable exits."));
401
402	// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
403	// variables not overflowing do not hold. See `emitSCEVChecks`.
404	static constexpr uint32_t SCEVCheckBypassWeights[] = {`1`, `127`};
405	// Likelyhood of bypassing the vectorized loop because pointers overlap. See
406	// `emitMemRuntimeChecks`.
407	static constexpr uint32_t MemCheckBypassWeights[] = {`1`, `127`};
408	// Likelyhood of bypassing the vectorized loop because there are zero trips left
409	// after prolog. See `emitIterationCountCheck`.
410	static constexpr uint32_t MinItersBypassWeights[] = {`1`, `127`};
411
412	/// A helper function that returns true if the given type is irregular. The
413	/// type is irregular if its allocated size doesn't equal the store size of an
414	/// element of the corresponding vector type.
415	static bool hasIrregularType(Type Ty, const* DataLayout &DL) {
416	// Determine if an array of N elements of type Ty is "bitcast compatible"
417	// with a <N x Ty> vector.
418	// This is only true if there is no padding between the array elements.
419	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
420	}
421
422	/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
423	/// ElementCount to include loops whose trip count is a function of vscale.
424	static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
425	const Loop *L) {
426	return ElementCount::getFixed(MinVal: SE->getSmallConstantTripCount(L));
427	}
428
429	/// Returns "best known" trip count, which is either a valid positive trip count
430	/// or std::nullopt when an estimate cannot be made (including when the trip
431	/// count would overflow), for the specified loop \p L as defined by the
432	/// following procedure:
433	/// 1) Returns exact trip count if it is known.
434	/// 2) Returns expected trip count according to profile data if any.
435	/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
436	/// 4) Returns std::nullopt if all of the above failed.
437	static std::optional<ElementCount>
438	getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
439	bool CanUseConstantMax = true) {
440	// Check if exact trip count is known.
441	if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
442	return ExpectedTC;
443
444	// Check if there is an expected trip count available from profile data.
445	if (LoopVectorizeWithBlockFrequency)
446	if (auto EstimatedTC = getLoopEstimatedTripCount(L))
447	return ElementCount::getFixed(MinVal: *EstimatedTC);
448
449	if (!CanUseConstantMax)
450	return std::nullopt;
451
452	// Check if upper bound estimate is known.
453	if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
454	return ElementCount::getFixed(MinVal: ExpectedTC);
455
456	return std::nullopt;
457	}
458
459	namespace {
460	// Forward declare GeneratedRTChecks.
461	class GeneratedRTChecks;
462
463	using SCEV2ValueTy = DenseMap<const SCEV , Value >;
464	} // namespace
465
466	namespace llvm {
467
468	AnalysisKey ShouldRunExtraVectorPasses::Key;
469
470	/// InnerLoopVectorizer vectorizes loops which contain only one basic
471	/// block to a specified vectorization factor (VF).
472	/// This class performs the widening of scalars into vectors, or multiple
473	/// scalars. This class also implements the following features:
474	/// It inserts an epilogue loop for handling loops that don't have iteration*
475	/// counts that are known to be a multiple of the vectorization factor.
476	/// It handles the code generation for reduction variables.*
477	/// Scalarization (implementation using scalars) of un-vectorizable*
478	/// instructions.
479	/// InnerLoopVectorizer does not perform any vectorization-legality
480	/// checks, and relies on the caller to check for the different legality
481	/// aspects. The InnerLoopVectorizer relies on the
482	/// LoopVectorizationLegality class to provide information about the induction
483	/// and reduction variables that were found to a given vectorization factor.
484	class InnerLoopVectorizer {
485	public:
486	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
487	LoopInfo LI, DominatorTree DT,
488	const TargetLibraryInfo *TLI,
489	const TargetTransformInfo TTI, AssumptionCache AC,
490	OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
491	ElementCount MinProfitableTripCount,
492	unsigned UnrollFactor, LoopVectorizationCostModel *CM,
493	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
494	GeneratedRTChecks &RTChecks, VPlan &Plan)
495	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
496	AC(AC), ORE(ORE), VF (VecWidth),
497	MinProfitableTripCount (MinProfitableTripCount), UF(UnrollFactor),
498	Builder (PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
499	RTChecks(RTChecks), Plan(Plan),
500	VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
501
502	virtual ~InnerLoopVectorizer() = default;
503
504	/// Create a new empty loop that will contain vectorized instructions later
505	/// on, while the old loop will be used as the scalar remainder. Control flow
506	/// is generated around the vectorized (and scalar epilogue) loops consisting
507	/// of various checks and bypasses. Return the pre-header block of the new
508	/// loop. In the case of epilogue vectorization, this function is overriden to
509	/// handle the more complex control flow around the loops.
510	virtual BasicBlock *createVectorizedLoopSkeleton();
511
512	/// Fix the vectorized code, taking care of header phi's, and more.
513	void fixVectorizedLoop(VPTransformState &State);
514
515	/// Fix the non-induction PHIs in \p Plan.
516	void fixNonInductionPHIs(VPTransformState &State);
517
518	/// Returns the original loop trip count.
519	Value getTripCount() const* { return TripCount; }
520
521	/// Used to set the trip count after ILV's construction and after the
522	/// preheader block has been executed. Note that this always holds the trip
523	/// count of the original loop for both main loop and epilogue vectorization.
524	void setTripCount(Value *TC) { TripCount = TC; }
525
526	/// Return the additional bypass block which targets the scalar loop by
527	/// skipping the epilogue loop after completing the main loop.
528	BasicBlock getAdditionalBypassBlock() const* {
529	assert(AdditionalBypassBlock &&
530	"Trying to access AdditionalBypassBlock but it has not been set");
531	return AdditionalBypassBlock;
532	}
533
534	protected:
535	friend class LoopVectorizationPlanner;
536
537	/// Returns (and creates if needed) the trip count of the widened loop.
538	Value getOrCreateVectorTripCount(BasicBlock InsertBlock);
539
540	// Create a check to see if the vector loop should be executed
541	Value createIterationCountCheck(ElementCount VF, unsigned* UF) const;
542
543	/// Emit a bypass check to see if the vector trip count is zero, including if
544	/// it overflows.
545	void emitIterationCountCheck(BasicBlock *Bypass);
546
547	/// Emit a bypass check to see if all of the SCEV assumptions we've
548	/// had to make are correct. Returns the block containing the checks or
549	/// nullptr if no checks have been added.
550	BasicBlock emitSCEVChecks(BasicBlock Bypass);
551
552	/// Emit bypass checks to check any memory assumptions we may have made.
553	/// Returns the block containing the checks or nullptr if no checks have been
554	/// added.
555	BasicBlock emitMemRuntimeChecks(BasicBlock Bypass);
556
557	/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
558	/// vector loop preheader, middle block and scalar preheader.
559	void createVectorLoopSkeleton(StringRef Prefix);
560
561	/// Allow subclasses to override and print debug traces before/after vplan
562	/// execution, when trace information is requested.
563	virtual void printDebugTracesAtStart() {}
564	virtual void printDebugTracesAtEnd() {}
565
566	/// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
567	/// vector preheader and its predecessor, also connecting the new block to the
568	/// scalar preheader.
569	void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
570
571	/// The original loop.
572	Loop *OrigLoop;
573
574	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
575	/// dynamic knowledge to simplify SCEV expressions and converts them to a
576	/// more usable form.
577	PredicatedScalarEvolution &PSE;
578
579	/// Loop Info.
580	LoopInfo *LI;
581
582	/// Dominator Tree.
583	DominatorTree *DT;
584
585	/// Target Library Info.
586	const TargetLibraryInfo *TLI;
587
588	/// Target Transform Info.
589	const TargetTransformInfo *TTI;
590
591	/// Assumption Cache.
592	AssumptionCache *AC;
593
594	/// Interface to emit optimization remarks.
595	OptimizationRemarkEmitter *ORE;
596
597	/// The vectorization SIMD factor to use. Each vector will have this many
598	/// vector elements.
599	ElementCount VF;
600
601	ElementCount MinProfitableTripCount;
602
603	/// The vectorization unroll factor to use. Each scalar is vectorized to this
604	/// many different vector instructions.
605	unsigned UF;
606
607	/// The builder that we use
608	IRBuilder<> Builder;
609
610	// --- Vectorization state ---
611
612	/// The vector-loop preheader.
613	BasicBlock LoopVectorPreHeader = nullptr*;
614
615	/// The scalar-loop preheader.
616	BasicBlock LoopScalarPreHeader = nullptr*;
617
618	/// Middle Block between the vector and the scalar.
619	BasicBlock LoopMiddleBlock = nullptr*;
620
621	/// Trip count of the original loop.
622	Value TripCount = nullptr*;
623
624	/// Trip count of the widened loop (TripCount - TripCount % (VFUF))*
625	Value VectorTripCount = nullptr*;
626
627	/// The profitablity analysis.
628	LoopVectorizationCostModel *Cost;
629
630	/// BFI and PSI are used to check for profile guided size optimizations.
631	BlockFrequencyInfo *BFI;
632	ProfileSummaryInfo *PSI;
633
634	/// Structure to hold information about generated runtime checks, responsible
635	/// for cleaning the checks, if vectorization turns out unprofitable.
636	GeneratedRTChecks &RTChecks;
637
638	/// The additional bypass block which conditionally skips over the epilogue
639	/// loop after executing the main loop. Needed to resume inductions and
640	/// reductions during epilogue vectorization.
641	BasicBlock AdditionalBypassBlock = nullptr*;
642
643	VPlan &Plan;
644
645	/// The vector preheader block of \p Plan, used as target for check blocks
646	/// introduced during skeleton creation.
647	VPBlockBase *VectorPHVPB;
648	};
649
650	/// Encapsulate information regarding vectorization of a loop and its epilogue.
651	/// This information is meant to be updated and used across two stages of
652	/// epilogue vectorization.
653	struct EpilogueLoopVectorizationInfo {
654	ElementCount MainLoopVF = ElementCount::getFixed(MinVal: `0`);
655	unsigned MainLoopUF = `0`;
656	ElementCount EpilogueVF = ElementCount::getFixed(MinVal: `0`);
657	unsigned EpilogueUF = `0`;
658	BasicBlock MainLoopIterationCountCheck = nullptr*;
659	BasicBlock EpilogueIterationCountCheck = nullptr*;
660	BasicBlock SCEVSafetyCheck = nullptr*;
661	BasicBlock MemSafetyCheck = nullptr*;
662	Value TripCount = nullptr*;
663	Value VectorTripCount = nullptr*;
664	VPlan &EpiloguePlan;
665
666	EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
667	ElementCount EVF, unsigned EUF,
668	VPlan &EpiloguePlan)
669	: MainLoopVF (MVF), MainLoopUF(MUF), EpilogueVF (EVF), EpilogueUF(EUF),
670	EpiloguePlan(EpiloguePlan) {
671	assert(EUF == `1` &&
672	"A high UF for the epilogue loop is likely not beneficial.");
673	}
674	};
675
676	/// An extension of the inner loop vectorizer that creates a skeleton for a
677	/// vectorized loop that has its epilogue (residual) also vectorized.
678	/// The idea is to run the vplan on a given loop twice, firstly to setup the
679	/// skeleton and vectorize the main loop, and secondly to complete the skeleton
680	/// from the first step and vectorize the epilogue. This is achieved by
681	/// deriving two concrete strategy classes from this base class and invoking
682	/// them in succession from the loop vectorizer planner.
683	class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
684	public:
685	InnerLoopAndEpilogueVectorizer(
686	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
687	DominatorTree DT, const* TargetLibraryInfo *TLI,
688	const TargetTransformInfo TTI, AssumptionCache AC,
689	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
690	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
691	ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
692	: InnerLoopVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
693	EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
694	BFI, PSI, Checks, Plan),
695	EPI(EPI) {}
696
697	// Override this function to handle the more complex control flow around the
698	// three loops.
699	BasicBlock *createVectorizedLoopSkeleton() final {
700	return createEpilogueVectorizedLoopSkeleton();
701	}
702
703	/// The interface for creating a vectorized skeleton using one of two
704	/// different strategies, each corresponding to one execution of the vplan
705	/// as described above.
706	virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = `0`;
707
708	/// Holds and updates state information required to vectorize the main loop
709	/// and its epilogue in two separate passes. This setup helps us avoid
710	/// regenerating and recomputing runtime safety checks. It also helps us to
711	/// shorten the iteration-count-check path length for the cases where the
712	/// iteration count of the loop is so small that the main vector loop is
713	/// completely skipped.
714	EpilogueLoopVectorizationInfo &EPI;
715	};
716
717	/// A specialized derived class of inner loop vectorizer that performs
718	/// vectorization of main* loops in the process of vectorizing loops and their*
719	/// epilogues.
720	class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
721	public:
722	EpilogueVectorizerMainLoop(
723	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
724	DominatorTree DT, const* TargetLibraryInfo *TLI,
725	const TargetTransformInfo TTI, AssumptionCache AC,
726	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
727	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
728	ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
729	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
730	EPI, CM, BFI, PSI, Check, Plan) {}
731	/// Implements the interface for creating a vectorized skeleton using the
732	/// main loop* strategy (ie the first pass of vplan execution).*
733	BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
734
735	protected:
736	/// Emits an iteration count bypass check once for the main loop (when \p
737	/// ForEpilogue is false) and once for the epilogue loop (when \p
738	/// ForEpilogue is true).
739	BasicBlock emitIterationCountCheck(BasicBlock Bypass, bool ForEpilogue);
740	void printDebugTracesAtStart() override;
741	void printDebugTracesAtEnd() override;
742	};
743
744	// A specialized derived class of inner loop vectorizer that performs
745	// vectorization of epilogue* loops in the process of vectorizing loops and*
746	// their epilogues.
747	class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
748	public:
749	EpilogueVectorizerEpilogueLoop(
750	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
751	DominatorTree DT, const* TargetLibraryInfo *TLI,
752	const TargetTransformInfo TTI, AssumptionCache AC,
753	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
754	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
755	ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
756	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
757	EPI, CM, BFI, PSI, Checks, Plan) {
758	TripCount = EPI.TripCount;
759	}
760	/// Implements the interface for creating a vectorized skeleton using the
761	/// epilogue loop* strategy (ie the second pass of vplan execution).*
762	BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
763
764	protected:
765	/// Emits an iteration count bypass check after the main vector loop has
766	/// finished to see if there are any iterations left to execute by either
767	/// the vector epilogue or the scalar epilogue.
768	BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
769	BasicBlock *Bypass,
770	BasicBlock *Insert);
771	void printDebugTracesAtStart() override;
772	void printDebugTracesAtEnd() override;
773	};
774	} // end namespace llvm
775
776	/// Look for a meaningful debug location on the instruction or its operands.
777	static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
778	if (!I)
779	return DebugLoc::getUnknown();
780
781	DebugLoc Empty;
782	if (I->getDebugLoc() != Empty)
783	return I->getDebugLoc();
784
785	for (Use &Op : I->operands()) {
786	if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
787	if (OpInst->getDebugLoc() != Empty)
788	return OpInst->getDebugLoc();
789	}
790
791	return I->getDebugLoc();
792	}
793
794	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
795	/// is passed, the message relates to that particular instruction.
796	#ifndef NDEBUG
797	static void debugVectorizationMessage(const StringRef Prefix,
798	const StringRef DebugMsg,
799	Instruction *I) {
800	dbgs() << "LV: " << Prefix << DebugMsg;
801	if (I != nullptr)
802	dbgs() << " " << *I;
803	else
804	dbgs() << `'.'`;
805	dbgs() << `'\n'`;
806	}
807	#endif
808
809	/// Create an analysis remark that explains why vectorization failed
810	///
811	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
812	/// RemarkName is the identifier for the remark. If \p I is passed it is an
813	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
814	/// the location of the remark. If \p DL is passed, use it as debug location for
815	/// the remark. \return the remark object that can be streamed to.
816	static OptimizationRemarkAnalysis
817	createLVAnalysis(const char PassName, StringRef RemarkName, Loop TheLoop,
818	Instruction *I, DebugLoc DL = {}) {
819	BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
820	// If debug location is attached to the instruction, use it. Otherwise if DL
821	// was not provided, use the loop's.
822	if (I && I->getDebugLoc())
823	DL = I->getDebugLoc();
824	else if (!DL)
825	DL = TheLoop->getStartLoc();
826
827	return OptimizationRemarkAnalysis (PassName, RemarkName, DL, CodeRegion);
828	}
829
830	namespace llvm {
831
832	/// Return a value for Step multiplied by VF.
833	Value createStepForVF(IRBuilderBase &B, Type Ty, ElementCount VF,
834	int64_t Step) {
835	assert(Ty->isIntegerTy() && "Expected an integer step");
836	return B.CreateElementCount(Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
837	}
838
839	/// Return the runtime value for VF.
840	Value getRuntimeVF(IRBuilderBase &B, Type Ty, ElementCount VF) {
841	return B.CreateElementCount(Ty, EC: VF);
842	}
843
844	void reportVectorizationFailure(const StringRef DebugMsg,
845	const StringRef OREMsg, const StringRef ORETag,
846	OptimizationRemarkEmitter ORE, Loop TheLoop,
847	Instruction *I) {
848	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
849	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
850	ORE->emit(
851	OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
852	<< "loop not vectorized: " << OREMsg);
853	}
854
855	/// Reports an informative message: print \p Msg for debugging purposes as well
856	/// as an optimization remark. Uses either \p I as location of the remark, or
857	/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
858	/// remark. If \p DL is passed, use it as debug location for the remark.
859	static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
860	OptimizationRemarkEmitter *ORE,
861	Loop TheLoop, Instruction I = nullptr,
862	DebugLoc DL = {}) {
863	LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
864	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
865	ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
866	I, DL)
867	<< Msg);
868	}
869
870	/// Report successful vectorization of the loop. In case an outer loop is
871	/// vectorized, prepend "outer" to the vectorization remark.
872	static void reportVectorization(OptimizationRemarkEmitter ORE, Loop TheLoop,
873	VectorizationFactor VF, unsigned IC) {
874	LLVM_DEBUG(debugVectorizationMessage(
875	"Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
876	nullptr));
877	StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
878	ORE->emit(RemarkBuilder: [&]() {
879	return OptimizationRemark (LV_NAME, "Vectorized", TheLoop->getStartLoc(),
880	TheLoop->getHeader())
881	<< "vectorized " << LoopType << "loop (vectorization width: "
882	<< ore::NV ("VectorizationFactor", VF.Width)
883	<< ", interleaved count: " << ore::NV ("InterleaveCount", IC) << ")";
884	});
885	}
886
887	} // end namespace llvm
888
889	namespace llvm {
890
891	// Loop vectorization cost-model hints how the scalar epilogue loop should be
892	// lowered.
893	enum ScalarEpilogueLowering {
894
895	// The default: allowing scalar epilogues.
896	CM_ScalarEpilogueAllowed,
897
898	// Vectorization with OptForSize: don't allow epilogues.
899	CM_ScalarEpilogueNotAllowedOptSize,
900
901	// A special case of vectorisation with OptForSize: loops with a very small
902	// trip count are considered for vectorization under OptForSize, thereby
903	// making sure the cost of their loop body is dominant, free of runtime
904	// guards and scalar iteration overheads.
905	CM_ScalarEpilogueNotAllowedLowTripLoop,
906
907	// Loop hint predicate indicating an epilogue is undesired.
908	CM_ScalarEpilogueNotNeededUsePredicate,
909
910	// Directive indicating we must either tail fold or not vectorize
911	CM_ScalarEpilogueNotAllowedUsePredicate
912	};
913
914	/// LoopVectorizationCostModel - estimates the expected speedups due to
915	/// vectorization.
916	/// In many cases vectorization is not profitable. This can happen because of
917	/// a number of reasons. In this class we mainly attempt to predict the
918	/// expected speedup/slowdowns due to the supported instruction set. We use the
919	/// TargetTransformInfo to query the different backends for the cost of
920	/// different operations.
921	class LoopVectorizationCostModel {
922	friend class LoopVectorizationPlanner;
923
924	public:
925	LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
926	PredicatedScalarEvolution &PSE, LoopInfo *LI,
927	LoopVectorizationLegality *Legal,
928	const TargetTransformInfo &TTI,
929	const TargetLibraryInfo TLI, DemandedBits DB,
930	AssumptionCache *AC,
931	OptimizationRemarkEmitter ORE, const* Function *F,
932	const LoopVectorizeHints *Hints,
933	InterleavedAccessInfo &IAI,
934	ProfileSummaryInfo PSI, BlockFrequencyInfo BFI)
935	: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
936	TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
937	Hints(Hints), InterleaveInfo(IAI) {
938	if (TTI.supportsScalableVectors() \|\| ForceTargetSupportsScalableVectors)
939	initializeVScaleForTuning();
940	CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
941	// Query this against the original loop and save it here because the profile
942	// of the original loop header may change as the transformation happens.
943	OptForSize = llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
944	QueryType: PGSOQueryType::IRPass);
945	}
946
947	/// \return An upper bound for the vectorization factors (both fixed and
948	/// scalable). If the factors are 0, vectorization and interleaving should be
949	/// avoided up front.
950	FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
951
952	/// \return True if runtime checks are required for vectorization, and false
953	/// otherwise.
954	bool runtimeChecksRequired();
955
956	/// Setup cost-based decisions for user vectorization factor.
957	/// \return true if the UserVF is a feasible VF to be chosen.
958	bool selectUserVectorizationFactor(ElementCount UserVF) {
959	collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
960	return expectedCost(VF: UserVF).isValid();
961	}
962
963	/// \return True if maximizing vector bandwidth is enabled by the target or
964	/// user options, for the given register kind.
965	bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
966
967	/// \return True if maximizing vector bandwidth is enabled by the target or
968	/// user options, for the given vector factor.
969	bool useMaxBandwidth(ElementCount VF);
970
971	/// \return The size (in bits) of the smallest and widest types in the code
972	/// that needs to be vectorized. We ignore values that remain scalar such as
973	/// 64 bit loop indices.
974	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
975
976	/// \return The desired interleave count.
977	/// If interleave count has been specified by metadata it will be returned.
978	/// Otherwise, the interleave count is computed and returned. VF and LoopCost
979	/// are the selected vectorization factor and the cost of the selected VF.
980	unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
981	InstructionCost LoopCost);
982
983	/// Memory access instruction may be vectorized in more than one way.
984	/// Form of instruction after vectorization depends on cost.
985	/// This function takes cost-based decisions for Load/Store instructions
986	/// and collects them in a map. This decisions map is used for building
987	/// the lists of loop-uniform and loop-scalar instructions.
988	/// The calculated cost is saved with widening decision in order to
989	/// avoid redundant calculations.
990	void setCostBasedWideningDecision(ElementCount VF);
991
992	/// A call may be vectorized in different ways depending on whether we have
993	/// vectorized variants available and whether the target supports masking.
994	/// This function analyzes all calls in the function at the supplied VF,
995	/// makes a decision based on the costs of available options, and stores that
996	/// decision in a map for use in planning and plan execution.
997	void setVectorizedCallDecision(ElementCount VF);
998
999	/// Collect values we want to ignore in the cost model.
1000	void collectValuesToIgnore();
1001
1002	/// Collect all element types in the loop for which widening is needed.
1003	void collectElementTypesForWidening();
1004
1005	/// Split reductions into those that happen in the loop, and those that happen
1006	/// outside. In loop reductions are collected into InLoopReductions.
1007	void collectInLoopReductions();
1008
1009	/// Returns true if we should use strict in-order reductions for the given
1010	/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1011	/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1012	/// of FP operations.
1013	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1014	return !Hints->allowReordering() && RdxDesc.isOrdered();
1015	}
1016
1017	/// \returns The smallest bitwidth each instruction can be represented with.
1018	/// The vector equivalents of these instructions should be truncated to this
1019	/// type.
1020	const MapVector<Instruction , uint64_t> &getMinimalBitwidths() const* {
1021	return MinBWs;
1022	}
1023
1024	/// \returns True if it is more profitable to scalarize instruction \p I for
1025	/// vectorization factor \p VF.
1026	bool isProfitableToScalarize(Instruction I, ElementCount VF) const* {
1027	assert(VF.isVector() &&
1028	"Profitable to scalarize relevant only for VF > 1.");
1029	assert(
1030	TheLoop->isInnermost() &&
1031	"cost-model should not be used for outer loops (in VPlan-native path)");
1032
1033	auto Scalars = InstsToScalarize.find(Val: VF);
1034	assert(Scalars != InstsToScalarize.end() &&
1035	"VF not yet analyzed for scalarization profitability");
1036	return Scalars ->second.contains(Val: I);
1037	}
1038
1039	/// Returns true if \p I is known to be uniform after vectorization.
1040	bool isUniformAfterVectorization(Instruction I, ElementCount VF) const* {
1041	assert(
1042	TheLoop->isInnermost() &&
1043	"cost-model should not be used for outer loops (in VPlan-native path)");
1044	// Pseudo probe needs to be duplicated for each unrolled iteration and
1045	// vector lane so that profiled loop trip count can be accurately
1046	// accumulated instead of being under counted.
1047	if (isa<PseudoProbeInst>(Val: I))
1048	return false;
1049
1050	if (VF.isScalar())
1051	return true;
1052
1053	auto UniformsPerVF = Uniforms.find(Val: VF);
1054	assert(UniformsPerVF != Uniforms.end() &&
1055	"VF not yet analyzed for uniformity");
1056	return UniformsPerVF ->second.count(Ptr: I);
1057	}
1058
1059	/// Returns true if \p I is known to be scalar after vectorization.
1060	bool isScalarAfterVectorization(Instruction I, ElementCount VF) const* {
1061	assert(
1062	TheLoop->isInnermost() &&
1063	"cost-model should not be used for outer loops (in VPlan-native path)");
1064	if (VF.isScalar())
1065	return true;
1066
1067	auto ScalarsPerVF = Scalars.find(Val: VF);
1068	assert(ScalarsPerVF != Scalars.end() &&
1069	"Scalar values are not calculated for VF");
1070	return ScalarsPerVF ->second.count(Ptr: I);
1071	}
1072
1073	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
1074	/// for vectorization factor \p VF.
1075	bool canTruncateToMinimalBitwidth(Instruction I, ElementCount VF) const* {
1076	return VF.isVector() && MinBWs.contains(Key: I) &&
1077	!isProfitableToScalarize(I, VF) &&
1078	!isScalarAfterVectorization(I, VF);
1079	}
1080
1081	/// Decision that was taken during cost calculation for memory instruction.
1082	enum InstWidening {
1083	CM_Unknown,
1084	CM_Widen, // For consecutive accesses with stride +1.
1085	CM_Widen_Reverse, // For consecutive accesses with stride -1.
1086	CM_Interleave,
1087	CM_GatherScatter,
1088	CM_Scalarize,
1089	CM_VectorCall,
1090	CM_IntrinsicCall
1091	};
1092
1093	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1094	/// instruction \p I and vector width \p VF.
1095	void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1096	InstructionCost Cost) {
1097	assert(VF.isVector() && "Expected VF >=2");
1098	WideningDecisions [{I, VF}] = {W, Cost};
1099	}
1100
1101	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1102	/// interleaving group \p Grp and vector width \p VF.
1103	void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1104	ElementCount VF, InstWidening W,
1105	InstructionCost Cost) {
1106	assert(VF.isVector() && "Expected VF >=2");
1107	/// Broadcast this decicion to all instructions inside the group.
1108	/// When interleaving, the cost will only be assigned one instruction, the
1109	/// insert position. For other cases, add the appropriate fraction of the
1110	/// total cost to each instruction. This ensures accurate costs are used,
1111	/// even if the insert position instruction is not used.
1112	InstructionCost InsertPosCost = Cost;
1113	InstructionCost OtherMemberCost = `0`;
1114	if (W != CM_Interleave)
1115	OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1116	;
1117	for (unsigned Idx = `0`; Idx < Grp->getFactor(); ++Idx) {
1118	if (auto *I = Grp->getMember(Index: Idx)) {
1119	if (Grp->getInsertPos() == I)
1120	WideningDecisions [{I, VF}] = {W, InsertPosCost};
1121	else
1122	WideningDecisions [{I, VF}] = {W, OtherMemberCost};
1123	}
1124	}
1125	}
1126
1127	/// Return the cost model decision for the given instruction \p I and vector
1128	/// width \p VF. Return CM_Unknown if this instruction did not pass
1129	/// through the cost modeling.
1130	InstWidening getWideningDecision(Instruction I, ElementCount VF) const* {
1131	assert(VF.isVector() && "Expected VF to be a vector VF");
1132	assert(
1133	TheLoop->isInnermost() &&
1134	"cost-model should not be used for outer loops (in VPlan-native path)");
1135
1136	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1137	auto Itr = WideningDecisions.find(Val: InstOnVF);
1138	if (Itr == WideningDecisions.end())
1139	return CM_Unknown;
1140	return Itr ->second.first;
1141	}
1142
1143	/// Return the vectorization cost for the given instruction \p I and vector
1144	/// width \p VF.
1145	InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1146	assert(VF.isVector() && "Expected VF >=2");
1147	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1148	assert(WideningDecisions.contains(InstOnVF) &&
1149	"The cost is not calculated");
1150	return WideningDecisions [InstOnVF].second;
1151	}
1152
1153	struct CallWideningDecision {
1154	InstWidening Kind;
1155	Function *Variant;
1156	Intrinsic::ID IID;
1157	std::optional<unsigned> MaskPos;
1158	InstructionCost Cost;
1159	};
1160
1161	void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1162	Function *Variant, Intrinsic::ID IID,
1163	std::optional<unsigned> MaskPos,
1164	InstructionCost Cost) {
1165	assert(!VF.isScalar() && "Expected vector VF");
1166	CallWideningDecisions [{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1167	}
1168
1169	CallWideningDecision getCallWideningDecision(CallInst *CI,
1170	ElementCount VF) const {
1171	assert(!VF.isScalar() && "Expected vector VF");
1172	return CallWideningDecisions.at(Val: {CI, VF});
1173	}
1174
1175	/// Return True if instruction \p I is an optimizable truncate whose operand
1176	/// is an induction variable. Such a truncate will be removed by adding a new
1177	/// induction variable with the destination type.
1178	bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1179	// If the instruction is not a truncate, return false.
1180	auto *Trunc = dyn_cast<TruncInst>(Val: I);
1181	if (!Trunc)
1182	return false;
1183
1184	// Get the source and destination types of the truncate.
1185	Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1186	Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1187
1188	// If the truncate is free for the given types, return false. Replacing a
1189	// free truncate with an induction variable would add an induction variable
1190	// update instruction to each iteration of the loop. We exclude from this
1191	// check the primary induction variable since it will need an update
1192	// instruction regardless.
1193	Value *Op = Trunc->getOperand(i_nocapture: `0`);
1194	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1195	return false;
1196
1197	// If the truncated value is not an induction variable, return false.
1198	return Legal->isInductionPhi(V: Op);
1199	}
1200
1201	/// Collects the instructions to scalarize for each predicated instruction in
1202	/// the loop.
1203	void collectInstsToScalarize(ElementCount VF);
1204
1205	/// Collect values that will not be widened, including Uniforms, Scalars, and
1206	/// Instructions to Scalarize for the given \p VF.
1207	/// The sets depend on CM decision for Load/Store instructions
1208	/// that may be vectorized as interleave, gather-scatter or scalarized.
1209	/// Also make a decision on what to do about call instructions in the loop
1210	/// at that VF -- scalarize, call a known vector routine, or call a
1211	/// vector intrinsic.
1212	void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1213	// Do the analysis once.
1214	if (VF.isScalar() \|\| Uniforms.contains(Val: VF))
1215	return;
1216	setCostBasedWideningDecision(VF);
1217	collectLoopUniforms(VF);
1218	setVectorizedCallDecision(VF);
1219	collectLoopScalars(VF);
1220	collectInstsToScalarize(VF);
1221	}
1222
1223	/// Returns true if the target machine supports masked store operation
1224	/// for the given \p DataType and kind of access to \p Ptr.
1225	bool isLegalMaskedStore(Type DataType, Value Ptr, Align Alignment,
1226	unsigned AddressSpace) const {
1227	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1228	TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1229	}
1230
1231	/// Returns true if the target machine supports masked load operation
1232	/// for the given \p DataType and kind of access to \p Ptr.
1233	bool isLegalMaskedLoad(Type DataType, Value Ptr, Align Alignment,
1234	unsigned AddressSpace) const {
1235	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1236	TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1237	}
1238
1239	/// Returns true if the target machine can represent \p V as a masked gather
1240	/// or scatter operation.
1241	bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1242	bool LI = isa<LoadInst>(Val: V);
1243	bool SI = isa<StoreInst>(Val: V);
1244	if (!LI && !SI)
1245	return false;
1246	auto *Ty = getLoadStoreType(I: V);
1247	Align Align = getLoadStoreAlignment(I: V);
1248	if (VF.isVector())
1249	Ty = VectorType::get(ElementType: Ty, EC: VF);
1250	return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) \|\|
1251	(SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1252	}
1253
1254	/// Returns true if the target machine supports all of the reduction
1255	/// variables found for the given VF.
1256	bool canVectorizeReductions(ElementCount VF) const {
1257	return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1258	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1259	return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1260	}));
1261	}
1262
1263	/// Given costs for both strategies, return true if the scalar predication
1264	/// lowering should be used for div/rem. This incorporates an override
1265	/// option so it is not simply a cost comparison.
1266	bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1267	InstructionCost SafeDivisorCost) const {
1268	switch (ForceSafeDivisor) {
1269	case cl::BOU_UNSET:
1270	return ScalarCost < SafeDivisorCost;
1271	case cl::BOU_TRUE:
1272	return false;
1273	case cl::BOU_FALSE:
1274	return true;
1275	}
1276	llvm_unreachable("impossible case value");
1277	}
1278
1279	/// Returns true if \p I is an instruction which requires predication and
1280	/// for which our chosen predication strategy is scalarization (i.e. we
1281	/// don't have an alternate strategy such as masking available).
1282	/// \p VF is the vectorization factor that will be used to vectorize \p I.
1283	bool isScalarWithPredication(Instruction I, ElementCount VF) const*;
1284
1285	/// Returns true if \p I is an instruction that needs to be predicated
1286	/// at runtime. The result is independent of the predication mechanism.
1287	/// Superset of instructions that return true for isScalarWithPredication.
1288	bool isPredicatedInst(Instruction I) const*;
1289
1290	/// Return the costs for our two available strategies for lowering a
1291	/// div/rem operation which requires speculating at least one lane.
1292	/// First result is for scalarization (will be invalid for scalable
1293	/// vectors); second is for the safe-divisor strategy.
1294	std::pair<InstructionCost, InstructionCost>
1295	getDivRemSpeculationCost(Instruction *I,
1296	ElementCount VF) const;
1297
1298	/// Returns true if \p I is a memory instruction with consecutive memory
1299	/// access that can be widened.
1300	bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1301
1302	/// Returns true if \p I is a memory instruction in an interleaved-group
1303	/// of memory accesses that can be vectorized with wide vector loads/stores
1304	/// and shuffles.
1305	bool interleavedAccessCanBeWidened(Instruction I, ElementCount VF) const*;
1306
1307	/// Check if \p Instr belongs to any interleaved access group.
1308	bool isAccessInterleaved(Instruction Instr) const* {
1309	return InterleaveInfo.isInterleaved(Instr);
1310	}
1311
1312	/// Get the interleaved access group that \p Instr belongs to.
1313	const InterleaveGroup<Instruction> *
1314	getInterleavedAccessGroup(Instruction Instr) const* {
1315	return InterleaveInfo.getInterleaveGroup(Instr);
1316	}
1317
1318	/// Returns true if we're required to use a scalar epilogue for at least
1319	/// the final iteration of the original loop.
1320	bool requiresScalarEpilogue(bool IsVectorizing) const {
1321	if (!isScalarEpilogueAllowed()) {
1322	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1323	return false;
1324	}
1325	// If we might exit from anywhere but the latch and early exit vectorization
1326	// is disabled, we must run the exiting iteration in scalar form.
1327	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1328	!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1329	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1330	"from latch block\n");
1331	return true;
1332	}
1333	if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1334	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1335	"interleaved group requires scalar epilogue\n");
1336	return true;
1337	}
1338	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1339	return false;
1340	}
1341
1342	/// Returns true if a scalar epilogue is not allowed due to optsize or a
1343	/// loop hint annotation.
1344	bool isScalarEpilogueAllowed() const {
1345	return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1346	}
1347
1348	/// Returns the TailFoldingStyle that is best for the current loop.
1349	TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1350	if (!ChosenTailFoldingStyle)
1351	return TailFoldingStyle::None;
1352	return IVUpdateMayOverflow ? ChosenTailFoldingStyle ->first
1353	: ChosenTailFoldingStyle ->second;
1354	}
1355
1356	/// Selects and saves TailFoldingStyle for 2 options - if IV update may
1357	/// overflow or not.
1358	/// \param IsScalableVF true if scalable vector factors enabled.
1359	/// \param UserIC User specific interleave count.
1360	void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1361	assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1362	if (!Legal->canFoldTailByMasking()) {
1363	ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1364	return;
1365	}
1366
1367	// Default to TTI preference, but allow command line override.
1368	ChosenTailFoldingStyle = {
1369	TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/true),
1370	TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/false)};
1371	if (ForceTailFoldingStyle.getNumOccurrences())
1372	ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1373	ForceTailFoldingStyle.getValue()};
1374
1375	if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1376	return;
1377	// Override forced styles if needed.
1378	// FIXME: Investigate opportunity for fixed vector factor.
1379	bool EVLIsLegal = UserIC <= `1` && IsScalableVF &&
1380	TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1381	if (EVLIsLegal)
1382	return;
1383	// If for some reason EVL mode is unsupported, fallback to
1384	// DataWithoutLaneMask to try to vectorize the loop with folded tail
1385	// in a generic way.
1386	ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1387	TailFoldingStyle::DataWithoutLaneMask};
1388	LLVM_DEBUG(
1389	dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1390	"not try to generate VP Intrinsics "
1391	<< (UserIC > `1`
1392	? "since interleave count specified is greater than 1.\n"
1393	: "due to non-interleaving reasons.\n"));
1394	}
1395
1396	/// Returns true if all loop blocks should be masked to fold tail loop.
1397	bool foldTailByMasking() const {
1398	// TODO: check if it is possible to check for None style independent of
1399	// IVUpdateMayOverflow flag in getTailFoldingStyle.
1400	return getTailFoldingStyle() != TailFoldingStyle::None;
1401	}
1402
1403	/// Return maximum safe number of elements to be processed per vector
1404	/// iteration, which do not prevent store-load forwarding and are safe with
1405	/// regard to the memory dependencies. Required for EVL-based VPlans to
1406	/// correctly calculate AVL (application vector length) as min(remaining AVL,
1407	/// MaxSafeElements).
1408	/// TODO: need to consider adjusting cost model to use this value as a
1409	/// vectorization factor for EVL-based vectorization.
1410	std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1411
1412	/// Returns true if the instructions in this block requires predication
1413	/// for any reason, e.g. because tail folding now requires a predicate
1414	/// or because the block in the original loop was predicated.
1415	bool blockNeedsPredicationForAnyReason(BasicBlock BB) const* {
1416	return foldTailByMasking() \|\| Legal->blockNeedsPredication(BB);
1417	}
1418
1419	/// Returns true if VP intrinsics with explicit vector length support should
1420	/// be generated in the tail folded loop.
1421	bool foldTailWithEVL() const {
1422	return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1423	}
1424
1425	/// Returns true if the Phi is part of an inloop reduction.
1426	bool isInLoopReduction(PHINode Phi) const* {
1427	return InLoopReductions.contains(Ptr: Phi);
1428	}
1429
1430	/// Returns true if the predicated reduction select should be used to set the
1431	/// incoming value for the reduction phi.
1432	bool usePredicatedReductionSelect() const {
1433	// Force to use predicated reduction select since the EVL of the
1434	// second-to-last iteration might not be VFUF.*
1435	if (foldTailWithEVL())
1436	return true;
1437	return PreferPredicatedReductionSelect \|\|
1438	TTI.preferPredicatedReductionSelect();
1439	}
1440
1441	/// Estimate cost of an intrinsic call instruction CI if it were vectorized
1442	/// with factor VF. Return the cost of the instruction, including
1443	/// scalarization overhead if it's needed.
1444	InstructionCost getVectorIntrinsicCost(CallInst CI, ElementCount VF) const*;
1445
1446	/// Estimate cost of a call instruction CI if it were vectorized with factor
1447	/// VF. Return the cost of the instruction, including scalarization overhead
1448	/// if it's needed.
1449	InstructionCost getVectorCallCost(CallInst CI, ElementCount VF) const*;
1450
1451	/// Invalidates decisions already taken by the cost model.
1452	void invalidateCostModelingDecisions() {
1453	WideningDecisions.clear();
1454	CallWideningDecisions.clear();
1455	Uniforms.clear();
1456	Scalars.clear();
1457	}
1458
1459	/// Returns the expected execution cost. The unit of the cost does
1460	/// not matter because we use the 'cost' units to compare different
1461	/// vector widths. The cost that is returned is not* normalized by*
1462	/// the factor width.
1463	InstructionCost expectedCost(ElementCount VF);
1464
1465	bool hasPredStores() const { return NumPredStores > `0`; }
1466
1467	/// Returns true if epilogue vectorization is considered profitable, and
1468	/// false otherwise.
1469	/// \p VF is the vectorization factor chosen for the original loop.
1470	/// \p Multiplier is an aditional scaling factor applied to VF before
1471	/// comparing to EpilogueVectorizationMinVF.
1472	bool isEpilogueVectorizationProfitable(const ElementCount VF,
1473	const unsigned IC) const;
1474
1475	/// Returns the execution time cost of an instruction for a given vector
1476	/// width. Vector width of one means scalar.
1477	InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1478
1479	/// Return the cost of instructions in an inloop reduction pattern, if I is
1480	/// part of that pattern.
1481	std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1482	ElementCount VF,
1483	Type VectorTy) const*;
1484
1485	/// Returns true if \p Op should be considered invariant and if it is
1486	/// trivially hoistable.
1487	bool shouldConsiderInvariant(Value *Op);
1488
1489	/// Return the value of vscale used for tuning the cost model.
1490	std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1491
1492	private:
1493	unsigned NumPredStores = `0`;
1494
1495	/// Used to store the value of vscale used for tuning the cost model. It is
1496	/// initialized during object construction.
1497	std::optional<unsigned> VScaleForTuning;
1498
1499	/// Initializes the value of vscale used for tuning the cost model. If
1500	/// vscale_range.min == vscale_range.max then return vscale_range.max, else
1501	/// return the value returned by the corresponding TTI method.
1502	void initializeVScaleForTuning() {
1503	const Function *Fn = TheLoop->getHeader()->getParent();
1504	if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1505	auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1506	auto Min = Attr.getVScaleRangeMin();
1507	auto Max = Attr.getVScaleRangeMax();
1508	if (Max && Min == Max) {
1509	VScaleForTuning = Max;
1510	return;
1511	}
1512	}
1513
1514	VScaleForTuning = TTI.getVScaleForTuning();
1515	}
1516
1517	/// \return An upper bound for the vectorization factors for both
1518	/// fixed and scalable vectorization, where the minimum-known number of
1519	/// elements is a power-of-2 larger than zero. If scalable vectorization is
1520	/// disabled or unsupported, then the scalable part will be equal to
1521	/// ElementCount::getScalable(0).
1522	FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1523	ElementCount UserVF,
1524	bool FoldTailByMasking);
1525
1526	/// \return the maximized element count based on the targets vector
1527	/// registers and the loop trip-count, but limited to a maximum safe VF.
1528	/// This is a helper function of computeFeasibleMaxVF.
1529	ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1530	unsigned SmallestType,
1531	unsigned WidestType,
1532	ElementCount MaxSafeVF,
1533	bool FoldTailByMasking);
1534
1535	/// Checks if scalable vectorization is supported and enabled. Caches the
1536	/// result to avoid repeated debug dumps for repeated queries.
1537	bool isScalableVectorizationAllowed();
1538
1539	/// \return the maximum legal scalable VF, based on the safe max number
1540	/// of elements.
1541	ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1542
1543	/// Calculate vectorization cost of memory instruction \p I.
1544	InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1545
1546	/// The cost computation for scalarized memory instruction.
1547	InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1548
1549	/// The cost computation for interleaving group of memory instructions.
1550	InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1551
1552	/// The cost computation for Gather/Scatter instruction.
1553	InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1554
1555	/// The cost computation for widening instruction \p I with consecutive
1556	/// memory access.
1557	InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1558
1559	/// The cost calculation for Load/Store instruction \p I with uniform pointer -
1560	/// Load: scalar load + broadcast.
1561	/// Store: scalar store + (loop invariant value stored? 0 : extract of last
1562	/// element)
1563	InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1564
1565	/// Estimate the overhead of scalarizing an instruction. This is a
1566	/// convenience wrapper for the type-based getScalarizationOverhead API.
1567	InstructionCost getScalarizationOverhead(Instruction *I,
1568	ElementCount VF) const;
1569
1570	/// Returns true if an artificially high cost for emulated masked memrefs
1571	/// should be used.
1572	bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1573
1574	/// Map of scalar integer values to the smallest bitwidth they can be legally
1575	/// represented as. The vector equivalents of these values should be truncated
1576	/// to this type.
1577	MapVector<Instruction *, uint64_t> MinBWs;
1578
1579	/// A type representing the costs for instructions if they were to be
1580	/// scalarized rather than vectorized. The entries are Instruction-Cost
1581	/// pairs.
1582	using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1583
1584	/// A set containing all BasicBlocks that are known to present after
1585	/// vectorization as a predicated block.
1586	DenseMap<ElementCount, SmallPtrSet<BasicBlock *, `4`>>
1587	PredicatedBBsAfterVectorization;
1588
1589	/// Records whether it is allowed to have the original scalar loop execute at
1590	/// least once. This may be needed as a fallback loop in case runtime
1591	/// aliasing/dependence checks fail, or to handle the tail/remainder
1592	/// iterations when the trip count is unknown or doesn't divide by the VF,
1593	/// or as a peel-loop to handle gaps in interleave-groups.
1594	/// Under optsize and when the trip count is very small we don't allow any
1595	/// iterations to execute in the scalar loop.
1596	ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1597
1598	/// Control finally chosen tail folding style. The first element is used if
1599	/// the IV update may overflow, the second element - if it does not.
1600	std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1601	ChosenTailFoldingStyle;
1602
1603	/// true if scalable vectorization is supported and enabled.
1604	std::optional<bool> IsScalableVectorizationAllowed;
1605
1606	/// Maximum safe number of elements to be processed per vector iteration,
1607	/// which do not prevent store-load forwarding and are safe with regard to the
1608	/// memory dependencies. Required for EVL-based veectorization, where this
1609	/// value is used as the upper bound of the safe AVL.
1610	std::optional<unsigned> MaxSafeElements;
1611
1612	/// A map holding scalar costs for different vectorization factors. The
1613	/// presence of a cost for an instruction in the mapping indicates that the
1614	/// instruction will be scalarized when vectorizing with the associated
1615	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
1616	DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1617
1618	/// Holds the instructions known to be uniform after vectorization.
1619	/// The data is collected per VF.
1620	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Uniforms;
1621
1622	/// Holds the instructions known to be scalar after vectorization.
1623	/// The data is collected per VF.
1624	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Scalars;
1625
1626	/// Holds the instructions (address computations) that are forced to be
1627	/// scalarized.
1628	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> ForcedScalars;
1629
1630	/// PHINodes of the reductions that should be expanded in-loop.
1631	SmallPtrSet<PHINode *, `4`> InLoopReductions;
1632
1633	/// A Map of inloop reduction operations and their immediate chain operand.
1634	/// FIXME: This can be removed once reductions can be costed correctly in
1635	/// VPlan. This was added to allow quick lookup of the inloop operations.
1636	DenseMap<Instruction , Instruction > InLoopReductionImmediateChains;
1637
1638	/// Returns the expected difference in cost from scalarizing the expression
1639	/// feeding a predicated instruction \p PredInst. The instructions to
1640	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
1641	/// non-negative return value implies the expression will be scalarized.
1642	/// Currently, only single-use chains are considered for scalarization.
1643	InstructionCost computePredInstDiscount(Instruction *PredInst,
1644	ScalarCostsTy &ScalarCosts,
1645	ElementCount VF);
1646
1647	/// Collect the instructions that are uniform after vectorization. An
1648	/// instruction is uniform if we represent it with a single scalar value in
1649	/// the vectorized loop corresponding to each vector iteration. Examples of
1650	/// uniform instructions include pointer operands of consecutive or
1651	/// interleaved memory accesses. Note that although uniformity implies an
1652	/// instruction will be scalar, the reverse is not true. In general, a
1653	/// scalarized instruction will be represented by VF scalar values in the
1654	/// vectorized loop, each corresponding to an iteration of the original
1655	/// scalar loop.
1656	void collectLoopUniforms(ElementCount VF);
1657
1658	/// Collect the instructions that are scalar after vectorization. An
1659	/// instruction is scalar if it is known to be uniform or will be scalarized
1660	/// during vectorization. collectLoopScalars should only add non-uniform nodes
1661	/// to the list if they are used by a load/store instruction that is marked as
1662	/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1663	/// VF values in the vectorized loop, each corresponding to an iteration of
1664	/// the original scalar loop.
1665	void collectLoopScalars(ElementCount VF);
1666
1667	/// Keeps cost model vectorization decision and cost for instructions.
1668	/// Right now it is used for memory instructions only.
1669	using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1670	std::pair<InstWidening, InstructionCost>>;
1671
1672	DecisionList WideningDecisions;
1673
1674	using CallDecisionList =
1675	DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1676
1677	CallDecisionList CallWideningDecisions;
1678
1679	/// Returns true if \p V is expected to be vectorized and it needs to be
1680	/// extracted.
1681	bool needsExtract(Value V, ElementCount VF) const* {
1682	Instruction *I = dyn_cast<Instruction>(Val: V);
1683	if (VF.isScalar() \|\| !I \|\| !TheLoop->contains(Inst: I) \|\|
1684	TheLoop->isLoopInvariant(V: I) \|\|
1685	getWideningDecision(I, VF) == CM_Scalarize)
1686	return false;
1687
1688	// Assume we can vectorize V (and hence we need extraction) if the
1689	// scalars are not computed yet. This can happen, because it is called
1690	// via getScalarizationOverhead from setCostBasedWideningDecision, before
1691	// the scalars are collected. That should be a safe assumption in most
1692	// cases, because we check if the operands have vectorizable types
1693	// beforehand in LoopVectorizationLegality.
1694	return !Scalars.contains(Val: VF) \|\| !isScalarAfterVectorization(I, VF);
1695	};
1696
1697	/// Returns a range containing only operands needing to be extracted.
1698	SmallVector<Value *, `4`> filterExtractingOperands(Instruction::op_range Ops,
1699	ElementCount VF) const {
1700	return SmallVector<Value *, `4`>(make_filter_range(
1701	Range&: Ops, Pred: [this, VF](Value V) { return* this->needsExtract(V, VF); }));
1702	}
1703
1704	public:
1705	/// The loop that we evaluate.
1706	Loop *TheLoop;
1707
1708	/// Predicated scalar evolution analysis.
1709	PredicatedScalarEvolution &PSE;
1710
1711	/// Loop Info analysis.
1712	LoopInfo *LI;
1713
1714	/// Vectorization legality.
1715	LoopVectorizationLegality *Legal;
1716
1717	/// Vector target information.
1718	const TargetTransformInfo &TTI;
1719
1720	/// Target Library Info.
1721	const TargetLibraryInfo *TLI;
1722
1723	/// Demanded bits analysis.
1724	DemandedBits *DB;
1725
1726	/// Assumption cache.
1727	AssumptionCache *AC;
1728
1729	/// Interface to emit optimization remarks.
1730	OptimizationRemarkEmitter *ORE;
1731
1732	const Function *TheFunction;
1733
1734	/// Loop Vectorize Hint.
1735	const LoopVectorizeHints *Hints;
1736
1737	/// The interleave access information contains groups of interleaved accesses
1738	/// with the same stride and close to each other.
1739	InterleavedAccessInfo &InterleaveInfo;
1740
1741	/// Values to ignore in the cost model.
1742	SmallPtrSet<const Value *, `16`> ValuesToIgnore;
1743
1744	/// Values to ignore in the cost model when VF > 1.
1745	SmallPtrSet<const Value *, `16`> VecValuesToIgnore;
1746
1747	/// All element types found in the loop.
1748	SmallPtrSet<Type *, `16`> ElementTypesInLoop;
1749
1750	/// The kind of cost that we are calculating
1751	TTI::TargetCostKind CostKind;
1752
1753	/// Whether this loop should be optimized for size based on function attribute
1754	/// or profile information.
1755	bool OptForSize;
1756	};
1757	} // end namespace llvm
1758
1759	namespace {
1760	/// Helper struct to manage generating runtime checks for vectorization.
1761	///
1762	/// The runtime checks are created up-front in temporary blocks to allow better
1763	/// estimating the cost and un-linked from the existing IR. After deciding to
1764	/// vectorize, the checks are moved back. If deciding not to vectorize, the
1765	/// temporary blocks are completely removed.
1766	class GeneratedRTChecks {
1767	/// Basic block which contains the generated SCEV checks, if any.
1768	BasicBlock SCEVCheckBlock = nullptr*;
1769
1770	/// The value representing the result of the generated SCEV checks. If it is
1771	/// nullptr no SCEV checks have been generated.
1772	Value SCEVCheckCond = nullptr*;
1773
1774	/// Basic block which contains the generated memory runtime checks, if any.
1775	BasicBlock MemCheckBlock = nullptr*;
1776
1777	/// The value representing the result of the generated memory runtime checks.
1778	/// If it is nullptr no memory runtime checks have been generated.
1779	Value MemRuntimeCheckCond = nullptr*;
1780
1781	/// True if any checks have been added.
1782	bool AddedAnyChecks = false;
1783
1784	DominatorTree *DT;
1785	LoopInfo *LI;
1786	TargetTransformInfo *TTI;
1787
1788	SCEVExpander SCEVExp;
1789	SCEVExpander MemCheckExp;
1790
1791	bool CostTooHigh = false;
1792	const bool AddBranchWeights;
1793
1794	Loop OuterLoop = nullptr*;
1795
1796	PredicatedScalarEvolution &PSE;
1797
1798	/// The kind of cost that we are calculating
1799	TTI::TargetCostKind CostKind;
1800
1801	public:
1802	GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1803	LoopInfo LI, TargetTransformInfo TTI,
1804	const DataLayout &DL, bool AddBranchWeights,
1805	TTI::TargetCostKind CostKind)
1806	: DT(DT), LI(LI), TTI(TTI), SCEVExp (*PSE.getSE(), DL, "scev.check"),
1807	MemCheckExp (*PSE.getSE(), DL, "scev.check"),
1808	AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1809
1810	/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1811	/// accurately estimate the cost of the runtime checks. The blocks are
1812	/// un-linked from the IR and are added back during vector code generation. If
1813	/// there is no vector code generation, the check blocks are removed
1814	/// completely.
1815	void create(Loop L, const* LoopAccessInfo &LAI,
1816	const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1817
1818	// Hard cutoff to limit compile-time increase in case a very large number of
1819	// runtime checks needs to be generated.
1820	// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1821	// profile info.
1822	CostTooHigh =
1823	LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1824	if (CostTooHigh)
1825	return;
1826
1827	BasicBlock *LoopHeader = L->getHeader();
1828	BasicBlock *Preheader = L->getLoopPreheader();
1829
1830	// Use SplitBlock to create blocks for SCEV & memory runtime checks to
1831	// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1832	// may be used by SCEVExpander. The blocks will be un-linked from their
1833	// predecessors and removed from LI & DT at the end of the function.
1834	if (!UnionPred.isAlwaysTrue()) {
1835	SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1836	MSSAU: nullptr, BBName: "vector.scevcheck");
1837
1838	SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1839	Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1840	if (isa<Constant>(Val: SCEVCheckCond)) {
1841	// Clean up directly after expanding the predicate to a constant, to
1842	// avoid further expansions re-using anything left over from SCEVExp.
1843	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1844	SCEVCleaner.cleanup();
1845	}
1846	}
1847
1848	const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1849	if (RtPtrChecking.Need) {
1850	auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1851	MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1852	BBName: "vector.memcheck");
1853
1854	auto DiffChecks = RtPtrChecking.getDiffChecks();
1855	if (DiffChecks) {
1856	Value RuntimeVF = nullptr*;
1857	MemRuntimeCheckCond = addDiffRuntimeChecks(
1858	Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1859	GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1860	if (!RuntimeVF)
1861	RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1862	return RuntimeVF;
1863	},
1864	IC);
1865	} else {
1866	MemRuntimeCheckCond = addRuntimeChecks(
1867	Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1868	Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1869	}
1870	assert(MemRuntimeCheckCond &&
1871	"no RT checks generated although RtPtrChecking "
1872	"claimed checks are required");
1873	}
1874
1875	if (!MemCheckBlock && !SCEVCheckBlock)
1876	return;
1877
1878	// Unhook the temporary block with the checks, update various places
1879	// accordingly.
1880	if (SCEVCheckBlock)
1881	SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1882	if (MemCheckBlock)
1883	MemCheckBlock->replaceAllUsesWith(V: Preheader);
1884
1885	if (SCEVCheckBlock) {
1886	SCEVCheckBlock->getTerminator()->moveBefore(
1887	InsertPos: Preheader->getTerminator()->getIterator());
1888	auto UI = new* UnreachableInst (Preheader->getContext(), SCEVCheckBlock);
1889	UI->setDebugLoc(DebugLoc::getTemporary());
1890	Preheader->getTerminator()->eraseFromParent();
1891	}
1892	if (MemCheckBlock) {
1893	MemCheckBlock->getTerminator()->moveBefore(
1894	InsertPos: Preheader->getTerminator()->getIterator());
1895	auto UI = new* UnreachableInst (Preheader->getContext(), MemCheckBlock);
1896	UI->setDebugLoc(DebugLoc::getTemporary());
1897	Preheader->getTerminator()->eraseFromParent();
1898	}
1899
1900	DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1901	if (MemCheckBlock) {
1902	DT->eraseNode(BB: MemCheckBlock);
1903	LI->removeBlock(BB: MemCheckBlock);
1904	}
1905	if (SCEVCheckBlock) {
1906	DT->eraseNode(BB: SCEVCheckBlock);
1907	LI->removeBlock(BB: SCEVCheckBlock);
1908	}
1909
1910	// Outer loop is used as part of the later cost calculations.
1911	OuterLoop = L->getParentLoop();
1912	}
1913
1914	InstructionCost getCost() {
1915	if (SCEVCheckBlock \|\| MemCheckBlock)
1916	LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1917
1918	if (CostTooHigh) {
1919	InstructionCost Cost;
1920	Cost.setInvalid();
1921	LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1922	return Cost;
1923	}
1924
1925	InstructionCost RTCheckCost = `0`;
1926	if (SCEVCheckBlock)
1927	for (Instruction &I : *SCEVCheckBlock) {
1928	if (SCEVCheckBlock->getTerminator() == &I)
1929	continue;
1930	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1931	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1932	RTCheckCost += C;
1933	}
1934	if (MemCheckBlock) {
1935	InstructionCost MemCheckCost = `0`;
1936	for (Instruction &I : *MemCheckBlock) {
1937	if (MemCheckBlock->getTerminator() == &I)
1938	continue;
1939	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1940	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1941	MemCheckCost += C;
1942	}
1943
1944	// If the runtime memory checks are being created inside an outer loop
1945	// we should find out if these checks are outer loop invariant. If so,
1946	// the checks will likely be hoisted out and so the effective cost will
1947	// reduce according to the outer loop trip count.
1948	if (OuterLoop) {
1949	ScalarEvolution *SE = MemCheckExp.getSE();
1950	// TODO: If profitable, we could refine this further by analysing every
1951	// individual memory check, since there could be a mixture of loop
1952	// variant and invariant checks that mean the final condition is
1953	// variant.
1954	const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1955	if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1956	// It seems reasonable to assume that we can reduce the effective
1957	// cost of the checks even when we know nothing about the trip
1958	// count. Assume that the outer loop executes at least twice.
1959	unsigned BestTripCount = `2`;
1960
1961	// Get the best known TC estimate.
1962	if (auto EstimatedTC = getSmallBestKnownTC(
1963	PSE, L: OuterLoop, / CanUseConstantMax = / false))
1964	if (EstimatedTC ->isFixed())
1965	BestTripCount = EstimatedTC ->getFixedValue();
1966
1967	InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1968
1969	// Let's ensure the cost is always at least 1.
1970	NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
1971	b: (InstructionCost::CostType)`1`);
1972
1973	if (BestTripCount > `1`)
1974	LLVM_DEBUG(dbgs()
1975	<< "We expect runtime memory checks to be hoisted "
1976	<< "out of the outer loop. Cost reduced from "
1977	<< MemCheckCost << " to " << NewMemCheckCost << `'\n'`);
1978
1979	MemCheckCost = NewMemCheckCost;
1980	}
1981	}
1982
1983	RTCheckCost += MemCheckCost;
1984	}
1985
1986	if (SCEVCheckBlock \|\| MemCheckBlock)
1987	LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1988	<< "\n");
1989
1990	return RTCheckCost;
1991	}
1992
1993	/// Remove the created SCEV & memory runtime check blocks & instructions, if
1994	/// unused.
1995	~GeneratedRTChecks() {
1996	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1997	SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1998	bool SCEVChecksUsed = !SCEVCheckBlock \|\| !pred_empty(BB: SCEVCheckBlock);
1999	bool MemChecksUsed = !MemCheckBlock \|\| !pred_empty(BB: MemCheckBlock);
2000	if (SCEVChecksUsed)
2001	SCEVCleaner.markResultUsed();
2002
2003	if (MemChecksUsed) {
2004	MemCheckCleaner.markResultUsed();
2005	} else {
2006	auto &SE = *MemCheckExp.getSE();
2007	// Memory runtime check generation creates compares that use expanded
2008	// values. Remove them before running the SCEVExpanderCleaners.
2009	for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2010	if (MemCheckExp.isInsertedInstruction(I: &I))
2011	continue;
2012	SE.forgetValue(V: &I);
2013	I.eraseFromParent();
2014	}
2015	}
2016	MemCheckCleaner.cleanup();
2017	SCEVCleaner.cleanup();
2018
2019	if (!SCEVChecksUsed)
2020	SCEVCheckBlock->eraseFromParent();
2021	if (!MemChecksUsed)
2022	MemCheckBlock->eraseFromParent();
2023	}
2024
2025	/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2026	/// adjusts the branches to branch to the vector preheader or \p Bypass,
2027	/// depending on the generated condition.
2028	BasicBlock emitSCEVChecks(BasicBlock Bypass,
2029	BasicBlock *LoopVectorPreHeader) {
2030	using namespace llvm::PatternMatch;
2031	if (!SCEVCheckCond \|\| match(V: SCEVCheckCond, P: m_ZeroInt()))
2032	return nullptr;
2033
2034	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2035	BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertBefore: SCEVCheckBlock);
2036
2037	SCEVCheckBlock->getTerminator()->eraseFromParent();
2038	SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2039	Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2040	NewBB: SCEVCheckBlock);
2041
2042	BranchInst &BI =
2043	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: SCEVCheckCond);
2044	if (AddBranchWeights)
2045	setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights, /IsExpected=/false);
2046	ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI);
2047	AddedAnyChecks = true;
2048	return SCEVCheckBlock;
2049	}
2050
2051	/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2052	/// the branches to branch to the vector preheader or \p Bypass, depending on
2053	/// the generated condition.
2054	BasicBlock emitMemRuntimeChecks(BasicBlock Bypass,
2055	BasicBlock *LoopVectorPreHeader) {
2056	// Check if we generated code that checks in runtime if arrays overlap.
2057	if (!MemRuntimeCheckCond)
2058	return nullptr;
2059
2060	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2061	Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2062	NewBB: MemCheckBlock);
2063
2064	MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2065
2066	BranchInst &BI =
2067	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond);
2068	if (AddBranchWeights) {
2069	setBranchWeights(I&: BI, Weights: MemCheckBypassWeights, /IsExpected=/false);
2070	}
2071	ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI);
2072	MemCheckBlock->getTerminator()->setDebugLoc(
2073	Pred->getTerminator()->getDebugLoc());
2074
2075	AddedAnyChecks = true;
2076	return MemCheckBlock;
2077	}
2078
2079	/// Return true if any runtime checks have been added
2080	bool hasChecks() const { return AddedAnyChecks; }
2081	};
2082	} // namespace
2083
2084	static bool useActiveLaneMask(TailFoldingStyle Style) {
2085	return Style == TailFoldingStyle::Data \|\|
2086	Style == TailFoldingStyle::DataAndControlFlow \|\|
2087	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2088	}
2089
2090	static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2091	return Style == TailFoldingStyle::DataAndControlFlow \|\|
2092	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2093	}
2094
2095	// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2096	// vectorization. The loop needs to be annotated with #pragma omp simd
2097	// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2098	// vector length information is not provided, vectorization is not considered
2099	// explicit. Interleave hints are not allowed either. These limitations will be
2100	// relaxed in the future.
2101	// Please, note that we are currently forced to abuse the pragma 'clang
2102	// vectorize' semantics. This pragma provides auto-vectorization hints
2103	// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2104	// provides explicit vectorization hints* (LV can bypass legal checks and*
2105	// assume that vectorization is legal). However, both hints are implemented
2106	// using the same metadata (llvm.loop.vectorize, processed by
2107	// LoopVectorizeHints). This will be fixed in the future when the native IR
2108	// representation for pragma 'omp simd' is introduced.
2109	static bool isExplicitVecOuterLoop(Loop *OuterLp,
2110	OptimizationRemarkEmitter *ORE) {
2111	assert(!OuterLp->isInnermost() && "This is not an outer loop");
2112	LoopVectorizeHints Hints(OuterLp, true /DisableInterleaving/, *ORE);
2113
2114	// Only outer loops with an explicit vectorization hint are supported.
2115	// Unannotated outer loops are ignored.
2116	if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2117	return false;
2118
2119	Function *Fn = OuterLp->getHeader()->getParent();
2120	if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2121	VectorizeOnlyWhenForced: true /VectorizeOnlyWhenForced/)) {
2122	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2123	return false;
2124	}
2125
2126	if (Hints.getInterleave() > `1`) {
2127	// TODO: Interleave support is future work.
2128	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2129	"outer loops.\n");
2130	Hints.emitRemarkWithHints();
2131	return false;
2132	}
2133
2134	return true;
2135	}
2136
2137	static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2138	OptimizationRemarkEmitter *ORE,
2139	SmallVectorImpl<Loop *> &V) {
2140	// Collect inner loops and outer loops without irreducible control flow. For
2141	// now, only collect outer loops that have explicit vectorization hints. If we
2142	// are stress testing the VPlan H-CFG construction, we collect the outermost
2143	// loop of every loop nest.
2144	if (L.isInnermost() \|\| VPlanBuildStressTest \|\|
2145	(EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2146	LoopBlocksRPO RPOT(&L);
2147	RPOT.perform(LI);
2148	if (!containsIrreducibleCFG<const BasicBlock >(RPOTraversal&: RPOT, LI: LI)) {
2149	V.push_back(Elt: &L);
2150	// TODO: Collect inner loops inside marked outer loops in case
2151	// vectorization fails for the outer loop. Do not invoke
2152	// 'containsIrreducibleCFG' again for inner loops when the outer loop is
2153	// already known to be reducible. We can use an inherited attribute for
2154	// that.
2155	return;
2156	}
2157	}
2158	for (Loop *InnerL : L)
2159	collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2160	}
2161
2162	//===----------------------------------------------------------------------===//
2163	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2164	// LoopVectorizationCostModel and LoopVectorizationPlanner.
2165	//===----------------------------------------------------------------------===//
2166
2167	/// Compute the transformed value of Index at offset StartValue using step
2168	/// StepValue.
2169	/// For integer induction, returns StartValue + Index StepValue.*
2170	/// For pointer induction, returns StartValue[Index StepValue].*
2171	/// FIXME: The newly created binary instructions should contain nsw/nuw
2172	/// flags, which can be found from the original scalar operations.
2173	static Value *
2174	emitTransformedIndex(IRBuilderBase &B, Value Index, Value StartValue,
2175	Value *Step,
2176	InductionDescriptor::InductionKind InductionKind,
2177	const BinaryOperator *InductionBinOp) {
2178	using namespace llvm::PatternMatch;
2179	Type *StepTy = Step->getType();
2180	Value *CastedIndex = StepTy->isIntegerTy()
2181	? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2182	: B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2183	if (CastedIndex != Index) {
2184	CastedIndex->setName(CastedIndex->getName() + ".cast");
2185	Index = CastedIndex;
2186	}
2187
2188	// Note: the IR at this point is broken. We cannot use SE to create any new
2189	// SCEV and then expand it, hoping that SCEV's simplification will give us
2190	// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2191	// lead to various SCEV crashes. So all we can do is to use builder and rely
2192	// on InstCombine for future simplifications. Here we handle some trivial
2193	// cases only.
2194	auto CreateAdd = [&B](Value X, Value Y) {
2195	assert(X->getType() == Y->getType() && "Types don't match!");
2196	if (match(V: X, P: m_ZeroInt()))
2197	return Y;
2198	if (match(V: Y, P: m_ZeroInt()))
2199	return X;
2200	return B.CreateAdd(LHS: X, RHS: Y);
2201	};
2202
2203	// We allow X to be a vector type, in which case Y will potentially be
2204	// splatted into a vector with the same element count.
2205	auto CreateMul = [&B](Value X, Value Y) {
2206	assert(X->getType()->getScalarType() == Y->getType() &&
2207	"Types don't match!");
2208	if (match(V: X, P: m_One()))
2209	return Y;
2210	if (match(V: Y, P: m_One()))
2211	return X;
2212	VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2213	if (XVTy && !isa<VectorType>(Val: Y->getType()))
2214	Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2215	return B.CreateMul(LHS: X, RHS: Y);
2216	};
2217
2218	switch (InductionKind) {
2219	case InductionDescriptor::IK_IntInduction: {
2220	assert(!isa<VectorType>(Index->getType()) &&
2221	"Vector indices not supported for integer inductions yet");
2222	assert(Index->getType() == StartValue->getType() &&
2223	"Index type does not match StartValue type");
2224	if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2225	return B.CreateSub(LHS: StartValue, RHS: Index);
2226	auto *Offset = CreateMul (Index, Step);
2227	return CreateAdd (StartValue, Offset);
2228	}
2229	case InductionDescriptor::IK_PtrInduction:
2230	return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul (Index, Step));
2231	case InductionDescriptor::IK_FpInduction: {
2232	assert(!isa<VectorType>(Index->getType()) &&
2233	"Vector indices not supported for FP inductions yet");
2234	assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2235	assert(InductionBinOp &&
2236	(InductionBinOp->getOpcode() == Instruction::FAdd \|\|
2237	InductionBinOp->getOpcode() == Instruction::FSub) &&
2238	"Original bin op should be defined for FP induction");
2239
2240	Value *MulExp = B.CreateFMul(L: Step, R: Index);
2241	return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2242	Name: "induction");
2243	}
2244	case InductionDescriptor::IK_NoInduction:
2245	return nullptr;
2246	}
2247	llvm_unreachable("invalid enum");
2248	}
2249
2250	static std::optional<unsigned> getMaxVScale(const Function &F,
2251	const TargetTransformInfo &TTI) {
2252	if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2253	return MaxVScale;
2254
2255	if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2256	return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2257
2258	return std::nullopt;
2259	}
2260
2261	/// For the given VF and UF and maximum trip count computed for the loop, return
2262	/// whether the induction variable might overflow in the vectorized loop. If not,
2263	/// then we know a runtime overflow check always evaluates to false and can be
2264	/// removed.
2265	static bool isIndvarOverflowCheckKnownFalse(
2266	const LoopVectorizationCostModel *Cost,
2267	ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2268	// Always be conservative if we don't know the exact unroll factor.
2269	unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2270
2271	IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2272	APInt MaxUIntTripCount = IdxTy->getMask();
2273
2274	// We know the runtime overflow check is known false iff the (max) trip-count
2275	// is known and (max) trip-count + (VF UF) does not overflow in the type of*
2276	// the vector loop induction variable.
2277	if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2278	uint64_t MaxVF = VF.getKnownMinValue();
2279	if (VF.isScalable()) {
2280	std::optional<unsigned> MaxVScale =
2281	getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2282	if (!MaxVScale)
2283	return false;
2284	MaxVF = MaxVScale;
2285	}
2286
2287	return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2288	}
2289
2290	return false;
2291	}
2292
2293	// Return whether we allow using masked interleave-groups (for dealing with
2294	// strided loads/stores that reside in predicated blocks, or for dealing
2295	// with gaps).
2296	static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2297	// If an override option has been passed in for interleaved accesses, use it.
2298	if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > `0`)
2299	return EnableMaskedInterleavedMemAccesses;
2300
2301	return TTI.enableMaskedInterleavedAccessVectorization();
2302	}
2303
2304	Value *
2305	InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2306	if (VectorTripCount)
2307	return VectorTripCount;
2308
2309	Value *TC = getTripCount();
2310	IRBuilder<> Builder(InsertBlock->getTerminator());
2311
2312	Type *Ty = TC->getType();
2313	// This is where we can make the step a runtime constant.
2314	Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2315
2316	// If the tail is to be folded by masking, round the number of iterations N
2317	// up to a multiple of Step instead of rounding down. This is done by first
2318	// adding Step-1 and then rounding down. Note that it's ok if this addition
2319	// overflows: the vector induction variable will eventually wrap to zero given
2320	// that it starts at zero and its Step is a power of two; the loop will then
2321	// exit, with the last early-exit vector comparison also producing all-true.
2322	// For scalable vectors the VF is not guaranteed to be a power of 2, but this
2323	// is accounted for in emitIterationCountCheck that adds an overflow check.
2324	if (Cost->foldTailByMasking()) {
2325	assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2326	"VF*UF must be a power of 2 when folding tail by masking");
2327	TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: `1`)),
2328	Name: "n.rnd.up");
2329	}
2330
2331	// Now we need to generate the expression for the part of the loop that the
2332	// vectorized body will execute. This is equal to N - (N % Step) if scalar
2333	// iterations are not required for correctness, or N - Step, otherwise. Step
2334	// is equal to the vectorization factor (number of SIMD elements) times the
2335	// unroll factor (number of SIMD instructions).
2336	Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2337
2338	// There are cases where we must* run at least one iteration in the remainder*
2339	// loop. See the cost model for when this can happen. If the step evenly
2340	// divides the trip count, we set the remainder to be equal to the step. If
2341	// the step does not evenly divide the trip count, no adjustment is necessary
2342	// since there will already be scalar iterations. Note that the minimum
2343	// iterations check ensures that N >= Step.
2344	if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2345	auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: `0`));
2346	R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2347	}
2348
2349	VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2350
2351	return VectorTripCount;
2352	}
2353
2354	void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2355	// Note: The block with the minimum trip-count check is already connected
2356	// during earlier VPlan construction.
2357	VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2358	VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2359	assert(PreVectorPH->getNumSuccessors() == `2` && "Expected 2 successors");
2360	assert(PreVectorPH->getSuccessors()[`0`] == ScalarPH && "Unexpected successor");
2361	VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2362	VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPB, BlockPtr: CheckVPIRBB);
2363	PreVectorPH = CheckVPIRBB;
2364	VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2365	PreVectorPH->swapSuccessors();
2366
2367	// We just connected a new block to the scalar preheader. Update all
2368	// VPPhis by adding an incoming value for it, replicating the last value.
2369	unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2370	for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2371	assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2372	assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - `1` &&
2373	"must have incoming values for all operands");
2374	R.addOperand(Operand: R.getOperand(N: NumPredecessors - `2`));
2375	}
2376	}
2377
2378	Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2379	unsigned UF) const {
2380	// Generate code to check if the loop's trip count is less than VF UF, or*
2381	// equal to it in case a scalar epilogue is required; this implies that the
2382	// vector trip count is zero. This check also covers the case where adding one
2383	// to the backedge-taken count overflowed leading to an incorrect trip count
2384	// of zero. In this case we will also jump to the scalar loop.
2385	auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2386	: ICmpInst::ICMP_ULT;
2387
2388	// Reuse existing vector loop preheader for TC checks.
2389	// Note that new preheader block is generated for vector loop.
2390	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2391	IRBuilder<> Builder(TCCheckBlock->getTerminator());
2392
2393	// If tail is to be folded, vector loop takes care of all iterations.
2394	Value *Count = getTripCount();
2395	Type *CountTy = Count->getType();
2396	Value *CheckMinIters = Builder.getFalse();
2397	auto CreateStep = [&]() -> Value * {
2398	// Create step with max(MinProTripCount, UF VF).*
2399	if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2400	return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2401
2402	Value *MinProfTC =
2403	createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: `1`);
2404	if (!VF.isScalable())
2405	return MinProfTC;
2406	return Builder.CreateBinaryIntrinsic(
2407	ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2408	};
2409
2410	TailFoldingStyle Style = Cost->getTailFoldingStyle();
2411	if (Style == TailFoldingStyle::None) {
2412	Value *Step = CreateStep ();
2413	ScalarEvolution &SE = *PSE.getSE();
2414	// TODO: Emit unconditional branch to vector preheader instead of
2415	// conditional branch with known condition.
2416	const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2417	// Check if the trip count is < the step.
2418	if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2419	// TODO: Ensure step is at most the trip count when determining max VF and
2420	// UF, w/o tail folding.
2421	CheckMinIters = Builder.getTrue();
2422	} else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2423	LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2424	// Generate the minimum iteration check only if we cannot prove the
2425	// check is known to be true, or known to be false.
2426	CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2427	} // else step known to be < trip count, use CheckMinIters preset to false.
2428	} else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2429	!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2430	Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2431	// vscale is not necessarily a power-of-2, which means we cannot guarantee
2432	// an overflow to zero when updating induction variables and so an
2433	// additional overflow check is required before entering the vector loop.
2434
2435	// Get the maximum unsigned value for the type.
2436	Value *MaxUIntTripCount =
2437	ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2438	Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2439
2440	// Don't execute the vector loop if (UMax - n) < (VF UF).*
2441	CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep ());
2442	}
2443	return CheckMinIters;
2444	}
2445
2446	void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2447	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2448	Value *CheckMinIters = createIterationCountCheck(VF, UF);
2449	// Create new preheader for vector loop.
2450	LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
2451	DT: static_cast<DominatorTree >(nullptr*), LI,
2452	MSSAU: nullptr, BBName: "vector.ph");
2453
2454	BranchInst &BI =
2455	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2456	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2457	setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /IsExpected=/false);
2458	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2459
2460	assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2461	TCCheckBlock &&
2462	"Plan's entry must be TCCCheckBlock");
2463	}
2464
2465	BasicBlock InnerLoopVectorizer::emitSCEVChecks(BasicBlock Bypass) {
2466	BasicBlock *const SCEVCheckBlock =
2467	RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2468	if (!SCEVCheckBlock)
2469	return nullptr;
2470
2471	assert((!Cost->OptForSize \|\|
2472	Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
2473	"Cannot SCEV check stride or overflow when optimizing for size");
2474
2475	introduceCheckBlockInVPlan(CheckIRBB: SCEVCheckBlock);
2476	return SCEVCheckBlock;
2477	}
2478
2479	BasicBlock InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock Bypass) {
2480	BasicBlock *const MemCheckBlock =
2481	RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2482
2483	// Check if we generated code that checks in runtime if arrays overlap. We put
2484	// the checks into a separate block to make the more common case of few
2485	// elements faster.
2486	if (!MemCheckBlock)
2487	return nullptr;
2488
2489	// VPlan-native path does not do any analysis for runtime checks currently.
2490	assert((!EnableVPlanNativePath \|\| OrigLoop->begin() == OrigLoop->end()) &&
2491	"Runtime checks are not supported for outer loops yet");
2492
2493	if (Cost->OptForSize) {
2494	assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2495	"Cannot emit memory checks when optimizing for size, unless forced "
2496	"to vectorize.");
2497	ORE->emit(RemarkBuilder: [&]() {
2498	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationCodeSize",
2499	OrigLoop->getStartLoc(),
2500	OrigLoop->getHeader())
2501	<< "Code-size may be reduced by not forcing "
2502	"vectorization, or by source-code modifications "
2503	"eliminating the need for runtime checks "
2504	"(e.g., adding 'restrict').";
2505	});
2506	}
2507
2508	introduceCheckBlockInVPlan(CheckIRBB: MemCheckBlock);
2509	return MemCheckBlock;
2510	}
2511
2512	/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2513	/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2514	/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2515	/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2516	static void replaceVPBBWithIRVPBB(VPBasicBlock VPBB, BasicBlock IRBB) {
2517	VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2518	for (auto &R : make_early_inc_range(Range&: *VPBB)) {
2519	assert((IRVPBB->empty() \|\| IRVPBB->back().isPhi() \|\| !R.isPhi()) &&
2520	"Tried to move phi recipe after a non-phi recipe");
2521	R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2522	}
2523
2524	VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2525	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
2526	}
2527
2528	void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2529	LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2530	assert(LoopVectorPreHeader && "Invalid loop structure");
2531	assert((OrigLoop->getUniqueLatchExitBlock() \|\|
2532	Cost->requiresScalarEpilogue(VF.isVector())) &&
2533	"loops not exiting via the latch without required epilogue?");
2534
2535	LoopScalarPreHeader =
2536	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2537	LI, MSSAU: nullptr, BBName: Twine (Prefix) + "scalar.ph");
2538	// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2539	// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2540	// preheader may be unreachable at this point. Instead it is replaced in
2541	// createVectorizedLoopSkeleton.
2542	}
2543
2544	/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2545	/// expansion results.
2546	static Value getExpandedStep(const* InductionDescriptor &ID,
2547	const SCEV2ValueTy &ExpandedSCEVs) {
2548	const SCEV *Step = ID.getStep();
2549	if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2550	return C->getValue();
2551	if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2552	return U->getValue();
2553	Value *V = ExpandedSCEVs.lookup(Val: Step);
2554	assert(V && "SCEV must be expanded at this point");
2555	return V;
2556	}
2557
2558	/// Knowing that loop \p L executes a single vector iteration, add instructions
2559	/// that will get simplified and thus should not have any cost to \p
2560	/// InstsToIgnore.
2561	static void addFullyUnrolledInstructionsToIgnore(
2562	Loop L, const* LoopVectorizationLegality::InductionList &IL,
2563	SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2564	auto *Cmp = L->getLatchCmpInst();
2565	if (Cmp)
2566	InstsToIgnore.insert(Ptr: Cmp);
2567	for (const auto &KV : IL) {
2568	// Extract the key by hand so that it can be used in the lambda below. Note
2569	// that captured structured bindings are a C++20 extension.
2570	const PHINode *IV = KV.first;
2571
2572	// Get next iteration value of the induction variable.
2573	Instruction *IVInst =
2574	cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2575	if (all_of(Range: IVInst->users(),
2576	P: [&](const User U) { return* U == IV \|\| U == Cmp; }))
2577	InstsToIgnore.insert(Ptr: IVInst);
2578	}
2579	}
2580
2581	BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2582	/*
2583	In this function we generate a new loop. The new loop will contain
2584	the vectorized instructions while the old loop will continue to run the
2585	scalar remainder.
2586
2587	[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2588	/ \| preheader are expanded here. Eventually all required SCEV
2589	/ \| expansion should happen here.
2590	/ v
2591	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
2592	\| / \|
2593	\| / v
2594	\|\| [ ] <-- vector pre header.
2595	\|/ \|
2596	\| v
2597	\| [ ] \
2598	\| [ ]_\| <-- vector loop (created during VPlan execution).
2599	\| \|
2600	\| v
2601	\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2602	\| \| successors created during VPlan execution)
2603	\/ \|
2604	/\ v
2605	\| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2606	\| \|
2607	(opt) v <-- edge from middle to exit iff epilogue is not required.
2608	\| [ ] \
2609	\| [ ]_\| <-- old scalar loop to handle remainder (scalar epilogue, header
2610	\| \| wrapped in VPIRBasicBlock).
2611	\ \|
2612	\ v
2613	>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2614	...
2615	*/
2616
2617	// Create an empty vector loop, and prepare basic blocks for the runtime
2618	// checks.
2619	createVectorLoopSkeleton(Prefix: "");
2620
2621	// Now, compare the new count to zero. If it is zero skip the vector loop and
2622	// jump to the scalar loop. This check also covers the case where the
2623	// backedge-taken count is uint##_max: adding one to it will overflow leading
2624	// to an incorrect trip count of zero. In this (rare) case we will also jump
2625	// to the scalar loop.
2626	emitIterationCountCheck(Bypass: LoopScalarPreHeader);
2627
2628	// Generate the code to check any assumptions that we've made for SCEV
2629	// expressions.
2630	emitSCEVChecks(Bypass: LoopScalarPreHeader);
2631
2632	// Generate the code that checks in runtime if arrays overlap. We put the
2633	// checks into a separate block to make the more common case of few elements
2634	// faster.
2635	emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
2636
2637	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
2638	return LoopVectorPreHeader;
2639	}
2640
2641	namespace {
2642
2643	struct CSEDenseMapInfo {
2644	static bool canHandle(const Instruction *I) {
2645	return isa<InsertElementInst>(Val: I) \|\| isa<ExtractElementInst>(Val: I) \|\|
2646	isa<ShuffleVectorInst>(Val: I) \|\| isa<GetElementPtrInst>(Val: I);
2647	}
2648
2649	static inline Instruction *getEmptyKey() {
2650	return DenseMapInfo<Instruction *>::getEmptyKey();
2651	}
2652
2653	static inline Instruction *getTombstoneKey() {
2654	return DenseMapInfo<Instruction *>::getTombstoneKey();
2655	}
2656
2657	static unsigned getHashValue(const Instruction *I) {
2658	assert(canHandle(I) && "Unknown instruction!");
2659	return hash_combine(args: I->getOpcode(),
2660	args: hash_combine_range(R: I->operand_values()));
2661	}
2662
2663	static bool isEqual(const Instruction LHS, const* Instruction *RHS) {
2664	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
2665	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
2666	return LHS == RHS;
2667	return LHS->isIdenticalTo(I: RHS);
2668	}
2669	};
2670
2671	} // end anonymous namespace
2672
2673	///Perform cse of induction variable instructions.
2674	static void cse(BasicBlock *BB) {
2675	// Perform simple cse.
2676	SmallDenseMap<Instruction , Instruction , `4`, CSEDenseMapInfo> CSEMap;
2677	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2678	if (!CSEDenseMapInfo::canHandle(I: &In))
2679	continue;
2680
2681	// Check if we can replace this instruction with any of the
2682	// visited instructions.
2683	if (Instruction *V = CSEMap.lookup(Val: &In)) {
2684	In.replaceAllUsesWith(V);
2685	In.eraseFromParent();
2686	continue;
2687	}
2688
2689	CSEMap [&In] = &In;
2690	}
2691	}
2692
2693	/// This function attempts to return a value that represents the vectorization
2694	/// factor at runtime. For fixed-width VFs we know this precisely at compile
2695	/// time, but for scalable VFs we calculate it based on an estimate of the
2696	/// vscale value.
2697	static unsigned getEstimatedRuntimeVF(ElementCount VF,
2698	std::optional<unsigned> VScale) {
2699	unsigned EstimatedVF = VF.getKnownMinValue();
2700	if (VF.isScalable())
2701	if (VScale)
2702	EstimatedVF = VScale;
2703	assert(EstimatedVF >= `1` && "Estimated VF shouldn't be less than 1");
2704	return EstimatedVF;
2705	}
2706
2707	InstructionCost
2708	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2709	ElementCount VF) const {
2710	// We only need to calculate a cost if the VF is scalar; for actual vectors
2711	// we should already have a pre-calculated cost at each VF.
2712	if (!VF.isScalar())
2713	return getCallWideningDecision(CI, VF).Cost;
2714
2715	Type *RetTy = CI->getType();
2716	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2717	if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2718	return *RedCost;
2719
2720	SmallVector<Type *, `4`> Tys;
2721	for (auto &ArgOp : CI->args())
2722	Tys.push_back(Elt: ArgOp ->getType());
2723
2724	InstructionCost ScalarCallCost =
2725	TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2726
2727	// If this is an intrinsic we may have a lower cost for it.
2728	if (getVectorIntrinsicIDForCall(CI, TLI)) {
2729	InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2730	return std::min(a: ScalarCallCost, b: IntrinsicCost);
2731	}
2732	return ScalarCallCost;
2733	}
2734
2735	static Type maybeVectorizeType(Type Ty, ElementCount VF) {
2736	if (VF.isScalar() \|\| !canVectorizeTy(Ty))
2737	return Ty;
2738	return toVectorizedTy(Ty, EC: VF);
2739	}
2740
2741	InstructionCost
2742	LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2743	ElementCount VF) const {
2744	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2745	assert(ID && "Expected intrinsic call!");
2746	Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2747	FastMathFlags FMF;
2748	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2749	FMF = FPMO->getFastMathFlags();
2750
2751	SmallVector<const Value *> Arguments(CI->args());
2752	FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2753	SmallVector<Type *> ParamTys;
2754	std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2755	result: std::back_inserter(x&: ParamTys),
2756	unary_op: [&](Type Ty) { return* maybeVectorizeType(Ty, VF); });
2757
2758	IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2759	dyn_cast<IntrinsicInst>(Val: CI),
2760	InstructionCost::getInvalid(), TLI);
2761	return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2762	}
2763
2764	void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2765	// Fix widened non-induction PHIs by setting up the PHI operands.
2766	fixNonInductionPHIs(State);
2767
2768	// After vectorization, the exit blocks of the original loop will have
2769	// additional predecessors. Invalidate SCEVs for the exit phis in case SE
2770	// looked through single-entry phis.
2771	SmallVector<BasicBlock *> ExitBlocks;
2772	OrigLoop->getExitBlocks(ExitBlocks);
2773	for (BasicBlock *Exit : ExitBlocks)
2774	for (PHINode &PN : Exit->phis())
2775	PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
2776
2777	// Forget the original basic block.
2778	PSE.getSE()->forgetLoop(L: OrigLoop);
2779	PSE.getSE()->forgetBlockAndLoopDispositions();
2780
2781	// Don't apply optimizations below when no (vector) loop remains, as they all
2782	// require one at the moment.
2783	VPBasicBlock *HeaderVPBB =
2784	vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2785	if (!HeaderVPBB)
2786	return;
2787
2788	BasicBlock *HeaderBB = State.CFG.VPBB2IRBB [HeaderVPBB];
2789
2790	// Remove redundant induction instructions.
2791	cse(BB: HeaderBB);
2792
2793	// Set/update profile weights for the vector and remainder loops as original
2794	// loop iterations are now distributed among them. Note that original loop
2795	// becomes the scalar remainder loop after vectorization.
2796	//
2797	// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2798	// end up getting slightly roughened result but that should be OK since
2799	// profile is not inherently precise anyway. Note also possible bypass of
2800	// vector code caused by legality checks is ignored, assigning all the weight
2801	// to the vector loop, optimistically.
2802	//
2803	// For scalable vectorization we can't know at compile time how many
2804	// iterations of the loop are handled in one vector iteration, so instead
2805	// use the value of vscale used for tuning.
2806	Loop *VectorLoop = LI->getLoopFor(BB: HeaderBB);
2807	unsigned EstimatedVFxUF =
2808	getEstimatedRuntimeVF(VF: VF * UF, VScale: Cost->getVScaleForTuning());
2809	setProfileInfoAfterUnrolling(OrigLoop, UnrolledLoop: VectorLoop, RemainderLoop: OrigLoop, UF: EstimatedVFxUF);
2810	}
2811
2812	void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2813	auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2814	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2815	for (VPRecipeBase &P : VPBB->phis()) {
2816	VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2817	if (!VPPhi)
2818	continue;
2819	PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2820	// Make sure the builder has a valid insert point.
2821	Builder.SetInsertPoint(NewPhi);
2822	for (unsigned Idx = `0`; Idx < VPPhi->getNumIncoming(); ++Idx) {
2823	VPValue *Inc = VPPhi->getIncomingValue(Idx);
2824	const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2825	NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB [VPBB]);
2826	}
2827	}
2828	}
2829	}
2830
2831	void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2832	// We should not collect Scalars more than once per VF. Right now, this
2833	// function is called from collectUniformsAndScalars(), which already does
2834	// this check. Collecting Scalars for VF=1 does not make any sense.
2835	assert(VF.isVector() && !Scalars.contains(VF) &&
2836	"This function should not be visited twice for the same VF");
2837
2838	// This avoids any chances of creating a REPLICATE recipe during planning
2839	// since that would result in generation of scalarized code during execution,
2840	// which is not supported for scalable vectors.
2841	if (VF.isScalable()) {
2842	Scalars [VF].insert_range(R&: Uniforms [VF]);
2843	return;
2844	}
2845
2846	SmallSetVector<Instruction *, `8`> Worklist;
2847
2848	// These sets are used to seed the analysis with pointers used by memory
2849	// accesses that will remain scalar.
2850	SmallSetVector<Instruction *, `8`> ScalarPtrs;
2851	SmallPtrSet<Instruction *, `8`> PossibleNonScalarPtrs;
2852	auto *Latch = TheLoop->getLoopLatch();
2853
2854	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
2855	// The pointer operands of loads and stores will be scalar as long as the
2856	// memory access is not a gather or scatter operation. The value operand of a
2857	// store will remain scalar if the store is scalarized.
2858	auto IsScalarUse = [&](Instruction MemAccess, Value Ptr) {
2859	InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2860	assert(WideningDecision != CM_Unknown &&
2861	"Widening decision should be ready at this moment");
2862	if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2863	if (Ptr == Store->getValueOperand())
2864	return WideningDecision == CM_Scalarize;
2865	assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2866	"Ptr is neither a value or pointer operand");
2867	return WideningDecision != CM_GatherScatter;
2868	};
2869
2870	// A helper that returns true if the given value is a getelementptr
2871	// instruction contained in the loop.
2872	auto IsLoopVaryingGEP = [&](Value *V) {
2873	return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2874	};
2875
2876	// A helper that evaluates a memory access's use of a pointer. If the use will
2877	// be a scalar use and the pointer is only used by memory accesses, we place
2878	// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2879	// PossibleNonScalarPtrs.
2880	auto EvaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
2881	// We only care about bitcast and getelementptr instructions contained in
2882	// the loop.
2883	if (!IsLoopVaryingGEP (Ptr))
2884	return;
2885
2886	// If the pointer has already been identified as scalar (e.g., if it was
2887	// also identified as uniform), there's nothing to do.
2888	auto *I = cast<Instruction>(Val: Ptr);
2889	if (Worklist.count(key: I))
2890	return;
2891
2892	// If the use of the pointer will be a scalar use, and all users of the
2893	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2894	// place the pointer in PossibleNonScalarPtrs.
2895	if (IsScalarUse (MemAccess, Ptr) &&
2896	all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2897	ScalarPtrs.insert(X: I);
2898	else
2899	PossibleNonScalarPtrs.insert(Ptr: I);
2900	};
2901
2902	// We seed the scalars analysis with three classes of instructions: (1)
2903	// instructions marked uniform-after-vectorization and (2) bitcast,
2904	// getelementptr and (pointer) phi instructions used by memory accesses
2905	// requiring a scalar use.
2906	//
2907	// (1) Add to the worklist all instructions that have been identified as
2908	// uniform-after-vectorization.
2909	Worklist.insert_range(R&: Uniforms [VF]);
2910
2911	// (2) Add to the worklist all bitcast and getelementptr instructions used by
2912	// memory accesses requiring a scalar use. The pointer operands of loads and
2913	// stores will be scalar unless the operation is a gather or scatter.
2914	// The value operand of a store will remain scalar if the store is scalarized.
2915	for (auto *BB : TheLoop->blocks())
2916	for (auto &I : *BB) {
2917	if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2918	EvaluatePtrUse (Load, Load->getPointerOperand());
2919	} else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2920	EvaluatePtrUse (Store, Store->getPointerOperand());
2921	EvaluatePtrUse (Store, Store->getValueOperand());
2922	}
2923	}
2924	for (auto *I : ScalarPtrs)
2925	if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2926	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2927	Worklist.insert(X: I);
2928	}
2929
2930	// Insert the forced scalars.
2931	// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2932	// induction variable when the PHI user is scalarized.
2933	auto ForcedScalar = ForcedScalars.find(Val: VF);
2934	if (ForcedScalar != ForcedScalars.end())
2935	for (auto *I : ForcedScalar ->second) {
2936	LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2937	Worklist.insert(X: I);
2938	}
2939
2940	// Expand the worklist by looking through any bitcasts and getelementptr
2941	// instructions we've already identified as scalar. This is similar to the
2942	// expansion step in collectLoopUniforms(); however, here we're only
2943	// expanding to include additional bitcasts and getelementptr instructions.
2944	unsigned Idx = `0`;
2945	while (Idx != Worklist.size()) {
2946	Instruction *Dst = Worklist [Idx++];
2947	if (!IsLoopVaryingGEP (Dst->getOperand(i: `0`)))
2948	continue;
2949	auto *Src = cast<Instruction>(Val: Dst->getOperand(i: `0`));
2950	if (llvm::all_of(Range: Src->users(), P: [&](User U) -> bool* {
2951	auto *J = cast<Instruction>(Val: U);
2952	return !TheLoop->contains(Inst: J) \|\| Worklist.count(key: J) \|\|
2953	((isa<LoadInst>(Val: J) \|\| isa<StoreInst>(Val: J)) &&
2954	IsScalarUse (J, Src));
2955	})) {
2956	Worklist.insert(X: Src);
2957	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2958	}
2959	}
2960
2961	// An induction variable will remain scalar if all users of the induction
2962	// variable and induction variable update remain scalar.
2963	for (const auto &Induction : Legal->getInductionVars()) {
2964	auto *Ind = Induction.first;
2965	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2966
2967	// If tail-folding is applied, the primary induction variable will be used
2968	// to feed a vector compare.
2969	if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2970	continue;
2971
2972	// Returns true if \p Indvar is a pointer induction that is used directly by
2973	// load/store instruction \p I.
2974	auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2975	Instruction *I) {
2976	return Induction.second.getKind() ==
2977	InductionDescriptor::IK_PtrInduction &&
2978	(isa<LoadInst>(Val: I) \|\| isa<StoreInst>(Val: I)) &&
2979	Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse (I, Indvar);
2980	};
2981
2982	// Determine if all users of the induction variable are scalar after
2983	// vectorization.
2984	bool ScalarInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
2985	auto *I = cast<Instruction>(Val: U);
2986	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
2987	IsDirectLoadStoreFromPtrIndvar (Ind, I);
2988	});
2989	if (!ScalarInd)
2990	continue;
2991
2992	// If the induction variable update is a fixed-order recurrence, neither the
2993	// induction variable or its update should be marked scalar after
2994	// vectorization.
2995	auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2996	if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2997	continue;
2998
2999	// Determine if all users of the induction variable update instruction are
3000	// scalar after vectorization.
3001	bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
3002	auto *I = cast<Instruction>(Val: U);
3003	return I == Ind \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3004	IsDirectLoadStoreFromPtrIndvar (IndUpdate, I);
3005	});
3006	if (!ScalarIndUpdate)
3007	continue;
3008
3009	// The induction variable and its update instruction will remain scalar.
3010	Worklist.insert(X: Ind);
3011	Worklist.insert(X: IndUpdate);
3012	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3013	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3014	<< "\n");
3015	}
3016
3017	Scalars [VF].insert_range(R&: Worklist);
3018	}
3019
3020	bool LoopVectorizationCostModel::isScalarWithPredication(
3021	Instruction I, ElementCount VF) const* {
3022	if (!isPredicatedInst(I))
3023	return false;
3024
3025	// Do we have a non-scalar lowering for this predicated
3026	// instruction? No - it is scalar with predication.
3027	switch(I->getOpcode()) {
3028	default:
3029	return true;
3030	case Instruction::Call:
3031	if (VF.isScalar())
3032	return true;
3033	return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
3034	case Instruction::Load:
3035	case Instruction::Store: {
3036	auto *Ptr = getLoadStorePointerOperand(V: I);
3037	auto *Ty = getLoadStoreType(I);
3038	unsigned AS = getLoadStoreAddressSpace(I);
3039	Type *VTy = Ty;
3040	if (VF.isVector())
3041	VTy = VectorType::get(ElementType: Ty, EC: VF);
3042	const Align Alignment = getLoadStoreAlignment(I);
3043	return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
3044	TTI.isLegalMaskedGather(DataType: VTy, Alignment))
3045	: !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
3046	TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
3047	}
3048	case Instruction::UDiv:
3049	case Instruction::SDiv:
3050	case Instruction::SRem:
3051	case Instruction::URem: {
3052	// We have the option to use the safe-divisor idiom to avoid predication.
3053	// The cost based decision here will always select safe-divisor for
3054	// scalable vectors as scalarization isn't legal.
3055	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3056	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3057	}
3058	}
3059	}
3060
3061	// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3062	bool LoopVectorizationCostModel::isPredicatedInst(Instruction I) const* {
3063	// TODO: We can use the loop-preheader as context point here and get
3064	// context sensitive reasoning for isSafeToSpeculativelyExecute.
3065	if (isSafeToSpeculativelyExecute(I) \|\|
3066	(isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) \|\|
3067	isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
3068	return false;
3069
3070	// If the instruction was executed conditionally in the original scalar loop,
3071	// predication is needed with a mask whose lanes are all possibly inactive.
3072	if (Legal->blockNeedsPredication(BB: I->getParent()))
3073	return true;
3074
3075	// If we're not folding the tail by masking, predication is unnecessary.
3076	if (!foldTailByMasking())
3077	return false;
3078
3079	// All that remain are instructions with side-effects originally executed in
3080	// the loop unconditionally, but now execute under a tail-fold mask (only)
3081	// having at least one active lane (the first). If the side-effects of the
3082	// instruction are invariant, executing it w/o (the tail-folding) mask is safe
3083	// - it will cause the same side-effects as when masked.
3084	switch(I->getOpcode()) {
3085	default:
3086	llvm_unreachable(
3087	"instruction should have been considered by earlier checks");
3088	case Instruction::Call:
3089	// Side-effects of a Call are assumed to be non-invariant, needing a
3090	// (fold-tail) mask.
3091	assert(Legal->isMaskRequired(I) &&
3092	"should have returned earlier for calls not needing a mask");
3093	return true;
3094	case Instruction::Load:
3095	// If the address is loop invariant no predication is needed.
3096	return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
3097	case Instruction::Store: {
3098	// For stores, we need to prove both speculation safety (which follows from
3099	// the same argument as loads), but also must prove the value being stored
3100	// is correct. The easiest form of the later is to require that all values
3101	// stored are the same.
3102	return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
3103	Legal->isInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
3104	}
3105	case Instruction::UDiv:
3106	case Instruction::SDiv:
3107	case Instruction::SRem:
3108	case Instruction::URem:
3109	// If the divisor is loop-invariant no predication is needed.
3110	return !Legal->isInvariant(V: I->getOperand(i: `1`));
3111	}
3112	}
3113
3114	std::pair<InstructionCost, InstructionCost>
3115	LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3116	ElementCount VF) const {
3117	assert(I->getOpcode() == Instruction::UDiv \|\|
3118	I->getOpcode() == Instruction::SDiv \|\|
3119	I->getOpcode() == Instruction::SRem \|\|
3120	I->getOpcode() == Instruction::URem);
3121	assert(!isSafeToSpeculativelyExecute(I));
3122
3123	// Scalarization isn't legal for scalable vector types
3124	InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3125	if (!VF.isScalable()) {
3126	// Get the scalarization cost and scale this amount by the probability of
3127	// executing the predicated block. If the instruction is not predicated,
3128	// we fall through to the next case.
3129	ScalarizationCost = `0`;
3130
3131	// These instructions have a non-void type, so account for the phi nodes
3132	// that we will create. This cost is likely to be zero. The phi node
3133	// cost, if any, should be scaled by the block probability because it
3134	// models a copy at the end of each predicated block.
3135	ScalarizationCost +=
3136	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
3137
3138	// The cost of the non-predicated instruction.
3139	ScalarizationCost +=
3140	VF.getFixedValue() *
3141	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
3142
3143	// The cost of insertelement and extractelement instructions needed for
3144	// scalarization.
3145	ScalarizationCost += getScalarizationOverhead(I, VF);
3146
3147	// Scale the cost by the probability of executing the predicated blocks.
3148	// This assumes the predicated block for each vector lane is equally
3149	// likely.
3150	ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3151	}
3152	InstructionCost SafeDivisorCost = `0`;
3153
3154	auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
3155
3156	// The cost of the select guard to ensure all lanes are well defined
3157	// after we speculate above any internal control flow.
3158	SafeDivisorCost +=
3159	TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
3160	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
3161	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3162
3163	// Certain instructions can be cheaper to vectorize if they have a constant
3164	// second vector operand. One example of this are shifts on x86.
3165	Value *Op2 = I->getOperand(i: `1`);
3166	auto Op2Info = TTI.getOperandInfo(V: Op2);
3167	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3168	Legal->isInvariant(V: Op2))
3169	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3170
3171	SmallVector<const Value *, `4`> Operands(I->operand_values());
3172	SafeDivisorCost += TTI.getArithmeticInstrCost(
3173	Opcode: I->getOpcode(), Ty: VecTy, CostKind,
3174	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3175	Opd2Info: Op2Info, Args: Operands, CxtI: I);
3176	return {ScalarizationCost, SafeDivisorCost};
3177	}
3178
3179	bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3180	Instruction I, ElementCount VF) const* {
3181	assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3182	assert(getWideningDecision(I, VF) == CM_Unknown &&
3183	"Decision should not be set yet.");
3184	auto *Group = getInterleavedAccessGroup(Instr: I);
3185	assert(Group && "Must have a group.");
3186	unsigned InterleaveFactor = Group->getFactor();
3187
3188	// If the instruction's allocated size doesn't equal its type size, it
3189	// requires padding and will be scalarized.
3190	auto &DL = I->getDataLayout();
3191	auto *ScalarTy = getLoadStoreType(I);
3192	if (hasIrregularType(Ty: ScalarTy, DL))
3193	return false;
3194
3195	// For scalable vectors, the interleave factors must be <= 8 since we require
3196	// the (de)interleaveN intrinsics instead of shufflevectors.
3197	if (VF.isScalable() && InterleaveFactor > `8`)
3198	return false;
3199
3200	// If the group involves a non-integral pointer, we may not be able to
3201	// losslessly cast all values to a common type.
3202	bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3203	for (unsigned Idx = `0`; Idx < InterleaveFactor; Idx++) {
3204	Instruction *Member = Group->getMember(Index: Idx);
3205	if (!Member)
3206	continue;
3207	auto *MemberTy = getLoadStoreType(I: Member);
3208	bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3209	// Don't coerce non-integral pointers to integers or vice versa.
3210	if (MemberNI != ScalarNI)
3211	// TODO: Consider adding special nullptr value case here
3212	return false;
3213	if (MemberNI && ScalarNI &&
3214	ScalarTy->getPointerAddressSpace() !=
3215	MemberTy->getPointerAddressSpace())
3216	return false;
3217	}
3218
3219	// Check if masking is required.
3220	// A Group may need masking for one of two reasons: it resides in a block that
3221	// needs predication, or it was decided to use masking to deal with gaps
3222	// (either a gap at the end of a load-access that may result in a speculative
3223	// load, or any gaps in a store-access).
3224	bool PredicatedAccessRequiresMasking =
3225	blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3226	Legal->isMaskRequired(I);
3227	bool LoadAccessWithGapsRequiresEpilogMasking =
3228	isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3229	!isScalarEpilogueAllowed();
3230	bool StoreAccessWithGapsRequiresMasking =
3231	isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
3232	if (!PredicatedAccessRequiresMasking &&
3233	!LoadAccessWithGapsRequiresEpilogMasking &&
3234	!StoreAccessWithGapsRequiresMasking)
3235	return true;
3236
3237	// If masked interleaving is required, we expect that the user/target had
3238	// enabled it, because otherwise it either wouldn't have been created or
3239	// it should have been invalidated by the CostModel.
3240	assert(useMaskedInterleavedAccesses(TTI) &&
3241	"Masked interleave-groups for predicated accesses are not enabled.");
3242
3243	if (Group->isReverse())
3244	return false;
3245
3246	auto *Ty = getLoadStoreType(I);
3247	const Align Alignment = getLoadStoreAlignment(I);
3248	unsigned AS = getLoadStoreAddressSpace(I);
3249	return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3250	: TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3251	}
3252
3253	bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3254	Instruction *I, ElementCount VF) {
3255	// Get and ensure we have a valid memory instruction.
3256	assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3257
3258	auto *Ptr = getLoadStorePointerOperand(V: I);
3259	auto *ScalarTy = getLoadStoreType(I);
3260
3261	// In order to be widened, the pointer should be consecutive, first of all.
3262	if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3263	return false;
3264
3265	// If the instruction is a store located in a predicated block, it will be
3266	// scalarized.
3267	if (isScalarWithPredication(I, VF))
3268	return false;
3269
3270	// If the instruction's allocated size doesn't equal it's type size, it
3271	// requires padding and will be scalarized.
3272	auto &DL = I->getDataLayout();
3273	if (hasIrregularType(Ty: ScalarTy, DL))
3274	return false;
3275
3276	return true;
3277	}
3278
3279	void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3280	// We should not collect Uniforms more than once per VF. Right now,
3281	// this function is called from collectUniformsAndScalars(), which
3282	// already does this check. Collecting Uniforms for VF=1 does not make any
3283	// sense.
3284
3285	assert(VF.isVector() && !Uniforms.contains(VF) &&
3286	"This function should not be visited twice for the same VF");
3287
3288	// Visit the list of Uniforms. If we find no uniform value, we won't
3289	// analyze again. Uniforms.count(VF) will return 1.
3290	Uniforms [VF].clear();
3291
3292	// Now we know that the loop is vectorizable!
3293	// Collect instructions inside the loop that will remain uniform after
3294	// vectorization.
3295
3296	// Global values, params and instructions outside of current loop are out of
3297	// scope.
3298	auto IsOutOfScope = [&](Value V) -> bool* {
3299	Instruction *I = dyn_cast<Instruction>(Val: V);
3300	return (!I \|\| !TheLoop->contains(Inst: I));
3301	};
3302
3303	// Worklist containing uniform instructions demanding lane 0.
3304	SetVector<Instruction *> Worklist;
3305
3306	// Add uniform instructions demanding lane 0 to the worklist. Instructions
3307	// that require predication must not be considered uniform after
3308	// vectorization, because that would create an erroneous replicating region
3309	// where only a single instance out of VF should be formed.
3310	auto AddToWorklistIfAllowed = [&](Instruction I) -> void* {
3311	if (IsOutOfScope (I)) {
3312	LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3313	<< *I << "\n");
3314	return;
3315	}
3316	if (isPredicatedInst(I)) {
3317	LLVM_DEBUG(
3318	dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3319	<< "\n");
3320	return;
3321	}
3322	LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3323	Worklist.insert(X: I);
3324	};
3325
3326	// Start with the conditional branches exiting the loop. If the branch
3327	// condition is an instruction contained in the loop that is only used by the
3328	// branch, it is uniform. Note conditions from uncountable early exits are not
3329	// uniform.
3330	SmallVector<BasicBlock *> Exiting;
3331	TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3332	for (BasicBlock *E : Exiting) {
3333	if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3334	continue;
3335	auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: `0`));
3336	if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3337	AddToWorklistIfAllowed (Cmp);
3338	}
3339
3340	auto PrevVF = VF.divideCoefficientBy(RHS: `2`);
3341	// Return true if all lanes perform the same memory operation, and we can
3342	// thus choose to execute only one.
3343	auto IsUniformMemOpUse = [&](Instruction *I) {
3344	// If the value was already known to not be uniform for the previous
3345	// (smaller VF), it cannot be uniform for the larger VF.
3346	if (PrevVF.isVector()) {
3347	auto Iter = Uniforms.find(Val: PrevVF);
3348	if (Iter != Uniforms.end() && !Iter ->second.contains(Ptr: I))
3349	return false;
3350	}
3351	if (!Legal->isUniformMemOp(I&: *I, VF))
3352	return false;
3353	if (isa<LoadInst>(Val: I))
3354	// Loading the same address always produces the same result - at least
3355	// assuming aliasing and ordering which have already been checked.
3356	return true;
3357	// Storing the same value on every iteration.
3358	return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3359	};
3360
3361	auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3362	InstWidening WideningDecision = getWideningDecision(I, VF);
3363	assert(WideningDecision != CM_Unknown &&
3364	"Widening decision should be ready at this moment");
3365
3366	if (IsUniformMemOpUse (I))
3367	return true;
3368
3369	return (WideningDecision == CM_Widen \|\|
3370	WideningDecision == CM_Widen_Reverse \|\|
3371	WideningDecision == CM_Interleave);
3372	};
3373
3374	// Returns true if Ptr is the pointer operand of a memory access instruction
3375	// I, I is known to not require scalarization, and the pointer is not also
3376	// stored.
3377	auto IsVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
3378	if (isa<StoreInst>(Val: I) && I->getOperand(i: `0`) == Ptr)
3379	return false;
3380	return getLoadStorePointerOperand(V: I) == Ptr &&
3381	(IsUniformDecision (I, VF) \|\| Legal->isInvariant(V: Ptr));
3382	};
3383
3384	// Holds a list of values which are known to have at least one uniform use.
3385	// Note that there may be other uses which aren't uniform. A "uniform use"
3386	// here is something which only demands lane 0 of the unrolled iterations;
3387	// it does not imply that all lanes produce the same value (e.g. this is not
3388	// the usual meaning of uniform)
3389	SetVector<Value *> HasUniformUse;
3390
3391	// Scan the loop for instructions which are either a) known to have only
3392	// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3393	for (auto *BB : TheLoop->blocks())
3394	for (auto &I : *BB) {
3395	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3396	switch (II->getIntrinsicID()) {
3397	case Intrinsic::sideeffect:
3398	case Intrinsic::experimental_noalias_scope_decl:
3399	case Intrinsic::assume:
3400	case Intrinsic::lifetime_start:
3401	case Intrinsic::lifetime_end:
3402	if (TheLoop->hasLoopInvariantOperands(I: &I))
3403	AddToWorklistIfAllowed (&I);
3404	break;
3405	default:
3406	break;
3407	}
3408	}
3409
3410	if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3411	if (IsOutOfScope (EVI->getAggregateOperand())) {
3412	AddToWorklistIfAllowed (EVI);
3413	continue;
3414	}
3415	// Only ExtractValue instructions where the aggregate value comes from a
3416	// call are allowed to be non-uniform.
3417	assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3418	"Expected aggregate value to be call return value");
3419	}
3420
3421	// If there's no pointer operand, there's nothing to do.
3422	auto *Ptr = getLoadStorePointerOperand(V: &I);
3423	if (!Ptr)
3424	continue;
3425
3426	if (IsUniformMemOpUse (&I))
3427	AddToWorklistIfAllowed (&I);
3428
3429	if (IsVectorizedMemAccessUse (&I, Ptr))
3430	HasUniformUse.insert(X: Ptr);
3431	}
3432
3433	// Add to the worklist any operands which have only* uniform (e.g. lane 0*
3434	// demanding) users. Since loops are assumed to be in LCSSA form, this
3435	// disallows uses outside the loop as well.
3436	for (auto *V : HasUniformUse) {
3437	if (IsOutOfScope (V))
3438	continue;
3439	auto *I = cast<Instruction>(Val: V);
3440	bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User U) -> bool* {
3441	auto *UI = cast<Instruction>(Val: U);
3442	return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse (UI, V);
3443	});
3444	if (UsersAreMemAccesses)
3445	AddToWorklistIfAllowed (I);
3446	}
3447
3448	// Expand Worklist in topological order: whenever a new instruction
3449	// is added , its users should be already inside Worklist. It ensures
3450	// a uniform instruction will only be used by uniform instructions.
3451	unsigned Idx = `0`;
3452	while (Idx != Worklist.size()) {
3453	Instruction *I = Worklist [Idx++];
3454
3455	for (auto *OV : I->operand_values()) {
3456	// isOutOfScope operands cannot be uniform instructions.
3457	if (IsOutOfScope (OV))
3458	continue;
3459	// First order recurrence Phi's should typically be considered
3460	// non-uniform.
3461	auto *OP = dyn_cast<PHINode>(Val: OV);
3462	if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3463	continue;
3464	// If all the users of the operand are uniform, then add the
3465	// operand into the uniform worklist.
3466	auto *OI = cast<Instruction>(Val: OV);
3467	if (llvm::all_of(Range: OI->users(), P: [&](User U) -> bool* {
3468	auto *J = cast<Instruction>(Val: U);
3469	return Worklist.count(key: J) \|\| IsVectorizedMemAccessUse (J, OI);
3470	}))
3471	AddToWorklistIfAllowed (OI);
3472	}
3473	}
3474
3475	// For an instruction to be added into Worklist above, all its users inside
3476	// the loop should also be in Worklist. However, this condition cannot be
3477	// true for phi nodes that form a cyclic dependence. We must process phi
3478	// nodes separately. An induction variable will remain uniform if all users
3479	// of the induction variable and induction variable update remain uniform.
3480	// The code below handles both pointer and non-pointer induction variables.
3481	BasicBlock *Latch = TheLoop->getLoopLatch();
3482	for (const auto &Induction : Legal->getInductionVars()) {
3483	auto *Ind = Induction.first;
3484	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3485
3486	// Determine if all users of the induction variable are uniform after
3487	// vectorization.
3488	bool UniformInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
3489	auto *I = cast<Instruction>(Val: U);
3490	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3491	IsVectorizedMemAccessUse (I, Ind);
3492	});
3493	if (!UniformInd)
3494	continue;
3495
3496	// Determine if all users of the induction variable update instruction are
3497	// uniform after vectorization.
3498	bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
3499	auto *I = cast<Instruction>(Val: U);
3500	return I == Ind \|\| Worklist.count(key: I) \|\|
3501	IsVectorizedMemAccessUse (I, IndUpdate);
3502	});
3503	if (!UniformIndUpdate)
3504	continue;
3505
3506	// The induction variable and its update instruction will remain uniform.
3507	AddToWorklistIfAllowed (Ind);
3508	AddToWorklistIfAllowed (IndUpdate);
3509	}
3510
3511	Uniforms [VF].insert_range(R&: Worklist);
3512	}
3513
3514	bool LoopVectorizationCostModel::runtimeChecksRequired() {
3515	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3516
3517	if (Legal->getRuntimePointerChecking()->Need) {
3518	reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3519	OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3520	"loop with '#pragma clang loop vectorize(enable)' when "
3521	"compiling with -Os/-Oz",
3522	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3523	return true;
3524	}
3525
3526	if (!PSE.getPredicate().isAlwaysTrue()) {
3527	reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3528	OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3529	"loop with '#pragma clang loop vectorize(enable)' when "
3530	"compiling with -Os/-Oz",
3531	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3532	return true;
3533	}
3534
3535	// FIXME: Avoid specializing for stride==1 instead of bailing out.
3536	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3537	reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3538	OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3539	"this loop without such check by compiling with -Os/-Oz",
3540	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3541	return true;
3542	}
3543
3544	return false;
3545	}
3546
3547	bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3548	if (IsScalableVectorizationAllowed)
3549	return *IsScalableVectorizationAllowed;
3550
3551	IsScalableVectorizationAllowed = false;
3552	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3553	return false;
3554
3555	if (Hints->isScalableVectorizationDisabled()) {
3556	reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3557	ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3558	return false;
3559	}
3560
3561	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3562
3563	auto MaxScalableVF = ElementCount::getScalable(
3564	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3565
3566	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
3567	// FIXME: While for scalable vectors this is currently sufficient, this should
3568	// be replaced by a more detailed mechanism that filters out specific VFs,
3569	// instead of invalidating vectorization for a whole set of VFs based on the
3570	// MaxVF.
3571
3572	// Disable scalable vectorization if the loop contains unsupported reductions.
3573	if (!canVectorizeReductions(VF: MaxScalableVF)) {
3574	reportVectorizationInfo(
3575	Msg: "Scalable vectorization not supported for the reduction "
3576	"operations found in this loop.",
3577	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3578	return false;
3579	}
3580
3581	// Disable scalable vectorization if the loop contains any instructions
3582	// with element types not supported for scalable vectors.
3583	if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3584	return !Ty->isVoidTy() &&
3585	!this->TTI.isElementTypeLegalForScalableVector(Ty);
3586	})) {
3587	reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3588	"for all element types found in this loop.",
3589	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3590	return false;
3591	}
3592
3593	if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3594	reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3595	"for safe distance analysis.",
3596	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3597	return false;
3598	}
3599
3600	IsScalableVectorizationAllowed = true;
3601	return true;
3602	}
3603
3604	ElementCount
3605	LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3606	if (!isScalableVectorizationAllowed())
3607	return ElementCount::getScalable(MinVal: `0`);
3608
3609	auto MaxScalableVF = ElementCount::getScalable(
3610	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3611	if (Legal->isSafeForAnyVectorWidth())
3612	return MaxScalableVF;
3613
3614	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3615	// Limit MaxScalableVF by the maximum safe dependence distance.
3616	MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3617
3618	if (!MaxScalableVF)
3619	reportVectorizationInfo(
3620	Msg: "Max legal vector width too small, scalable vectorization "
3621	"unfeasible.",
3622	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3623
3624	return MaxScalableVF;
3625	}
3626
3627	FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3628	unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3629	MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3630	unsigned SmallestType, WidestType;
3631	std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3632
3633	// Get the maximum safe dependence distance in bits computed by LAA.
3634	// It is computed by MaxVF sizeOf(type) * 8, where type is taken from*
3635	// the memory accesses that is most restrictive (involved in the smallest
3636	// dependence distance).
3637	unsigned MaxSafeElementsPowerOf2 =
3638	bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3639	if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3640	unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3641	MaxSafeElementsPowerOf2 =
3642	std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3643	}
3644	auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3645	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3646
3647	if (!Legal->isSafeForAnyVectorWidth())
3648	this->MaxSafeElements = MaxSafeElementsPowerOf2;
3649
3650	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3651	<< ".\n");
3652	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3653	<< ".\n");
3654
3655	// First analyze the UserVF, fall back if the UserVF should be ignored.
3656	if (UserVF) {
3657	auto MaxSafeUserVF =
3658	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3659
3660	if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3661	// If `VF=vscale x N` is safe, then so is `VF=N`
3662	if (UserVF.isScalable())
3663	return FixedScalableVFPair (
3664	ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3665
3666	return UserVF;
3667	}
3668
3669	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3670
3671	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3672	// is better to ignore the hint and let the compiler choose a suitable VF.
3673	if (!UserVF.isScalable()) {
3674	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3675	<< " is unsafe, clamping to max safe VF="
3676	<< MaxSafeFixedVF << ".\n");
3677	ORE->emit(RemarkBuilder: [&]() {
3678	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3679	TheLoop->getStartLoc(),
3680	TheLoop->getHeader())
3681	<< "User-specified vectorization factor "
3682	<< ore::NV ("UserVectorizationFactor", UserVF)
3683	<< " is unsafe, clamping to maximum safe vectorization factor "
3684	<< ore::NV ("VectorizationFactor", MaxSafeFixedVF);
3685	});
3686	return MaxSafeFixedVF;
3687	}
3688
3689	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3690	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3691	<< " is ignored because scalable vectors are not "
3692	"available.\n");
3693	ORE->emit(RemarkBuilder: [&]() {
3694	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3695	TheLoop->getStartLoc(),
3696	TheLoop->getHeader())
3697	<< "User-specified vectorization factor "
3698	<< ore::NV ("UserVectorizationFactor", UserVF)
3699	<< " is ignored because the target does not support scalable "
3700	"vectors. The compiler will pick a more suitable value.";
3701	});
3702	} else {
3703	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3704	<< " is unsafe. Ignoring scalable UserVF.\n");
3705	ORE->emit(RemarkBuilder: [&]() {
3706	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3707	TheLoop->getStartLoc(),
3708	TheLoop->getHeader())
3709	<< "User-specified vectorization factor "
3710	<< ore::NV ("UserVectorizationFactor", UserVF)
3711	<< " is unsafe. Ignoring the hint to let the compiler pick a "
3712	"more suitable value.";
3713	});
3714	}
3715	}
3716
3717	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3718	<< " / " << WidestType << " bits.\n");
3719
3720	FixedScalableVFPair Result(ElementCount::getFixed(MinVal: `1`),
3721	ElementCount::getScalable(MinVal: `0`));
3722	if (auto MaxVF =
3723	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3724	MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
3725	Result.FixedVF = MaxVF;
3726
3727	if (auto MaxVF =
3728	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3729	MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
3730	if (MaxVF.isScalable()) {
3731	Result.ScalableVF = MaxVF;
3732	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3733	<< "\n");
3734	}
3735
3736	return Result;
3737	}
3738
3739	FixedScalableVFPair
3740	LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3741	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3742	// TODO: It may be useful to do since it's still likely to be dynamically
3743	// uniform if the target can skip.
3744	reportVectorizationFailure(
3745	DebugMsg: "Not inserting runtime ptr check for divergent target",
3746	OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3747	ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3748	return FixedScalableVFPair::getNone();
3749	}
3750
3751	ScalarEvolution *SE = PSE.getSE();
3752	ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3753	unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3754	LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << `'\n'`);
3755	if (TC != ElementCount::getFixed(MinVal: MaxTC))
3756	LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << `'\n'`);
3757	if (TC.isScalar()) {
3758	reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3759	OREMsg: "loop trip count is one, irrelevant for vectorization",
3760	ORETag: "SingleIterationLoop", ORE, TheLoop);
3761	return FixedScalableVFPair::getNone();
3762	}
3763
3764	// If BTC matches the widest induction type and is -1 then the trip count
3765	// computation will wrap to 0 and the vector trip count will be 0. Do not try
3766	// to vectorize.
3767	const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3768	if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3769	BTC->getType()->getScalarSizeInBits() >=
3770	Legal->getWidestInductionType()->getScalarSizeInBits() &&
3771	SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3772	RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3773	reportVectorizationFailure(
3774	DebugMsg: "Trip count computation wrapped",
3775	OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3776	ORETag: "TripCountWrapped", ORE, TheLoop);
3777	return FixedScalableVFPair::getNone();
3778	}
3779
3780	switch (ScalarEpilogueStatus) {
3781	case CM_ScalarEpilogueAllowed:
3782	return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
3783	case CM_ScalarEpilogueNotAllowedUsePredicate:
3784	[[fallthrough]];
3785	case CM_ScalarEpilogueNotNeededUsePredicate:
3786	LLVM_DEBUG(
3787	dbgs() << "LV: vector predicate hint/switch found.\n"
3788	<< "LV: Not allowing scalar epilogue, creating predicated "
3789	<< "vector loop.\n");
3790	break;
3791	case CM_ScalarEpilogueNotAllowedLowTripLoop:
3792	// fallthrough as a special case of OptForSize
3793	case CM_ScalarEpilogueNotAllowedOptSize:
3794	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3795	LLVM_DEBUG(
3796	dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3797	else
3798	LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3799	<< "count.\n");
3800
3801	// Bail if runtime checks are required, which are not good when optimising
3802	// for size.
3803	if (runtimeChecksRequired())
3804	return FixedScalableVFPair::getNone();
3805
3806	break;
3807	}
3808
3809	// Now try the tail folding
3810
3811	// Invalidate interleave groups that require an epilogue if we can't mask
3812	// the interleave-group.
3813	if (!useMaskedInterleavedAccesses(TTI)) {
3814	assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3815	"No decisions should have been taken at this point");
3816	// Note: There is no need to invalidate any cost modeling decisions here, as
3817	// none were taken so far.
3818	InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3819	}
3820
3821	FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
3822
3823	// Avoid tail folding if the trip count is known to be a multiple of any VF
3824	// we choose.
3825	std::optional<unsigned> MaxPowerOf2RuntimeVF =
3826	MaxFactors.FixedVF.getFixedValue();
3827	if (MaxFactors.ScalableVF) {
3828	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3829	if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3830	MaxPowerOf2RuntimeVF = std::max<unsigned>(
3831	a: *MaxPowerOf2RuntimeVF,
3832	b: MaxVScale MaxFactors.ScalableVF.getKnownMinValue());
3833	} else
3834	MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3835	}
3836
3837	auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3838	// Return false if the loop is neither a single-latch-exit loop nor an
3839	// early-exit loop as tail-folding is not supported in that case.
3840	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3841	!Legal->hasUncountableEarlyExit())
3842	return false;
3843	unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3844	ScalarEvolution *SE = PSE.getSE();
3845	// Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3846	// with uncountable exits. For countable loops, the symbolic maximum must
3847	// remain identical to the known back-edge taken count.
3848	const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3849	assert((Legal->hasUncountableEarlyExit() \|\|
3850	BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3851	"Invalid loop count");
3852	const SCEV *ExitCount = SE->getAddExpr(
3853	LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3854	const SCEV *Rem = SE->getURemExpr(
3855	LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3856	RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3857	return Rem->isZero();
3858	};
3859
3860	if (MaxPowerOf2RuntimeVF > `0u`) {
3861	assert((UserVF.isNonZero() \|\| isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3862	"MaxFixedVF must be a power of 2");
3863	if (NoScalarEpilogueNeeded (*MaxPowerOf2RuntimeVF)) {
3864	// Accept MaxFixedVF if we do not have a tail.
3865	LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3866	return MaxFactors;
3867	}
3868	}
3869
3870	auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3871	if (ExpectedTC && ExpectedTC ->isFixed() &&
3872	ExpectedTC ->getFixedValue() <=
3873	TTI.getMinTripCountTailFoldingThreshold()) {
3874	if (MaxPowerOf2RuntimeVF > `0u`) {
3875	// If we have a low-trip-count, and the fixed-width VF is known to divide
3876	// the trip count but the scalable factor does not, use the fixed-width
3877	// factor in preference to allow the generation of a non-predicated loop.
3878	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3879	NoScalarEpilogueNeeded (MaxFactors.FixedVF.getFixedValue())) {
3880	LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3881	"remain for any chosen VF.\n");
3882	MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: `0`);
3883	return MaxFactors;
3884	}
3885	}
3886
3887	reportVectorizationFailure(
3888	DebugMsg: "The trip count is below the minial threshold value.",
3889	OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3890	ORE, TheLoop);
3891	return FixedScalableVFPair::getNone();
3892	}
3893
3894	// If we don't know the precise trip count, or if the trip count that we
3895	// found modulo the vectorization factor is not zero, try to fold the tail
3896	// by masking.
3897	// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3898	bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3899	setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3900	if (foldTailByMasking()) {
3901	if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3902	LLVM_DEBUG(
3903	dbgs()
3904	<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3905	"try to generate VP Intrinsics with scalable vector "
3906	"factors only.\n");
3907	// Tail folded loop using VP intrinsics restricts the VF to be scalable
3908	// for now.
3909	// TODO: extend it for fixed vectors, if required.
3910	assert(ContainsScalableVF && "Expected scalable vector factor.");
3911
3912	MaxFactors.FixedVF = ElementCount::getFixed(MinVal: `1`);
3913	}
3914	return MaxFactors;
3915	}
3916
3917	// If there was a tail-folding hint/switch, but we can't fold the tail by
3918	// masking, fallback to a vectorization with a scalar epilogue.
3919	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3920	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3921	"scalar epilogue instead.\n");
3922	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3923	return MaxFactors;
3924	}
3925
3926	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3927	LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3928	return FixedScalableVFPair::getNone();
3929	}
3930
3931	if (TC.isZero()) {
3932	reportVectorizationFailure(
3933	DebugMsg: "unable to calculate the loop count due to complex control flow",
3934	ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3935	return FixedScalableVFPair::getNone();
3936	}
3937
3938	reportVectorizationFailure(
3939	DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3940	OREMsg: "cannot optimize for size and vectorize at the same time. "
3941	"Enable vectorization of this loop with '#pragma clang loop "
3942	"vectorize(enable)' when compiling with -Os/-Oz",
3943	ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3944	return FixedScalableVFPair::getNone();
3945	}
3946
3947	bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3948	return useMaxBandwidth(RegKind: VF.isScalable()
3949	? TargetTransformInfo::RGK_ScalableVector
3950	: TargetTransformInfo::RGK_FixedWidthVector);
3951	}
3952
3953	bool LoopVectorizationCostModel::useMaxBandwidth(
3954	TargetTransformInfo::RegisterKind RegKind) {
3955	return MaximizeBandwidth \|\| (MaximizeBandwidth.getNumOccurrences() == `0` &&
3956	(TTI.shouldMaximizeVectorBandwidth(K: RegKind) \|\|
3957	(UseWiderVFIfCallVariantsPresent &&
3958	Legal->hasVectorCallVariants())));
3959	}
3960
3961	ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3962	unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3963	ElementCount MaxSafeVF, bool FoldTailByMasking) {
3964	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3965	const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3966	K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3967	: TargetTransformInfo::RGK_FixedWidthVector);
3968
3969	// Convenience function to return the minimum of two ElementCounts.
3970	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3971	assert((LHS.isScalable() == RHS.isScalable()) &&
3972	"Scalable flags must match");
3973	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3974	};
3975
3976	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3977	// Note that both WidestRegister and WidestType may not be a powers of 2.
3978	auto MaxVectorElementCount = ElementCount::get(
3979	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3980	Scalable: ComputeScalableMaxVF);
3981	MaxVectorElementCount = MinVF (MaxVectorElementCount, MaxSafeVF);
3982	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3983	<< (MaxVectorElementCount * WidestType) << " bits.\n");
3984
3985	if (!MaxVectorElementCount) {
3986	LLVM_DEBUG(dbgs() << "LV: The target has no "
3987	<< (ComputeScalableMaxVF ? "scalable" : "fixed")
3988	<< " vector registers.\n");
3989	return ElementCount::getFixed(MinVal: `1`);
3990	}
3991
3992	unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3993	if (MaxVectorElementCount.isScalable() &&
3994	TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3995	auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3996	auto Min = Attr.getVScaleRangeMin();
3997	WidestRegisterMinEC *= Min;
3998	}
3999
4000	// When a scalar epilogue is required, at least one iteration of the scalar
4001	// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4002	// max VF that results in a dead vector loop.
4003	if (MaxTripCount > `0` && requiresScalarEpilogue(IsVectorizing: true))
4004	MaxTripCount -= `1`;
4005
4006	if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4007	(!FoldTailByMasking \|\| isPowerOf2_32(Value: MaxTripCount))) {
4008	// If upper bound loop trip count (TC) is known at compile time there is no
4009	// point in choosing VF greater than TC (as done in the loop below). Select
4010	// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4011	// scalable, we only fall back on a fixed VF when the TC is less than or
4012	// equal to the known number of lanes.
4013	auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
4014	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4015	"exceeding the constant trip count: "
4016	<< ClampedUpperTripCount << "\n");
4017	return ElementCount::get(
4018	MinVal: ClampedUpperTripCount,
4019	Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4020	}
4021
4022	TargetTransformInfo::RegisterKind RegKind =
4023	ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4024	: TargetTransformInfo::RGK_FixedWidthVector;
4025	ElementCount MaxVF = MaxVectorElementCount;
4026	if (useMaxBandwidth(RegKind)) {
4027	auto MaxVectorElementCountMaxBW = ElementCount::get(
4028	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
4029	Scalable: ComputeScalableMaxVF);
4030	MaxVF = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
4031
4032	if (ElementCount MinVF =
4033	TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
4034	if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
4035	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4036	<< ") with target's minimum: " << MinVF << `'\n'`);
4037	MaxVF = MinVF;
4038	}
4039	}
4040
4041	// Invalidate any widening decisions we might have made, in case the loop
4042	// requires prediction (decided later), but we have already made some
4043	// load/store widening decisions.
4044	invalidateCostModelingDecisions();
4045	}
4046	return MaxVF;
4047	}
4048
4049	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4050	const VectorizationFactor &B,
4051	const unsigned MaxTripCount,
4052	bool HasTail) const {
4053	InstructionCost CostA = A.Cost;
4054	InstructionCost CostB = B.Cost;
4055
4056	// Improve estimate for the vector width if it is scalable.
4057	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4058	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4059	if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
4060	if (A.Width.isScalable())
4061	EstimatedWidthA = VScale;
4062	if (B.Width.isScalable())
4063	EstimatedWidthB = VScale;
4064	}
4065
4066	// When optimizing for size choose whichever is smallest, which will be the
4067	// one with the smallest cost for the whole loop. On a tie pick the larger
4068	// vector width, on the assumption that throughput will be greater.
4069	if (CM.CostKind == TTI::TCK_CodeSize)
4070	return CostA < CostB \|\|
4071	(CostA == CostB && EstimatedWidthA > EstimatedWidthB);
4072
4073	// Assume vscale may be larger than 1 (or the value being tuned for),
4074	// so that scalable vectorization is slightly favorable over fixed-width
4075	// vectorization.
4076	bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4077	A.Width.isScalable() && !B.Width.isScalable();
4078
4079	auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4080	const InstructionCost &RHS) {
4081	return PreferScalable ? LHS <= RHS : LHS < RHS;
4082	};
4083
4084	// To avoid the need for FP division:
4085	// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4086	// <=> (CostA EstimatedWidthB) < (CostB * EstimatedWidthA)*
4087	if (!MaxTripCount)
4088	return CmpFn (CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4089
4090	auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
4091	InstructionCost VectorCost,
4092	InstructionCost ScalarCost) {
4093	// If the trip count is a known (possibly small) constant, the trip count
4094	// will be rounded up to an integer number of iterations under
4095	// FoldTailByMasking. The total cost in that case will be
4096	// VecCostceil(TripCount/VF). When not folding the tail, the total*
4097	// cost will be VecCostfloor(TC/VF) + ScalarCost(TC%VF). There will be
4098	// some extra overheads, but for the purpose of comparing the costs of
4099	// different VFs we can use this to compare the total loop-body cost
4100	// expected after vectorization.
4101	if (HasTail)
4102	return VectorCost * (MaxTripCount / VF) +
4103	ScalarCost * (MaxTripCount % VF);
4104	return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
4105	};
4106
4107	auto RTCostA = GetCostForTC (EstimatedWidthA, CostA, A.ScalarCost);
4108	auto RTCostB = GetCostForTC (EstimatedWidthB, CostB, B.ScalarCost);
4109	return CmpFn (RTCostA, RTCostB);
4110	}
4111
4112	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4113	const VectorizationFactor &B,
4114	bool HasTail) const {
4115	const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4116	return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4117	HasTail);
4118	}
4119
4120	void LoopVectorizationPlanner::emitInvalidCostRemarks(
4121	OptimizationRemarkEmitter *ORE) {
4122	using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4123	SmallVector<RecipeVFPair> InvalidCosts;
4124	for (const auto &Plan : VPlans) {
4125	for (ElementCount VF : Plan ->vectorFactors()) {
4126	// The VPlan-based cost model is designed for computing vector cost.
4127	// Querying VPlan-based cost model with a scarlar VF will cause some
4128	// errors because we expect the VF is vector for most of the widen
4129	// recipes.
4130	if (VF.isScalar())
4131	continue;
4132
4133	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4134	CM, CM.CostKind);
4135	precomputeCosts(Plan&: *Plan, VF, CostCtx);
4136	auto Iter = vp_depth_first_deep(G: Plan ->getVectorLoopRegion()->getEntry());
4137	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4138	for (auto &R : *VPBB) {
4139	if (!R.cost(VF, Ctx&: CostCtx).isValid())
4140	InvalidCosts.emplace_back(Args: &R, Args&: VF);
4141	}
4142	}
4143	}
4144	}
4145	if (InvalidCosts.empty())
4146	return;
4147
4148	// Emit a report of VFs with invalid costs in the loop.
4149
4150	// Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4151	DenseMap<VPRecipeBase , unsigned*> Numbering;
4152	unsigned I = `0`;
4153	for (auto &Pair : InvalidCosts)
4154	if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4155	++I;
4156
4157	// Sort the list, first on recipe(number) then on VF.
4158	sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4159	unsigned NA = Numbering [A.first];
4160	unsigned NB = Numbering [B.first];
4161	if (NA != NB)
4162	return NA < NB;
4163	return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4164	});
4165
4166	// For a list of ordered recipe-VF pairs:
4167	// [(load, VF1), (load, VF2), (store, VF1)]
4168	// group the recipes together to emit separate remarks for:
4169	// load (VF1, VF2)
4170	// store (VF1)
4171	auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4172	auto Subset = ArrayRef<RecipeVFPair>();
4173	do {
4174	if (Subset.empty())
4175	Subset = Tail.take_front(N: `1`);
4176
4177	VPRecipeBase *R = Subset.front().first;
4178
4179	unsigned Opcode =
4180	TypeSwitch<const VPRecipeBase , unsigned*>(R)
4181	.Case<VPHeaderPHIRecipe>(
4182	caseFn: [](const auto R) { return* Instruction::PHI; })
4183	.Case<VPWidenSelectRecipe>(
4184	caseFn: [](const auto R) { return* Instruction::Select; })
4185	.Case<VPWidenStoreRecipe>(
4186	caseFn: [](const auto R) { return* Instruction::Store; })
4187	.Case<VPWidenLoadRecipe>(
4188	caseFn: [](const auto R) { return* Instruction::Load; })
4189	.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4190	caseFn: [](const auto R) { return* Instruction::Call; })
4191	.Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4192	VPWidenCastRecipe>(
4193	caseFn: [](const auto R) { return* R->getOpcode(); })
4194	.Case<VPInterleaveRecipe>(caseFn: [](const VPInterleaveRecipe *R) {
4195	return R->getStoredValues().empty() ? Instruction::Load
4196	: Instruction::Store;
4197	});
4198
4199	// If the next recipe is different, or if there are no other pairs,
4200	// emit a remark for the collated subset. e.g.
4201	// [(load, VF1), (load, VF2))]
4202	// to emit:
4203	// remark: invalid costs for 'load' at VF=(VF1, VF2)
4204	if (Subset == Tail \|\| Tail [Subset.size()].first != R) {
4205	std::string OutString;
4206	raw_string_ostream OS(OutString);
4207	assert(!Subset.empty() && "Unexpected empty range");
4208	OS << "Recipe with invalid costs prevented vectorization at VF=(";
4209	for (const auto &Pair : Subset)
4210	OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4211	OS << "):";
4212	if (Opcode == Instruction::Call) {
4213	StringRef Name = "";
4214	if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4215	Name = Int->getIntrinsicName();
4216	} else {
4217	auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4218	Function *CalledFn =
4219	WidenCall ? WidenCall->getCalledScalarFunction()
4220	: cast<Function>(Val: R->getOperand(N: R->getNumOperands() - `1`)
4221	->getLiveInIRValue());
4222	Name = CalledFn->getName();
4223	}
4224	OS << " call to " << Name;
4225	} else
4226	OS << " " << Instruction::getOpcodeName(Opcode);
4227	reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4228	DL: R->getDebugLoc());
4229	Tail = Tail.drop_front(N: Subset.size());
4230	Subset = {};
4231	} else
4232	// Grow the subset by one element
4233	Subset = Tail.take_front(N: Subset.size() + `1`);
4234	} while (!Tail.empty());
4235	}
4236
4237	/// Check if any recipe of \p Plan will generate a vector value, which will be
4238	/// assigned a vector register.
4239	static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4240	const TargetTransformInfo &TTI) {
4241	assert(VF.isVector() && "Checking a scalar VF?");
4242	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4243	DenseSet<VPRecipeBase *> EphemeralRecipes;
4244	collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4245	// Set of already visited types.
4246	DenseSet<Type *> Visited;
4247	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4248	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4249	for (VPRecipeBase &R : *VPBB) {
4250	if (EphemeralRecipes.contains(V: &R))
4251	continue;
4252	// Continue early if the recipe is considered to not produce a vector
4253	// result. Note that this includes VPInstruction where some opcodes may
4254	// produce a vector, to preserve existing behavior as VPInstructions model
4255	// aspects not directly mapped to existing IR instructions.
4256	switch (R.getVPDefID()) {
4257	case VPDef::VPDerivedIVSC:
4258	case VPDef::VPScalarIVStepsSC:
4259	case VPDef::VPReplicateSC:
4260	case VPDef::VPInstructionSC:
4261	case VPDef::VPCanonicalIVPHISC:
4262	case VPDef::VPVectorPointerSC:
4263	case VPDef::VPVectorEndPointerSC:
4264	case VPDef::VPExpandSCEVSC:
4265	case VPDef::VPEVLBasedIVPHISC:
4266	case VPDef::VPPredInstPHISC:
4267	case VPDef::VPBranchOnMaskSC:
4268	continue;
4269	case VPDef::VPReductionSC:
4270	case VPDef::VPActiveLaneMaskPHISC:
4271	case VPDef::VPWidenCallSC:
4272	case VPDef::VPWidenCanonicalIVSC:
4273	case VPDef::VPWidenCastSC:
4274	case VPDef::VPWidenGEPSC:
4275	case VPDef::VPWidenIntrinsicSC:
4276	case VPDef::VPWidenSC:
4277	case VPDef::VPWidenSelectSC:
4278	case VPDef::VPBlendSC:
4279	case VPDef::VPFirstOrderRecurrencePHISC:
4280	case VPDef::VPHistogramSC:
4281	case VPDef::VPWidenPHISC:
4282	case VPDef::VPWidenIntOrFpInductionSC:
4283	case VPDef::VPWidenPointerInductionSC:
4284	case VPDef::VPReductionPHISC:
4285	case VPDef::VPInterleaveSC:
4286	case VPDef::VPWidenLoadEVLSC:
4287	case VPDef::VPWidenLoadSC:
4288	case VPDef::VPWidenStoreEVLSC:
4289	case VPDef::VPWidenStoreSC:
4290	break;
4291	default:
4292	llvm_unreachable("unhandled recipe");
4293	}
4294
4295	auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4296	unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4297	if (!NumLegalParts)
4298	return false;
4299	if (VF.isScalable()) {
4300	// <vscale x 1 x iN> is assumed to be profitable over iN because
4301	// scalable registers are a distinct register class from scalar
4302	// ones. If we ever find a target which wants to lower scalable
4303	// vectors back to scalars, we'll need to update this code to
4304	// explicitly ask TTI about the register class uses for each part.
4305	return NumLegalParts <= VF.getKnownMinValue();
4306	}
4307	// Two or more elements that share a register - are vectorized.
4308	return NumLegalParts < VF.getFixedValue();
4309	};
4310
4311	// If no def nor is a store, e.g., branches, continue - no value to check.
4312	if (R.getNumDefinedValues() == `0` &&
4313	!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4314	Val: &R))
4315	continue;
4316	// For multi-def recipes, currently only interleaved loads, suffice to
4317	// check first def only.
4318	// For stores check their stored value; for interleaved stores suffice
4319	// the check first stored value only. In all cases this is the second
4320	// operand.
4321	VPValue *ToCheck =
4322	R.getNumDefinedValues() >= `1` ? R.getVPValue(I: `0`) : R.getOperand(N: `1`);
4323	Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4324	if (!Visited.insert(V: {ScalarTy}).second)
4325	continue;
4326	Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4327	if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4328	return true;
4329	}
4330	}
4331
4332	return false;
4333	}
4334
4335	static bool hasReplicatorRegion(VPlan &Plan) {
4336	return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4337	G: Plan.getVectorLoopRegion()->getEntry())),
4338	P: [](auto VPRB) { return* VPRB->isReplicator(); });
4339	}
4340
4341	#ifndef NDEBUG
4342	VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4343	InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(`1`));
4344	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4345	assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4346	assert(
4347	any_of(VPlans,
4348	[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4349	"Expected Scalar VF to be a candidate");
4350
4351	const VectorizationFactor ScalarCost(ElementCount::getFixed(`1`), ExpectedCost,
4352	ExpectedCost);
4353	VectorizationFactor ChosenFactor = ScalarCost;
4354
4355	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4356	if (ForceVectorization &&
4357	(VPlans.size() > `1` \|\| !VPlans[`0`]->hasScalarVFOnly())) {
4358	// Ignore scalar width, because the user explicitly wants vectorization.
4359	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
4360	// evaluation.
4361	ChosenFactor.Cost = InstructionCost::getMax();
4362	}
4363
4364	for (auto &P : VPlans) {
4365	ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4366	P->vectorFactors().end());
4367
4368	SmallVector<VPRegisterUsage, `8`> RUs;
4369	if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) \|\|
4370	CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4371	RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4372
4373	for (unsigned I = `0`; I < VFs.size(); I++) {
4374	ElementCount VF = VFs[I];
4375	// The cost for scalar VF=1 is already calculated, so ignore it.
4376	if (VF.isScalar())
4377	continue;
4378
4379	/// Don't consider the VF if it exceeds the number of registers for the
4380	/// target.
4381	if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4382	continue;
4383
4384	InstructionCost C = CM.expectedCost(VF);
4385
4386	// Add on other costs that are modelled in VPlan, but not in the legacy
4387	// cost model.
4388	VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4389	CM, CM.CostKind);
4390	VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4391	assert(VectorRegion && "Expected to have a vector region!");
4392	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4393	vp_depth_first_shallow(VectorRegion->getEntry()))) {
4394	for (VPRecipeBase &R : *VPBB) {
4395	auto *VPI = dyn_cast<VPInstruction>(&R);
4396	if (!VPI)
4397	continue;
4398	switch (VPI->getOpcode()) {
4399	case VPInstruction::ActiveLaneMask:
4400	case VPInstruction::ExplicitVectorLength:
4401	C += VPI->cost(VF, CostCtx);
4402	break;
4403	default:
4404	break;
4405	}
4406	}
4407	}
4408
4409	VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4410	unsigned Width =
4411	getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4412	LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4413	<< " costs: " << (Candidate.Cost / Width));
4414	if (VF.isScalable())
4415	LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4416	<< CM.getVScaleForTuning().value_or(`1`) << ")");
4417	LLVM_DEBUG(dbgs() << ".\n");
4418
4419	if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4420	LLVM_DEBUG(
4421	dbgs()
4422	<< "LV: Not considering vector loop of width " << VF
4423	<< " because it will not generate any vector instructions.\n");
4424	continue;
4425	}
4426
4427	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4428	LLVM_DEBUG(
4429	dbgs()
4430	<< "LV: Not considering vector loop of width " << VF
4431	<< " because it would cause replicated blocks to be generated,"
4432	<< " which isn't allowed when optimizing for size.\n");
4433	continue;
4434	}
4435
4436	if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4437	ChosenFactor = Candidate;
4438	}
4439	}
4440
4441	if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4442	reportVectorizationFailure(
4443	"There are conditional stores.",
4444	"store that is conditionally executed prevents vectorization",
4445	"ConditionalStore", ORE, OrigLoop);
4446	ChosenFactor = ScalarCost;
4447	}
4448
4449	LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4450	!isMoreProfitable(ChosenFactor, ScalarCost,
4451	!CM.foldTailByMasking())) dbgs()
4452	<< "LV: Vectorization seems to be not beneficial, "
4453	<< "but was forced by a user.\n");
4454	return ChosenFactor;
4455	}
4456	#endif
4457
4458	bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4459	ElementCount VF) const {
4460	// Cross iteration phis such as reductions need special handling and are
4461	// currently unsupported.
4462	if (any_of(Range: OrigLoop->getHeader()->phis(),
4463	P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); }))
4464	return false;
4465
4466	// Phis with uses outside of the loop require special handling and are
4467	// currently unsupported.
4468	for (const auto &Entry : Legal->getInductionVars()) {
4469	// Look for uses of the value of the induction at the last iteration.
4470	Value *PostInc =
4471	Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4472	for (User *U : PostInc->users())
4473	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4474	return false;
4475	// Look for uses of penultimate value of the induction.
4476	for (User *U : Entry.first->users())
4477	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4478	return false;
4479	}
4480
4481	// Epilogue vectorization code has not been auditted to ensure it handles
4482	// non-latch exits properly. It may be fine, but it needs auditted and
4483	// tested.
4484	// TODO: Add support for loops with an early exit.
4485	if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4486	return false;
4487
4488	return true;
4489	}
4490
4491	bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4492	const ElementCount VF, const unsigned IC) const {
4493	// FIXME: We need a much better cost-model to take different parameters such
4494	// as register pressure, code size increase and cost of extra branches into
4495	// account. For now we apply a very crude heuristic and only consider loops
4496	// with vectorization factors larger than a certain value.
4497
4498	// Allow the target to opt out entirely.
4499	if (!TTI.preferEpilogueVectorization())
4500	return false;
4501
4502	// We also consider epilogue vectorization unprofitable for targets that don't
4503	// consider interleaving beneficial (eg. MVE).
4504	if (TTI.getMaxInterleaveFactor(VF) <= `1`)
4505	return false;
4506
4507	// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4508	// VFs when deciding profitability.
4509	// See related "TODO: extend to support scalable VFs." in
4510	// selectEpilogueVectorizationFactor.
4511	unsigned Multiplier = VF.isFixed() ? IC : `1`;
4512	unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > `0`
4513	? EpilogueVectorizationMinVF
4514	: TTI.getEpilogueVectorizationMinVF();
4515	return getEstimatedRuntimeVF(VF: VF * Multiplier, VScale: VScaleForTuning) >=
4516	MinVFThreshold;
4517	}
4518
4519	VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4520	const ElementCount MainLoopVF, unsigned IC) {
4521	VectorizationFactor Result = VectorizationFactor::Disabled();
4522	if (!EnableEpilogueVectorization) {
4523	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4524	return Result;
4525	}
4526
4527	if (!CM.isScalarEpilogueAllowed()) {
4528	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4529	"epilogue is allowed.\n");
4530	return Result;
4531	}
4532
4533	// Not really a cost consideration, but check for unsupported cases here to
4534	// simplify the logic.
4535	if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4536	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4537	"is not a supported candidate.\n");
4538	return Result;
4539	}
4540
4541	if (EpilogueVectorizationForceVF > `1`) {
4542	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4543	ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4544	if (hasPlanWithVF(VF: ForcedEC))
4545	return {ForcedEC, `0`, `0`};
4546
4547	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4548	"viable.\n");
4549	return Result;
4550	}
4551
4552	if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4553	LLVM_DEBUG(
4554	dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4555	return Result;
4556	}
4557
4558	if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4559	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4560	"this loop\n");
4561	return Result;
4562	}
4563
4564	// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4565	// the main loop handles 8 lanes per iteration. We could still benefit from
4566	// vectorizing the epilogue loop with VF=4.
4567	ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4568	MinVal: getEstimatedRuntimeVF(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4569
4570	ScalarEvolution &SE = *PSE.getSE();
4571	Type *TCType = Legal->getWidestInductionType();
4572	const SCEV RemainingIterations = nullptr*;
4573	unsigned MaxTripCount = `0`;
4574	for (auto &NextVF : ProfitableVFs) {
4575	// Skip candidate VFs without a corresponding VPlan.
4576	if (!hasPlanWithVF(VF: NextVF.Width))
4577	continue;
4578
4579	// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4580	// vectors) or > the VF of the main loop (fixed vectors).
4581	if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4582	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) \|\|
4583	(NextVF.Width.isScalable() &&
4584	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) \|\|
4585	(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4586	ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4587	continue;
4588
4589	// If NextVF is greater than the number of remaining iterations, the
4590	// epilogue loop would be dead. Skip such factors.
4591	if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4592	// TODO: extend to support scalable VFs.
4593	if (!RemainingIterations) {
4594	const SCEV *TC = vputils::getSCEVExprForVPValue(
4595	V: getPlanFor(VF: NextVF.Width).getTripCount(), SE);
4596	assert(!isa<SCEVCouldNotCompute>(TC) &&
4597	"Trip count SCEV must be computable");
4598	RemainingIterations = SE.getURemExpr(
4599	LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getFixedValue() * IC));
4600	MaxTripCount = MainLoopVF.getFixedValue() * IC - `1`;
4601	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4602	RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4603	MaxTripCount =
4604	SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4605	}
4606	LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4607	<< MaxTripCount << "\n");
4608	}
4609	if (SE.isKnownPredicate(
4610	Pred: CmpInst::ICMP_UGT,
4611	LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getFixedValue()),
4612	RHS: RemainingIterations))
4613	continue;
4614	}
4615
4616	if (Result.Width.isScalar() \|\|
4617	isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking()))
4618	Result = NextVF;
4619	}
4620
4621	if (Result != VectorizationFactor::Disabled())
4622	LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4623	<< Result.Width << "\n");
4624	return Result;
4625	}
4626
4627	std::pair<unsigned, unsigned>
4628	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4629	unsigned MinWidth = -`1U`;
4630	unsigned MaxWidth = `8`;
4631	const DataLayout &DL = TheFunction->getDataLayout();
4632	// For in-loop reductions, no element types are added to ElementTypesInLoop
4633	// if there are no loads/stores in the loop. In this case, check through the
4634	// reduction variables to determine the maximum width.
4635	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4636	for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4637	const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4638	// When finding the min width used by the recurrence we need to account
4639	// for casts on the input operands of the recurrence.
4640	MinWidth = std::min(
4641	a: MinWidth,
4642	b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4643	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4644	MaxWidth = std::max(a: MaxWidth,
4645	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4646	}
4647	} else {
4648	for (Type *T : ElementTypesInLoop) {
4649	MinWidth = std::min<unsigned>(
4650	a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4651	MaxWidth = std::max<unsigned>(
4652	a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4653	}
4654	}
4655	return {MinWidth, MaxWidth};
4656	}
4657
4658	void LoopVectorizationCostModel::collectElementTypesForWidening() {
4659	ElementTypesInLoop.clear();
4660	// For each block.
4661	for (BasicBlock *BB : TheLoop->blocks()) {
4662	// For each instruction in the loop.
4663	for (Instruction &I : BB->instructionsWithoutDebug()) {
4664	Type *T = I.getType();
4665
4666	// Skip ignored values.
4667	if (ValuesToIgnore.count(Ptr: &I))
4668	continue;
4669
4670	// Only examine Loads, Stores and PHINodes.
4671	if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4672	continue;
4673
4674	// Examine PHI nodes that are reduction variables. Update the type to
4675	// account for the recurrence type.
4676	if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4677	if (!Legal->isReductionVariable(PN))
4678	continue;
4679	const RecurrenceDescriptor &RdxDesc =
4680	Legal->getReductionVars().find(Key: PN)->second;
4681	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
4682	TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4683	Ty: RdxDesc.getRecurrenceType()))
4684	continue;
4685	T = RdxDesc.getRecurrenceType();
4686	}
4687
4688	// Examine the stored values.
4689	if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4690	T = ST->getValueOperand()->getType();
4691
4692	assert(T->isSized() &&
4693	"Expected the load/store/recurrence type to be sized");
4694
4695	ElementTypesInLoop.insert(Ptr: T);
4696	}
4697	}
4698	}
4699
4700	unsigned
4701	LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4702	InstructionCost LoopCost) {
4703	// -- The interleave heuristics --
4704	// We interleave the loop in order to expose ILP and reduce the loop overhead.
4705	// There are many micro-architectural considerations that we can't predict
4706	// at this level. For example, frontend pressure (on decode or fetch) due to
4707	// code size, or the number and capabilities of the execution ports.
4708	//
4709	// We use the following heuristics to select the interleave count:
4710	// 1. If the code has reductions, then we interleave to break the cross
4711	// iteration dependency.
4712	// 2. If the loop is really small, then we interleave to reduce the loop
4713	// overhead.
4714	// 3. We don't interleave if we think that we will spill registers to memory
4715	// due to the increased register pressure.
4716
4717	if (!isScalarEpilogueAllowed())
4718	return `1`;
4719
4720	// Do not interleave if EVL is preferred and no User IC is specified.
4721	if (foldTailWithEVL()) {
4722	LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4723	"Unroll factor forced to be 1.\n");
4724	return `1`;
4725	}
4726
4727	// We used the distance for the interleave count.
4728	if (!Legal->isSafeForAnyVectorWidth())
4729	return `1`;
4730
4731	// We don't attempt to perform interleaving for loops with uncountable early
4732	// exits because the VPInstruction::AnyOf code cannot currently handle
4733	// multiple parts.
4734	if (Legal->hasUncountableEarlyExit())
4735	return `1`;
4736
4737	const bool HasReductions = !Legal->getReductionVars().empty();
4738
4739	// If we did not calculate the cost for VF (because the user selected the VF)
4740	// then we calculate the cost of VF here.
4741	if (LoopCost == `0`) {
4742	LoopCost = expectedCost(VF);
4743	assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4744
4745	// Loop body is free and there is no need for interleaving.
4746	if (LoopCost == `0`)
4747	return `1`;
4748	}
4749
4750	VPRegisterUsage R =
4751	calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore)[`0`];
4752	// We divide by these constants so assume that we have at least one
4753	// instruction that uses at least one register.
4754	for (auto &Pair : R.MaxLocalUsers) {
4755	Pair.second = std::max(a: Pair.second, b: `1U`);
4756	}
4757
4758	// We calculate the interleave count using the following formula.
4759	// Subtract the number of loop invariants from the number of available
4760	// registers. These registers are used by all of the interleaved instances.
4761	// Next, divide the remaining registers by the number of registers that is
4762	// required by the loop, in order to estimate how many parallel instances
4763	// fit without causing spills. All of this is rounded down if necessary to be
4764	// a power of two. We want power of two interleave count to simplify any
4765	// addressing operations or alignment considerations.
4766	// We also want power of two interleave counts to ensure that the induction
4767	// variable of the vector loop wraps to zero, when tail is folded by masking;
4768	// this currently happens when OptForSize, in which case IC is set to 1 above.
4769	unsigned IC = UINT_MAX;
4770
4771	for (const auto &Pair : R.MaxLocalUsers) {
4772	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4773	LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4774	<< " registers of "
4775	<< TTI.getRegisterClassName(Pair.first)
4776	<< " register class\n");
4777	if (VF.isScalar()) {
4778	if (ForceTargetNumScalarRegs.getNumOccurrences() > `0`)
4779	TargetNumRegisters = ForceTargetNumScalarRegs;
4780	} else {
4781	if (ForceTargetNumVectorRegs.getNumOccurrences() > `0`)
4782	TargetNumRegisters = ForceTargetNumVectorRegs;
4783	}
4784	unsigned MaxLocalUsers = Pair.second;
4785	unsigned LoopInvariantRegs = `0`;
4786	if (R.LoopInvariantRegs.contains(Key: Pair.first))
4787	LoopInvariantRegs = R.LoopInvariantRegs [Pair.first];
4788
4789	unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4790	MaxLocalUsers);
4791	// Don't count the induction variable as interleaved.
4792	if (EnableIndVarRegisterHeur) {
4793	TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - `1`) /
4794	std::max(a: `1U`, b: (MaxLocalUsers - `1`)));
4795	}
4796
4797	IC = std::min(a: IC, b: TmpIC);
4798	}
4799
4800	// Clamp the interleave ranges to reasonable counts.
4801	unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4802
4803	// Check if the user has overridden the max.
4804	if (VF.isScalar()) {
4805	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > `0`)
4806	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4807	} else {
4808	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > `0`)
4809	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4810	}
4811
4812	unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScale: VScaleForTuning);
4813
4814	// Try to get the exact trip count, or an estimate based on profiling data or
4815	// ConstantMax from PSE, failing that.
4816	if (auto BestKnownTC = getSmallBestKnownTC(PSE, L: TheLoop)) {
4817	// At least one iteration must be scalar when this constraint holds. So the
4818	// maximum available iterations for interleaving is one less.
4819	unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
4820	? BestKnownTC ->getFixedValue() - `1`
4821	: BestKnownTC ->getFixedValue();
4822
4823	unsigned InterleaveCountLB = bit_floor(Value: std::max(
4824	a: `1u`, b: std::min(a: AvailableTC / (EstimatedVF * `2`), b: MaxInterleaveCount)));
4825
4826	if (getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop).isNonZero()) {
4827	// If the best known trip count is exact, we select between two
4828	// prospective ICs, where
4829	//
4830	// 1) the aggressive IC is capped by the trip count divided by VF
4831	// 2) the conservative IC is capped by the trip count divided by (VF 2)*
4832	//
4833	// The final IC is selected in a way that the epilogue loop trip count is
4834	// minimized while maximizing the IC itself, so that we either run the
4835	// vector loop at least once if it generates a small epilogue loop, or
4836	// else we run the vector loop at least twice.
4837
4838	unsigned InterleaveCountUB = bit_floor(Value: std::max(
4839	a: `1u`, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4840	MaxInterleaveCount = InterleaveCountLB;
4841
4842	if (InterleaveCountUB != InterleaveCountLB) {
4843	unsigned TailTripCountUB =
4844	(AvailableTC % (EstimatedVF * InterleaveCountUB));
4845	unsigned TailTripCountLB =
4846	(AvailableTC % (EstimatedVF * InterleaveCountLB));
4847	// If both produce same scalar tail, maximize the IC to do the same work
4848	// in fewer vector loop iterations
4849	if (TailTripCountUB == TailTripCountLB)
4850	MaxInterleaveCount = InterleaveCountUB;
4851	}
4852	} else {
4853	// If trip count is an estimated compile time constant, limit the
4854	// IC to be capped by the trip count divided by VF 2, such that the*
4855	// vector loop runs at least twice to make interleaving seem profitable
4856	// when there is an epilogue loop present. Since exact Trip count is not
4857	// known we choose to be conservative in our IC estimate.
4858	MaxInterleaveCount = InterleaveCountLB;
4859	}
4860	}
4861
4862	assert(MaxInterleaveCount > `0` &&
4863	"Maximum interleave count must be greater than 0");
4864
4865	// Clamp the calculated IC to be between the 1 and the max interleave count
4866	// that the target and trip count allows.
4867	if (IC > MaxInterleaveCount)
4868	IC = MaxInterleaveCount;
4869	else
4870	// Make sure IC is greater than 0.
4871	IC = std::max(a: `1u`, b: IC);
4872
4873	assert(IC > `0` && "Interleave count must be greater than 0.");
4874
4875	// Interleave if we vectorized this loop and there is a reduction that could
4876	// benefit from interleaving.
4877	if (VF.isVector() && HasReductions) {
4878	LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4879	return IC;
4880	}
4881
4882	// For any scalar loop that either requires runtime checks or predication we
4883	// are better off leaving this to the unroller. Note that if we've already
4884	// vectorized the loop we will have done the runtime check and so interleaving
4885	// won't require further checks.
4886	bool ScalarInterleavingRequiresPredication =
4887	(VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
4888	return Legal->blockNeedsPredication(BB);
4889	}));
4890	bool ScalarInterleavingRequiresRuntimePointerCheck =
4891	(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4892
4893	// We want to interleave small loops in order to reduce the loop overhead and
4894	// potentially expose ILP opportunities.
4895	LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << `'\n'`
4896	<< "LV: IC is " << IC << `'\n'`
4897	<< "LV: VF is " << VF << `'\n'`);
4898	const bool AggressivelyInterleaveReductions =
4899	TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4900	if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4901	!ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4902	// We assume that the cost overhead is 1 and we use the cost model
4903	// to estimate the cost of the loop and interleave until the cost of the
4904	// loop overhead is about 5% of the cost of the loop.
4905	unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4906	Value: SmallLoopCost / LoopCost.getValue()));
4907
4908	// Interleave until store/load ports (estimated by max interleave count) are
4909	// saturated.
4910	unsigned NumStores = Legal->getNumStores();
4911	unsigned NumLoads = Legal->getNumLoads();
4912	unsigned StoresIC = IC / (NumStores ? NumStores : `1`);
4913	unsigned LoadsIC = IC / (NumLoads ? NumLoads : `1`);
4914
4915	// There is little point in interleaving for reductions containing selects
4916	// and compares when VF=1 since it may just create more overhead than it's
4917	// worth for loops with small trip counts. This is because we still have to
4918	// do the final reduction after the loop.
4919	bool HasSelectCmpReductions =
4920	HasReductions &&
4921	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4922	const RecurrenceDescriptor &RdxDesc = Reduction.second;
4923	RecurKind RK = RdxDesc.getRecurrenceKind();
4924	return RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) \|\|
4925	RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK);
4926	});
4927	if (HasSelectCmpReductions) {
4928	LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4929	return `1`;
4930	}
4931
4932	// If we have a scalar reduction (vector reductions are already dealt with
4933	// by this point), we can increase the critical path length if the loop
4934	// we're interleaving is inside another loop. For tree-wise reductions
4935	// set the limit to 2, and for ordered reductions it's best to disable
4936	// interleaving entirely.
4937	if (HasReductions && TheLoop->getLoopDepth() > `1`) {
4938	bool HasOrderedReductions =
4939	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4940	const RecurrenceDescriptor &RdxDesc = Reduction.second;
4941	return RdxDesc.isOrdered();
4942	});
4943	if (HasOrderedReductions) {
4944	LLVM_DEBUG(
4945	dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4946	return `1`;
4947	}
4948
4949	unsigned F = MaxNestedScalarReductionIC;
4950	SmallIC = std::min(a: SmallIC, b: F);
4951	StoresIC = std::min(a: StoresIC, b: F);
4952	LoadsIC = std::min(a: LoadsIC, b: F);
4953	}
4954
4955	if (EnableLoadStoreRuntimeInterleave &&
4956	std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4957	LLVM_DEBUG(
4958	dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4959	return std::max(a: StoresIC, b: LoadsIC);
4960	}
4961
4962	// If there are scalar reductions and TTI has enabled aggressive
4963	// interleaving for reductions, we will interleave to expose ILP.
4964	if (VF.isScalar() && AggressivelyInterleaveReductions) {
4965	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4966	// Interleave no less than SmallIC but not as aggressive as the normal IC
4967	// to satisfy the rare situation when resources are too limited.
4968	return std::max(a: IC / `2`, b: SmallIC);
4969	}
4970
4971	LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4972	return SmallIC;
4973	}
4974
4975	// Interleave if this is a large loop (small loops are already dealt with by
4976	// this point) that could benefit from interleaving.
4977	if (AggressivelyInterleaveReductions) {
4978	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4979	return IC;
4980	}
4981
4982	LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4983	return `1`;
4984	}
4985
4986	bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4987	ElementCount VF) {
4988	// TODO: Cost model for emulated masked load/store is completely
4989	// broken. This hack guides the cost model to use an artificially
4990	// high enough value to practically disable vectorization with such
4991	// operations, except where previously deployed legality hack allowed
4992	// using very low cost values. This is to avoid regressions coming simply
4993	// from moving "masked load/store" check from legality to cost model.
4994	// Masked Load/Gather emulation was previously never allowed.
4995	// Limited number of Masked Store/Scatter emulation was allowed.
4996	assert((isPredicatedInst(I)) &&
4997	"Expecting a scalar emulated instruction");
4998	return isa<LoadInst>(Val: I) \|\|
4999	(isa<StoreInst>(Val: I) &&
5000	NumPredStores > NumberOfStoresToPredicate);
5001	}
5002
5003	void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5004	assert(VF.isVector() && "Expected VF >= 2");
5005
5006	// If we've already collected the instructions to scalarize or the predicated
5007	// BBs after vectorization, there's nothing to do. Collection may already have
5008	// occurred if we have a user-selected VF and are now computing the expected
5009	// cost for interleaving.
5010	if (InstsToScalarize.contains(Val: VF) \|\|
5011	PredicatedBBsAfterVectorization.contains(Val: VF))
5012	return;
5013
5014	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5015	// not profitable to scalarize any instructions, the presence of VF in the
5016	// map will indicate that we've analyzed it already.
5017	ScalarCostsTy &ScalarCostsVF = InstsToScalarize [VF];
5018
5019	// Find all the instructions that are scalar with predication in the loop and
5020	// determine if it would be better to not if-convert the blocks they are in.
5021	// If so, we also record the instructions to scalarize.
5022	for (BasicBlock *BB : TheLoop->blocks()) {
5023	if (!blockNeedsPredicationForAnyReason(BB))
5024	continue;
5025	for (Instruction &I : *BB)
5026	if (isScalarWithPredication(I: &I, VF)) {
5027	ScalarCostsTy ScalarCosts;
5028	// Do not apply discount logic for:
5029	// 1. Scalars after vectorization, as there will only be a single copy
5030	// of the instruction.
5031	// 2. Scalable VF, as that would lead to invalid scalarization costs.
5032	// 3. Emulated masked memrefs, if a hacked cost is needed.
5033	if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5034	!useEmulatedMaskMemRefHack(I: &I, VF) &&
5035	computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= `0`) {
5036	ScalarCostsVF.insert_range(R&: ScalarCosts);
5037	// Check if we decided to scalarize a call. If so, update the widening
5038	// decision of the call to CM_Scalarize with the computed scalar cost.
5039	for (const auto &[I, Cost] : ScalarCosts) {
5040	auto *CI = dyn_cast<CallInst>(Val: I);
5041	if (!CI \|\| !CallWideningDecisions.contains(Val: {CI, VF}))
5042	continue;
5043	CallWideningDecisions [{CI, VF}].Kind = CM_Scalarize;
5044	CallWideningDecisions [{CI, VF}].Cost = Cost;
5045	}
5046	}
5047	// Remember that BB will remain after vectorization.
5048	PredicatedBBsAfterVectorization [VF].insert(Ptr: BB);
5049	for (auto *Pred : predecessors(BB)) {
5050	if (Pred->getSingleSuccessor() == BB)
5051	PredicatedBBsAfterVectorization [VF].insert(Ptr: Pred);
5052	}
5053	}
5054	}
5055	}
5056
5057	InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5058	Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5059	assert(!isUniformAfterVectorization(PredInst, VF) &&
5060	"Instruction marked uniform-after-vectorization will be predicated");
5061
5062	// Initialize the discount to zero, meaning that the scalar version and the
5063	// vector version cost the same.
5064	InstructionCost Discount = `0`;
5065
5066	// Holds instructions to analyze. The instructions we visit are mapped in
5067	// ScalarCosts. Those instructions are the ones that would be scalarized if
5068	// we find that the scalar version costs less.
5069	SmallVector<Instruction *, `8`> Worklist;
5070
5071	// Returns true if the given instruction can be scalarized.
5072	auto CanBeScalarized = [&](Instruction I) -> bool* {
5073	// We only attempt to scalarize instructions forming a single-use chain
5074	// from the original predicated block that would otherwise be vectorized.
5075	// Although not strictly necessary, we give up on instructions we know will
5076	// already be scalar to avoid traversing chains that are unlikely to be
5077	// beneficial.
5078	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
5079	isScalarAfterVectorization(I, VF))
5080	return false;
5081
5082	// If the instruction is scalar with predication, it will be analyzed
5083	// separately. We ignore it within the context of PredInst.
5084	if (isScalarWithPredication(I, VF))
5085	return false;
5086
5087	// If any of the instruction's operands are uniform after vectorization,
5088	// the instruction cannot be scalarized. This prevents, for example, a
5089	// masked load from being scalarized.
5090	//
5091	// We assume we will only emit a value for lane zero of an instruction
5092	// marked uniform after vectorization, rather than VF identical values.
5093	// Thus, if we scalarize an instruction that uses a uniform, we would
5094	// create uses of values corresponding to the lanes we aren't emitting code
5095	// for. This behavior can be changed by allowing getScalarValue to clone
5096	// the lane zero values for uniforms rather than asserting.
5097	for (Use &U : I->operands())
5098	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5099	if (isUniformAfterVectorization(I: J, VF))
5100	return false;
5101
5102	// Otherwise, we can scalarize the instruction.
5103	return true;
5104	};
5105
5106	// Compute the expected cost discount from scalarizing the entire expression
5107	// feeding the predicated instruction. We currently only consider expressions
5108	// that are single-use instruction chains.
5109	Worklist.push_back(Elt: PredInst);
5110	while (!Worklist.empty()) {
5111	Instruction *I = Worklist.pop_back_val();
5112
5113	// If we've already analyzed the instruction, there's nothing to do.
5114	if (ScalarCosts.contains(Val: I))
5115	continue;
5116
5117	// Cannot scalarize fixed-order recurrence phis at the moment.
5118	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5119	continue;
5120
5121	// Compute the cost of the vector instruction. Note that this cost already
5122	// includes the scalarization overhead of the predicated instruction.
5123	InstructionCost VectorCost = getInstructionCost(I, VF);
5124
5125	// Compute the cost of the scalarized instruction. This cost is the cost of
5126	// the instruction as if it wasn't if-converted and instead remained in the
5127	// predicated block. We will scale this cost by block probability after
5128	// computing the scalarization overhead.
5129	InstructionCost ScalarCost =
5130	VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`));
5131
5132	// Compute the scalarization overhead of needed insertelement instructions
5133	// and phi nodes.
5134	if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5135	Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5136	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5137	ScalarCost += TTI.getScalarizationOverhead(
5138	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5139	/Insert=/true,
5140	/Extract=/false, CostKind);
5141	}
5142	ScalarCost +=
5143	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5144	}
5145
5146	// Compute the scalarization overhead of needed extractelement
5147	// instructions. For each of the instruction's operands, if the operand can
5148	// be scalarized, add it to the worklist; otherwise, account for the
5149	// overhead.
5150	for (Use &U : I->operands())
5151	if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5152	assert(canVectorizeTy(J->getType()) &&
5153	"Instruction has non-scalar type");
5154	if (CanBeScalarized (J))
5155	Worklist.push_back(Elt: J);
5156	else if (needsExtract(V: J, VF)) {
5157	Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5158	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5159	ScalarCost += TTI.getScalarizationOverhead(
5160	Ty: cast<VectorType>(Val: VectorTy),
5161	DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /Insert/ false,
5162	/Extract/ true, CostKind);
5163	}
5164	}
5165	}
5166
5167	// Scale the total scalar cost by block probability.
5168	ScalarCost /= getPredBlockCostDivisor(CostKind);
5169
5170	// Compute the discount. A non-negative discount means the vector version
5171	// of the instruction costs more, and scalarizing would be beneficial.
5172	Discount += VectorCost - ScalarCost;
5173	ScalarCosts [I] = ScalarCost;
5174	}
5175
5176	return Discount;
5177	}
5178
5179	InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5180	InstructionCost Cost;
5181
5182	// If the vector loop gets executed exactly once with the given VF, ignore the
5183	// costs of comparison and induction instructions, as they'll get simplified
5184	// away.
5185	SmallPtrSet<Instruction *, `2`> ValuesToIgnoreForVF;
5186	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5187	if (TC == VF && !foldTailByMasking())
5188	addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5189	InstsToIgnore&: ValuesToIgnoreForVF);
5190
5191	// For each block.
5192	for (BasicBlock *BB : TheLoop->blocks()) {
5193	InstructionCost BlockCost;
5194
5195	// For each instruction in the old loop.
5196	for (Instruction &I : BB->instructionsWithoutDebug()) {
5197	// Skip ignored values.
5198	if (ValuesToIgnore.count(Ptr: &I) \|\| ValuesToIgnoreForVF.count(Ptr: &I) \|\|
5199	(VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5200	continue;
5201
5202	InstructionCost C = getInstructionCost(I: &I, VF);
5203
5204	// Check if we should override the cost.
5205	if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > `0`)
5206	C = InstructionCost (ForceTargetInstructionCost);
5207
5208	BlockCost += C;
5209	LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5210	<< VF << " For instruction: " << I << `'\n'`);
5211	}
5212
5213	// If we are vectorizing a predicated block, it will have been
5214	// if-converted. This means that the block's instructions (aside from
5215	// stores and instructions that may divide by zero) will now be
5216	// unconditionally executed. For the scalar case, we may not always execute
5217	// the predicated block, if it is an if-else block. Thus, scale the block's
5218	// cost by the probability of executing it. blockNeedsPredication from
5219	// Legal is used so as to not include all blocks in tail folded loops.
5220	if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5221	BlockCost /= getPredBlockCostDivisor(CostKind);
5222
5223	Cost += BlockCost;
5224	}
5225
5226	return Cost;
5227	}
5228
5229	/// Gets Address Access SCEV after verifying that the access pattern
5230	/// is loop invariant except the induction variable dependence.
5231	///
5232	/// This SCEV can be sent to the Target in order to estimate the address
5233	/// calculation cost.
5234	static const SCEV *getAddressAccessSCEV(
5235	Value *Ptr,
5236	LoopVectorizationLegality *Legal,
5237	PredicatedScalarEvolution &PSE,
5238	const Loop *TheLoop) {
5239
5240	auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
5241	if (!Gep)
5242	return nullptr;
5243
5244	// We are looking for a gep with all loop invariant indices except for one
5245	// which should be an induction variable.
5246	auto *SE = PSE.getSE();
5247	unsigned NumOperands = Gep->getNumOperands();
5248	for (unsigned Idx = `1`; Idx < NumOperands; ++Idx) {
5249	Value *Opd = Gep->getOperand(i_nocapture: Idx);
5250	if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
5251	!Legal->isInductionVariable(V: Opd))
5252	return nullptr;
5253	}
5254
5255	// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5256	return PSE.getSCEV(V: Ptr);
5257	}
5258
5259	InstructionCost
5260	LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5261	ElementCount VF) {
5262	assert(VF.isVector() &&
5263	"Scalarization cost of instruction implies vectorization.");
5264	if (VF.isScalable())
5265	return InstructionCost::getInvalid();
5266
5267	Type *ValTy = getLoadStoreType(I);
5268	auto *SE = PSE.getSE();
5269
5270	unsigned AS = getLoadStoreAddressSpace(I);
5271	Value *Ptr = getLoadStorePointerOperand(V: I);
5272	Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5273	// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5274	// that it is being called from this specific place.
5275
5276	// Figure out whether the access is strided and get the stride value
5277	// if it's known in compile time
5278	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5279
5280	// Get the cost of the scalar memory instruction and address computation.
5281	InstructionCost Cost =
5282	VF.getFixedValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
5283
5284	// Don't pass I here, since it is scalar but will actually be part of a*
5285	// vectorized loop where the user of it is a vectorized instruction.
5286	const Align Alignment = getLoadStoreAlignment(I);
5287	Cost += VF.getFixedValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
5288	Src: ValTy->getScalarType(),
5289	Alignment, AddressSpace: AS, CostKind);
5290
5291	// Get the overhead of the extractelement and insertelement instructions
5292	// we might create due to scalarization.
5293	Cost += getScalarizationOverhead(I, VF);
5294
5295	// If we have a predicated load/store, it will need extra i1 extracts and
5296	// conditional branches, but may not be executed for each vector lane. Scale
5297	// the cost by the probability of executing the predicated block.
5298	if (isPredicatedInst(I)) {
5299	Cost /= getPredBlockCostDivisor(CostKind);
5300
5301	// Add the cost of an i1 extract and a branch
5302	auto *VecI1Ty =
5303	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5304	Cost += TTI.getScalarizationOverhead(
5305	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5306	/Insert=/false, /Extract=/true, CostKind);
5307	Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5308
5309	if (useEmulatedMaskMemRefHack(I, VF))
5310	// Artificially setting to a high enough value to practically disable
5311	// vectorization with such operations.
5312	Cost = `3000000`;
5313	}
5314
5315	return Cost;
5316	}
5317
5318	InstructionCost
5319	LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5320	ElementCount VF) {
5321	Type *ValTy = getLoadStoreType(I);
5322	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5323	Value *Ptr = getLoadStorePointerOperand(V: I);
5324	unsigned AS = getLoadStoreAddressSpace(I);
5325	int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5326
5327	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5328	"Stride should be 1 or -1 for consecutive memory access");
5329	const Align Alignment = getLoadStoreAlignment(I);
5330	InstructionCost Cost = `0`;
5331	if (Legal->isMaskRequired(I)) {
5332	Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5333	CostKind);
5334	} else {
5335	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5336	Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5337	CostKind, OpdInfo: OpInfo, I);
5338	}
5339
5340	bool Reverse = ConsecutiveStride < `0`;
5341	if (Reverse)
5342	Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5343	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5344	return Cost;
5345	}
5346
5347	InstructionCost
5348	LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5349	ElementCount VF) {
5350	assert(Legal->isUniformMemOp(*I, VF));
5351
5352	Type *ValTy = getLoadStoreType(I);
5353	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5354	const Align Alignment = getLoadStoreAlignment(I);
5355	unsigned AS = getLoadStoreAddressSpace(I);
5356	if (isa<LoadInst>(Val: I)) {
5357	return TTI.getAddressComputationCost(Ty: ValTy) +
5358	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5359	CostKind) +
5360	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5361	SrcTy: VectorTy, Mask: {}, CostKind);
5362	}
5363	StoreInst *SI = cast<StoreInst>(Val: I);
5364
5365	bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5366	// TODO: We have existing tests that request the cost of extracting element
5367	// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5368	// the actual generated code, which involves extracting the last element of
5369	// a scalable vector where the lane to extract is unknown at compile time.
5370	return TTI.getAddressComputationCost(Ty: ValTy) +
5371	TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
5372	CostKind) +
5373	(IsLoopInvariantStoreValue
5374	? `0`
5375	: TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
5376	CostKind, Index: VF.getKnownMinValue() - `1`));
5377	}
5378
5379	InstructionCost
5380	LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5381	ElementCount VF) {
5382	Type *ValTy = getLoadStoreType(I);
5383	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5384	const Align Alignment = getLoadStoreAlignment(I);
5385	const Value *Ptr = getLoadStorePointerOperand(V: I);
5386
5387	return TTI.getAddressComputationCost(Ty: VectorTy) +
5388	TTI.getGatherScatterOpCost(Opcode: I->getOpcode(), DataTy: VectorTy, Ptr,
5389	VariableMask: Legal->isMaskRequired(I), Alignment,
5390	CostKind, I);
5391	}
5392
5393	InstructionCost
5394	LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5395	ElementCount VF) {
5396	const auto *Group = getInterleavedAccessGroup(Instr: I);
5397	assert(Group && "Fail to get an interleaved access group.");
5398
5399	Instruction *InsertPos = Group->getInsertPos();
5400	Type *ValTy = getLoadStoreType(I: InsertPos);
5401	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5402	unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5403
5404	unsigned InterleaveFactor = Group->getFactor();
5405	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
5406
5407	// Holds the indices of existing members in the interleaved group.
5408	SmallVector<unsigned, `4`> Indices;
5409	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
5410	if (Group->getMember(Index: IF))
5411	Indices.push_back(Elt: IF);
5412
5413	// Calculate the cost of the whole interleaved group.
5414	bool UseMaskForGaps =
5415	(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) \|\|
5416	(isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
5417	InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5418	Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5419	Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5420	UseMaskForGaps);
5421
5422	if (Group->isReverse()) {
5423	// TODO: Add support for reversed masked interleaved access.
5424	assert(!Legal->isMaskRequired(I) &&
5425	"Reverse masked interleaved access not supported.");
5426	Cost += Group->getNumMembers() *
5427	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5428	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5429	}
5430	return Cost;
5431	}
5432
5433	std::optional<InstructionCost>
5434	LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5435	ElementCount VF,
5436	Type Ty) const* {
5437	using namespace llvm::PatternMatch;
5438	// Early exit for no inloop reductions
5439	if (InLoopReductions.empty() \|\| VF.isScalar() \|\| !isa<VectorType>(Val: Ty))
5440	return std::nullopt;
5441	auto *VectorTy = cast<VectorType>(Val: Ty);
5442
5443	// We are looking for a pattern of, and finding the minimal acceptable cost:
5444	// reduce(mul(ext(A), ext(B))) or
5445	// reduce(mul(A, B)) or
5446	// reduce(ext(A)) or
5447	// reduce(A).
5448	// The basic idea is that we walk down the tree to do that, finding the root
5449	// reduction instruction in InLoopReductionImmediateChains. From there we find
5450	// the pattern of mul/ext and test the cost of the entire pattern vs the cost
5451	// of the components. If the reduction cost is lower then we return it for the
5452	// reduction instruction and 0 for the other instructions in the pattern. If
5453	// it is not we return an invalid cost specifying the orignal cost method
5454	// should be used.
5455	Instruction *RetI = I;
5456	if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5457	if (!RetI->hasOneUser())
5458	return std::nullopt;
5459	RetI = RetI->user_back();
5460	}
5461
5462	if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5463	RetI->user_back()->getOpcode() == Instruction::Add) {
5464	RetI = RetI->user_back();
5465	}
5466
5467	// Test if the found instruction is a reduction, and if not return an invalid
5468	// cost specifying the parent to use the original cost modelling.
5469	Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5470	if (!LastChain)
5471	return std::nullopt;
5472
5473	// Find the reduction this chain is a part of and calculate the basic cost of
5474	// the reduction on its own.
5475	Instruction *ReductionPhi = LastChain;
5476	while (!isa<PHINode>(Val: ReductionPhi))
5477	ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5478
5479	const RecurrenceDescriptor &RdxDesc =
5480	Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second;
5481
5482	InstructionCost BaseCost;
5483	RecurKind RK = RdxDesc.getRecurrenceKind();
5484	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5485	Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5486	BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5487	FMF: RdxDesc.getFastMathFlags(), CostKind);
5488	} else {
5489	BaseCost = TTI.getArithmeticReductionCost(
5490	Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5491	}
5492
5493	// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5494	// normal fmul instruction to the cost of the fadd reduction.
5495	if (RK == RecurKind::FMulAdd)
5496	BaseCost +=
5497	TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5498
5499	// If we're using ordered reductions then we can just return the base cost
5500	// here, since getArithmeticReductionCost calculates the full ordered
5501	// reduction cost when FP reassociation is not allowed.
5502	if (useOrderedReductions(RdxDesc))
5503	return BaseCost;
5504
5505	// Get the operand that was not the reduction chain and match it to one of the
5506	// patterns, returning the better cost if it is found.
5507	Instruction *RedOp = RetI->getOperand(i: `1`) == LastChain
5508	? dyn_cast<Instruction>(Val: RetI->getOperand(i: `0`))
5509	: dyn_cast<Instruction>(Val: RetI->getOperand(i: `1`));
5510
5511	VectorTy = VectorType::get(ElementType: I->getOperand(i: `0`)->getType(), Other: VectorTy);
5512
5513	Instruction Op0, Op1;
5514	if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5515	match(V: RedOp,
5516	P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5517	match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5518	Op0->getOpcode() == Op1->getOpcode() &&
5519	Op0->getOperand(i: `0`)->getType() == Op1->getOperand(i: `0`)->getType() &&
5520	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5521	(Op0->getOpcode() == RedOp->getOpcode() \|\| Op0 == Op1)) {
5522
5523	// Matched reduce.add(ext(mul(ext(A), ext(B)))
5524	// Note that the extend opcodes need to all match, or if A==B they will have
5525	// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5526	// which is equally fine.
5527	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5528	auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: `0`)->getType(), Other: VectorTy);
5529	auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5530
5531	InstructionCost ExtCost =
5532	TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5533	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5534	InstructionCost MulCost =
5535	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5536	InstructionCost Ext2Cost =
5537	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5538	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5539
5540	InstructionCost RedCost = TTI.getMulAccReductionCost(
5541	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5542
5543	if (RedCost.isValid() &&
5544	RedCost < ExtCost * `2` + MulCost + Ext2Cost + BaseCost)
5545	return I == RetI ? RedCost : `0`;
5546	} else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5547	!TheLoop->isLoopInvariant(V: RedOp)) {
5548	// Matched reduce(ext(A))
5549	bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5550	auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: `0`)->getType(), Other: VectorTy);
5551	InstructionCost RedCost = TTI.getExtendedReductionCost(
5552	Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5553	FMF: RdxDesc.getFastMathFlags(), CostKind);
5554
5555	InstructionCost ExtCost =
5556	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5557	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5558	if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5559	return I == RetI ? RedCost : `0`;
5560	} else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5561	match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5562	if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5563	Op0->getOpcode() == Op1->getOpcode() &&
5564	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5565	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5566	Type *Op0Ty = Op0->getOperand(i: `0`)->getType();
5567	Type *Op1Ty = Op1->getOperand(i: `0`)->getType();
5568	Type *LargestOpTy =
5569	Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5570	: Op0Ty;
5571	auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5572
5573	// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5574	// different sizes. We take the largest type as the ext to reduce, and add
5575	// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5576	InstructionCost ExtCost0 = TTI.getCastInstrCost(
5577	Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5578	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5579	InstructionCost ExtCost1 = TTI.getCastInstrCost(
5580	Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5581	CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5582	InstructionCost MulCost =
5583	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5584
5585	InstructionCost RedCost = TTI.getMulAccReductionCost(
5586	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5587	InstructionCost ExtraExtCost = `0`;
5588	if (Op0Ty != LargestOpTy \|\| Op1Ty != LargestOpTy) {
5589	Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5590	ExtraExtCost = TTI.getCastInstrCost(
5591	Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5592	Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: `0`)->getType(), Other: VectorTy),
5593	CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5594	}
5595
5596	if (RedCost.isValid() &&
5597	(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5598	return I == RetI ? RedCost : `0`;
5599	} else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5600	// Matched reduce.add(mul())
5601	InstructionCost MulCost =
5602	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5603
5604	InstructionCost RedCost = TTI.getMulAccReductionCost(
5605	IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
5606
5607	if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5608	return I == RetI ? RedCost : `0`;
5609	}
5610	}
5611
5612	return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5613	}
5614
5615	InstructionCost
5616	LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5617	ElementCount VF) {
5618	// Calculate scalar cost only. Vectorization cost should be ready at this
5619	// moment.
5620	if (VF.isScalar()) {
5621	Type *ValTy = getLoadStoreType(I);
5622	const Align Alignment = getLoadStoreAlignment(I);
5623	unsigned AS = getLoadStoreAddressSpace(I);
5624
5625	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5626	return TTI.getAddressComputationCost(Ty: ValTy) +
5627	TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5628	OpdInfo: OpInfo, I);
5629	}
5630	return getWideningCost(I, VF);
5631	}
5632
5633	InstructionCost
5634	LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5635	ElementCount VF) const {
5636
5637	// There is no mechanism yet to create a scalable scalarization loop,
5638	// so this is currently Invalid.
5639	if (VF.isScalable())
5640	return InstructionCost::getInvalid();
5641
5642	if (VF.isScalar())
5643	return `0`;
5644
5645	InstructionCost Cost = `0`;
5646	Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5647	if (!RetTy->isVoidTy() &&
5648	(!isa<LoadInst>(Val: I) \|\| !TTI.supportsEfficientVectorElementLoadStore())) {
5649
5650	for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5651	Cost += TTI.getScalarizationOverhead(
5652	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5653	/Insert=/true,
5654	/Extract=/false, CostKind);
5655	}
5656	}
5657
5658	// Some targets keep addresses scalar.
5659	if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5660	return Cost;
5661
5662	// Some targets support efficient element stores.
5663	if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5664	return Cost;
5665
5666	// Collect operands to consider.
5667	CallInst *CI = dyn_cast<CallInst>(Val: I);
5668	Instruction::op_range Ops = CI ? CI->args() : I->operands();
5669
5670	// Skip operands that do not require extraction/scalarization and do not incur
5671	// any overhead.
5672	SmallVector<Type *> Tys;
5673	for (auto *V : filterExtractingOperands(Ops, VF))
5674	Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5675	return Cost + TTI.getOperandsScalarizationOverhead(
5676	Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
5677	}
5678
5679	void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5680	if (VF.isScalar())
5681	return;
5682	NumPredStores = `0`;
5683	for (BasicBlock *BB : TheLoop->blocks()) {
5684	// For each instruction in the old loop.
5685	for (Instruction &I : *BB) {
5686	Value *Ptr = getLoadStorePointerOperand(V: &I);
5687	if (!Ptr)
5688	continue;
5689
5690	// TODO: We should generate better code and update the cost model for
5691	// predicated uniform stores. Today they are treated as any other
5692	// predicated store (see added test cases in
5693	// invariant-store-vectorization.ll).
5694	if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5695	NumPredStores++;
5696
5697	if (Legal->isUniformMemOp(I, VF)) {
5698	auto IsLegalToScalarize = [&]() {
5699	if (!VF.isScalable())
5700	// Scalarization of fixed length vectors "just works".
5701	return true;
5702
5703	// We have dedicated lowering for unpredicated uniform loads and
5704	// stores. Note that even with tail folding we know that at least
5705	// one lane is active (i.e. generalized predication is not possible
5706	// here), and the logic below depends on this fact.
5707	if (!foldTailByMasking())
5708	return true;
5709
5710	// For scalable vectors, a uniform memop load is always
5711	// uniform-by-parts and we know how to scalarize that.
5712	if (isa<LoadInst>(Val: I))
5713	return true;
5714
5715	// A uniform store isn't neccessarily uniform-by-part
5716	// and we can't assume scalarization.
5717	auto &SI = cast<StoreInst>(Val&: I);
5718	return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5719	};
5720
5721	const InstructionCost GatherScatterCost =
5722	isLegalGatherOrScatter(V: &I, VF) ?
5723	getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5724
5725	// Load: Scalar load + broadcast
5726	// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5727	// FIXME: This cost is a significant under-estimate for tail folded
5728	// memory ops.
5729	const InstructionCost ScalarizationCost =
5730	IsLegalToScalarize () ? getUniformMemOpCost(I: &I, VF)
5731	: InstructionCost::getInvalid();
5732
5733	// Choose better solution for the current VF, Note that Invalid
5734	// costs compare as maximumal large. If both are invalid, we get
5735	// scalable invalid which signals a failure and a vectorization abort.
5736	if (GatherScatterCost < ScalarizationCost)
5737	setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5738	else
5739	setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5740	continue;
5741	}
5742
5743	// We assume that widening is the best solution when possible.
5744	if (memoryInstructionCanBeWidened(I: &I, VF)) {
5745	InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5746	int ConsecutiveStride = Legal->isConsecutivePtr(
5747	AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5748	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5749	"Expected consecutive stride.");
5750	InstWidening Decision =
5751	ConsecutiveStride == `1` ? CM_Widen : CM_Widen_Reverse;
5752	setWideningDecision(I: &I, VF, W: Decision, Cost);
5753	continue;
5754	}
5755
5756	// Choose between Interleaving, Gather/Scatter or Scalarization.
5757	InstructionCost InterleaveCost = InstructionCost::getInvalid();
5758	unsigned NumAccesses = `1`;
5759	if (isAccessInterleaved(Instr: &I)) {
5760	const auto *Group = getInterleavedAccessGroup(Instr: &I);
5761	assert(Group && "Fail to get an interleaved access group.");
5762
5763	// Make one decision for the whole group.
5764	if (getWideningDecision(I: &I, VF) != CM_Unknown)
5765	continue;
5766
5767	NumAccesses = Group->getNumMembers();
5768	if (interleavedAccessCanBeWidened(I: &I, VF))
5769	InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5770	}
5771
5772	InstructionCost GatherScatterCost =
5773	isLegalGatherOrScatter(V: &I, VF)
5774	? getGatherScatterCost(I: &I, VF) * NumAccesses
5775	: InstructionCost::getInvalid();
5776
5777	InstructionCost ScalarizationCost =
5778	getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5779
5780	// Choose better solution for the current VF,
5781	// write down this decision and use it during vectorization.
5782	InstructionCost Cost;
5783	InstWidening Decision;
5784	if (InterleaveCost <= GatherScatterCost &&
5785	InterleaveCost < ScalarizationCost) {
5786	Decision = CM_Interleave;
5787	Cost = InterleaveCost;
5788	} else if (GatherScatterCost < ScalarizationCost) {
5789	Decision = CM_GatherScatter;
5790	Cost = GatherScatterCost;
5791	} else {
5792	Decision = CM_Scalarize;
5793	Cost = ScalarizationCost;
5794	}
5795	// If the instructions belongs to an interleave group, the whole group
5796	// receives the same decision. The whole group receives the cost, but
5797	// the cost will actually be assigned to one instruction.
5798	if (const auto *Group = getInterleavedAccessGroup(Instr: &I))
5799	setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5800	else
5801	setWideningDecision(I: &I, VF, W: Decision, Cost);
5802	}
5803	}
5804
5805	// Make sure that any load of address and any other address computation
5806	// remains scalar unless there is gather/scatter support. This avoids
5807	// inevitable extracts into address registers, and also has the benefit of
5808	// activating LSR more, since that pass can't optimize vectorized
5809	// addresses.
5810	if (TTI.prefersVectorizedAddressing())
5811	return;
5812
5813	// Start with all scalar pointer uses.
5814	SmallPtrSet<Instruction *, `8`> AddrDefs;
5815	for (BasicBlock *BB : TheLoop->blocks())
5816	for (Instruction &I : *BB) {
5817	Instruction *PtrDef =
5818	dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5819	if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5820	getWideningDecision(I: &I, VF) != CM_GatherScatter)
5821	AddrDefs.insert(Ptr: PtrDef);
5822	}
5823
5824	// Add all instructions used to generate the addresses.
5825	SmallVector<Instruction *, `4`> Worklist;
5826	append_range(C&: Worklist, R&: AddrDefs);
5827	while (!Worklist.empty()) {
5828	Instruction *I = Worklist.pop_back_val();
5829	for (auto &Op : I->operands())
5830	if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5831	if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
5832	AddrDefs.insert(Ptr: InstOp).second)
5833	Worklist.push_back(Elt: InstOp);
5834	}
5835
5836	for (auto *I : AddrDefs) {
5837	if (isa<LoadInst>(Val: I)) {
5838	// Setting the desired widening decision should ideally be handled in
5839	// by cost functions, but since this involves the task of finding out
5840	// if the loaded register is involved in an address computation, it is
5841	// instead changed here when we know this is the case.
5842	InstWidening Decision = getWideningDecision(I, VF);
5843	if (Decision == CM_Widen \|\| Decision == CM_Widen_Reverse)
5844	// Scalarize a widened load of address.
5845	setWideningDecision(
5846	I, VF, W: CM_Scalarize,
5847	Cost: (VF.getKnownMinValue() *
5848	getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`))));
5849	else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5850	// Scalarize an interleave group of address loads.
5851	for (unsigned I = `0`; I < Group->getFactor(); ++I) {
5852	if (Instruction *Member = Group->getMember(Index: I))
5853	setWideningDecision(
5854	I: Member, VF, W: CM_Scalarize,
5855	Cost: (VF.getKnownMinValue() *
5856	getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: `1`))));
5857	}
5858	}
5859	} else {
5860	// Cannot scalarize fixed-order recurrence phis at the moment.
5861	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5862	continue;
5863
5864	// Make sure I gets scalarized and a cost estimate without
5865	// scalarization overhead.
5866	ForcedScalars [VF].insert(Ptr: I);
5867	}
5868	}
5869	}
5870
5871	void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5872	assert(!VF.isScalar() &&
5873	"Trying to set a vectorization decision for a scalar VF");
5874
5875	auto ForcedScalar = ForcedScalars.find(Val: VF);
5876	for (BasicBlock *BB : TheLoop->blocks()) {
5877	// For each instruction in the old loop.
5878	for (Instruction &I : *BB) {
5879	CallInst *CI = dyn_cast<CallInst>(Val: &I);
5880
5881	if (!CI)
5882	continue;
5883
5884	InstructionCost ScalarCost = InstructionCost::getInvalid();
5885	InstructionCost VectorCost = InstructionCost::getInvalid();
5886	InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5887	Function *ScalarFunc = CI->getCalledFunction();
5888	Type *ScalarRetTy = CI->getType();
5889	SmallVector<Type *, `4`> Tys, ScalarTys;
5890	for (auto &ArgOp : CI->args())
5891	ScalarTys.push_back(Elt: ArgOp ->getType());
5892
5893	// Estimate cost of scalarized vector call. The source operands are
5894	// assumed to be vectors, so we need to extract individual elements from
5895	// there, execute VF scalar calls, and then gather the result into the
5896	// vector return value.
5897	InstructionCost ScalarCallCost =
5898	TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5899
5900	// Compute costs of unpacking argument values for the scalar calls and
5901	// packing the return values to a vector.
5902	InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5903
5904	ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5905	// Honor ForcedScalars and UniformAfterVectorization decisions.
5906	// TODO: For calls, it might still be more profitable to widen. Use
5907	// VPlan-based cost model to compare different options.
5908	if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5909	ForcedScalar ->second.contains(Ptr: CI)) \|\|
5910	isUniformAfterVectorization(I: CI, VF))) {
5911	setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5912	IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5913	Cost: ScalarCost);
5914	continue;
5915	}
5916
5917	bool MaskRequired = Legal->isMaskRequired(I: CI);
5918	// Compute corresponding vector type for return value and arguments.
5919	Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5920	for (Type *ScalarTy : ScalarTys)
5921	Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5922
5923	// An in-loop reduction using an fmuladd intrinsic is a special case;
5924	// we don't want the normal cost for that intrinsic.
5925	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5926	if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5927	setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5928	IID: getVectorIntrinsicIDForCall(CI, TLI),
5929	MaskPos: std::nullopt, Cost: *RedCost);
5930	continue;
5931	}
5932
5933	// Find the cost of vectorizing the call, if we can find a suitable
5934	// vector variant of the function.
5935	VFInfo FuncInfo;
5936	Function VecFunc = nullptr*;
5937	// Search through any available variants for one we can use at this VF.
5938	for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5939	// Must match requested VF.
5940	if (Info.Shape.VF != VF)
5941	continue;
5942
5943	// Must take a mask argument if one is required
5944	if (MaskRequired && !Info.isMasked())
5945	continue;
5946
5947	// Check that all parameter kinds are supported
5948	bool ParamsOk = true;
5949	for (VFParameter Param : Info.Shape.Parameters) {
5950	switch (Param.ParamKind) {
5951	case VFParamKind::Vector:
5952	break;
5953	case VFParamKind::OMP_Uniform: {
5954	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5955	// Make sure the scalar parameter in the loop is invariant.
5956	if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
5957	L: TheLoop))
5958	ParamsOk = false;
5959	break;
5960	}
5961	case VFParamKind::OMP_Linear: {
5962	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5963	// Find the stride for the scalar parameter in this loop and see if
5964	// it matches the stride for the variant.
5965	// TODO: do we need to figure out the cost of an extract to get the
5966	// first lane? Or do we hope that it will be folded away?
5967	ScalarEvolution *SE = PSE.getSE();
5968	const auto *SAR =
5969	dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
5970
5971	if (!SAR \|\| SAR->getLoop() != TheLoop) {
5972	ParamsOk = false;
5973	break;
5974	}
5975
5976	const SCEVConstant *Step =
5977	dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
5978
5979	if (!Step \|\|
5980	Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5981	ParamsOk = false;
5982
5983	break;
5984	}
5985	case VFParamKind::GlobalPredicate:
5986	break;
5987	default:
5988	ParamsOk = false;
5989	break;
5990	}
5991	}
5992
5993	if (!ParamsOk)
5994	continue;
5995
5996	// Found a suitable candidate, stop here.
5997	VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
5998	FuncInfo = Info;
5999	break;
6000	}
6001
6002	if (TLI && VecFunc && !CI->isNoBuiltin())
6003	VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6004
6005	// Find the cost of an intrinsic; some targets may have instructions that
6006	// perform the operation without needing an actual call.
6007	Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6008	if (IID != Intrinsic::not_intrinsic)
6009	IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6010
6011	InstructionCost Cost = ScalarCost;
6012	InstWidening Decision = CM_Scalarize;
6013
6014	if (VectorCost <= Cost) {
6015	Cost = VectorCost;
6016	Decision = CM_VectorCall;
6017	}
6018
6019	if (IntrinsicCost <= Cost) {
6020	Cost = IntrinsicCost;
6021	Decision = CM_IntrinsicCall;
6022	}
6023
6024	setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6025	MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6026	}
6027	}
6028	}
6029
6030	bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6031	if (!Legal->isInvariant(V: Op))
6032	return false;
6033	// Consider Op invariant, if it or its operands aren't predicated
6034	// instruction in the loop. In that case, it is not trivially hoistable.
6035	auto *OpI = dyn_cast<Instruction>(Val: Op);
6036	return !OpI \|\| !TheLoop->contains(Inst: OpI) \|\|
6037	(!isPredicatedInst(I: OpI) &&
6038	(!isa<PHINode>(Val: OpI) \|\| OpI->getParent() != TheLoop->getHeader()) &&
6039	all_of(Range: OpI->operands(),
6040	P: [this](Value Op) { return* shouldConsiderInvariant(Op); }));
6041	}
6042
6043	InstructionCost
6044	LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6045	ElementCount VF) {
6046	// If we know that this instruction will remain uniform, check the cost of
6047	// the scalar version.
6048	if (isUniformAfterVectorization(I, VF))
6049	VF = ElementCount::getFixed(MinVal: `1`);
6050
6051	if (VF.isVector() && isProfitableToScalarize(I, VF))
6052	return InstsToScalarize [VF][I];
6053
6054	// Forced scalars do not have any scalarization overhead.
6055	auto ForcedScalar = ForcedScalars.find(Val: VF);
6056	if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6057	auto InstSet = ForcedScalar ->second;
6058	if (InstSet.count(Ptr: I))
6059	return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`)) *
6060	VF.getKnownMinValue();
6061	}
6062
6063	Type *RetTy = I->getType();
6064	if (canTruncateToMinimalBitwidth(I, VF))
6065	RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs [I]);
6066	auto *SE = PSE.getSE();
6067
6068	Type *VectorTy;
6069	if (isScalarAfterVectorization(I, VF)) {
6070	[[maybe_unused]] auto HasSingleCopyAfterVectorization =
6071	[this](Instruction I, ElementCount VF) -> bool* {
6072	if (VF.isScalar())
6073	return true;
6074
6075	auto Scalarized = InstsToScalarize.find(Val: VF);
6076	assert(Scalarized != InstsToScalarize.end() &&
6077	"VF not yet analyzed for scalarization profitability");
6078	return !Scalarized ->second.count(Val: I) &&
6079	llvm::all_of(Range: I->users(), P: [&](User *U) {
6080	auto *UI = cast<Instruction>(Val: U);
6081	return !Scalarized ->second.count(Val: UI);
6082	});
6083	};
6084
6085	// With the exception of GEPs and PHIs, after scalarization there should
6086	// only be one copy of the instruction generated in the loop. This is
6087	// because the VF is either 1, or any instructions that need scalarizing
6088	// have already been dealt with by the time we get here. As a result,
6089	// it means we don't have to multiply the instruction cost by VF.
6090	assert(I->getOpcode() == Instruction::GetElementPtr \|\|
6091	I->getOpcode() == Instruction::PHI \|\|
6092	(I->getOpcode() == Instruction::BitCast &&
6093	I->getType()->isPointerTy()) \|\|
6094	HasSingleCopyAfterVectorization(I, VF));
6095	VectorTy = RetTy;
6096	} else
6097	VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6098
6099	if (VF.isVector() && VectorTy->isVectorTy() &&
6100	!TTI.getNumberOfParts(Tp: VectorTy))
6101	return InstructionCost::getInvalid();
6102
6103	// TODO: We need to estimate the cost of intrinsic calls.
6104	switch (I->getOpcode()) {
6105	case Instruction::GetElementPtr:
6106	// We mark this instruction as zero-cost because the cost of GEPs in
6107	// vectorized code depends on whether the corresponding memory instruction
6108	// is scalarized or not. Therefore, we handle GEPs with the memory
6109	// instruction cost.
6110	return `0`;
6111	case Instruction::Br: {
6112	// In cases of scalarized and predicated instructions, there will be VF
6113	// predicated blocks in the vectorized loop. Each branch around these
6114	// blocks requires also an extract of its vector compare i1 element.
6115	// Note that the conditional branch from the loop latch will be replaced by
6116	// a single branch controlling the loop, so there is no extra overhead from
6117	// scalarization.
6118	bool ScalarPredicatedBB = false;
6119	BranchInst *BI = cast<BranchInst>(Val: I);
6120	if (VF.isVector() && BI->isConditional() &&
6121	(PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `0`)) \|\|
6122	PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `1`))) &&
6123	BI->getParent() != TheLoop->getLoopLatch())
6124	ScalarPredicatedBB = true;
6125
6126	if (ScalarPredicatedBB) {
6127	// Not possible to scalarize scalable vector with predicated instructions.
6128	if (VF.isScalable())
6129	return InstructionCost::getInvalid();
6130	// Return cost for branches around scalarized and predicated blocks.
6131	auto *VecI1Ty =
6132	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6133	return (
6134	TTI.getScalarizationOverhead(
6135	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6136	/Insert/ false, /Extract/ true, CostKind) +
6137	(TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6138	}
6139
6140	if (I->getParent() == TheLoop->getLoopLatch() \|\| VF.isScalar())
6141	// The back-edge branch will remain, as will all scalar branches.
6142	return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6143
6144	// This branch will be eliminated by if-conversion.
6145	return `0`;
6146	// Note: We currently assume zero cost for an unconditional branch inside
6147	// a predicated block since it will become a fall-through, although we
6148	// may decide in the future to call TTI for all branches.
6149	}
6150	case Instruction::Switch: {
6151	if (VF.isScalar())
6152	return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6153	auto *Switch = cast<SwitchInst>(Val: I);
6154	return Switch->getNumCases() *
6155	TTI.getCmpSelInstrCost(
6156	Opcode: Instruction::ICmp,
6157	ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6158	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6159	VecPred: CmpInst::ICMP_EQ, CostKind);
6160	}
6161	case Instruction::PHI: {
6162	auto *Phi = cast<PHINode>(Val: I);
6163
6164	// First-order recurrences are replaced by vector shuffles inside the loop.
6165	if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6166	SmallVector<int> Mask(VF.getKnownMinValue());
6167	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
6168	return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6169	DstTy: cast<VectorType>(Val: VectorTy),
6170	SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6171	Index: VF.getKnownMinValue() - `1`);
6172	}
6173
6174	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6175	// converted into select instructions. We require N - 1 selects per phi
6176	// node, where N is the number of incoming values.
6177	if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6178	Type *ResultTy = Phi->getType();
6179
6180	// All instructions in an Any-of reduction chain are narrowed to bool.
6181	// Check if that is the case for this phi node.
6182	auto *HeaderUser = cast_if_present<PHINode>(
6183	Val: find_singleton<User>(Range: Phi->users(), P: [this](User U, bool) -> User {
6184	auto *Phi = dyn_cast<PHINode>(Val: U);
6185	if (Phi && Phi->getParent() == TheLoop->getHeader())
6186	return Phi;
6187	return nullptr;
6188	}));
6189	if (HeaderUser) {
6190	auto &ReductionVars = Legal->getReductionVars();
6191	auto Iter = ReductionVars.find(Key: HeaderUser);
6192	if (Iter != ReductionVars.end() &&
6193	RecurrenceDescriptor::isAnyOfRecurrenceKind(
6194	Kind: Iter->second.getRecurrenceKind()))
6195	ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6196	}
6197	return (Phi->getNumIncomingValues() - `1`) *
6198	TTI.getCmpSelInstrCost(
6199	Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6200	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6201	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6202	}
6203
6204	// When tail folding with EVL, if the phi is part of an out of loop
6205	// reduction then it will be transformed into a wide vp_merge.
6206	if (VF.isVector() && foldTailWithEVL() &&
6207	Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6208	IntrinsicCostAttributes ICA(
6209	Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6210	{toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6211	return TTI.getIntrinsicInstrCost(ICA, CostKind);
6212	}
6213
6214	return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6215	}
6216	case Instruction::UDiv:
6217	case Instruction::SDiv:
6218	case Instruction::URem:
6219	case Instruction::SRem:
6220	if (VF.isVector() && isPredicatedInst(I)) {
6221	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6222	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6223	ScalarCost : SafeDivisorCost;
6224	}
6225	// We've proven all lanes safe to speculate, fall through.
6226	[[fallthrough]];
6227	case Instruction::Add:
6228	case Instruction::Sub: {
6229	auto Info = Legal->getHistogramInfo(I);
6230	if (Info && VF.isVector()) {
6231	const HistogramInfo *HGram = Info.value();
6232	// Assume that a non-constant update value (or a constant != 1) requires
6233	// a multiply, and add that into the cost.
6234	InstructionCost MulCost = TTI::TCC_Free;
6235	ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`));
6236	if (!RHS \|\| RHS->getZExtValue() != `1`)
6237	MulCost =
6238	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6239
6240	// Find the cost of the histogram operation itself.
6241	Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6242	Type *ScalarTy = I->getType();
6243	Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6244	IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6245	Type::getVoidTy(C&: I->getContext()),
6246	{PtrTy, ScalarTy, MaskTy});
6247
6248	// Add the costs together with the add/sub operation.
6249	return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6250	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6251	}
6252	[[fallthrough]];
6253	}
6254	case Instruction::FAdd:
6255	case Instruction::FSub:
6256	case Instruction::Mul:
6257	case Instruction::FMul:
6258	case Instruction::FDiv:
6259	case Instruction::FRem:
6260	case Instruction::Shl:
6261	case Instruction::LShr:
6262	case Instruction::AShr:
6263	case Instruction::And:
6264	case Instruction::Or:
6265	case Instruction::Xor: {
6266	// If we're speculating on the stride being 1, the multiplication may
6267	// fold away. We can generalize this for all operations using the notion
6268	// of neutral elements. (TODO)
6269	if (I->getOpcode() == Instruction::Mul &&
6270	((TheLoop->isLoopInvariant(V: I->getOperand(i: `0`)) &&
6271	PSE.getSCEV(V: I->getOperand(i: `0`))->isOne()) \|\|
6272	(TheLoop->isLoopInvariant(V: I->getOperand(i: `1`)) &&
6273	PSE.getSCEV(V: I->getOperand(i: `1`))->isOne())))
6274	return `0`;
6275
6276	// Detect reduction patterns
6277	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6278	return *RedCost;
6279
6280	// Certain instructions can be cheaper to vectorize if they have a constant
6281	// second vector operand. One example of this are shifts on x86.
6282	Value *Op2 = I->getOperand(i: `1`);
6283	if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6284	PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6285	isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6286	Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6287	}
6288	auto Op2Info = TTI.getOperandInfo(V: Op2);
6289	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6290	shouldConsiderInvariant(Op: Op2))
6291	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6292
6293	SmallVector<const Value *, `4`> Operands(I->operand_values());
6294	return TTI.getArithmeticInstrCost(
6295	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6296	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6297	Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6298	}
6299	case Instruction::FNeg: {
6300	return TTI.getArithmeticInstrCost(
6301	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6302	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6303	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6304	Args: I->getOperand(i: `0`), CxtI: I);
6305	}
6306	case Instruction::Select: {
6307	SelectInst *SI = cast<SelectInst>(Val: I);
6308	const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6309	bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6310
6311	const Value Op0, Op1;
6312	using namespace llvm::PatternMatch;
6313	if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) \|\|
6314	match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6315	// select x, y, false --> x & y
6316	// select x, true, y --> x \| y
6317	const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6318	const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6319	assert(Op0->getType()->getScalarSizeInBits() == `1` &&
6320	Op1->getType()->getScalarSizeInBits() == `1`);
6321
6322	SmallVector<const Value *, `2`> Operands{Op0, Op1};
6323	return TTI.getArithmeticInstrCost(
6324	Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
6325	CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
6326	}
6327
6328	Type *CondTy = SI->getCondition()->getType();
6329	if (!ScalarCond)
6330	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6331
6332	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6333	if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6334	Pred = Cmp->getPredicate();
6335	return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6336	CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6337	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6338	}
6339	case Instruction::ICmp:
6340	case Instruction::FCmp: {
6341	Type *ValTy = I->getOperand(i: `0`)->getType();
6342
6343	if (canTruncateToMinimalBitwidth(I, VF)) {
6344	[[maybe_unused]] Instruction *Op0AsInstruction =
6345	dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6346	assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) \|\|
6347	MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6348	"if both the operand and the compare are marked for "
6349	"truncation, they must have the same bitwidth");
6350	ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs [I]);
6351	}
6352
6353	VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6354	return TTI.getCmpSelInstrCost(
6355	Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6356	VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6357	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6358	}
6359	case Instruction::Store:
6360	case Instruction::Load: {
6361	ElementCount Width = VF;
6362	if (Width.isVector()) {
6363	InstWidening Decision = getWideningDecision(I, VF: Width);
6364	assert(Decision != CM_Unknown &&
6365	"CM decision should be taken at this point");
6366	if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6367	return InstructionCost::getInvalid();
6368	if (Decision == CM_Scalarize)
6369	Width = ElementCount::getFixed(MinVal: `1`);
6370	}
6371	VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6372	return getMemoryInstructionCost(I, VF);
6373	}
6374	case Instruction::BitCast:
6375	if (I->getType()->isPointerTy())
6376	return `0`;
6377	[[fallthrough]];
6378	case Instruction::ZExt:
6379	case Instruction::SExt:
6380	case Instruction::FPToUI:
6381	case Instruction::FPToSI:
6382	case Instruction::FPExt:
6383	case Instruction::PtrToInt:
6384	case Instruction::IntToPtr:
6385	case Instruction::SIToFP:
6386	case Instruction::UIToFP:
6387	case Instruction::Trunc:
6388	case Instruction::FPTrunc: {
6389	// Computes the CastContextHint from a Load/Store instruction.
6390	auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6391	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
6392	"Expected a load or a store!");
6393
6394	if (VF.isScalar() \|\| !TheLoop->contains(Inst: I))
6395	return TTI::CastContextHint::Normal;
6396
6397	switch (getWideningDecision(I, VF)) {
6398	case LoopVectorizationCostModel::CM_GatherScatter:
6399	return TTI::CastContextHint::GatherScatter;
6400	case LoopVectorizationCostModel::CM_Interleave:
6401	return TTI::CastContextHint::Interleave;
6402	case LoopVectorizationCostModel::CM_Scalarize:
6403	case LoopVectorizationCostModel::CM_Widen:
6404	return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6405	: TTI::CastContextHint::Normal;
6406	case LoopVectorizationCostModel::CM_Widen_Reverse:
6407	return TTI::CastContextHint::Reversed;
6408	case LoopVectorizationCostModel::CM_Unknown:
6409	llvm_unreachable("Instr did not go through cost modelling?");
6410	case LoopVectorizationCostModel::CM_VectorCall:
6411	case LoopVectorizationCostModel::CM_IntrinsicCall:
6412	llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6413	}
6414
6415	llvm_unreachable("Unhandled case!");
6416	};
6417
6418	unsigned Opcode = I->getOpcode();
6419	TTI::CastContextHint CCH = TTI::CastContextHint::None;
6420	// For Trunc, the context is the only user, which must be a StoreInst.
6421	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
6422	if (I->hasOneUse())
6423	if (StoreInst Store = dyn_cast<StoreInst>(Val: I->user_begin()))
6424	CCH = ComputeCCH (Store);
6425	}
6426	// For Z/Sext, the context is the operand, which must be a LoadInst.
6427	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
6428	Opcode == Instruction::FPExt) {
6429	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
6430	CCH = ComputeCCH (Load);
6431	}
6432
6433	// We optimize the truncation of induction variables having constant
6434	// integer steps. The cost of these truncations is the same as the scalar
6435	// operation.
6436	if (isOptimizableIVTruncate(I, VF)) {
6437	auto *Trunc = cast<TruncInst>(Val: I);
6438	return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6439	Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6440	}
6441
6442	// Detect reduction patterns
6443	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6444	return *RedCost;
6445
6446	Type *SrcScalarTy = I->getOperand(i: `0`)->getType();
6447	Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6448	if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6449	SrcScalarTy =
6450	IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs [Op0AsInstruction]);
6451	Type *SrcVecTy =
6452	VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6453
6454	if (canTruncateToMinimalBitwidth(I, VF)) {
6455	// If the result type is <= the source type, there will be no extend
6456	// after truncating the users to the minimal required bitwidth.
6457	if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6458	(I->getOpcode() == Instruction::ZExt \|\|
6459	I->getOpcode() == Instruction::SExt))
6460	return `0`;
6461	}
6462
6463	return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6464	}
6465	case Instruction::Call:
6466	return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6467	case Instruction::ExtractValue:
6468	return TTI.getInstructionCost(U: I, CostKind);
6469	case Instruction::Alloca:
6470	// We cannot easily widen alloca to a scalable alloca, as
6471	// the result would need to be a vector of pointers.
6472	if (VF.isScalable())
6473	return InstructionCost::getInvalid();
6474	[[fallthrough]];
6475	default:
6476	// This opcode is unknown. Assume that it is the same as 'mul'.
6477	return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6478	} // end of switch.
6479	}
6480
6481	void LoopVectorizationCostModel::collectValuesToIgnore() {
6482	// Ignore ephemeral values.
6483	CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6484
6485	SmallVector<Value *, `4`> DeadInterleavePointerOps;
6486	SmallVector<Value *, `4`> DeadOps;
6487
6488	// If a scalar epilogue is required, users outside the loop won't use
6489	// live-outs from the vector loop but from the scalar epilogue. Ignore them if
6490	// that is the case.
6491	bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6492	auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6493	return RequiresScalarEpilogue &&
6494	!TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6495	};
6496
6497	LoopBlocksDFS DFS(TheLoop);
6498	DFS.perform(LI);
6499	MapVector<Value , SmallVector<Value >> DeadInvariantStoreOps;
6500	for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6501	for (Instruction &I : reverse(C&: *BB)) {
6502	// Find all stores to invariant variables. Since they are going to sink
6503	// outside the loop we do not need calculate cost for them.
6504	StoreInst *SI;
6505	if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
6506	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
6507	ValuesToIgnore.insert(Ptr: &I);
6508	DeadInvariantStoreOps [SI->getPointerOperand()].push_back(
6509	Elt: SI->getValueOperand());
6510	}
6511
6512	if (VecValuesToIgnore.contains(Ptr: &I) \|\| ValuesToIgnore.contains(Ptr: &I))
6513	continue;
6514
6515	// Add instructions that would be trivially dead and are only used by
6516	// values already ignored to DeadOps to seed worklist.
6517	if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6518	all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6519	return VecValuesToIgnore.contains(Ptr: U) \|\|
6520	ValuesToIgnore.contains(Ptr: U) \|\| IsLiveOutDead (U);
6521	}))
6522	DeadOps.push_back(Elt: &I);
6523
6524	// For interleave groups, we only create a pointer for the start of the
6525	// interleave group. Queue up addresses of group members except the insert
6526	// position for further processing.
6527	if (isAccessInterleaved(Instr: &I)) {
6528	auto *Group = getInterleavedAccessGroup(Instr: &I);
6529	if (Group->getInsertPos() == &I)
6530	continue;
6531	Value *PointerOp = getLoadStorePointerOperand(V: &I);
6532	DeadInterleavePointerOps.push_back(Elt: PointerOp);
6533	}
6534
6535	// Queue branches for analysis. They are dead, if their successors only
6536	// contain dead instructions.
6537	if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6538	if (Br->isConditional())
6539	DeadOps.push_back(Elt: &I);
6540	}
6541	}
6542
6543	// Mark ops feeding interleave group members as free, if they are only used
6544	// by other dead computations.
6545	for (unsigned I = `0`; I != DeadInterleavePointerOps.size(); ++I) {
6546	auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps [I]);
6547	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\| any_of(Range: Op->users(), P: [this](User *U) {
6548	Instruction *UI = cast<Instruction>(Val: U);
6549	return !VecValuesToIgnore.contains(Ptr: U) &&
6550	(!isAccessInterleaved(Instr: UI) \|\|
6551	getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6552	}))
6553	continue;
6554	VecValuesToIgnore.insert(Ptr: Op);
6555	DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6556	}
6557
6558	for (const auto &[_, Ops] : DeadInvariantStoreOps)
6559	llvm::append_range(C&: DeadOps, R: ArrayRef(Ops).drop_back());
6560
6561	// Mark ops that would be trivially dead and are only used by ignored
6562	// instructions as free.
6563	BasicBlock *Header = TheLoop->getHeader();
6564
6565	// Returns true if the block contains only dead instructions. Such blocks will
6566	// be removed by VPlan-to-VPlan transforms and won't be considered by the
6567	// VPlan-based cost model, so skip them in the legacy cost-model as well.
6568	auto IsEmptyBlock = [this](BasicBlock *BB) {
6569	return all_of(Range&: BB, P: [this*](Instruction &I) {
6570	return ValuesToIgnore.contains(Ptr: &I) \|\| VecValuesToIgnore.contains(Ptr: &I) \|\|
6571	(isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6572	});
6573	};
6574	for (unsigned I = `0`; I != DeadOps.size(); ++I) {
6575	auto *Op = dyn_cast<Instruction>(Val: DeadOps [I]);
6576
6577	// Check if the branch should be considered dead.
6578	if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6579	BasicBlock *ThenBB = Br->getSuccessor(i: `0`);
6580	BasicBlock *ElseBB = Br->getSuccessor(i: `1`);
6581	// Don't considers branches leaving the loop for simplification.
6582	if (!TheLoop->contains(BB: ThenBB) \|\| !TheLoop->contains(BB: ElseBB))
6583	continue;
6584	bool ThenEmpty = IsEmptyBlock (ThenBB);
6585	bool ElseEmpty = IsEmptyBlock (ElseBB);
6586	if ((ThenEmpty && ElseEmpty) \|\|
6587	(ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6588	ElseBB->phis().empty()) \|\|
6589	(ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6590	ThenBB->phis().empty())) {
6591	VecValuesToIgnore.insert(Ptr: Br);
6592	DeadOps.push_back(Elt: Br->getCondition());
6593	}
6594	continue;
6595	}
6596
6597	// Skip any op that shouldn't be considered dead.
6598	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\|
6599	(isa<PHINode>(Val: Op) && Op->getParent() == Header) \|\|
6600	!wouldInstructionBeTriviallyDead(I: Op, TLI) \|\|
6601	any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6602	return !VecValuesToIgnore.contains(Ptr: U) &&
6603	!ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead (U);
6604	}))
6605	continue;
6606
6607	// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6608	// which applies for both scalar and vector versions. Otherwise it is only
6609	// dead in vector versions, so only add it to VecValuesToIgnore.
6610	if (all_of(Range: Op->users(),
6611	P: [this](User U) { return* ValuesToIgnore.contains(Ptr: U); }))
6612	ValuesToIgnore.insert(Ptr: Op);
6613
6614	VecValuesToIgnore.insert(Ptr: Op);
6615	DeadOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6616	}
6617
6618	// Ignore type-promoting instructions we identified during reduction
6619	// detection.
6620	for (const auto &Reduction : Legal->getReductionVars()) {
6621	const RecurrenceDescriptor &RedDes = Reduction.second;
6622	const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6623	VecValuesToIgnore.insert_range(R: Casts);
6624	}
6625	// Ignore type-casting instructions we identified during induction
6626	// detection.
6627	for (const auto &Induction : Legal->getInductionVars()) {
6628	const InductionDescriptor &IndDes = Induction.second;
6629	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6630	VecValuesToIgnore.insert_range(R: Casts);
6631	}
6632	}
6633
6634	void LoopVectorizationCostModel::collectInLoopReductions() {
6635	// Avoid duplicating work finding in-loop reductions.
6636	if (!InLoopReductions.empty())
6637	return;
6638
6639	for (const auto &Reduction : Legal->getReductionVars()) {
6640	PHINode *Phi = Reduction.first;
6641	const RecurrenceDescriptor &RdxDesc = Reduction.second;
6642
6643	// We don't collect reductions that are type promoted (yet).
6644	if (RdxDesc.getRecurrenceType() != Phi->getType())
6645	continue;
6646
6647	// If the target would prefer this reduction to happen "in-loop", then we
6648	// want to record it as such.
6649	RecurKind Kind = RdxDesc.getRecurrenceKind();
6650	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6651	!TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6652	continue;
6653
6654	// Check that we can correctly put the reductions into the loop, by
6655	// finding the chain of operations that leads from the phi to the loop
6656	// exit value.
6657	SmallVector<Instruction *, `4`> ReductionOperations =
6658	RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6659	bool InLoop = !ReductionOperations.empty();
6660
6661	if (InLoop) {
6662	InLoopReductions.insert(Ptr: Phi);
6663	// Add the elements to InLoopReductionImmediateChains for cost modelling.
6664	Instruction *LastChain = Phi;
6665	for (auto *I : ReductionOperations) {
6666	InLoopReductionImmediateChains [I] = LastChain;
6667	LastChain = I;
6668	}
6669	}
6670	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6671	<< " reduction for phi: " << *Phi << "\n");
6672	}
6673	}
6674
6675	// This function will select a scalable VF if the target supports scalable
6676	// vectors and a fixed one otherwise.
6677	// TODO: we could return a pair of values that specify the max VF and
6678	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6679	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6680	// doesn't have a cost model that can choose which plan to execute if
6681	// more than one is generated.
6682	static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6683	LoopVectorizationCostModel &CM) {
6684	unsigned WidestType;
6685	std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6686
6687	TargetTransformInfo::RegisterKind RegKind =
6688	TTI.enableScalableVectorization()
6689	? TargetTransformInfo::RGK_ScalableVector
6690	: TargetTransformInfo::RGK_FixedWidthVector;
6691
6692	TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6693	unsigned N = RegSize.getKnownMinValue() / WidestType;
6694	return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6695	}
6696
6697	VectorizationFactor
6698	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6699	ElementCount VF = UserVF;
6700	// Outer loop handling: They may require CFG and instruction level
6701	// transformations before even evaluating whether vectorization is profitable.
6702	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
6703	// the vectorization pipeline.
6704	if (!OrigLoop->isInnermost()) {
6705	// If the user doesn't provide a vectorization factor, determine a
6706	// reasonable one.
6707	if (UserVF.isZero()) {
6708	VF = determineVPlanVF(TTI, CM);
6709	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6710
6711	// Make sure we have a VF > 1 for stress testing.
6712	if (VPlanBuildStressTest && (VF.isScalar() \|\| VF.isZero())) {
6713	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6714	<< "overriding computed VF.\n");
6715	VF = ElementCount::getFixed(MinVal: `4`);
6716	}
6717	} else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6718	!ForceTargetSupportsScalableVectors) {
6719	LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6720	<< "not supported by the target.\n");
6721	reportVectorizationFailure(
6722	DebugMsg: "Scalable vectorization requested but not supported by the target",
6723	OREMsg: "the scalable user-specified vectorization width for outer-loop "
6724	"vectorization cannot be used because the target does not support "
6725	"scalable vectors.",
6726	ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6727	return VectorizationFactor::Disabled();
6728	}
6729	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6730	assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6731	"VF needs to be a power of two");
6732	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6733	<< "VF " << VF << " to build VPlans.\n");
6734	buildVPlans(MinVF: VF, MaxVF: VF);
6735
6736	if (VPlans.empty())
6737	return VectorizationFactor::Disabled();
6738
6739	// For VPlan build stress testing, we bail out after VPlan construction.
6740	if (VPlanBuildStressTest)
6741	return VectorizationFactor::Disabled();
6742
6743	return {VF, `0` /Cost/, `0` / ScalarCost /};
6744	}
6745
6746	LLVM_DEBUG(
6747	dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6748	"VPlan-native path.\n");
6749	return VectorizationFactor::Disabled();
6750	}
6751
6752	void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6753	assert(OrigLoop->isInnermost() && "Inner loop expected.");
6754	CM.collectValuesToIgnore();
6755	CM.collectElementTypesForWidening();
6756
6757	FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6758	if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6759	return;
6760
6761	// Invalidate interleave groups if all blocks of loop will be predicated.
6762	if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6763	!useMaskedInterleavedAccesses(TTI)) {
6764	LLVM_DEBUG(
6765	dbgs()
6766	<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
6767	"which requires masked-interleaved support.\n");
6768	if (CM.InterleaveInfo.invalidateGroups())
6769	// Invalidating interleave groups also requires invalidating all decisions
6770	// based on them, which includes widening decisions and uniform and scalar
6771	// values.
6772	CM.invalidateCostModelingDecisions();
6773	}
6774
6775	if (CM.foldTailByMasking())
6776	Legal->prepareToFoldTailByMasking();
6777
6778	ElementCount MaxUserVF =
6779	UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6780	if (UserVF) {
6781	if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6782	reportVectorizationInfo(
6783	Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6784	ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6785	} else {
6786	assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6787	"VF needs to be a power of two");
6788	// Collect the instructions (and their associated costs) that will be more
6789	// profitable to scalarize.
6790	CM.collectInLoopReductions();
6791	if (CM.selectUserVectorizationFactor(UserVF)) {
6792	LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6793	buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6794	LLVM_DEBUG(printPlans(dbgs()));
6795	return;
6796	}
6797	reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6798	ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6799	}
6800	}
6801
6802	// Collect the Vectorization Factor Candidates.
6803	SmallVector<ElementCount> VFCandidates;
6804	for (auto VF = ElementCount::getFixed(MinVal: `1`);
6805	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= `2`)
6806	VFCandidates.push_back(Elt: VF);
6807	for (auto VF = ElementCount::getScalable(MinVal: `1`);
6808	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= `2`)
6809	VFCandidates.push_back(Elt: VF);
6810
6811	CM.collectInLoopReductions();
6812	for (const auto &VF : VFCandidates) {
6813	// Collect Uniform and Scalar instructions after vectorization with VF.
6814	CM.collectNonVectorizedAndSetWideningDecisions(VF);
6815	}
6816
6817	buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: `1`), MaxVF: MaxFactors.FixedVF);
6818	buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: `1`), MaxVF: MaxFactors.ScalableVF);
6819
6820	LLVM_DEBUG(printPlans(dbgs()));
6821	}
6822
6823	InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6824	ElementCount VF) const {
6825	if (ForceTargetInstructionCost.getNumOccurrences())
6826	return InstructionCost (ForceTargetInstructionCost.getNumOccurrences());
6827	return CM.getInstructionCost(I: UI, VF);
6828	}
6829
6830	bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6831	ElementCount VF) const {
6832	return CM.isUniformAfterVectorization(I, VF);
6833	}
6834
6835	bool VPCostContext::skipCostComputation(Instruction UI, bool* IsVector) const {
6836	return CM.ValuesToIgnore.contains(Ptr: UI) \|\|
6837	(IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) \|\|
6838	SkipCostComputation.contains(Ptr: UI);
6839	}
6840
6841	InstructionCost
6842	LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6843	VPCostContext &CostCtx) const {
6844	InstructionCost Cost;
6845	// Cost modeling for inductions is inaccurate in the legacy cost model
6846	// compared to the recipes that are generated. To match here initially during
6847	// VPlan cost model bring up directly use the induction costs from the legacy
6848	// cost model. Note that we do this as pre-processing; the VPlan may not have
6849	// any recipes associated with the original induction increment instruction
6850	// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6851	// the cost of induction phis and increments (both that are represented by
6852	// recipes and those that are not), to avoid distinguishing between them here,
6853	// and skip all recipes that represent induction phis and increments (the
6854	// former case) later on, if they exist, to avoid counting them twice.
6855	// Similarly we pre-compute the cost of any optimized truncates.
6856	// TODO: Switch to more accurate costing based on VPlan.
6857	for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6858	Instruction *IVInc = cast<Instruction>(
6859	Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6860	SmallVector<Instruction *> IVInsts = {IVInc};
6861	for (unsigned I = `0`; I != IVInsts.size(); I++) {
6862	for (Value *Op : IVInsts [I]->operands()) {
6863	auto *OpI = dyn_cast<Instruction>(Val: Op);
6864	if (Op == IV \|\| !OpI \|\| !OrigLoop->contains(Inst: OpI) \|\| !Op->hasOneUse())
6865	continue;
6866	IVInsts.push_back(Elt: OpI);
6867	}
6868	}
6869	IVInsts.push_back(Elt: IV);
6870	for (User *U : IV->users()) {
6871	auto *CI = cast<Instruction>(Val: U);
6872	if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6873	continue;
6874	IVInsts.push_back(Elt: CI);
6875	}
6876
6877	// If the vector loop gets executed exactly once with the given VF, ignore
6878	// the costs of comparison and induction instructions, as they'll get
6879	// simplified away.
6880	// TODO: Remove this code after stepping away from the legacy cost model and
6881	// adding code to simplify VPlans before calculating their costs.
6882	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6883	if (TC == VF && !CM.foldTailByMasking())
6884	addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6885	InstsToIgnore&: CostCtx.SkipCostComputation);
6886
6887	for (Instruction *IVInst : IVInsts) {
6888	if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6889	continue;
6890	InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6891	LLVM_DEBUG({
6892	dbgs() << "Cost of " << InductionCost << " for VF " << VF
6893	<< ": induction instruction " << *IVInst << "\n";
6894	});
6895	Cost += InductionCost;
6896	CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6897	}
6898	}
6899
6900	/// Compute the cost of all exiting conditions of the loop using the legacy
6901	/// cost model. This is to match the legacy behavior, which adds the cost of
6902	/// all exit conditions. Note that this over-estimates the cost, as there will
6903	/// be a single condition to control the vector loop.
6904	SmallVector<BasicBlock *> Exiting;
6905	CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6906	SetVector<Instruction *> ExitInstrs;
6907	// Collect all exit conditions.
6908	for (BasicBlock *EB : Exiting) {
6909	auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6910	if (!Term \|\| CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6911	continue;
6912	if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: `0`))) {
6913	ExitInstrs.insert(X: CondI);
6914	}
6915	}
6916	// Compute the cost of all instructions only feeding the exit conditions.
6917	for (unsigned I = `0`; I != ExitInstrs.size(); ++I) {
6918	Instruction *CondI = ExitInstrs [I];
6919	if (!OrigLoop->contains(Inst: CondI) \|\|
6920	!CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6921	continue;
6922	InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6923	LLVM_DEBUG({
6924	dbgs() << "Cost of " << CondICost << " for VF " << VF
6925	<< ": exit condition instruction " << *CondI << "\n";
6926	});
6927	Cost += CondICost;
6928	for (Value *Op : CondI->operands()) {
6929	auto *OpI = dyn_cast<Instruction>(Val: Op);
6930	if (!OpI \|\| CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) \|\|
6931	any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) {
6932	return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) &&
6933	!ExitInstrs.contains(key: cast<Instruction>(Val: U));
6934	}))
6935	continue;
6936	ExitInstrs.insert(X: OpI);
6937	}
6938	}
6939
6940	// Pre-compute the costs for branches except for the backedge, as the number
6941	// of replicate regions in a VPlan may not directly match the number of
6942	// branches, which would lead to different decisions.
6943	// TODO: Compute cost of branches for each replicate region in the VPlan,
6944	// which is more accurate than the legacy cost model.
6945	for (BasicBlock *BB : OrigLoop->blocks()) {
6946	if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6947	continue;
6948	CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6949	if (BB == OrigLoop->getLoopLatch())
6950	continue;
6951	auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6952	Cost += BranchCost;
6953	}
6954
6955	// Pre-compute costs for instructions that are forced-scalar or profitable to
6956	// scalarize. Their costs will be computed separately in the legacy cost
6957	// model.
6958	for (Instruction *ForcedScalar : CM.ForcedScalars [VF]) {
6959	if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
6960	continue;
6961	CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
6962	InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
6963	LLVM_DEBUG({
6964	dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6965	<< ": forced scalar " << *ForcedScalar << "\n";
6966	});
6967	Cost += ForcedCost;
6968	}
6969	for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize [VF]) {
6970	if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
6971	continue;
6972	CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
6973	LLVM_DEBUG({
6974	dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6975	<< ": profitable to scalarize " << *Scalarized << "\n";
6976	});
6977	Cost += ScalarCost;
6978	}
6979
6980	return Cost;
6981	}
6982
6983	InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6984	ElementCount VF) const {
6985	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6986	CM.CostKind);
6987	InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6988
6989	// Now compute and add the VPlan-based cost.
6990	Cost += Plan.cost(VF, Ctx&: CostCtx);
6991	#ifndef NDEBUG
6992	unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6993	LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6994	<< " (Estimated cost per lane: ");
6995	if (Cost.isValid()) {
6996	double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6997	LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6998	} else / No point dividing an invalid cost - it will still be invalid /
6999	LLVM_DEBUG(dbgs() << "Invalid");
7000	LLVM_DEBUG(dbgs() << ")\n");
7001	#endif
7002	return Cost;
7003	}
7004
7005	#ifndef NDEBUG
7006	/// Return true if the original loop \ TheLoop contains any instructions that do
7007	/// not have corresponding recipes in \p Plan and are not marked to be ignored
7008	/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7009	/// cost-model did not account for.
7010	static bool planContainsAdditionalSimplifications(VPlan &Plan,
7011	VPCostContext &CostCtx,
7012	Loop *TheLoop,
7013	ElementCount VF) {
7014	// First collect all instructions for the recipes in Plan.
7015	auto GetInstructionForCost = [](const VPRecipeBase R) -> Instruction {
7016	if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7017	return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7018	if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7019	return &WidenMem->getIngredient();
7020	return nullptr;
7021	};
7022
7023	DenseSet<Instruction *> SeenInstrs;
7024	auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7025	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7026	for (VPRecipeBase &R : *VPBB) {
7027	if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7028	auto *IG = IR->getInterleaveGroup();
7029	unsigned NumMembers = IG->getNumMembers();
7030	for (unsigned I = `0`; I != NumMembers; ++I) {
7031	if (Instruction *M = IG->getMember(I))
7032	SeenInstrs.insert(M);
7033	}
7034	continue;
7035	}
7036	// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7037	// cost model won't cost it whilst the legacy will.
7038	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7039	if (none_of(FOR->users(), [](VPUser *U) {
7040	auto *VPI = dyn_cast<VPInstruction>(U);
7041	return VPI && VPI->getOpcode() ==
7042	VPInstruction::FirstOrderRecurrenceSplice;
7043	}))
7044	return true;
7045	}
7046	// The VPlan-based cost model is more accurate for partial reduction and
7047	// comparing against the legacy cost isn't desirable.
7048	if (isa<VPPartialReductionRecipe>(&R))
7049	return true;
7050
7051	/// If a VPlan transform folded a recipe to one producing a single-scalar,
7052	/// but the original instruction wasn't uniform-after-vectorization in the
7053	/// legacy cost model, the legacy cost overestimates the actual cost.
7054	if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7055	if (RepR->isSingleScalar() &&
7056	!CostCtx.isLegacyUniformAfterVectorization(
7057	RepR->getUnderlyingInstr(), VF))
7058	return true;
7059	}
7060	if (Instruction *UI = GetInstructionForCost(&R)) {
7061	// If we adjusted the predicate of the recipe, the cost in the legacy
7062	// cost model may be different.
7063	if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
7064	if ((WidenCmp->getOpcode() == Instruction::ICmp \|\|
7065	WidenCmp->getOpcode() == Instruction::FCmp) &&
7066	WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
7067	return true;
7068	}
7069	SeenInstrs.insert(UI);
7070	}
7071	}
7072	}
7073
7074	// Return true if the loop contains any instructions that are not also part of
7075	// the VPlan or are skipped for VPlan-based cost computations. This indicates
7076	// that the VPlan contains extra simplifications.
7077	return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7078	TheLoop](BasicBlock *BB) {
7079	return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7080	// Skip induction phis when checking for simplifications, as they may not
7081	// be lowered directly be lowered to a corresponding PHI recipe.
7082	if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7083	CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7084	return false;
7085	return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7086	});
7087	});
7088	}
7089	#endif
7090
7091	VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7092	if (VPlans.empty())
7093	return VectorizationFactor::Disabled();
7094	// If there is a single VPlan with a single VF, return it directly.
7095	VPlan &FirstPlan = *VPlans [`0`];
7096	if (VPlans.size() == `1` && size(Range: FirstPlan.vectorFactors()) == `1`)
7097	return {*FirstPlan.vectorFactors().begin(), `0`, `0`};
7098
7099	LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7100	<< (CM.CostKind == TTI::TCK_RecipThroughput
7101	? "Reciprocal Throughput\n"
7102	: CM.CostKind == TTI::TCK_Latency
7103	? "Instruction Latency\n"
7104	: CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7105	: CM.CostKind == TTI::TCK_SizeAndLatency
7106	? "Code Size and Latency\n"
7107	: "Unknown\n"));
7108
7109	ElementCount ScalarVF = ElementCount::getFixed(MinVal: `1`);
7110	assert(hasPlanWithVF(ScalarVF) &&
7111	"More than a single plan/VF w/o any plan having scalar VF");
7112
7113	// TODO: Compute scalar cost using VPlan-based cost model.
7114	InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7115	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7116	VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7117	VectorizationFactor BestFactor = ScalarFactor;
7118
7119	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7120	if (ForceVectorization) {
7121	// Ignore scalar width, because the user explicitly wants vectorization.
7122	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7123	// evaluation.
7124	BestFactor.Cost = InstructionCost::getMax();
7125	}
7126
7127	for (auto &P : VPlans) {
7128	ArrayRef<ElementCount> VFs(P ->vectorFactors().begin(),
7129	P ->vectorFactors().end());
7130
7131	SmallVector<VPRegisterUsage, `8`> RUs;
7132	if (CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_ScalableVector) \|\|
7133	CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_FixedWidthVector))
7134	RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7135
7136	for (unsigned I = `0`; I < VFs.size(); I++) {
7137	ElementCount VF = VFs [I];
7138	if (VF.isScalar())
7139	continue;
7140	if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7141	LLVM_DEBUG(
7142	dbgs()
7143	<< "LV: Not considering vector loop of width " << VF
7144	<< " because it will not generate any vector instructions.\n");
7145	continue;
7146	}
7147	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7148	LLVM_DEBUG(
7149	dbgs()
7150	<< "LV: Not considering vector loop of width " << VF
7151	<< " because it would cause replicated blocks to be generated,"
7152	<< " which isn't allowed when optimizing for size.\n");
7153	continue;
7154	}
7155
7156	InstructionCost Cost = cost(Plan&: *P, VF);
7157	VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7158
7159	if (CM.useMaxBandwidth(VF) && RUs [I].exceedsMaxNumRegs(TTI)) {
7160	LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7161	<< VF << " because it uses too many registers\n");
7162	continue;
7163	}
7164
7165	if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P ->hasScalarTail()))
7166	BestFactor = CurrentFactor;
7167
7168	// If profitable add it to ProfitableVF list.
7169	if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P ->hasScalarTail()))
7170	ProfitableVFs.push_back(Elt: CurrentFactor);
7171	}
7172	}
7173
7174	#ifndef NDEBUG
7175	// Select the optimal vectorization factor according to the legacy cost-model.
7176	// This is now only used to verify the decisions by the new VPlan-based
7177	// cost-model and will be retired once the VPlan-based cost-model is
7178	// stabilized.
7179	VectorizationFactor LegacyVF = selectVectorizationFactor();
7180	VPlan &BestPlan = getPlanFor(BestFactor.Width);
7181
7182	// Pre-compute the cost and use it to check if BestPlan contains any
7183	// simplifications not accounted for in the legacy cost model. If that's the
7184	// case, don't trigger the assertion, as the extra simplifications may cause a
7185	// different VF to be picked by the VPlan-based cost model.
7186	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7187	CM.CostKind);
7188	precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7189	// Verify that the VPlan-based and legacy cost models agree, except for VPlans
7190	// with early exits and plans with additional VPlan simplifications. The
7191	// legacy cost model doesn't properly model costs for such loops.
7192	assert((BestFactor.Width == LegacyVF.Width \|\| BestPlan.hasEarlyExit() \|\|
7193	planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7194	CostCtx, OrigLoop,
7195	BestFactor.Width) \|\|
7196	planContainsAdditionalSimplifications(
7197	getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7198	" VPlan cost model and legacy cost model disagreed");
7199	assert((BestFactor.Width.isScalar() \|\| BestFactor.ScalarCost > `0`) &&
7200	"when vectorizing, the scalar cost must be computed.");
7201	#endif
7202
7203	LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7204	return BestFactor;
7205	}
7206
7207	static void addRuntimeUnrollDisableMetaData(Loop *L) {
7208	SmallVector<Metadata *, `4`> MDs;
7209	// Reserve first location for self reference to the LoopID metadata node.
7210	MDs.push_back(Elt: nullptr);
7211	bool IsUnrollMetadata = false;
7212	MDNode *LoopID = L->getLoopID();
7213	if (LoopID) {
7214	// First find existing loop unrolling disable metadata.
7215	for (unsigned I = `1`, IE = LoopID->getNumOperands(); I < IE; ++I) {
7216	auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
7217	if (MD) {
7218	const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
7219	IsUnrollMetadata =
7220	S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7221	}
7222	MDs.push_back(Elt: LoopID->getOperand(I));
7223	}
7224	}
7225
7226	if (!IsUnrollMetadata) {
7227	// Add runtime unroll disable metadata.
7228	LLVMContext &Context = L->getHeader()->getContext();
7229	SmallVector<Metadata *, `1`> DisableOperands;
7230	DisableOperands.push_back(
7231	Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7232	MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7233	MDs.push_back(Elt: DisableNode);
7234	MDNode *NewLoopID = MDNode::get(Context, MDs);
7235	// Set operand 0 to refer to the loop id itself.
7236	NewLoopID->replaceOperandWith(I: `0`, New: NewLoopID);
7237	L->setLoopID(NewLoopID);
7238	}
7239	}
7240
7241	static Value getStartValueFromReductionResult(VPInstruction RdxResult) {
7242	using namespace VPlanPatternMatch;
7243	assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7244	"RdxResult must be ComputeFindIVResult");
7245	VPValue *StartVPV = RdxResult->getOperand(N: `1`);
7246	match(V: StartVPV, P: m_Freeze(Op0: m_VPValue(V&: StartVPV)));
7247	return StartVPV->getLiveInIRValue();
7248	}
7249
7250	// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7251	// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7252	// from the main vector loop.
7253	static void fixReductionScalarResumeWhenVectorizingEpilog(
7254	VPPhi EpiResumePhiR, VPTransformState &State, BasicBlock BypassBlock) {
7255	// Get the VPInstruction computing the reduction result in the middle block.
7256	// The first operand may not be from the middle block if it is not connected
7257	// to the scalar preheader. In that case, there's nothing to fix.
7258	VPValue *Incoming = EpiResumePhiR->getOperand(N: `0`);
7259	match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7260	Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7261	auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7262	if (!EpiRedResult \|\|
7263	(EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7264	EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7265	EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7266	return;
7267
7268	auto *EpiRedHeaderPhi =
7269	cast<VPReductionPHIRecipe>(Val: EpiRedResult->getOperand(N: `0`));
7270	const RecurrenceDescriptor &RdxDesc =
7271	EpiRedHeaderPhi->getRecurrenceDescriptor();
7272	Value *MainResumeValue;
7273	if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7274	assert((VPI->getOpcode() == VPInstruction::Broadcast \|\|
7275	VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7276	"unexpected start recipe");
7277	MainResumeValue = VPI->getOperand(N: `0`)->getUnderlyingValue();
7278	} else
7279	MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7280	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7281	Kind: RdxDesc.getRecurrenceKind())) {
7282	[[maybe_unused]] Value *StartV =
7283	EpiRedResult->getOperand(N: `1`)->getLiveInIRValue();
7284	auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7285	assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7286	"AnyOf expected to start with ICMP_NE");
7287	assert(Cmp->getOperand(`1`) == StartV &&
7288	"AnyOf expected to start by comparing main resume value to original "
7289	"start value");
7290	MainResumeValue = Cmp->getOperand(i_nocapture: `0`);
7291	} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(
7292	Kind: RdxDesc.getRecurrenceKind())) {
7293	Value *StartV = getStartValueFromReductionResult(RdxResult: EpiRedResult);
7294	Value *SentinelV = EpiRedResult->getOperand(N: `2`)->getLiveInIRValue();
7295	using namespace llvm::PatternMatch;
7296	Value Cmp, OrigResumeV, *CmpOp;
7297	[[maybe_unused]] bool IsExpectedPattern =
7298	match(V: MainResumeValue,
7299	P: m_Select(C: m_OneUse(SubPattern: m_Value(V&: Cmp)), L: m_Specific(V: SentinelV),
7300	R: m_Value(V&: OrigResumeV))) &&
7301	(match(V: Cmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: OrigResumeV),
7302	R: m_Value(V&: CmpOp))) &&
7303	((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(V: CmpOp))));
7304	assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7305	MainResumeValue = OrigResumeV;
7306	}
7307	PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7308
7309	// When fixing reductions in the epilogue loop we should already have
7310	// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7311	// over the incoming values correctly.
7312	auto EpiResumePhi = cast<PHINode>(Val: State.get(Def: EpiResumePhiR, IsScalar: true*));
7313	EpiResumePhi->setIncomingValueForBlock(
7314	BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7315	}
7316
7317	DenseMap<const SCEV , Value > LoopVectorizationPlanner::executePlan(
7318	ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7319	InnerLoopVectorizer &ILV, DominatorTree DT, bool* VectorizingEpilogue) {
7320	assert(BestVPlan.hasVF(BestVF) &&
7321	"Trying to execute plan with unsupported VF");
7322	assert(BestVPlan.hasUF(BestUF) &&
7323	"Trying to execute plan with unsupported UF");
7324	// TODO: Move to VPlan transform stage once the transition to the VPlan-based
7325	// cost model is complete for better cost estimates.
7326	VPlanTransforms::runPass(Fn: VPlanTransforms::unrollByUF, Plan&: BestVPlan, Args&: BestUF,
7327	Args&: OrigLoop->getHeader()->getContext());
7328	VPlanTransforms::runPass(Fn: VPlanTransforms::replicateByVF, Plan&: BestVPlan, Args&: BestVF);
7329	VPlanTransforms::runPass(Fn: VPlanTransforms::materializeBroadcasts, Plan&: BestVPlan);
7330	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7331	std::optional<unsigned> VScale = CM.getVScaleForTuning();
7332	VPlanTransforms::runPass(Fn: VPlanTransforms::addBranchWeightToMiddleTerminator,
7333	Plan&: BestVPlan, Args&: BestVF, Args&: VScale);
7334	}
7335	VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7336	VPlanTransforms::simplifyRecipes(Plan&: BestVPlan, CanonicalIVTy&: *Legal->getWidestInductionType());
7337	VPlanTransforms::narrowInterleaveGroups(
7338	Plan&: BestVPlan, VF: BestVF,
7339	VectorRegWidth: TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector));
7340	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7341
7342	VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan,
7343	CanonicalIVTy&: *Legal->getWidestInductionType());
7344	// Regions are dissolved after optimizing for VF and UF, which completely
7345	// removes unneeded loop regions first.
7346	VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7347	// Perform the actual loop transformation.
7348	VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7349	OrigLoop->getParentLoop(),
7350	Legal->getWidestInductionType());
7351
7352	#ifdef EXPENSIVE_CHECKS
7353	assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7354	#endif
7355
7356	// 0. Generate SCEV-dependent code in the entry, including TripCount, before
7357	// making any changes to the CFG.
7358	DenseMap<const SCEV , Value > ExpandedSCEVs;
7359	auto *Entry = cast<VPIRBasicBlock>(Val: BestVPlan.getEntry());
7360	State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7361	for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
7362	auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
7363	if (!ExpSCEV)
7364	continue;
7365	ExpSCEV->execute(State);
7366	ExpandedSCEVs [ExpSCEV->getSCEV()] = State.get(Def: ExpSCEV, Lane: VPLane (`0`));
7367	VPValue *Exp = BestVPlan.getOrAddLiveIn(V: ExpandedSCEVs [ExpSCEV->getSCEV()]);
7368	ExpSCEV->replaceAllUsesWith(New: Exp);
7369	if (BestVPlan.getTripCount() == ExpSCEV)
7370	BestVPlan.resetTripCount(NewTripCount: Exp);
7371	ExpSCEV->eraseFromParent();
7372	}
7373
7374	if (!ILV.getTripCount())
7375	ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Lane: VPLane (`0`)));
7376	else
7377	assert(VectorizingEpilogue && "should only re-use the existing trip "
7378	"count during epilogue vectorization");
7379
7380	// 1. Set up the skeleton for vectorization, including vector pre-header and
7381	// middle block. The vector loop is created during VPlan execution.
7382	VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: Entry->getSuccessors()[`1`]);
7383	State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7384	if (VectorizingEpilogue)
7385	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7386
7387	assert(verifyVPlanIsValid(BestVPlan, true /VerifyLate/) &&
7388	"final VPlan is invalid");
7389
7390	ILV.printDebugTracesAtStart();
7391
7392	//===------------------------------------------------===//
7393	//
7394	// Notice: any optimization or new instruction that go
7395	// into the code below should also be implemented in
7396	// the cost-model.
7397	//
7398	//===------------------------------------------------===//
7399
7400	// 2. Copy and widen instructions from the old loop into the new loop.
7401	BestVPlan.prepareToExecute(
7402	TripCount: ILV.getTripCount(),
7403	VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: ILV.LoopVectorPreHeader), State);
7404	replaceVPBBWithIRVPBB(VPBB: VectorPH, IRBB: State.CFG.PrevBB);
7405
7406	BestVPlan.execute(State: &State);
7407
7408	// 2.5 When vectorizing the epilogue, fix reduction resume values from the
7409	// additional bypass block.
7410	if (VectorizingEpilogue) {
7411	assert(!BestVPlan.hasEarlyExit() &&
7412	"Epilogue vectorisation not yet supported with early exits");
7413	BasicBlock *PH = OrigLoop->getLoopPreheader();
7414	BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7415	for (auto *Pred : predecessors(BB: PH)) {
7416	for (PHINode &Phi : PH->phis()) {
7417	if (Phi.getBasicBlockIndex(BB: Pred) != -`1`)
7418	continue;
7419	Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
7420	}
7421	}
7422	VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7423	if (ScalarPH->getNumPredecessors() > `0`) {
7424	// If ScalarPH has predecessors, we may need to update its reduction
7425	// resume values.
7426	for (VPRecipeBase &R : ScalarPH->phis()) {
7427	fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), State,
7428	BypassBlock);
7429	}
7430	}
7431	}
7432
7433	// 2.6. Maintain Loop Hints
7434	// Keep all loop hints from the original loop on the vector loop (we'll
7435	// replace the vectorizer-specific hints below).
7436	VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7437	if (HeaderVPBB) {
7438	MDNode *OrigLoopID = OrigLoop->getLoopID();
7439
7440	std::optional<MDNode *> VectorizedLoopID =
7441	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7442	LLVMLoopVectorizeFollowupVectorized});
7443
7444	Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB [HeaderVPBB]);
7445	if (VectorizedLoopID) {
7446	L->setLoopID(*VectorizedLoopID);
7447	} else {
7448	// Keep all loop hints from the original loop on the vector loop (we'll
7449	// replace the vectorizer-specific hints below).
7450	if (MDNode *LID = OrigLoop->getLoopID())
7451	L->setLoopID(LID);
7452
7453	LoopVectorizeHints Hints(L, true, *ORE);
7454	Hints.setAlreadyVectorized();
7455
7456	// Check if it's EVL-vectorized and mark the corresponding metadata.
7457	bool IsEVLVectorized =
7458	llvm::any_of(Range&: HeaderVPBB, P: [](const* VPRecipeBase &Recipe) {
7459	// Looking for the ExplictVectorLength VPInstruction.
7460	if (const auto *VI = dyn_cast<VPInstruction>(Val: &Recipe))
7461	return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7462	return false;
7463	});
7464	if (IsEVLVectorized) {
7465	LLVMContext &Context = L->getHeader()->getContext();
7466	MDNode *LoopID = L->getLoopID();
7467	auto *IsEVLVectorizedMD = MDNode::get(
7468	Context,
7469	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized.tailfoldingstyle"),
7470	MDString::get(Context, Str: "evl")});
7471	MDNode *NewLoopID = makePostTransformationMetadata(Context, OrigLoopID: LoopID, RemovePrefixes: {},
7472	AddAttrs: {IsEVLVectorizedMD});
7473	L->setLoopID(NewLoopID);
7474	}
7475	}
7476	TargetTransformInfo::UnrollingPreferences UP;
7477	TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7478	if (!UP.UnrollVectorizedLoop \|\| VectorizingEpilogue)
7479	addRuntimeUnrollDisableMetaData(L);
7480	}
7481
7482	// 3. Fix the vectorized code: take care of header phi's, live-outs,
7483	// predication, updating analyses.
7484	ILV.fixVectorizedLoop(State);
7485
7486	ILV.printDebugTracesAtEnd();
7487
7488	return ExpandedSCEVs;
7489	}
7490
7491	//===--------------------------------------------------------------------===//
7492	// EpilogueVectorizerMainLoop
7493	//===--------------------------------------------------------------------===//
7494
7495	/// This function is partially responsible for generating the control flow
7496	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7497	BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7498	createVectorLoopSkeleton(Prefix: "");
7499
7500	// Generate the code to check the minimum iteration count of the vector
7501	// epilogue (see below).
7502	EPI.EpilogueIterationCountCheck =
7503	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7504	EPI.EpilogueIterationCountCheck->setName("iter.check");
7505
7506	// Generate the code to check any assumptions that we've made for SCEV
7507	// expressions.
7508	EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader);
7509
7510	// Generate the code that checks at runtime if arrays overlap. We put the
7511	// checks into a separate block to make the more common case of few elements
7512	// faster.
7513	EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
7514
7515	// Generate the iteration count check for the main loop, after* the check*
7516	// for the epilogue loop, so that the path-length is shorter for the case
7517	// that goes directly through the vector epilogue. The longer-path length for
7518	// the main loop is compensated for, by the gain from vectorizing the larger
7519	// trip count. Note: the branch will get updated later on when we vectorize
7520	// the epilogue.
7521	EPI.MainLoopIterationCountCheck =
7522	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7523
7524	// Generate the induction variable.
7525	EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7526
7527	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7528	return LoopVectorPreHeader;
7529	}
7530
7531	void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7532	LLVM_DEBUG({
7533	dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7534	<< "Main Loop VF:" << EPI.MainLoopVF
7535	<< ", Main Loop UF:" << EPI.MainLoopUF
7536	<< ", Epilogue Loop VF:" << EPI.EpilogueVF
7537	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7538	});
7539	}
7540
7541	void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7542	DEBUG_WITH_TYPE(VerboseDebug, {
7543	dbgs() << "intermediate fn:\n"
7544	<< *OrigLoop->getHeader()->getParent() << "\n";
7545	});
7546	}
7547
7548	BasicBlock *
7549	EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7550	bool ForEpilogue) {
7551	assert(Bypass && "Expected valid bypass basic block.");
7552	Value *Count = getTripCount();
7553	MinProfitableTripCount = ElementCount::getFixed(MinVal: `0`);
7554	Value *CheckMinIters = createIterationCountCheck(
7555	VF: ForEpilogue ? EPI.EpilogueVF : VF, UF: ForEpilogue ? EPI.EpilogueUF : UF);
7556
7557	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7558	if (!ForEpilogue)
7559	TCCheckBlock->setName("vector.main.loop.iter.check");
7560
7561	// Create new preheader for vector loop.
7562	LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7563	DT: static_cast<DominatorTree >(nullptr*), LI,
7564	MSSAU: nullptr, BBName: "vector.ph");
7565
7566	if (ForEpilogue) {
7567	// Save the trip count so we don't have to regenerate it in the
7568	// vec.epilog.iter.check. This is safe to do because the trip count
7569	// generated here dominates the vector epilog iter check.
7570	EPI.TripCount = Count;
7571	}
7572
7573	BranchInst &BI =
7574	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7575	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7576	setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /IsExpected=/false);
7577	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7578
7579	// When vectorizing the main loop, its trip-count check is placed in a new
7580	// block, whereas the overall trip-count check is placed in the VPlan entry
7581	// block. When vectorizing the epilogue loop, its trip-count check is placed
7582	// in the VPlan entry block.
7583	if (!ForEpilogue)
7584	introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7585	return TCCheckBlock;
7586	}
7587
7588	//===--------------------------------------------------------------------===//
7589	// EpilogueVectorizerEpilogueLoop
7590	//===--------------------------------------------------------------------===//
7591
7592	/// This function is partially responsible for generating the control flow
7593	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7594	BasicBlock *
7595	EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7596	createVectorLoopSkeleton(Prefix: "vec.epilog.");
7597
7598	// Now, compare the remaining count and if there aren't enough iterations to
7599	// execute the vectorized epilogue skip to the scalar part.
7600	LoopVectorPreHeader->setName("vec.epilog.ph");
7601	BasicBlock *VecEpilogueIterationCountCheck =
7602	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI,
7603	MSSAU: nullptr, BBName: "vec.epilog.iter.check", Before: true);
7604	emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7605	Insert: VecEpilogueIterationCountCheck);
7606	AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7607
7608	// Adjust the control flow taking the state info from the main loop
7609	// vectorization into account.
7610	assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7611	"expected this to be saved from the previous pass.");
7612	EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7613	From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7614
7615	EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7616	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7617
7618	if (EPI.SCEVSafetyCheck)
7619	EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7620	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7621	if (EPI.MemSafetyCheck)
7622	EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7623	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7624
7625	DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7626	NewBB: EPI.EpilogueIterationCountCheck);
7627
7628	// The vec.epilog.iter.check block may contain Phi nodes from inductions or
7629	// reductions which merge control-flow from the latch block and the middle
7630	// block. Update the incoming values here and move the Phi into the preheader.
7631	SmallVector<PHINode *, `4`> PhisInBlock(
7632	llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
7633
7634	for (PHINode *Phi : PhisInBlock) {
7635	Phi->moveBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7636	Phi->replaceIncomingBlockWith(
7637	Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7638	New: VecEpilogueIterationCountCheck);
7639
7640	// If the phi doesn't have an incoming value from the
7641	// EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7642	// value and also those from other check blocks. This is needed for
7643	// reduction phis only.
7644	if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7645	return EPI.EpilogueIterationCountCheck == IncB;
7646	}))
7647	continue;
7648	Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7649	if (EPI.SCEVSafetyCheck)
7650	Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck);
7651	if (EPI.MemSafetyCheck)
7652	Phi->removeIncomingValue(BB: EPI.MemSafetyCheck);
7653	}
7654
7655	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7656	return LoopVectorPreHeader;
7657	}
7658
7659	BasicBlock *
7660	EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7661	BasicBlock Bypass, BasicBlock Insert) {
7662
7663	assert(EPI.TripCount &&
7664	"Expected trip count to have been saved in the first pass.");
7665	Value *TC = EPI.TripCount;
7666	IRBuilder<> Builder(Insert->getTerminator());
7667	Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7668
7669	// Generate code to check if the loop's trip count is less than VF UF of the*
7670	// vector epilogue loop.
7671	auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7672	? ICmpInst::ICMP_ULE
7673	: ICmpInst::ICMP_ULT;
7674
7675	Value *CheckMinIters =
7676	Builder.CreateICmp(P, LHS: Count,
7677	RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7678	VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7679	Name: "min.epilog.iters.check");
7680
7681	BranchInst &BI =
7682	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7683	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7684	// FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7685	// think the MainLoopStep is correct.
7686	unsigned MainLoopStep = UF * VF.getKnownMinValue();
7687	unsigned EpilogueLoopStep =
7688	EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7689	// We assume the remaining `Count` is equally distributed in
7690	// [0, MainLoopStep)
7691	// So the probability for `Count < EpilogueLoopStep` should be
7692	// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7693	unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7694	const uint32_t Weights[] = {EstimatedSkipCount,
7695	MainLoopStep - EstimatedSkipCount};
7696	setBranchWeights(I&: BI, Weights, /IsExpected=/false);
7697	}
7698	ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7699
7700	// A new entry block has been created for the epilogue VPlan. Hook it in, as
7701	// otherwise we would try to modify the entry to the main vector loop.
7702	VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: Insert);
7703	VPBasicBlock *OldEntry = Plan.getEntry();
7704	VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7705	Plan.setEntry(NewEntry);
7706	// OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7707
7708	return Insert;
7709	}
7710
7711	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7712	LLVM_DEBUG({
7713	dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7714	<< "Epilogue Loop VF:" << EPI.EpilogueVF
7715	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7716	});
7717	}
7718
7719	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7720	DEBUG_WITH_TYPE(VerboseDebug, {
7721	dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7722	});
7723	}
7724
7725	VPWidenMemoryRecipe *
7726	VPRecipeBuilder::tryToWidenMemory(Instruction I, ArrayRef<VPValue > Operands,
7727	VFRange &Range) {
7728	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
7729	"Must be called with either a load or store");
7730
7731	auto WillWiden = [&](ElementCount VF) -> bool {
7732	LoopVectorizationCostModel::InstWidening Decision =
7733	CM.getWideningDecision(I, VF);
7734	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7735	"CM decision should be taken at this point.");
7736	if (Decision == LoopVectorizationCostModel::CM_Interleave)
7737	return true;
7738	if (CM.isScalarAfterVectorization(I, VF) \|\|
7739	CM.isProfitableToScalarize(I, VF))
7740	return false;
7741	return Decision != LoopVectorizationCostModel::CM_Scalarize;
7742	};
7743
7744	if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7745	return nullptr;
7746
7747	VPValue Mask = nullptr*;
7748	if (Legal->isMaskRequired(I))
7749	Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7750
7751	// Determine if the pointer operand of the access is either consecutive or
7752	// reverse consecutive.
7753	LoopVectorizationCostModel::InstWidening Decision =
7754	CM.getWideningDecision(I, VF: Range.Start);
7755	bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7756	bool Consecutive =
7757	Reverse \|\| Decision == LoopVectorizationCostModel::CM_Widen;
7758
7759	VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands [`0`] : Operands [`1`];
7760	if (Consecutive) {
7761	auto *GEP = dyn_cast<GetElementPtrInst>(
7762	Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7763	VPSingleDefRecipe *VectorPtr;
7764	if (Reverse) {
7765	// When folding the tail, we may compute an address that we don't in the
7766	// original scalar loop and it may not be inbounds. Drop Inbounds in that
7767	// case.
7768	GEPNoWrapFlags Flags =
7769	(CM.foldTailByMasking() \|\| !GEP \|\| !GEP->isInBounds())
7770	? GEPNoWrapFlags::none()
7771	: GEPNoWrapFlags::inBounds();
7772	VectorPtr =
7773	new VPVectorEndPointerRecipe (Ptr, &Plan.getVF(), getLoadStoreType(I),
7774	/Stride/ -`1`, Flags, I->getDebugLoc());
7775	} else {
7776	VectorPtr = new VPVectorPointerRecipe (Ptr, getLoadStoreType(I),
7777	GEP ? GEP->getNoWrapFlags()
7778	: GEPNoWrapFlags::none(),
7779	I->getDebugLoc());
7780	}
7781	Builder.insert(R: VectorPtr);
7782	Ptr = VectorPtr;
7783	}
7784	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
7785	return new VPWidenLoadRecipe (*Load, Ptr, Mask, Consecutive, Reverse,
7786	VPIRMetadata (*Load, LVer), I->getDebugLoc());
7787
7788	StoreInst *Store = cast<StoreInst>(Val: I);
7789	return new VPWidenStoreRecipe (*Store, Ptr, Operands [`0`], Mask, Consecutive,
7790	Reverse, VPIRMetadata (*Store, LVer),
7791	I->getDebugLoc());
7792	}
7793
7794	/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7795	/// insert a recipe to expand the step for the induction recipe.
7796	static VPWidenIntOrFpInductionRecipe *
7797	createWidenInductionRecipes(PHINode Phi, Instruction PhiOrTrunc,
7798	VPValue Start, const* InductionDescriptor &IndDesc,
7799	VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7800	assert(IndDesc.getStartValue() ==
7801	Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7802	assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7803	"step must be loop invariant");
7804
7805	VPValue *Step =
7806	vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
7807	if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
7808	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, &Plan.getVF(),
7809	IndDesc, TruncI,
7810	TruncI->getDebugLoc());
7811	}
7812	assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7813	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, &Plan.getVF(),
7814	IndDesc, Phi->getDebugLoc());
7815	}
7816
7817	VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7818	PHINode Phi, ArrayRef<VPValue > Operands, VFRange &Range) {
7819
7820	// Check if this is an integer or fp induction. If so, build the recipe that
7821	// produces its scalar and vector values.
7822	if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7823	return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands [`0`], IndDesc: *II, Plan,
7824	SE&: PSE.getSE(), OrigLoop&: OrigLoop);
7825
7826	// Check if this is pointer induction. If so, build the recipe for it.
7827	if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7828	VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
7829	SE&: *PSE.getSE());
7830	return new VPWidenPointerInductionRecipe (
7831	Phi, Operands [`0`], Step, &Plan.getVFxUF(), *II,
7832	LoopVectorizationPlanner::getDecisionAndClampRange(
7833	Predicate: [&](ElementCount VF) {
7834	return CM.isScalarAfterVectorization(I: Phi, VF);
7835	},
7836	Range),
7837	Phi->getDebugLoc());
7838	}
7839	return nullptr;
7840	}
7841
7842	VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7843	TruncInst I, ArrayRef<VPValue > Operands, VFRange &Range) {
7844	// Optimize the special case where the source is a constant integer
7845	// induction variable. Notice that we can only optimize the 'trunc' case
7846	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7847	// (c) other casts depend on pointer size.
7848
7849	// Determine whether \p K is a truncation based on an induction variable that
7850	// can be optimized.
7851	auto IsOptimizableIVTruncate =
7852	[&](Instruction K) -> std::function<bool*(ElementCount)> {
7853	return [=](ElementCount VF) -> bool {
7854	return CM.isOptimizableIVTruncate(I: K, VF);
7855	};
7856	};
7857
7858	if (LoopVectorizationPlanner::getDecisionAndClampRange(
7859	Predicate: IsOptimizableIVTruncate (I), Range)) {
7860
7861	auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: `0`));
7862	const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7863	VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
7864	return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
7865	OrigLoop&: *OrigLoop);
7866	}
7867	return nullptr;
7868	}
7869
7870	VPSingleDefRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
7871	ArrayRef<VPValue *> Operands,
7872	VFRange &Range) {
7873	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7874	Predicate: [this, CI](ElementCount VF) {
7875	return CM.isScalarWithPredication(I: CI, VF);
7876	},
7877	Range);
7878
7879	if (IsPredicated)
7880	return nullptr;
7881
7882	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7883	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
7884	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
7885	ID == Intrinsic::pseudoprobe \|\|
7886	ID == Intrinsic::experimental_noalias_scope_decl))
7887	return nullptr;
7888
7889	SmallVector<VPValue *, `4`> Ops(Operands.take_front(N: CI->arg_size()));
7890
7891	// Is it beneficial to perform intrinsic call compared to lib call?
7892	bool ShouldUseVectorIntrinsic =
7893	ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7894	Predicate: [&](ElementCount VF) -> bool {
7895	return CM.getCallWideningDecision(CI, VF).Kind ==
7896	LoopVectorizationCostModel::CM_IntrinsicCall;
7897	},
7898	Range);
7899	if (ShouldUseVectorIntrinsic)
7900	return new VPWidenIntrinsicRecipe (*CI, ID, Ops, CI->getType(),
7901	CI->getDebugLoc());
7902
7903	Function Variant = nullptr*;
7904	std::optional<unsigned> MaskPos;
7905	// Is better to call a vectorized version of the function than to to scalarize
7906	// the call?
7907	auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7908	Predicate: [&](ElementCount VF) -> bool {
7909	// The following case may be scalarized depending on the VF.
7910	// The flag shows whether we can use a usual Call for vectorized
7911	// version of the instruction.
7912
7913	// If we've found a variant at a previous VF, then stop looking. A
7914	// vectorized variant of a function expects input in a certain shape
7915	// -- basically the number of input registers, the number of lanes
7916	// per register, and whether there's a mask required.
7917	// We store a pointer to the variant in the VPWidenCallRecipe, so
7918	// once we have an appropriate variant it's only valid for that VF.
7919	// This will force a different vplan to be generated for each VF that
7920	// finds a valid variant.
7921	if (Variant)
7922	return false;
7923	LoopVectorizationCostModel::CallWideningDecision Decision =
7924	CM.getCallWideningDecision(CI, VF);
7925	if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7926	Variant = Decision.Variant;
7927	MaskPos = Decision.MaskPos;
7928	return true;
7929	}
7930
7931	return false;
7932	},
7933	Range);
7934	if (ShouldUseVectorCall) {
7935	if (MaskPos.has_value()) {
7936	// We have 2 cases that would require a mask:
7937	// 1) The block needs to be predicated, either due to a conditional
7938	// in the scalar loop or use of an active lane mask with
7939	// tail-folding, and we use the appropriate mask for the block.
7940	// 2) No mask is required for the block, but the only available
7941	// vector variant at this VF requires a mask, so we synthesize an
7942	// all-true mask.
7943	VPValue Mask = nullptr*;
7944	if (Legal->isMaskRequired(I: CI))
7945	Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7946	else
7947	Mask = Plan.getOrAddLiveIn(
7948	V: ConstantInt::getTrue(Ty: IntegerType::getInt1Ty(C&: CI->getContext())));
7949
7950	Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7951	}
7952
7953	Ops.push_back(Elt: Operands.back());
7954	return new VPWidenCallRecipe (CI, Variant, Ops, CI->getDebugLoc());
7955	}
7956
7957	return nullptr;
7958	}
7959
7960	bool VPRecipeBuilder::shouldWiden(Instruction I, VFRange &Range) const* {
7961	assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7962	!isa<StoreInst>(I) && "Instruction should have been handled earlier");
7963	// Instruction should be widened, unless it is scalar after vectorization,
7964	// scalarization is profitable or it is predicated.
7965	auto WillScalarize = [this, I](ElementCount VF) -> bool {
7966	return CM.isScalarAfterVectorization(I, VF) \|\|
7967	CM.isProfitableToScalarize(I, VF) \|\|
7968	CM.isScalarWithPredication(I, VF);
7969	};
7970	return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7971	Range);
7972	}
7973
7974	VPWidenRecipe VPRecipeBuilder::tryToWiden(Instruction I,
7975	ArrayRef<VPValue *> Operands) {
7976	switch (I->getOpcode()) {
7977	default:
7978	return nullptr;
7979	case Instruction::SDiv:
7980	case Instruction::UDiv:
7981	case Instruction::SRem:
7982	case Instruction::URem: {
7983	// If not provably safe, use a select to form a safe divisor before widening the
7984	// div/rem operation itself. Otherwise fall through to general handling below.
7985	if (CM.isPredicatedInst(I)) {
7986	SmallVector<VPValue *> Ops(Operands);
7987	VPValue *Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7988	VPValue *One =
7989	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: `1u`, IsSigned: false));
7990	auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops [`1`], FalseVal: One, DL: I->getDebugLoc());
7991	Ops [`1`] = SafeRHS;
7992	return new VPWidenRecipe (*I, Ops);
7993	}
7994	[[fallthrough]];
7995	}
7996	case Instruction::Add:
7997	case Instruction::And:
7998	case Instruction::AShr:
7999	case Instruction::FAdd:
8000	case Instruction::FCmp:
8001	case Instruction::FDiv:
8002	case Instruction::FMul:
8003	case Instruction::FNeg:
8004	case Instruction::FRem:
8005	case Instruction::FSub:
8006	case Instruction::ICmp:
8007	case Instruction::LShr:
8008	case Instruction::Mul:
8009	case Instruction::Or:
8010	case Instruction::Select:
8011	case Instruction::Shl:
8012	case Instruction::Sub:
8013	case Instruction::Xor:
8014	case Instruction::Freeze: {
8015	SmallVector<VPValue *> NewOps(Operands);
8016	if (Instruction::isBinaryOp(Opcode: I->getOpcode())) {
8017	// The legacy cost model uses SCEV to check if some of the operands are
8018	// constants. To match the legacy cost model's behavior, use SCEV to try
8019	// to replace operands with constants.
8020	ScalarEvolution &SE = *PSE.getSE();
8021	auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8022	if (!Op->isLiveIn())
8023	return Op;
8024	Value *V = Op->getUnderlyingValue();
8025	if (isa<Constant>(Val: V) \|\| !SE.isSCEVable(Ty: V->getType()))
8026	return Op;
8027	auto *C = dyn_cast<SCEVConstant>(Val: SE.getSCEV(V));
8028	if (!C)
8029	return Op;
8030	return Plan.getOrAddLiveIn(V: C->getValue());
8031	};
8032	// For Mul, the legacy cost model checks both operands.
8033	if (I->getOpcode() == Instruction::Mul)
8034	NewOps [`0`] = GetConstantViaSCEV (NewOps [`0`]);
8035	// For other binops, the legacy cost model only checks the second operand.
8036	NewOps [`1`] = GetConstantViaSCEV (NewOps [`1`]);
8037	}
8038	return new VPWidenRecipe (*I, NewOps);
8039	}
8040	case Instruction::ExtractValue: {
8041	SmallVector<VPValue *> NewOps(Operands);
8042	Type *I32Ty = IntegerType::getInt32Ty(C&: I->getContext());
8043	auto *EVI = cast<ExtractValueInst>(Val: I);
8044	assert(EVI->getNumIndices() == `1` && "Expected one extractvalue index");
8045	unsigned Idx = EVI->getIndices()[`0`];
8046	NewOps.push_back(Elt: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: Idx, IsSigned: false)));
8047	return new VPWidenRecipe (*I, NewOps);
8048	}
8049	};
8050	}
8051
8052	VPHistogramRecipe *
8053	VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8054	ArrayRef<VPValue *> Operands) {
8055	// FIXME: Support other operations.
8056	unsigned Opcode = HI->Update->getOpcode();
8057	assert((Opcode == Instruction::Add \|\| Opcode == Instruction::Sub) &&
8058	"Histogram update operation must be an Add or Sub");
8059
8060	SmallVector<VPValue *, `3`> HGramOps;
8061	// Bucket address.
8062	HGramOps.push_back(Elt: Operands [`1`]);
8063	// Increment value.
8064	HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: `1`)));
8065
8066	// In case of predicated execution (due to tail-folding, or conditional
8067	// execution, or both), pass the relevant mask.
8068	if (Legal->isMaskRequired(I: HI->Store))
8069	HGramOps.push_back(Elt: getBlockInMask(VPBB: Builder.getInsertBlock()));
8070
8071	return new VPHistogramRecipe (Opcode, HGramOps, HI->Store->getDebugLoc());
8072	}
8073
8074	VPReplicateRecipe *
8075	VPRecipeBuilder::handleReplication(Instruction I, ArrayRef<VPValue > Operands,
8076	VFRange &Range) {
8077	bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8078	Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8079	Range);
8080
8081	bool IsPredicated = CM.isPredicatedInst(I);
8082
8083	// Even if the instruction is not marked as uniform, there are certain
8084	// intrinsic calls that can be effectively treated as such, so we check for
8085	// them here. Conservatively, we only do this for scalable vectors, since
8086	// for fixed-width VFs we can always fall back on full scalarization.
8087	if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8088	switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8089	case Intrinsic::assume:
8090	case Intrinsic::lifetime_start:
8091	case Intrinsic::lifetime_end:
8092	// For scalable vectors if one of the operands is variant then we still
8093	// want to mark as uniform, which will generate one instruction for just
8094	// the first lane of the vector. We can't scalarize the call in the same
8095	// way as for fixed-width vectors because we don't know how many lanes
8096	// there are.
8097	//
8098	// The reasons for doing it this way for scalable vectors are:
8099	// 1. For the assume intrinsic generating the instruction for the first
8100	// lane is still be better than not generating any at all. For
8101	// example, the input may be a splat across all lanes.
8102	// 2. For the lifetime start/end intrinsics the pointer operand only
8103	// does anything useful when the input comes from a stack object,
8104	// which suggests it should always be uniform. For non-stack objects
8105	// the effect is to poison the object, which still allows us to
8106	// remove the call.
8107	IsUniform = true;
8108	break;
8109	default:
8110	break;
8111	}
8112	}
8113	VPValue BlockInMask = nullptr*;
8114	if (!IsPredicated) {
8115	// Finalize the recipe for Instr, first if it is not predicated.
8116	LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8117	} else {
8118	LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8119	// Instructions marked for predication are replicated and a mask operand is
8120	// added initially. Masked replicate recipes will later be placed under an
8121	// if-then construct to prevent side-effects. Generate recipes to compute
8122	// the block mask for this region.
8123	BlockInMask = getBlockInMask(VPBB: Builder.getInsertBlock());
8124	}
8125
8126	// Note that there is some custom logic to mark some intrinsics as uniform
8127	// manually above for scalable vectors, which this assert needs to account for
8128	// as well.
8129	assert((Range.Start.isScalar() \|\| !IsUniform \|\| !IsPredicated \|\|
8130	(Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8131	"Should not predicate a uniform recipe");
8132	auto Recipe = new* VPReplicateRecipe (I, Operands, IsUniform, BlockInMask,
8133	VPIRMetadata (*I, LVer));
8134	return Recipe;
8135	}
8136
8137	/// Find all possible partial reductions in the loop and track all of those that
8138	/// are valid so recipes can be formed later.
8139	void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8140	// Find all possible partial reductions.
8141	SmallVector<std::pair<PartialReductionChain, unsigned>>
8142	PartialReductionChains;
8143	for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8144	getScaledReductions(PHI: Phi, RdxExitInstr: RdxDesc.getLoopExitInstr(), Range,
8145	Chains&: PartialReductionChains);
8146	}
8147
8148	// A partial reduction is invalid if any of its extends are used by
8149	// something that isn't another partial reduction. This is because the
8150	// extends are intended to be lowered along with the reduction itself.
8151
8152	// Build up a set of partial reduction ops for efficient use checking.
8153	SmallSet<User *, `4`> PartialReductionOps;
8154	for (const auto &[PartialRdx, _] : PartialReductionChains)
8155	PartialReductionOps.insert(Ptr: PartialRdx.ExtendUser);
8156
8157	auto ExtendIsOnlyUsedByPartialReductions =
8158	[&PartialReductionOps](Instruction *Extend) {
8159	return all_of(Range: Extend->users(), P: [&](const User *U) {
8160	return PartialReductionOps.contains(Ptr: U);
8161	});
8162	};
8163
8164	// Check if each use of a chain's two extends is a partial reduction
8165	// and only add those that don't have non-partial reduction users.
8166	for (auto Pair : PartialReductionChains) {
8167	PartialReductionChain Chain = Pair.first;
8168	if (ExtendIsOnlyUsedByPartialReductions (Chain.ExtendA) &&
8169	(!Chain.ExtendB \|\| ExtendIsOnlyUsedByPartialReductions (Chain.ExtendB)))
8170	ScaledReductionMap.try_emplace(Key: Chain.Reduction, Args&: Pair.second);
8171	}
8172	}
8173
8174	bool VPRecipeBuilder::getScaledReductions(
8175	Instruction PHI, Instruction RdxExitInstr, VFRange &Range,
8176	SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8177	if (!CM.TheLoop->contains(Inst: RdxExitInstr))
8178	return false;
8179
8180	auto *Update = dyn_cast<BinaryOperator>(Val: RdxExitInstr);
8181	if (!Update)
8182	return false;
8183
8184	Value *Op = Update->getOperand(i_nocapture: `0`);
8185	Value *PhiOp = Update->getOperand(i_nocapture: `1`);
8186	if (Op == PHI)
8187	std::swap(a&: Op, b&: PhiOp);
8188
8189	// Try and get a scaled reduction from the first non-phi operand.
8190	// If one is found, we use the discovered reduction instruction in
8191	// place of the accumulator for costing.
8192	if (auto *OpInst = dyn_cast<Instruction>(Val: Op)) {
8193	if (getScaledReductions(PHI, RdxExitInstr: OpInst, Range, Chains)) {
8194	PHI = Chains.rbegin()->first.Reduction;
8195
8196	Op = Update->getOperand(i_nocapture: `0`);
8197	PhiOp = Update->getOperand(i_nocapture: `1`);
8198	if (Op == PHI)
8199	std::swap(a&: Op, b&: PhiOp);
8200	}
8201	}
8202	if (PhiOp != PHI)
8203	return false;
8204
8205	using namespace llvm::PatternMatch;
8206
8207	// If the update is a binary operator, check both of its operands to see if
8208	// they are extends. Otherwise, see if the update comes directly from an
8209	// extend.
8210	Instruction Exts[`2`] = {nullptr*};
8211	BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Val: Op);
8212	std::optional<unsigned> BinOpc;
8213	Type ExtOpTypes[`2`] = {nullptr*};
8214
8215	auto CollectExtInfo = [&Exts,
8216	&ExtOpTypes](SmallVectorImpl<Value > &Ops) -> bool* {
8217	unsigned I = `0`;
8218	for (Value *OpI : Ops) {
8219	Value *ExtOp;
8220	if (!match(V: OpI, P: m_ZExtOrSExt(Op: m_Value(V&: ExtOp))))
8221	return false;
8222	Exts[I] = cast<Instruction>(Val: OpI);
8223	ExtOpTypes[I] = ExtOp->getType();
8224	I++;
8225	}
8226	return true;
8227	};
8228
8229	if (ExtendUser) {
8230	if (!ExtendUser->hasOneUse())
8231	return false;
8232
8233	// Use the side-effect of match to replace BinOp only if the pattern is
8234	// matched, we don't care at this point whether it actually matched.
8235	match(V: ExtendUser, P: m_Neg(V: m_BinOp(I&: ExtendUser)));
8236
8237	SmallVector<Value *> Ops(ExtendUser->operands());
8238	if (!CollectExtInfo (Ops))
8239	return false;
8240
8241	BinOpc = std::make_optional(t: ExtendUser->getOpcode());
8242	} else if (match(V: Update, P: m_Add(L: m_Value(), R: m_Value()))) {
8243	// We already know the operands for Update are Op and PhiOp.
8244	SmallVector<Value *> Ops({Op});
8245	if (!CollectExtInfo (Ops))
8246	return false;
8247
8248	ExtendUser = Update;
8249	BinOpc = std::nullopt;
8250	} else
8251	return false;
8252
8253	TTI::PartialReductionExtendKind OpAExtend =
8254	TTI::getPartialReductionExtendKind(I: Exts[`0`]);
8255	TTI::PartialReductionExtendKind OpBExtend =
8256	Exts[`1`] ? TTI::getPartialReductionExtendKind(I: Exts[`1`]) : TTI::PR_None;
8257	PartialReductionChain Chain(RdxExitInstr, Exts[`0`], Exts[`1`], ExtendUser);
8258
8259	TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8260	TypeSize ASize = ExtOpTypes[`0`]->getPrimitiveSizeInBits();
8261	if (!PHISize.hasKnownScalarFactor(RHS: ASize))
8262	return false;
8263	unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(RHS: ASize);
8264
8265	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8266	Predicate: [&](ElementCount VF) {
8267	InstructionCost Cost = TTI->getPartialReductionCost(
8268	Opcode: Update->getOpcode(), InputTypeA: ExtOpTypes[`0`], InputTypeB: ExtOpTypes[`1`],
8269	AccumType: PHI->getType(), VF, OpAExtend, OpBExtend, BinOp: BinOpc, CostKind: CM.CostKind);
8270	return Cost.isValid();
8271	},
8272	Range)) {
8273	Chains.emplace_back(Args&: Chain, Args&: TargetScaleFactor);
8274	return true;
8275	}
8276
8277	return false;
8278	}
8279
8280	VPRecipeBase VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe R,
8281	VFRange &Range) {
8282	// First, check for specific widening recipes that deal with inductions, Phi
8283	// nodes, calls and memory operations.
8284	VPRecipeBase *Recipe;
8285	Instruction *Instr = R->getUnderlyingInstr();
8286	SmallVector<VPValue *, `4`> Operands(R->operands());
8287	if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(Val: R)) {
8288	VPBasicBlock *Parent = PhiR->getParent();
8289	[[maybe_unused]] VPRegionBlock *LoopRegionOf =
8290	Parent->getEnclosingLoopRegion();
8291	assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8292	"Non-header phis should have been handled during predication");
8293	auto *Phi = cast<PHINode>(Val: R->getUnderlyingInstr());
8294	assert(Operands.size() == `2` && "Must have 2 operands for header phis");
8295	if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8296	return Recipe;
8297
8298	VPHeaderPHIRecipe PhiRecipe = nullptr*;
8299	assert((Legal->isReductionVariable(Phi) \|\|
8300	Legal->isFixedOrderRecurrence(Phi)) &&
8301	"can only widen reductions and fixed-order recurrences here");
8302	VPValue *StartV = Operands [`0`];
8303	if (Legal->isReductionVariable(PN: Phi)) {
8304	const RecurrenceDescriptor &RdxDesc =
8305	Legal->getReductionVars().find(Key: Phi)->second;
8306	assert(RdxDesc.getRecurrenceStartValue() ==
8307	Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8308
8309	// If the PHI is used by a partial reduction, set the scale factor.
8310	unsigned ScaleFactor =
8311	getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr()).value_or(u: `1`);
8312	PhiRecipe = new VPReductionPHIRecipe (
8313	Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8314	CM.useOrderedReductions(RdxDesc), ScaleFactor);
8315	} else {
8316	// TODO: Currently fixed-order recurrences are modeled as chains of
8317	// first-order recurrences. If there are no users of the intermediate
8318	// recurrences in the chain, the fixed order recurrence should be modeled
8319	// directly, enabling more efficient codegen.
8320	PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8321	}
8322	// Add backedge value.
8323	PhiRecipe->addOperand(Operand: Operands [`1`]);
8324	return PhiRecipe;
8325	}
8326
8327	if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8328	I: cast<TruncInst>(Val: Instr), Operands, Range)))
8329	return Recipe;
8330
8331	// All widen recipes below deal only with VF > 1.
8332	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8333	Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8334	return nullptr;
8335
8336	if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8337	return tryToWidenCall(CI, Operands, Range);
8338
8339	if (StoreInst *SI = dyn_cast<StoreInst>(Val: Instr))
8340	if (auto HistInfo = Legal->getHistogramInfo(I: SI))
8341	return tryToWidenHistogram(HI: *HistInfo, Operands);
8342
8343	if (isa<LoadInst>(Val: Instr) \|\| isa<StoreInst>(Val: Instr))
8344	return tryToWidenMemory(I: Instr, Operands, Range);
8345
8346	if (std::optional<unsigned> ScaleFactor = getScalingForReduction(ExitInst: Instr))
8347	return tryToCreatePartialReduction(Reduction: Instr, Operands, ScaleFactor: ScaleFactor.value());
8348
8349	if (!shouldWiden(I: Instr, Range))
8350	return nullptr;
8351
8352	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8353	return new VPWidenGEPRecipe (GEP, Operands);
8354
8355	if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8356	return new VPWidenSelectRecipe (*SI, Operands);
8357	}
8358
8359	if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8360	return new VPWidenCastRecipe (CI->getOpcode(), Operands [`0`], CI->getType(),
8361	*CI);
8362	}
8363
8364	return tryToWiden(I: Instr, Operands);
8365	}
8366
8367	VPRecipeBase *
8368	VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8369	ArrayRef<VPValue *> Operands,
8370	unsigned ScaleFactor) {
8371	assert(Operands.size() == `2` &&
8372	"Unexpected number of operands for partial reduction");
8373
8374	VPValue *BinOp = Operands [`0`];
8375	VPValue *Accumulator = Operands [`1`];
8376	VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8377	if (isa<VPReductionPHIRecipe>(Val: BinOpRecipe) \|\|
8378	isa<VPPartialReductionRecipe>(Val: BinOpRecipe))
8379	std::swap(a&: BinOp, b&: Accumulator);
8380
8381	unsigned ReductionOpcode = Reduction->getOpcode();
8382	if (ReductionOpcode == Instruction::Sub) {
8383	auto *const Zero = ConstantInt::get(Ty: Reduction->getType(), V: `0`);
8384	SmallVector<VPValue *, `2`> Ops;
8385	Ops.push_back(Elt: Plan.getOrAddLiveIn(V: Zero));
8386	Ops.push_back(Elt: BinOp);
8387	BinOp = new VPWidenRecipe (*Reduction, Ops);
8388	Builder.insert(R: BinOp->getDefiningRecipe());
8389	ReductionOpcode = Instruction::Add;
8390	}
8391
8392	VPValue Cond = nullptr*;
8393	if (CM.blockNeedsPredicationForAnyReason(BB: Reduction->getParent())) {
8394	assert((ReductionOpcode == Instruction::Add \|\|
8395	ReductionOpcode == Instruction::Sub) &&
8396	"Expected an ADD or SUB operation for predicated partial "
8397	"reductions (because the neutral element in the mask is zero)!");
8398	Cond = getBlockInMask(VPBB: Builder.getInsertBlock());
8399	VPValue *Zero =
8400	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: Reduction->getType(), V: `0`));
8401	BinOp = Builder.createSelect(Cond, TrueVal: BinOp, FalseVal: Zero, DL: Reduction->getDebugLoc());
8402	}
8403	return new VPPartialReductionRecipe (ReductionOpcode, Accumulator, BinOp, Cond,
8404	ScaleFactor, Reduction);
8405	}
8406
8407	void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8408	ElementCount MaxVF) {
8409	if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8410	return;
8411
8412	assert(OrigLoop->isInnermost() && "Inner loop expected.");
8413
8414	const LoopAccessInfo *LAI = Legal->getLAI();
8415	LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8416	OrigLoop, LI, DT, PSE.getSE());
8417	if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8418	!LAI->getRuntimePointerChecking()->getDiffChecks()) {
8419	// Only use noalias metadata when using memory checks guaranteeing no
8420	// overlap across all iterations.
8421	LVer.prepareNoAliasMetadata();
8422	}
8423
8424	auto MaxVFTimes2 = MaxVF * `2`;
8425	auto VPlan0 = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8426	for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8427	VFRange SubRange = {VF, MaxVFTimes2};
8428	if (auto Plan = tryToBuildVPlanWithVPRecipes(
8429	InitialPlan: std::unique_ptr<VPlan>(VPlan0 ->duplicate()), Range&: SubRange, LVer: &LVer)) {
8430	bool HasScalarVF = Plan ->hasScalarVFOnly();
8431	// Now optimize the initial VPlan.
8432	if (!HasScalarVF)
8433	VPlanTransforms::runPass(Fn: VPlanTransforms::truncateToMinimalBitwidths,
8434	Plan&: *Plan, Args: CM.getMinimalBitwidths());
8435	VPlanTransforms::runPass(Fn: VPlanTransforms::optimize, Plan&: *Plan);
8436	// TODO: try to put it close to addActiveLaneMask().
8437	// Discard the plan if it is not EVL-compatible
8438	if (CM.foldTailWithEVL() && !HasScalarVF &&
8439	!VPlanTransforms::runPass(Transform: VPlanTransforms::tryAddExplicitVectorLength,
8440	Plan&: *Plan, Args: CM.getMaxSafeElements()))
8441	break;
8442	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8443	VPlans.push_back(Elt: std::move(Plan));
8444	}
8445	VF = SubRange.End;
8446	}
8447	}
8448
8449	/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8450	/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8451	/// the end value of the induction.
8452	static VPInstruction *addResumePhiRecipeForInduction(
8453	VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8454	VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8455	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
8456	// Truncated wide inductions resume from the last lane of their vector value
8457	// in the last vector iteration which is handled elsewhere.
8458	if (WideIntOrFp && WideIntOrFp->getTruncInst())
8459	return nullptr;
8460
8461	VPValue *Start = WideIV->getStartValue();
8462	VPValue *Step = WideIV->getStepValue();
8463	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8464	VPValue *EndValue = VectorTC;
8465	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
8466	EndValue = VectorPHBuilder.createDerivedIV(
8467	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
8468	Start, Current: VectorTC, Step);
8469	}
8470
8471	// EndValue is derived from the vector trip count (which has the same type as
8472	// the widest induction) and thus may be wider than the induction here.
8473	Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
8474	if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
8475	EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
8476	ResultTy: ScalarTypeOfWideIV,
8477	DL: WideIV->getDebugLoc());
8478	}
8479
8480	auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8481	IncomingValues: {EndValue, Start}, DL: WideIV->getDebugLoc(), Name: "bc.resume.val");
8482	return ResumePhiRecipe;
8483	}
8484
8485	/// Create resume phis in the scalar preheader for first-order recurrences,
8486	/// reductions and inductions, and update the VPIRInstructions wrapping the
8487	/// original phis in the scalar header. End values for inductions are added to
8488	/// \p IVEndValues.
8489	static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8490	DenseMap<VPValue , VPValue > &IVEndValues) {
8491	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8492	auto *ScalarPH = Plan.getScalarPreheader();
8493	auto *MiddleVPBB = cast<VPBasicBlock>(Val: ScalarPH->getPredecessors()[`0`]);
8494	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8495	VPBuilder VectorPHBuilder(
8496	cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor()));
8497	VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8498	VPBuilder ScalarPHBuilder(ScalarPH);
8499	for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8500	auto *ScalarPhiIRI = cast<VPIRPhi>(Val: &ScalarPhiR);
8501
8502	// TODO: Extract final value from induction recipe initially, optimize to
8503	// pre-computed end value together in optimizeInductionExitUsers.
8504	auto *VectorPhiR =
8505	cast<VPHeaderPHIRecipe>(Val: Builder.getRecipe(I: &ScalarPhiIRI->getIRPhi()));
8506	if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(Val: VectorPhiR)) {
8507	if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8508	WideIV: WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8509	VectorTC: &Plan.getVectorTripCount())) {
8510	assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8511	IVEndValues [WideIVR] = ResumePhi->getOperand(N: `0`);
8512	ScalarPhiIRI->addOperand(Operand: ResumePhi);
8513	continue;
8514	}
8515	// TODO: Also handle truncated inductions here. Computing end-values
8516	// separately should be done as VPlan-to-VPlan optimization, after
8517	// legalizing all resume values to use the last lane from the loop.
8518	assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8519	"should only skip truncated wide inductions");
8520	continue;
8521	}
8522
8523	// The backedge value provides the value to resume coming out of a loop,
8524	// which for FORs is a vector whose last element needs to be extracted. The
8525	// start value provides the value if the loop is bypassed.
8526	bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(Val: VectorPhiR);
8527	auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8528	assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8529	"Cannot handle loops with uncountable early exits");
8530	if (IsFOR)
8531	ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8532	Opcode: VPInstruction::ExtractLastElement, Operands: {ResumeFromVectorLoop}, Inst: {},
8533	Name: "vector.recur.extract");
8534	StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8535	auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8536	IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, DL: {}, Name);
8537	ScalarPhiIRI->addOperand(Operand: ResumePhiR);
8538	}
8539	}
8540
8541	// Collect VPIRInstructions for phis in the exit block from the latch only.
8542	static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8543	SetVector<VPIRInstruction *> ExitUsersToFix;
8544	for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8545
8546	if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8547	continue;
8548
8549	for (VPRecipeBase &R : ExitVPBB->phis()) {
8550	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
8551	assert(ExitIRI->getNumOperands() == `1` && "must have a single operand");
8552	VPValue *V = ExitIRI->getOperand(N: `0`);
8553	if (V->isLiveIn())
8554	continue;
8555	assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8556	"Only recipes defined inside a region should need fixing.");
8557	ExitUsersToFix.insert(X: ExitIRI);
8558	}
8559	}
8560	return ExitUsersToFix;
8561	}
8562
8563	// Add exit values to \p Plan. Extracts are added for each entry in \p
8564	// ExitUsersToFix if needed and their operands are updated.
8565	static void
8566	addUsersInExitBlocks(VPlan &Plan,
8567	const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8568	if (ExitUsersToFix.empty())
8569	return;
8570
8571	auto *MiddleVPBB = Plan.getMiddleBlock();
8572	VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8573
8574	// Introduce extract for exiting values and update the VPIRInstructions
8575	// modeling the corresponding LCSSA phis.
8576	for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8577	assert(ExitIRI->getNumOperands() == `1` &&
8578	ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8579	"exit values from early exits must be fixed when branch to "
8580	"early-exit is added");
8581	ExitIRI->extractLastLaneOfFirstOperand(Builder&: B);
8582	}
8583	}
8584
8585	/// Handle users in the exit block for first order reductions in the original
8586	/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8587	/// users in the original exit block using the VPIRInstruction wrapping to the
8588	/// LCSSA phi.
8589	static void addExitUsersForFirstOrderRecurrences(
8590	VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8591	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8592	auto *ScalarPHVPBB = Plan.getScalarPreheader();
8593	auto *MiddleVPBB = Plan.getMiddleBlock();
8594	VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8595	VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8596
8597	auto IsScalableOne = [](ElementCount VF) -> bool {
8598	return VF == ElementCount::getScalable(MinVal: `1`);
8599	};
8600
8601	for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8602	auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
8603	if (!FOR)
8604	continue;
8605
8606	assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8607	"Cannot handle loops with uncountable early exits");
8608
8609	// This is the second phase of vectorizing first-order recurrences, creating
8610	// extract for users outside the loop. An overview of the transformation is
8611	// described below. Suppose we have the following loop with some use after
8612	// the loop of the last a[i-1],
8613	//
8614	// for (int i = 0; i < n; ++i) {
8615	// t = a[i - 1];
8616	// b[i] = a[i] - t;
8617	// }
8618	// use t;
8619	//
8620	// There is a first-order recurrence on "a". For this loop, the shorthand
8621	// scalar IR looks like:
8622	//
8623	// scalar.ph:
8624	// s.init = a[-1]
8625	// br scalar.body
8626	//
8627	// scalar.body:
8628	// i = phi [0, scalar.ph], [i+1, scalar.body]
8629	// s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8630	// s2 = a[i]
8631	// b[i] = s2 - s1
8632	// br cond, scalar.body, exit.block
8633	//
8634	// exit.block:
8635	// use = lcssa.phi [s1, scalar.body]
8636	//
8637	// In this example, s1 is a recurrence because it's value depends on the
8638	// previous iteration. In the first phase of vectorization, we created a
8639	// VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8640	// for users in the scalar preheader and exit block.
8641	//
8642	// vector.ph:
8643	// v_init = vector(..., ..., ..., a[-1])
8644	// br vector.body
8645	//
8646	// vector.body
8647	// i = phi [0, vector.ph], [i+4, vector.body]
8648	// v1 = phi [v_init, vector.ph], [v2, vector.body]
8649	// v2 = a[i, i+1, i+2, i+3]
8650	// b[i] = v2 - v1
8651	// // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8652	// b[i, i+1, i+2, i+3] = v2 - v1
8653	// br cond, vector.body, middle.block
8654	//
8655	// middle.block:
8656	// vector.recur.extract.for.phi = v2(2)
8657	// vector.recur.extract = v2(3)
8658	// br cond, scalar.ph, exit.block
8659	//
8660	// scalar.ph:
8661	// scalar.recur.init = phi [vector.recur.extract, middle.block],
8662	// [s.init, otherwise]
8663	// br scalar.body
8664	//
8665	// scalar.body:
8666	// i = phi [0, scalar.ph], [i+1, scalar.body]
8667	// s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8668	// s2 = a[i]
8669	// b[i] = s2 - s1
8670	// br cond, scalar.body, exit.block
8671	//
8672	// exit.block:
8673	// lo = lcssa.phi [s1, scalar.body],
8674	// [vector.recur.extract.for.phi, middle.block]
8675	//
8676	// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8677	// Extract the penultimate value of the recurrence and use it as operand for
8678	// the VPIRInstruction modeling the phi.
8679	for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8680	if (ExitIRI->getOperand(N: `0`) != FOR)
8681	continue;
8682	// For VF vscale x 1, if vscale = 1, we are unable to extract the
8683	// penultimate value of the recurrence. Instead, we rely on function
8684	// addUsersInExitBlocks to extract the last element from the result of
8685	// VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8686	// recurrence phi in ExitUsersToFix.
8687	// TODO: Consider vscale_range info and UF.
8688	if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
8689	Range))
8690	return;
8691	VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8692	Opcode: VPInstruction::ExtractPenultimateElement, Operands: {FOR->getBackedgeValue()},
8693	Inst: {}, Name: "vector.recur.extract.for.phi");
8694	ExitIRI->setOperand(I: `0`, New: PenultimateElement);
8695	ExitUsersToFix.remove(X: ExitIRI);
8696	}
8697	}
8698	}
8699
8700	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8701	VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8702
8703	using namespace llvm::VPlanPatternMatch;
8704	SmallPtrSet<const InterleaveGroup<Instruction> *, `1`> InterleaveGroups;
8705
8706	// ---------------------------------------------------------------------------
8707	// Build initial VPlan: Scan the body of the loop in a topological order to
8708	// visit each basic block after having visited its predecessor basic blocks.
8709	// ---------------------------------------------------------------------------
8710
8711	// Create initial VPlan skeleton, having a basic block for the pre-header
8712	// which contains SCEV expansions that need to happen before the CFG is
8713	// modified; a basic block for the vector pre-header, followed by a region for
8714	// the vector loop, followed by the middle basic block. The skeleton vector
8715	// loop region contains a header and latch basic blocks.
8716
8717	bool RequiresScalarEpilogueCheck =
8718	LoopVectorizationPlanner::getDecisionAndClampRange(
8719	Predicate: [this](ElementCount VF) {
8720	return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8721	},
8722	Range);
8723	VPlanTransforms::prepareForVectorization(
8724	Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8725	TailFolded: CM.foldTailByMasking(), TheLoop: OrigLoop,
8726	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()),
8727	HasUncountableExit: Legal->hasUncountableEarlyExit(), Range);
8728	VPlanTransforms::createLoopRegions(Plan&: *Plan);
8729
8730	// Don't use getDecisionAndClampRange here, because we don't know the UF
8731	// so this function is better to be conservative, rather than to split
8732	// it up into different VPlans.
8733	// TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8734	bool IVUpdateMayOverflow = false;
8735	for (ElementCount VF : Range)
8736	IVUpdateMayOverflow \|= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8737
8738	TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8739	// Use NUW for the induction increment if we proved that it won't overflow in
8740	// the vector loop or when not folding the tail. In the later case, we know
8741	// that the canonical induction increment will not overflow as the vector trip
8742	// count is >= increment and a multiple of the increment.
8743	bool HasNUW = !IVUpdateMayOverflow \|\| Style == TailFoldingStyle::None;
8744	if (!HasNUW) {
8745	auto *IVInc = Plan ->getVectorLoopRegion()
8746	->getExitingBasicBlock()
8747	->getTerminator()
8748	->getOperand(N: `0`);
8749	assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8750	m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8751	"Did not find the canonical IV increment");
8752	cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8753	}
8754
8755	// ---------------------------------------------------------------------------
8756	// Pre-construction: record ingredients whose recipes we'll need to further
8757	// process after constructing the initial VPlan.
8758	// ---------------------------------------------------------------------------
8759
8760	// For each interleave group which is relevant for this (possibly trimmed)
8761	// Range, add it to the set of groups to be later applied to the VPlan and add
8762	// placeholders for its members' Recipes which we'll be replacing with a
8763	// single VPInterleaveRecipe.
8764	for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8765	auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8766	bool Result = (VF.isVector() && // Query is illegal for VF == 1
8767	CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8768	LoopVectorizationCostModel::CM_Interleave);
8769	// For scalable vectors, the interleave factors must be <= 8 since we
8770	// require the (de)interleaveN intrinsics instead of shufflevectors.
8771	assert((!Result \|\| !VF.isScalable() \|\| IG->getFactor() <= `8`) &&
8772	"Unsupported interleave factor for scalable vectors");
8773	return Result;
8774	};
8775	if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8776	continue;
8777	InterleaveGroups.insert(Ptr: IG);
8778	}
8779
8780	// ---------------------------------------------------------------------------
8781	// Predicate and linearize the top-level loop region.
8782	// ---------------------------------------------------------------------------
8783	auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8784	Plan&: *Plan, FoldTail: CM.foldTailByMasking());
8785
8786	// ---------------------------------------------------------------------------
8787	// Construct wide recipes and apply predication for original scalar
8788	// VPInstructions in the loop.
8789	// ---------------------------------------------------------------------------
8790	VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8791	Builder, BlockMaskCache, LVer);
8792	RecipeBuilder.collectScaledReductions(Range);
8793
8794	// Scan the body of the loop in a topological order to visit each basic block
8795	// after having visited its predecessor basic blocks.
8796	VPRegionBlock *LoopRegion = Plan ->getVectorLoopRegion();
8797	VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8798	ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8799	HeaderVPBB);
8800
8801	auto *MiddleVPBB = Plan ->getMiddleBlock();
8802	VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8803	// Mapping from VPValues in the initial plan to their widened VPValues. Needed
8804	// temporarily to update created block masks.
8805	DenseMap<VPValue , VPValue > Old2New;
8806	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8807	// Convert input VPInstructions to widened recipes.
8808	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
8809	auto *SingleDef = cast<VPSingleDefRecipe>(Val: &R);
8810	auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8811	// Skip recipes that do not need transforming, including canonical IV,
8812	// wide canonical IV and VPInstructions without underlying values. The
8813	// latter are added above for masking.
8814	// FIXME: Migrate code relying on the underlying instruction from VPlan0
8815	// to construct recipes below to not use the underlying instruction.
8816	if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8817	Val: &R) \|\|
8818	(isa<VPInstruction>(Val: &R) && !UnderlyingValue))
8819	continue;
8820
8821	// FIXME: VPlan0, which models a copy of the original scalar loop, should
8822	// not use VPWidenPHIRecipe to model the phis.
8823	assert((isa<VPWidenPHIRecipe>(&R) \|\| isa<VPInstruction>(&R)) &&
8824	UnderlyingValue && "unsupported recipe");
8825
8826	// TODO: Gradually replace uses of underlying instruction by analyses on
8827	// VPlan.
8828	Instruction *Instr = cast<Instruction>(Val: UnderlyingValue);
8829	Builder.setInsertPoint(SingleDef);
8830
8831	// The stores with invariant address inside the loop will be deleted, and
8832	// in the exit block, a uniform store recipe will be created for the final
8833	// invariant store of the reduction.
8834	StoreInst *SI;
8835	if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8836	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8837	// Only create recipe for the final invariant store of the reduction.
8838	if (Legal->isInvariantStoreOfReduction(SI)) {
8839	auto *Recipe =
8840	new VPReplicateRecipe (SI, R.operands(), true / IsUniform /,
8841	nullptr /Mask/, VPIRMetadata (*SI, LVer));
8842	Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8843	}
8844	R.eraseFromParent();
8845	continue;
8846	}
8847
8848	VPRecipeBase *Recipe =
8849	RecipeBuilder.tryToCreateWidenRecipe(R: SingleDef, Range);
8850	if (!Recipe) {
8851	SmallVector<VPValue *, `4`> Operands(R.operands());
8852	Recipe = RecipeBuilder.handleReplication(I: Instr, Operands, Range);
8853	}
8854
8855	RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8856	if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8857	// Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8858	// moved to the phi section in the header.
8859	Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8860	} else {
8861	Builder.insert(R: Recipe);
8862	}
8863	if (Recipe->getNumDefinedValues() == `1`) {
8864	SingleDef->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8865	Old2New [SingleDef] = Recipe->getVPSingleValue();
8866	} else {
8867	assert(Recipe->getNumDefinedValues() == `0` &&
8868	"Unexpected multidef recipe");
8869	R.eraseFromParent();
8870	}
8871	}
8872	}
8873
8874	// replaceAllUsesWith above may invalidate the block masks. Update them here.
8875	// TODO: Include the masks as operands in the predicated VPlan directly
8876	// to remove the need to keep a map of masks beyond the predication
8877	// transform.
8878	RecipeBuilder.updateBlockMaskCache(Old2New);
8879	for (const auto &[Old, _] : Old2New)
8880	Old->getDefiningRecipe()->eraseFromParent();
8881
8882	assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8883	!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8884	"entry block must be set to a VPRegionBlock having a non-empty entry "
8885	"VPBasicBlock");
8886
8887	// Update wide induction increments to use the same step as the corresponding
8888	// wide induction. This enables detecting induction increments directly in
8889	// VPlan and removes redundant splats.
8890	for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8891	auto *IVInc = cast<Instruction>(
8892	Val: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
8893	if (IVInc->getOperand(i: `0`) != Phi \|\| IVInc->getOpcode() != Instruction::Add)
8894	continue;
8895	VPWidenInductionRecipe *WideIV =
8896	cast<VPWidenInductionRecipe>(Val: RecipeBuilder.getRecipe(I: Phi));
8897	VPRecipeBase *R = RecipeBuilder.getRecipe(I: IVInc);
8898	R->setOperand(I: `1`, New: WideIV->getStepValue());
8899	}
8900
8901	DenseMap<VPValue , VPValue > IVEndValues;
8902	addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8903	SetVector<VPIRInstruction *> ExitUsersToFix =
8904	collectUsersInLatchExitBlock(Plan&: *Plan);
8905	addExitUsersForFirstOrderRecurrences(Plan&: *Plan, ExitUsersToFix, Range);
8906	addUsersInExitBlocks(Plan&: *Plan, ExitUsersToFix);
8907
8908	// ---------------------------------------------------------------------------
8909	// Transform initial VPlan: Apply previously taken decisions, in order, to
8910	// bring the VPlan to its final state.
8911	// ---------------------------------------------------------------------------
8912
8913	// Adjust the recipes for any inloop reductions.
8914	adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start);
8915
8916	// Transform recipes to abstract recipes if it is legal and beneficial and
8917	// clamp the range for better cost estimation.
8918	// TODO: Enable following transform when the EVL-version of extended-reduction
8919	// and mulacc-reduction are implemented.
8920	if (!CM.foldTailWithEVL()) {
8921	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8922	CM.CostKind);
8923	VPlanTransforms::runPass(Fn: VPlanTransforms::convertToAbstractRecipes, Plan&: *Plan,
8924	Args&: CostCtx, Args&: Range);
8925	}
8926
8927	for (ElementCount VF : Range)
8928	Plan ->addVF(VF);
8929	Plan ->setName("Initial VPlan");
8930
8931	// Interleave memory: for each Interleave Group we marked earlier as relevant
8932	// for this VPlan, replace the Recipes widening its memory instructions with a
8933	// single VPInterleaveRecipe at its insertion point.
8934	VPlanTransforms::runPass(Fn: VPlanTransforms::createInterleaveGroups, Plan&: *Plan,
8935	Args: InterleaveGroups, Args&: RecipeBuilder,
8936	Args: CM.isScalarEpilogueAllowed());
8937
8938	// Replace VPValues for known constant strides guaranteed by predicate scalar
8939	// evolution.
8940	auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8941	auto *R = cast<VPRecipeBase>(Val: &U);
8942	return R->getParent()->getParent() \|\|
8943	R->getParent() ==
8944	Plan ->getVectorLoopRegion()->getSinglePredecessor();
8945	};
8946	for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8947	auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8948	auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8949	// Only handle constant strides for now.
8950	if (!ScevStride)
8951	continue;
8952
8953	auto *CI = Plan ->getOrAddLiveIn(
8954	V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()));
8955	if (VPValue *StrideVPV = Plan ->getLiveIn(V: StrideV))
8956	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8957
8958	// The versioned value may not be used in the loop directly but through a
8959	// sext/zext. Add new live-ins in those cases.
8960	for (Value *U : StrideV->users()) {
8961	if (!isa<SExtInst, ZExtInst>(Val: U))
8962	continue;
8963	VPValue *StrideVPV = Plan ->getLiveIn(V: U);
8964	if (!StrideVPV)
8965	continue;
8966	unsigned BW = U->getType()->getScalarSizeInBits();
8967	APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW)
8968	: ScevStride->getAPInt().zext(width: BW);
8969	VPValue *CI = Plan ->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C));
8970	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8971	}
8972	}
8973
8974	auto BlockNeedsPredication = [this](BasicBlock *BB) {
8975	return Legal->blockNeedsPredication(BB);
8976	};
8977	VPlanTransforms::runPass(Fn: VPlanTransforms::dropPoisonGeneratingRecipes, Plan&: *Plan,
8978	Args: BlockNeedsPredication);
8979
8980	// Sink users of fixed-order recurrence past the recipe defining the previous
8981	// value and introduce FirstOrderRecurrenceSplice VPInstructions.
8982	if (!VPlanTransforms::runPass(Transform: VPlanTransforms::adjustFixedOrderRecurrences,
8983	Plan&: *Plan, Args&: Builder))
8984	return nullptr;
8985
8986	if (useActiveLaneMask(Style)) {
8987	// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8988	// TailFoldingStyle is visible there.
8989	bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8990	bool WithoutRuntimeCheck =
8991	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8992	VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8993	DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8994	}
8995	VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues);
8996
8997	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8998	return Plan;
8999	}
9000
9001	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
9002	// Outer loop handling: They may require CFG and instruction level
9003	// transformations before even evaluating whether vectorization is profitable.
9004	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
9005	// the vectorization pipeline.
9006	assert(!OrigLoop->isInnermost());
9007	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9008
9009	auto Plan = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
9010	VPlanTransforms::prepareForVectorization(
9011	Plan&: Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck: true, TailFolded: false*, TheLoop: OrigLoop,
9012	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), HasUncountableExit: false,
9013	Range);
9014	VPlanTransforms::createLoopRegions(Plan&: *Plan);
9015
9016	for (ElementCount VF : Range)
9017	Plan ->addVF(VF);
9018
9019	if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
9020	Plan,
9021	GetIntOrFpInductionDescriptor: [this](PHINode *P) {
9022	return Legal->getIntOrFpInductionDescriptor(Phi: P);
9023	},
9024	SE&: PSE.getSE(), TLI: TLI))
9025	return nullptr;
9026
9027	// Collect mapping of IR header phis to header phi recipes, to be used in
9028	// addScalarResumePhis.
9029	DenseMap<VPBasicBlock , VPValue > BlockMaskCache;
9030	VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9031	Builder, BlockMaskCache, nullptr /LVer/);
9032	for (auto &R : Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9033	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9034	continue;
9035	auto *HeaderR = cast<VPHeaderPHIRecipe>(Val: &R);
9036	RecipeBuilder.setRecipe(I: HeaderR->getUnderlyingInstr(), R: HeaderR);
9037	}
9038	DenseMap<VPValue , VPValue > IVEndValues;
9039	// TODO: IVEndValues are not used yet in the native path, to optimize exit
9040	// values.
9041	addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
9042
9043	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9044	return Plan;
9045	}
9046
9047	// Adjust the recipes for reductions. For in-loop reductions the chain of
9048	// instructions leading from the loop exit instr to the phi need to be converted
9049	// to reductions, with one operand being vector and the other being the scalar
9050	// reduction chain. For other reductions, a select is introduced between the phi
9051	// and users outside the vector region when folding the tail.
9052	//
9053	// A ComputeReductionResult recipe is added to the middle block, also for
9054	// in-loop reductions which compute their result in-loop, because generating
9055	// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9056	//
9057	// Adjust AnyOf reductions; replace the reduction phi for the selected value
9058	// with a boolean reduction phi node to check if the condition is true in any
9059	// iteration. The final value is selected by the final ComputeReductionResult.
9060	void LoopVectorizationPlanner::adjustRecipesForReductions(
9061	VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9062	using namespace VPlanPatternMatch;
9063	VPRegionBlock *VectorLoopRegion = Plan ->getVectorLoopRegion();
9064	VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9065	VPBasicBlock *MiddleVPBB = Plan ->getMiddleBlock();
9066	SmallVector<VPRecipeBase *> ToDelete;
9067
9068	for (VPRecipeBase &R : Header->phis()) {
9069	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9070	if (!PhiR \|\| !PhiR->isInLoop() \|\| (MinVF.isScalar() && !PhiR->isOrdered()))
9071	continue;
9072
9073	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9074	RecurKind Kind = RdxDesc.getRecurrenceKind();
9075	assert(
9076	!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9077	!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
9078	"AnyOf and FindIV reductions are not allowed for in-loop reductions");
9079
9080	// Collect the chain of "link" recipes for the reduction starting at PhiR.
9081	SetVector<VPSingleDefRecipe *> Worklist;
9082	Worklist.insert(X: PhiR);
9083	for (unsigned I = `0`; I != Worklist.size(); ++I) {
9084	VPSingleDefRecipe *Cur = Worklist [I];
9085	for (VPUser *U : Cur->users()) {
9086	auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
9087	if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9088	assert((UserRecipe->getParent() == MiddleVPBB \|\|
9089	UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9090	"U must be either in the loop region, the middle block or the "
9091	"scalar preheader.");
9092	continue;
9093	}
9094	Worklist.insert(X: UserRecipe);
9095	}
9096	}
9097
9098	// Visit operation "Links" along the reduction chain top-down starting from
9099	// the phi until LoopExitValue. We keep track of the previous item
9100	// (PreviousLink) to tell which of the two operands of a Link will remain
9101	// scalar and which will be reduced. For minmax by select(cmp), Link will be
9102	// the select instructions. Blend recipes of in-loop reduction phi's will
9103	// get folded to their non-phi operand, as the reduction recipe handles the
9104	// condition directly.
9105	VPSingleDefRecipe PreviousLink = PhiR; // Aka Worklist[0].*
9106	for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9107	if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
9108	assert(Blend->getNumIncomingValues() == `2` &&
9109	"Blend must have 2 incoming values");
9110	if (Blend->getIncomingValue(Idx: `0`) == PhiR) {
9111	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `1`));
9112	} else {
9113	assert(Blend->getIncomingValue(`1`) == PhiR &&
9114	"PhiR must be an operand of the blend");
9115	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `0`));
9116	}
9117	continue;
9118	}
9119
9120	Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9121
9122	// Index of the first operand which holds a non-mask vector operand.
9123	unsigned IndexOfFirstOperand;
9124	// Recognize a call to the llvm.fmuladd intrinsic.
9125	bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9126	VPValue *VecOp;
9127	VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9128	if (IsFMulAdd) {
9129	assert(
9130	RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9131	"Expected instruction to be a call to the llvm.fmuladd intrinsic");
9132	assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) \|\|
9133	isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9134	CurrentLink->getOperand(`2`) == PreviousLink &&
9135	"expected a call where the previous link is the added operand");
9136
9137	// If the instruction is a call to the llvm.fmuladd intrinsic then we
9138	// need to create an fmul recipe (multiplying the first two operands of
9139	// the fmuladd together) to use as the vector operand for the fadd
9140	// reduction.
9141	VPInstruction FMulRecipe = new* VPInstruction (
9142	Instruction::FMul,
9143	{CurrentLink->getOperand(N: `0`), CurrentLink->getOperand(N: `1`)},
9144	CurrentLinkI->getFastMathFlags());
9145	LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9146	VecOp = FMulRecipe;
9147	} else {
9148	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9149	if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9150	assert(isa<CmpInst>(CurrentLinkI) &&
9151	"need to have the compare of the select");
9152	continue;
9153	}
9154	assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9155	"must be a select recipe");
9156	IndexOfFirstOperand = `1`;
9157	} else {
9158	assert((MinVF.isScalar() \|\| isa<VPWidenRecipe>(CurrentLink)) &&
9159	"Expected to replace a VPWidenSC");
9160	IndexOfFirstOperand = `0`;
9161	}
9162	// Note that for non-commutable operands (cmp-selects), the semantics of
9163	// the cmp-select are captured in the recurrence kind.
9164	unsigned VecOpId =
9165	CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9166	? IndexOfFirstOperand + `1`
9167	: IndexOfFirstOperand;
9168	VecOp = CurrentLink->getOperand(N: VecOpId);
9169	assert(VecOp != PreviousLink &&
9170	CurrentLink->getOperand(CurrentLink->getNumOperands() - `1` -
9171	(VecOpId - IndexOfFirstOperand)) ==
9172	PreviousLink &&
9173	"PreviousLink must be the operand other than VecOp");
9174	}
9175
9176	VPValue CondOp = nullptr*;
9177	if (CM.blockNeedsPredicationForAnyReason(BB: CurrentLinkI->getParent()))
9178	CondOp = RecipeBuilder.getBlockInMask(VPBB: CurrentLink->getParent());
9179
9180	// Non-FP RdxDescs will have all fast math flags set, so clear them.
9181	FastMathFlags FMFs = isa<FPMathOperator>(Val: CurrentLinkI)
9182	? RdxDesc.getFastMathFlags()
9183	: FastMathFlags ();
9184	auto RedRecipe = new* VPReductionRecipe (
9185	Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9186	CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9187	// Append the recipe to the end of the VPBasicBlock because we need to
9188	// ensure that it comes after all of it's inputs, including CondOp.
9189	// Delete CurrentLink as it will be invalid if its operand is replaced
9190	// with a reduction defined at the bottom of the block in the next link.
9191	if (LinkVPBB->getNumSuccessors() == `0`)
9192	RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
9193	else
9194	LinkVPBB->appendRecipe(Recipe: RedRecipe);
9195
9196	CurrentLink->replaceAllUsesWith(New: RedRecipe);
9197	ToDelete.push_back(Elt: CurrentLink);
9198	PreviousLink = RedRecipe;
9199	}
9200	}
9201	VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9202	Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
9203	VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9204	for (VPRecipeBase &R :
9205	Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9206	VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9207	if (!PhiR)
9208	continue;
9209
9210	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9211	Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9212	// If tail is folded by masking, introduce selects between the phi
9213	// and the users outside the vector region of each reduction, at the
9214	// beginning of the dedicated latch block.
9215	auto *OrigExitingVPV = PhiR->getBackedgeValue();
9216	auto *NewExitingVPV = PhiR->getBackedgeValue();
9217	// Don't output selects for partial reductions because they have an output
9218	// with fewer lanes than the VF. So the operands of the select would have
9219	// different numbers of lanes. Partial reductions mask the input instead.
9220	if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9221	!isa<VPPartialReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe())) {
9222	VPValue *Cond = RecipeBuilder.getBlockInMask(VPBB: PhiR->getParent());
9223	std::optional<FastMathFlags> FMFs =
9224	PhiTy->isFloatingPointTy()
9225	? std::make_optional(t: RdxDesc.getFastMathFlags())
9226	: std::nullopt;
9227	NewExitingVPV =
9228	Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9229	OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9230	return isa<VPInstruction>(Val: &U) &&
9231	(cast<VPInstruction>(Val: &U)->getOpcode() ==
9232	VPInstruction::ComputeAnyOfResult \|\|
9233	cast<VPInstruction>(Val: &U)->getOpcode() ==
9234	VPInstruction::ComputeReductionResult \|\|
9235	cast<VPInstruction>(Val: &U)->getOpcode() ==
9236	VPInstruction::ComputeFindIVResult);
9237	});
9238	if (CM.usePredicatedReductionSelect())
9239	PhiR->setOperand(I: `1`, New: NewExitingVPV);
9240	}
9241
9242	// We want code in the middle block to appear to execute on the location of
9243	// the scalar loop's latch terminator because: (a) it is all compiler
9244	// generated, (b) these instructions are always executed after evaluating
9245	// the latch conditional branch, and (c) other passes may add new
9246	// predecessors which terminate on this line. This is the easiest way to
9247	// ensure we don't accidentally cause an extra step back into the loop while
9248	// debugging.
9249	DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9250
9251	// TODO: At the moment ComputeReductionResult also drives creation of the
9252	// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9253	// even for in-loop reductions, until the reduction resume value handling is
9254	// also modeled in VPlan.
9255	VPInstruction *FinalReductionResult;
9256	VPBuilder::InsertPointGuard Guard(Builder);
9257	Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
9258	if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9259	Kind: RdxDesc.getRecurrenceKind())) {
9260	VPValue *Start = PhiR->getStartValue();
9261	VPValue *Sentinel = Plan ->getOrAddLiveIn(V: RdxDesc.getSentinelValue());
9262	FinalReductionResult =
9263	Builder.createNaryOp(Opcode: VPInstruction::ComputeFindIVResult,
9264	Operands: {PhiR, Start, Sentinel, NewExitingVPV}, DL: ExitDL);
9265	} else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9266	Kind: RdxDesc.getRecurrenceKind())) {
9267	VPValue *Start = PhiR->getStartValue();
9268	FinalReductionResult =
9269	Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
9270	Operands: {PhiR, Start, NewExitingVPV}, DL: ExitDL);
9271	} else {
9272	VPIRFlags Flags = RecurrenceDescriptor::isFloatingPointRecurrenceKind(
9273	Kind: RdxDesc.getRecurrenceKind())
9274	? VPIRFlags (RdxDesc.getFastMathFlags())
9275	: VPIRFlags ();
9276	FinalReductionResult =
9277	Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
9278	Operands: {PhiR, NewExitingVPV}, Flags, DL: ExitDL);
9279	}
9280	// If the vector reduction can be performed in a smaller type, we truncate
9281	// then extend the loop exit value to enable InstCombine to evaluate the
9282	// entire expression in the smaller type.
9283	if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9284	!RecurrenceDescriptor::isAnyOfRecurrenceKind(
9285	Kind: RdxDesc.getRecurrenceKind())) {
9286	assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9287	assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(
9288	RdxDesc.getRecurrenceKind()) &&
9289	"Unexpected truncated min-max recurrence!");
9290	Type *RdxTy = RdxDesc.getRecurrenceType();
9291	auto *Trunc =
9292	new VPWidenCastRecipe (Instruction::Trunc, NewExitingVPV, RdxTy);
9293	Instruction::CastOps ExtendOpc =
9294	RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9295	auto Extnd = new* VPWidenCastRecipe (ExtendOpc, Trunc, PhiTy);
9296	Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9297	Extnd->insertAfter(InsertPos: Trunc);
9298	if (PhiR->getOperand(N: `1`) == NewExitingVPV)
9299	PhiR->setOperand(I: `1`, New: Extnd->getVPSingleValue());
9300
9301	// Update ComputeReductionResult with the truncated exiting value and
9302	// extend its result.
9303	FinalReductionResult->setOperand(I: `1`, New: Trunc);
9304	FinalReductionResult =
9305	Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
9306	}
9307
9308	// Update all users outside the vector region. Also replace redundant
9309	// ExtractLastElement.
9310	for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
9311	auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
9312	if (FinalReductionResult == U \|\| Parent->getParent())
9313	continue;
9314	U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
9315	if (match(U, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
9316	Op0: m_VPValue())))
9317	cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
9318	}
9319
9320	// Adjust AnyOf reductions; replace the reduction phi for the selected value
9321	// with a boolean reduction phi node to check if the condition is true in
9322	// any iteration. The final value is selected by the final
9323	// ComputeReductionResult.
9324	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9325	Kind: RdxDesc.getRecurrenceKind())) {
9326	auto Select = cast<VPRecipeBase>(Val: find_if(Range: PhiR->users(), P: [](VPUser *U) {
9327	return isa<VPWidenSelectRecipe>(Val: U) \|\|
9328	(isa<VPReplicateRecipe>(Val: U) &&
9329	cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() ==
9330	Instruction::Select);
9331	}));
9332	VPValue *Cmp = Select->getOperand(N: `0`);
9333	// If the compare is checking the reduction PHI node, adjust it to check
9334	// the start value.
9335	if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9336	CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
9337	Builder.setInsertPoint(Select);
9338
9339	// If the true value of the select is the reduction phi, the new value is
9340	// selected if the negated condition is true in any iteration.
9341	if (Select->getOperand(N: `1`) == PhiR)
9342	Cmp = Builder.createNot(Operand: Cmp);
9343	VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
9344	Select->getVPSingleValue()->replaceAllUsesWith(New: Or);
9345	// Delete Select now that it has invalid types.
9346	ToDelete.push_back(Elt: Select);
9347
9348	// Convert the reduction phi to operate on bools.
9349	PhiR->setOperand(I: `0`, New: Plan ->getOrAddLiveIn(V: ConstantInt::getFalse(
9350	Context&: OrigLoop->getHeader()->getContext())));
9351	continue;
9352	}
9353
9354	if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9355	Kind: RdxDesc.getRecurrenceKind())) {
9356	// Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9357	// the sentinel value after generating the ResumePhi recipe, which uses
9358	// the original start value.
9359	PhiR->setOperand(I: `0`, New: Plan ->getOrAddLiveIn(V: RdxDesc.getSentinelValue()));
9360	}
9361	RecurKind RK = RdxDesc.getRecurrenceKind();
9362	if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
9363	!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
9364	!RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))) {
9365	VPBuilder PHBuilder(Plan ->getVectorPreheader());
9366	VPValue *Iden = Plan ->getOrAddLiveIn(
9367	V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: RdxDesc.getFastMathFlags()));
9368	// If the PHI is used by a partial reduction, set the scale factor.
9369	unsigned ScaleFactor =
9370	RecipeBuilder.getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr())
9371	.value_or(u: `1`);
9372	Type *I32Ty = IntegerType::getInt32Ty(C&: PhiTy->getContext());
9373	auto *ScaleFactorVPV =
9374	Plan ->getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: ScaleFactor));
9375	VPValue *StartV = PHBuilder.createNaryOp(
9376	Opcode: VPInstruction::ReductionStartVector,
9377	Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9378	Flags: PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9379	: FastMathFlags ());
9380	PhiR->setOperand(I: `0`, New: StartV);
9381	}
9382	}
9383	for (VPRecipeBase *R : ToDelete)
9384	R->eraseFromParent();
9385
9386	VPlanTransforms::runPass(Fn: VPlanTransforms::clearReductionWrapFlags, Plan&: *Plan);
9387	}
9388
9389	void VPDerivedIVRecipe::execute(VPTransformState &State) {
9390	assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9391
9392	// Fast-math-flags propagate from the original induction instruction.
9393	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9394	if (FPBinOp)
9395	State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9396
9397	Value *Step = State.get(Def: getStepValue(), Lane: VPLane (`0`));
9398	Value *Index = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
9399	Value *DerivedIV = emitTransformedIndex(
9400	B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
9401	InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9402	DerivedIV->setName(Name);
9403	// If index is the vector trip count, the concrete value will only be set in
9404	// prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9405	// TODO: Remove the special case for the vector trip count once it is computed
9406	// in VPlan and can be used during VPlan simplification.
9407	assert((DerivedIV != Index \|\|
9408	getOperand(`1`) == &getParent()->getPlan()->getVectorTripCount()) &&
9409	"IV didn't need transforming?");
9410	State.set(Def: this, V: DerivedIV, Lane: VPLane (`0`));
9411	}
9412
9413	// Determine how to lower the scalar epilogue, which depends on 1) optimising
9414	// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9415	// predication, and 4) a TTI hook that analyses whether the loop is suitable
9416	// for predication.
9417	static ScalarEpilogueLowering getScalarEpilogueLowering(
9418	Function F, Loop L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9419	BlockFrequencyInfo BFI, TargetTransformInfo TTI, TargetLibraryInfo *TLI,
9420	LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9421	// 1) OptSize takes precedence over all other options, i.e. if this is set,
9422	// don't look at hints or options, and don't request a scalar epilogue.
9423	// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9424	// LoopAccessInfo (due to code dependency and not being able to reliably get
9425	// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9426	// of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9427	// versioning when the vectorization is forced, unlike hasOptSize. So revert
9428	// back to the old way and vectorize with versioning when forced. See D81345.)
9429	if (F->hasOptSize() \|\| (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9430	QueryType: PGSOQueryType::IRPass) &&
9431	Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9432	return CM_ScalarEpilogueNotAllowedOptSize;
9433
9434	// 2) If set, obey the directives
9435	if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9436	switch (PreferPredicateOverEpilogue) {
9437	case PreferPredicateTy::ScalarEpilogue:
9438	return CM_ScalarEpilogueAllowed;
9439	case PreferPredicateTy::PredicateElseScalarEpilogue:
9440	return CM_ScalarEpilogueNotNeededUsePredicate;
9441	case PreferPredicateTy::PredicateOrDontVectorize:
9442	return CM_ScalarEpilogueNotAllowedUsePredicate;
9443	};
9444	}
9445
9446	// 3) If set, obey the hints
9447	switch (Hints.getPredicate()) {
9448	case LoopVectorizeHints::FK_Enabled:
9449	return CM_ScalarEpilogueNotNeededUsePredicate;
9450	case LoopVectorizeHints::FK_Disabled:
9451	return CM_ScalarEpilogueAllowed;
9452	};
9453
9454	// 4) if the TTI hook indicates this is profitable, request predication.
9455	TailFoldingInfo TFI(TLI, &LVL, IAI);
9456	if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9457	return CM_ScalarEpilogueNotNeededUsePredicate;
9458
9459	return CM_ScalarEpilogueAllowed;
9460	}
9461
9462	// Process the loop in the VPlan-native vectorization path. This path builds
9463	// VPlan upfront in the vectorization pipeline, which allows to apply
9464	// VPlan-to-VPlan transformations from the very beginning without modifying the
9465	// input LLVM IR.
9466	static bool processLoopInVPlanNativePath(
9467	Loop L, PredicatedScalarEvolution &PSE, LoopInfo LI, DominatorTree *DT,
9468	LoopVectorizationLegality LVL, TargetTransformInfo TTI,
9469	TargetLibraryInfo TLI, DemandedBits DB, AssumptionCache *AC,
9470	OptimizationRemarkEmitter ORE, BlockFrequencyInfo BFI,
9471	ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9472	LoopVectorizationRequirements &Requirements) {
9473
9474	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9475	LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9476	return false;
9477	}
9478	assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9479	Function *F = L->getHeader()->getParent();
9480	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9481
9482	ScalarEpilogueLowering SEL =
9483	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9484
9485	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9486	&Hints, IAI, PSI, BFI);
9487	// Use the planner for outer loop vectorization.
9488	// TODO: CM is not used at this point inside the planner. Turn CM into an
9489	// optional argument if we don't need it in the future.
9490	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9491	ORE);
9492
9493	// Get user vectorization factor.
9494	ElementCount UserVF = Hints.getWidth();
9495
9496	CM.collectElementTypesForWidening();
9497
9498	// Plan how to best vectorize, return the best VF and its cost.
9499	const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9500
9501	// If we are stress testing VPlan builds, do not attempt to generate vector
9502	// code. Masked vector code generation support will follow soon.
9503	// Also, do not attempt to vectorize if no vector code will be produced.
9504	if (VPlanBuildStressTest \|\| VectorizationFactor::Disabled() == VF)
9505	return false;
9506
9507	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9508
9509	{
9510	bool AddBranchWeights =
9511	hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9512	GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9513	AddBranchWeights, CM.CostKind);
9514	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9515	VF.Width, `1`, &CM, BFI, PSI, Checks, BestPlan);
9516	LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9517	<< L->getHeader()->getParent()->getName() << "\"\n");
9518	LVP.executePlan(BestVF: VF.Width, BestUF: `1`, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9519	}
9520
9521	reportVectorization(ORE, TheLoop: L, VF, IC: `1`);
9522
9523	// Mark the loop as already vectorized to avoid vectorizing again.
9524	Hints.setAlreadyVectorized();
9525	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9526	return true;
9527	}
9528
9529	// Emit a remark if there are stores to floats that required a floating point
9530	// extension. If the vectorized loop was generated with floating point there
9531	// will be a performance penalty from the conversion overhead and the change in
9532	// the vector width.
9533	static void checkMixedPrecision(Loop L, OptimizationRemarkEmitter ORE) {
9534	SmallVector<Instruction *, `4`> Worklist;
9535	for (BasicBlock *BB : L->getBlocks()) {
9536	for (Instruction &Inst : *BB) {
9537	if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9538	if (S->getValueOperand()->getType()->isFloatTy())
9539	Worklist.push_back(Elt: S);
9540	}
9541	}
9542	}
9543
9544	// Traverse the floating point stores upwards searching, for floating point
9545	// conversions.
9546	SmallPtrSet<const Instruction *, `4`> Visited;
9547	SmallPtrSet<const Instruction *, `4`> EmittedRemark;
9548	while (!Worklist.empty()) {
9549	auto *I = Worklist.pop_back_val();
9550	if (!L->contains(Inst: I))
9551	continue;
9552	if (!Visited.insert(Ptr: I).second)
9553	continue;
9554
9555	// Emit a remark if the floating point store required a floating
9556	// point conversion.
9557	// TODO: More work could be done to identify the root cause such as a
9558	// constant or a function return type and point the user to it.
9559	if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9560	ORE->emit(RemarkBuilder: [&]() {
9561	return OptimizationRemarkAnalysis (LV_NAME, "VectorMixedPrecision",
9562	I->getDebugLoc(), L->getHeader())
9563	<< "floating point conversion changes vector width. "
9564	<< "Mixed floating point precision requires an up/down "
9565	<< "cast that will negatively impact performance.";
9566	});
9567
9568	for (Use &Op : I->operands())
9569	if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9570	Worklist.push_back(Elt: OpI);
9571	}
9572	}
9573
9574	/// For loops with uncountable early exits, find the cost of doing work when
9575	/// exiting the loop early, such as calculating the final exit values of
9576	/// variables used outside the loop.
9577	/// TODO: This is currently overly pessimistic because the loop may not take
9578	/// the early exit, but better to keep this conservative for now. In future,
9579	/// it might be possible to relax this by using branch probabilities.
9580	static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9581	VPlan &Plan, ElementCount VF) {
9582	InstructionCost Cost = `0`;
9583	for (auto *ExitVPBB : Plan.getExitBlocks()) {
9584	for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9585	// If the predecessor is not the middle.block, then it must be the
9586	// vector.early.exit block, which may contain work to calculate the exit
9587	// values of variables used outside the loop.
9588	if (PredVPBB != Plan.getMiddleBlock()) {
9589	LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9590	<< PredVPBB->getName() << ":\n");
9591	Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
9592	}
9593	}
9594	}
9595	return Cost;
9596	}
9597
9598	/// This function determines whether or not it's still profitable to vectorize
9599	/// the loop given the extra work we have to do outside of the loop:
9600	/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9601	/// to vectorize.
9602	/// 2. In the case of loops with uncountable early exits, we may have to do
9603	/// extra work when exiting the loop early, such as calculating the final
9604	/// exit values of variables used outside the loop.
9605	static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9606	VectorizationFactor &VF, Loop *L,
9607	PredicatedScalarEvolution &PSE,
9608	VPCostContext &CostCtx, VPlan &Plan,
9609	ScalarEpilogueLowering SEL,
9610	std::optional<unsigned> VScale) {
9611	InstructionCost TotalCost = Checks.getCost();
9612	if (!TotalCost.isValid())
9613	return false;
9614
9615	// Add on the cost of any work required in the vector early exit block, if
9616	// one exists.
9617	TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
9618
9619	// When interleaving only scalar and vector cost will be equal, which in turn
9620	// would lead to a divide by 0. Fall back to hard threshold.
9621	if (VF.Width.isScalar()) {
9622	// TODO: Should we rename VectorizeMemoryCheckThreshold?
9623	if (TotalCost > VectorizeMemoryCheckThreshold) {
9624	LLVM_DEBUG(
9625	dbgs()
9626	<< "LV: Interleaving only is not profitable due to runtime checks\n");
9627	return false;
9628	}
9629	return true;
9630	}
9631
9632	// The scalar cost should only be 0 when vectorizing with a user specified
9633	// VF/IC. In those cases, runtime checks should always be generated.
9634	uint64_t ScalarC = VF.ScalarCost.getValue();
9635	if (ScalarC == `0`)
9636	return true;
9637
9638	// First, compute the minimum iteration count required so that the vector
9639	// loop outperforms the scalar loop.
9640	// The total cost of the scalar loop is
9641	// ScalarC TC*
9642	// where
9643	// TC is the actual trip count of the loop.*
9644	// ScalarC is the cost of a single scalar iteration.*
9645	//
9646	// The total cost of the vector loop is
9647	// RtC + VecC (TC / VF) + EpiC*
9648	// where
9649	// RtC is the cost of the generated runtime checks plus the cost of*
9650	// performing any additional work in the vector.early.exit block for loops
9651	// with uncountable early exits.
9652	// VecC is the cost of a single vector iteration.*
9653	// TC is the actual trip count of the loop*
9654	// VF is the vectorization factor*
9655	// EpiCost is the cost of the generated epilogue, including the cost*
9656	// of the remaining scalar operations.
9657	//
9658	// Vectorization is profitable once the total vector cost is less than the
9659	// total scalar cost:
9660	// RtC + VecC (TC / VF) + EpiC < ScalarC * TC*
9661	//
9662	// Now we can compute the minimum required trip count TC as
9663	// VF (RtC + EpiC) / (ScalarC * VF - VecC) < TC*
9664	//
9665	// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9666	// the computations are performed on doubles, not integers and the result
9667	// is rounded up, hence we get an upper estimate of the TC.
9668	unsigned IntVF = getEstimatedRuntimeVF(VF: VF.Width, VScale);
9669	uint64_t RtC = TotalCost.getValue();
9670	uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9671	uint64_t MinTC1 = Div == `0` ? `0` : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9672
9673	// Second, compute a minimum iteration count so that the cost of the
9674	// runtime checks is only a fraction of the total scalar loop cost. This
9675	// adds a loop-dependent bound on the overhead incurred if the runtime
9676	// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9677	// TC. To bound the runtime check to be a fraction 1/X of the scalar*
9678	// cost, compute
9679	// RtC < ScalarC TC * (1 / X) ==> RtC * X / ScalarC < TC*
9680	uint64_t MinTC2 = divideCeil(Numerator: RtC * `10`, Denominator: ScalarC);
9681
9682	// Now pick the larger minimum. If it is not a multiple of VF and a scalar
9683	// epilogue is allowed, choose the next closest multiple of VF. This should
9684	// partly compensate for ignoring the epilogue cost.
9685	uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9686	if (SEL == CM_ScalarEpilogueAllowed)
9687	MinTC = alignTo(Value: MinTC, Align: IntVF);
9688	VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9689
9690	LLVM_DEBUG(
9691	dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9692	<< VF.MinProfitableTripCount << "\n");
9693
9694	// Skip vectorization if the expected trip count is less than the minimum
9695	// required trip count.
9696	if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9697	if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
9698	LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9699	"trip count < minimum profitable VF ("
9700	<< *ExpectedTC << " < " << VF.MinProfitableTripCount
9701	<< ")\n");
9702
9703	return false;
9704	}
9705	}
9706	return true;
9707	}
9708
9709	LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9710	: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced \|\|
9711	!EnableLoopInterleaving),
9712	VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced \|\|
9713	!EnableLoopVectorization) {}
9714
9715	/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9716	/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9717	/// don't have a corresponding wide induction in \p EpiPlan.
9718	static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9719	// Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9720	// will need their resume-values computed in the main vector loop. Others
9721	// can be removed from the main VPlan.
9722	SmallPtrSet<PHINode *, `2`> EpiWidenedPhis;
9723	for (VPRecipeBase &R :
9724	EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9725	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9726	continue;
9727	EpiWidenedPhis.insert(
9728	Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9729	}
9730	for (VPRecipeBase &R :
9731	make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9732	auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9733	if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9734	continue;
9735	// There is no corresponding wide induction in the epilogue plan that would
9736	// need a resume value. Remove the VPIRInst wrapping the scalar header phi
9737	// together with the corresponding ResumePhi. The resume values for the
9738	// scalar loop will be created during execution of EpiPlan.
9739	VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: `0`)->getDefiningRecipe();
9740	VPIRInst->eraseFromParent();
9741	ResumePhi->eraseFromParent();
9742	}
9743	VPlanTransforms::runPass(Fn: VPlanTransforms::removeDeadRecipes, Plan&: MainPlan);
9744
9745	using namespace VPlanPatternMatch;
9746	// When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9747	// introduce multiple uses of undef/poison. If the reduction start value may
9748	// be undef or poison it needs to be frozen and the frozen start has to be
9749	// used when computing the reduction result. We also need to use the frozen
9750	// value in the resume phi generated by the main vector loop, as this is also
9751	// used to compute the reduction result after the epilogue vector loop.
9752	auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9753	bool UpdateResumePhis) {
9754	VPBuilder Builder(Plan.getEntry());
9755	for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9756	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9757	if (!VPI \|\| VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9758	continue;
9759	VPValue *OrigStart = VPI->getOperand(N: `1`);
9760	if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9761	continue;
9762	VPInstruction *Freeze =
9763	Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, Inst: {}, Name: "fr");
9764	VPI->setOperand(I: `1`, New: Freeze);
9765	if (UpdateResumePhis)
9766	OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9767	return Freeze != &U && isa<VPPhi>(Val: &U);
9768	});
9769	}
9770	};
9771	AddFreezeForFindLastIVReductions (MainPlan, true);
9772	AddFreezeForFindLastIVReductions (EpiPlan, false);
9773
9774	VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9775	VPValue *VectorTC = &MainPlan.getVectorTripCount();
9776	// If there is a suitable resume value for the canonical induction in the
9777	// scalar (which will become vector) epilogue loop we are done. Otherwise
9778	// create it below.
9779	if (any_of(Range&: *MainScalarPH, P: [VectorTC](VPRecipeBase &R) {
9780	return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Op0: m_Specific(VPV: VectorTC),
9781	Op1: m_SpecificInt(V: `0`)));
9782	}))
9783	return;
9784	VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9785	ScalarPHBuilder.createScalarPhi(
9786	IncomingValues: {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, DL: {},
9787	Name: "vec.epilog.resume.val");
9788	}
9789
9790	/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9791	/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9792	static void
9793	preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9794	const SCEV2ValueTy &ExpandedSCEVs,
9795	const EpilogueLoopVectorizationInfo &EPI) {
9796	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9797	VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9798	Header->setName("vec.epilog.vector.body");
9799
9800	DenseMap<Value , Value > ToFrozen;
9801	// Ensure that the start values for all header phi recipes are updated before
9802	// vectorizing the epilogue loop.
9803	for (VPRecipeBase &R : Header->phis()) {
9804	if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &R)) {
9805	// When vectorizing the epilogue loop, the canonical induction start
9806	// value needs to be changed from zero to the value after the main
9807	// vector loop. Find the resume value created during execution of the main
9808	// VPlan.
9809	// FIXME: Improve modeling for canonical IV start values in the epilogue
9810	// loop.
9811	using namespace llvm::PatternMatch;
9812	Type *IdxTy = IV->getScalarType();
9813	PHINode *EPResumeVal = find_singleton<PHINode>(
9814	Range: L->getLoopPreheader()->phis(),
9815	P: [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9816	if (P.getType() == IdxTy &&
9817	match(
9818	V: P.getIncomingValueForBlock(BB: EPI.MainLoopIterationCountCheck),
9819	P: m_SpecificInt(V: `0`)) &&
9820	all_of(Range: P.incoming_values(), P: [&EPI](Value *Inc) {
9821	return Inc == EPI.VectorTripCount \|\|
9822	match(V: Inc, P: m_SpecificInt(V: `0`));
9823	}))
9824	return &P;
9825	return nullptr;
9826	});
9827	assert(EPResumeVal && "must have a resume value for the canonical IV");
9828	VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9829	assert(all_of(IV->users(),
9830	[](const VPUser *U) {
9831	return isa<VPScalarIVStepsRecipe>(U) \|\|
9832	isa<VPDerivedIVRecipe>(U) \|\|
9833	cast<VPRecipeBase>(U)->isScalarCast() \|\|
9834	cast<VPInstruction>(U)->getOpcode() ==
9835	Instruction::Add;
9836	}) &&
9837	"the canonical IV should only be used by its increment or "
9838	"ScalarIVSteps when resetting the start value");
9839	IV->setOperand(I: `0`, New: VPV);
9840	continue;
9841	}
9842
9843	Value ResumeV = nullptr*;
9844	// TODO: Move setting of resume values to prepareToExecute.
9845	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9846	auto *RdxResult =
9847	cast<VPInstruction>(Val: find_if(Range: ReductionPhi->users(), P: [](VPUser U) {
9848	auto *VPI = dyn_cast<VPInstruction>(Val: U);
9849	return VPI &&
9850	(VPI->getOpcode() == VPInstruction::ComputeAnyOfResult \|\|
9851	VPI->getOpcode() == VPInstruction::ComputeReductionResult \|\|
9852	VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9853	}));
9854	ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9855	->getIncomingValueForBlock(BB: L->getLoopPreheader());
9856	const RecurrenceDescriptor &RdxDesc =
9857	ReductionPhi->getRecurrenceDescriptor();
9858	RecurKind RK = RdxDesc.getRecurrenceKind();
9859	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
9860	Value *StartV = RdxResult->getOperand(N: `1`)->getLiveInIRValue();
9861	assert(RdxDesc.getRecurrenceStartValue() == StartV &&
9862	"start value from ComputeAnyOfResult must match");
9863
9864	// VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9865	// start value; compare the final value from the main vector loop
9866	// to the start value.
9867	BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9868	IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9869	ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9870	} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK)) {
9871	Value *StartV = getStartValueFromReductionResult(RdxResult);
9872	assert(RdxDesc.getRecurrenceStartValue() == StartV &&
9873	"start value from ComputeFinIVResult must match");
9874
9875	ToFrozen [StartV] = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9876	BB: EPI.MainLoopIterationCountCheck);
9877
9878	// VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9879	// an adjustment to the resume value. The resume value is adjusted to
9880	// the sentinel value when the final value from the main vector loop
9881	// equals the start value. This ensures correctness when the start value
9882	// might not be less than the minimum value of a monotonically
9883	// increasing induction variable.
9884	BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9885	IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9886	Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: ToFrozen [StartV]);
9887	Value *Sentinel = RdxResult->getOperand(N: `2`)->getLiveInIRValue();
9888	ResumeV = Builder.CreateSelect(C: Cmp, True: Sentinel, False: ResumeV);
9889	} else {
9890	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9891	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9892	if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9893	assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9894	"unexpected start value");
9895	VPI->setOperand(I: `0`, New: StartVal);
9896	continue;
9897	}
9898	}
9899	} else {
9900	// Retrieve the induction resume values for wide inductions from
9901	// their original phi nodes in the scalar loop.
9902	PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9903	// Hook up to the PHINode generated by a ResumePhi recipe of main
9904	// loop VPlan, which feeds the scalar loop.
9905	ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9906	}
9907	assert(ResumeV && "Must have a resume value");
9908	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9909	cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9910	}
9911
9912	// For some VPValues in the epilogue plan we must re-use the generated IR
9913	// values from the main plan. Replace them with live-in VPValues.
9914	// TODO: This is a workaround needed for epilogue vectorization and it
9915	// should be removed once induction resume value creation is done
9916	// directly in VPlan.
9917	for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9918	// Re-use frozen values from the main plan for Freeze VPInstructions in the
9919	// epilogue plan. This ensures all users use the same frozen value.
9920	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9921	if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9922	VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9923	V: ToFrozen.lookup(Val: VPI->getOperand(N: `0`)->getLiveInIRValue())));
9924	continue;
9925	}
9926
9927	// Re-use the trip count and steps expanded for the main loop, as
9928	// skeleton creation needs it as a value that dominates both the scalar
9929	// and vector epilogue loops
9930	auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9931	if (!ExpandR)
9932	continue;
9933	VPValue *ExpandedVal =
9934	Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9935	ExpandR->replaceAllUsesWith(New: ExpandedVal);
9936	if (Plan.getTripCount() == ExpandR)
9937	Plan.resetTripCount(NewTripCount: ExpandedVal);
9938	ExpandR->eraseFromParent();
9939	}
9940	}
9941
9942	// Generate bypass values from the additional bypass block. Note that when the
9943	// vectorized epilogue is skipped due to iteration count check, then the
9944	// resume value for the induction variable comes from the trip count of the
9945	// main vector loop, passed as the second argument.
9946	static Value *createInductionAdditionalBypassValues(
9947	PHINode OrigPhi, const* InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9948	const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9949	Instruction *OldInduction) {
9950	Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9951	// For the primary induction the additional bypass end value is known.
9952	// Otherwise it is computed.
9953	Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9954	if (OrigPhi != OldInduction) {
9955	auto *BinOp = II.getInductionBinOp();
9956	// Fast-math-flags propagate from the original induction instruction.
9957	if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9958	BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9959
9960	// Compute the end value for the additional bypass.
9961	EndValueFromAdditionalBypass =
9962	emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9963	StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9964	EndValueFromAdditionalBypass->setName("ind.end");
9965	}
9966	return EndValueFromAdditionalBypass;
9967	}
9968
9969	bool LoopVectorizePass::processLoop(Loop *L) {
9970	assert((EnableVPlanNativePath \|\| L->isInnermost()) &&
9971	"VPlan-native path is not enabled. Only process inner loops.");
9972
9973	LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9974	<< L->getHeader()->getParent()->getName() << "' from "
9975	<< L->getLocStr() << "\n");
9976
9977	LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9978
9979	LLVM_DEBUG(
9980	dbgs() << "LV: Loop hints:"
9981	<< " force="
9982	<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9983	? "disabled"
9984	: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9985	? "enabled"
9986	: "?"))
9987	<< " width=" << Hints.getWidth()
9988	<< " interleave=" << Hints.getInterleave() << "\n");
9989
9990	// Function containing loop
9991	Function *F = L->getHeader()->getParent();
9992
9993	// Looking at the diagnostic output is the only way to determine if a loop
9994	// was vectorized (other than looking at the IR or machine code), so it
9995	// is important to generate an optimization remark for each loop. Most of
9996	// these messages are generated as OptimizationRemarkAnalysis. Remarks
9997	// generated as OptimizationRemark and OptimizationRemarkMissed are
9998	// less verbose reporting vectorized loops and unvectorized loops that may
9999	// benefit from vectorization, respectively.
10000
10001	if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10002	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10003	return false;
10004	}
10005
10006	PredicatedScalarEvolution PSE(SE, L);
10007
10008	// Check if it is legal to vectorize the loop.
10009	LoopVectorizationRequirements Requirements;
10010	LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10011	&Requirements, &Hints, DB, AC, BFI, PSI);
10012	if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
10013	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10014	Hints.emitRemarkWithHints();
10015	return false;
10016	}
10017
10018	if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10019	reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
10020	"early exit is not enabled",
10021	ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
10022	return false;
10023	}
10024
10025	// Entrance to the VPlan-native vectorization path. Outer loops are processed
10026	// here. They may require CFG and instruction level transformations before
10027	// even evaluating whether vectorization is profitable. Since we cannot modify
10028	// the incoming IR, we need to build VPlan upfront in the vectorization
10029	// pipeline.
10030	if (!L->isInnermost())
10031	return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
10032	ORE, BFI, PSI, Hints, Requirements);
10033
10034	assert(L->isInnermost() && "Inner loop expected.");
10035
10036	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10037	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10038
10039	// If an override option has been passed in for interleaved accesses, use it.
10040	if (EnableInterleavedMemAccesses.getNumOccurrences() > `0`)
10041	UseInterleaved = EnableInterleavedMemAccesses;
10042
10043	// Analyze interleaved memory accesses.
10044	if (UseInterleaved)
10045	IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
10046
10047	if (LVL.hasUncountableEarlyExit()) {
10048	BasicBlock *LoopLatch = L->getLoopLatch();
10049	if (IAI.requiresScalarEpilogue() \|\|
10050	any_of(Range: LVL.getCountableExitingBlocks(),
10051	P: [LoopLatch](BasicBlock BB) { return* BB != LoopLatch; })) {
10052	reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
10053	"requiring a scalar epilogue is unsupported",
10054	ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
10055	return false;
10056	}
10057	}
10058
10059	// Check the function attributes and profiles to find out if this function
10060	// should be optimized for size.
10061	ScalarEpilogueLowering SEL =
10062	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
10063
10064	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
10065	// count by optimizing for size, to minimize overheads.
10066	auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10067	if (ExpectedTC && ExpectedTC ->isFixed() &&
10068	ExpectedTC ->getFixedValue() < TinyTripCountVectorThreshold) {
10069	LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10070	<< "This loop is worth vectorizing only if no scalar "
10071	<< "iteration overheads are incurred.");
10072	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10073	LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10074	else {
10075	LLVM_DEBUG(dbgs() << "\n");
10076	// Predicate tail-folded loops are efficient even when the loop
10077	// iteration count is low. However, setting the epilogue policy to
10078	// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10079	// with runtime checks. It's more effective to let
10080	// `isOutsideLoopWorkProfitable` determine if vectorization is
10081	// beneficial for the loop.
10082	if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10083	SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10084	}
10085	}
10086
10087	// Check the function attributes to see if implicit floats or vectors are
10088	// allowed.
10089	if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
10090	reportVectorizationFailure(
10091	DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
10092	OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
10093	ORETag: "NoImplicitFloat", ORE, TheLoop: L);
10094	Hints.emitRemarkWithHints();
10095	return false;
10096	}
10097
10098	// Check if the target supports potentially unsafe FP vectorization.
10099	// FIXME: Add a check for the type of safety issue (denormal, signaling)
10100	// for the target we're vectorizing for, to make sure none of the
10101	// additional fp-math flags can help.
10102	if (Hints.isPotentiallyUnsafe() &&
10103	TTI->isFPVectorizationPotentiallyUnsafe()) {
10104	reportVectorizationFailure(
10105	DebugMsg: "Potentially unsafe FP op prevents vectorization",
10106	OREMsg: "loop not vectorized due to unsafe FP support.",
10107	ORETag: "UnsafeFP", ORE, TheLoop: L);
10108	Hints.emitRemarkWithHints();
10109	return false;
10110	}
10111
10112	bool AllowOrderedReductions;
10113	// If the flag is set, use that instead and override the TTI behaviour.
10114	if (ForceOrderedReductions.getNumOccurrences() > `0`)
10115	AllowOrderedReductions = ForceOrderedReductions;
10116	else
10117	AllowOrderedReductions = TTI->enableOrderedReductions();
10118	if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
10119	ORE->emit(RemarkBuilder: [&]() {
10120	auto *ExactFPMathInst = Requirements.getExactFPInst();
10121	return OptimizationRemarkAnalysisFPCommute (DEBUG_TYPE, "CantReorderFPOps",
10122	ExactFPMathInst->getDebugLoc(),
10123	ExactFPMathInst->getParent())
10124	<< "loop not vectorized: cannot prove it is safe to reorder "
10125	"floating-point operations";
10126	});
10127	LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10128	"reorder floating-point operations\n");
10129	Hints.emitRemarkWithHints();
10130	return false;
10131	}
10132
10133	// Use the cost model.
10134	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10135	F, &Hints, IAI, PSI, BFI);
10136	// Use the planner for vectorization.
10137	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10138	ORE);
10139
10140	// Get user vectorization factor and interleave count.
10141	ElementCount UserVF = Hints.getWidth();
10142	unsigned UserIC = Hints.getInterleave();
10143	if (LVL.hasUncountableEarlyExit() && UserIC != `1` &&
10144	!VectorizerParams::isInterleaveForced()) {
10145	UserIC = `1`;
10146	reportVectorizationInfo(Msg: "Interleaving not supported for loops "
10147	"with uncountable early exits",
10148	ORETag: "InterleaveEarlyExitDisabled", ORE, TheLoop: L);
10149	}
10150
10151	// Plan how to best vectorize.
10152	LVP.plan(UserVF, UserIC);
10153	VectorizationFactor VF = LVP.computeBestVF();
10154	unsigned IC = `1`;
10155
10156	if (ORE->allowExtraAnalysis(LV_NAME))
10157	LVP.emitInvalidCostRemarks(ORE);
10158
10159	bool AddBranchWeights =
10160	hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
10161	GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10162	AddBranchWeights, CM.CostKind);
10163	if (LVP.hasPlanWithVF(VF: VF.Width)) {
10164	// Select the interleave count.
10165	IC = CM.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
10166
10167	unsigned SelectedIC = std::max(a: IC, b: UserIC);
10168	// Optimistically generate runtime checks if they are needed. Drop them if
10169	// they turn out to not be profitable.
10170	if (VF.Width.isVector() \|\| SelectedIC > `1`)
10171	Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
10172
10173	// Check if it is profitable to vectorize with runtime checks.
10174	bool ForceVectorization =
10175	Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10176	VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10177	CM, CM.CostKind);
10178	if (!ForceVectorization &&
10179	!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10180	Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
10181	VScale: CM.getVScaleForTuning())) {
10182	ORE->emit(RemarkBuilder: [&]() {
10183	return OptimizationRemarkAnalysisAliasing (
10184	DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10185	L->getHeader())
10186	<< "loop not vectorized: cannot prove it is safe to reorder "
10187	"memory operations";
10188	});
10189	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10190	Hints.emitRemarkWithHints();
10191	return false;
10192	}
10193	}
10194
10195	// Identify the diagnostic messages that should be produced.
10196	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10197	bool VectorizeLoop = true, InterleaveLoop = true;
10198	if (VF.Width.isScalar()) {
10199	LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10200	VecDiagMsg = {
10201	"VectorizationNotBeneficial",
10202	"the cost-model indicates that vectorization is not beneficial"};
10203	VectorizeLoop = false;
10204	}
10205
10206	if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > `1`) {
10207	// Tell the user interleaving was avoided up-front, despite being explicitly
10208	// requested.
10209	LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10210	"interleaving should be avoided up front\n");
10211	IntDiagMsg = {"InterleavingAvoided",
10212	"Ignoring UserIC, because interleaving was avoided up front"};
10213	InterleaveLoop = false;
10214	} else if (IC == `1` && UserIC <= `1`) {
10215	// Tell the user interleaving is not beneficial.
10216	LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10217	IntDiagMsg = {
10218	"InterleavingNotBeneficial",
10219	"the cost-model indicates that interleaving is not beneficial"};
10220	InterleaveLoop = false;
10221	if (UserIC == `1`) {
10222	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10223	IntDiagMsg.second +=
10224	" and is explicitly disabled or interleave count is set to 1";
10225	}
10226	} else if (IC > `1` && UserIC == `1`) {
10227	// Tell the user interleaving is beneficial, but it explicitly disabled.
10228	LLVM_DEBUG(
10229	dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10230	IntDiagMsg = {"InterleavingBeneficialButDisabled",
10231	"the cost-model indicates that interleaving is beneficial "
10232	"but is explicitly disabled or interleave count is set to 1"};
10233	InterleaveLoop = false;
10234	}
10235
10236	// If there is a histogram in the loop, do not just interleave without
10237	// vectorizing. The order of operations will be incorrect without the
10238	// histogram intrinsics, which are only used for recipes with VF > 1.
10239	if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10240	LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10241	<< "to histogram operations.\n");
10242	IntDiagMsg = {
10243	"HistogramPreventsScalarInterleaving",
10244	"Unable to interleave without vectorization due to constraints on "
10245	"the order of histogram operations"};
10246	InterleaveLoop = false;
10247	}
10248
10249	// Override IC if user provided an interleave count.
10250	IC = UserIC > `0` ? UserIC : IC;
10251
10252	// Emit diagnostic messages, if any.
10253	const char *VAPassName = Hints.vectorizeAnalysisPassName();
10254	if (!VectorizeLoop && !InterleaveLoop) {
10255	// Do not vectorize or interleaving the loop.
10256	ORE->emit(RemarkBuilder: [&]() {
10257	return OptimizationRemarkMissed (VAPassName, VecDiagMsg.first,
10258	L->getStartLoc(), L->getHeader())
10259	<< VecDiagMsg.second;
10260	});
10261	ORE->emit(RemarkBuilder: [&]() {
10262	return OptimizationRemarkMissed (LV_NAME, IntDiagMsg.first,
10263	L->getStartLoc(), L->getHeader())
10264	<< IntDiagMsg.second;
10265	});
10266	return false;
10267	}
10268
10269	if (!VectorizeLoop && InterleaveLoop) {
10270	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10271	ORE->emit(RemarkBuilder: [&]() {
10272	return OptimizationRemarkAnalysis (VAPassName, VecDiagMsg.first,
10273	L->getStartLoc(), L->getHeader())
10274	<< VecDiagMsg.second;
10275	});
10276	} else if (VectorizeLoop && !InterleaveLoop) {
10277	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10278	<< ") in " << L->getLocStr() << `'\n'`);
10279	ORE->emit(RemarkBuilder: [&]() {
10280	return OptimizationRemarkAnalysis (LV_NAME, IntDiagMsg.first,
10281	L->getStartLoc(), L->getHeader())
10282	<< IntDiagMsg.second;
10283	});
10284	} else if (VectorizeLoop && InterleaveLoop) {
10285	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10286	<< ") in " << L->getLocStr() << `'\n'`);
10287	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10288	}
10289
10290	bool DisableRuntimeUnroll = false;
10291	MDNode *OrigLoopID = L->getLoopID();
10292	{
10293	using namespace ore;
10294	if (!VectorizeLoop) {
10295	assert(IC > `1` && "interleave count should not be 1 or 0");
10296	// If we decided that it is not legal to vectorize the loop, then
10297	// interleave it.
10298	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10299	InnerLoopVectorizer Unroller(
10300	L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(MinVal: `1`),
10301	ElementCount::getFixed(MinVal: `1`), IC, &CM, BFI, PSI, Checks, BestPlan);
10302
10303	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, VectorizingEpilogue: false);
10304
10305	ORE->emit(RemarkBuilder: [&]() {
10306	return OptimizationRemark (LV_NAME, "Interleaved", L->getStartLoc(),
10307	L->getHeader())
10308	<< "interleaved loop (interleaved count: "
10309	<< NV ("InterleaveCount", IC) << ")";
10310	});
10311	} else {
10312	// If we decided that it is legal* to vectorize the loop, then do it.*
10313
10314	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10315	// Consider vectorizing the epilogue too if it's profitable.
10316	VectorizationFactor EpilogueVF =
10317	LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10318	if (EpilogueVF.Width.isVector()) {
10319	std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10320
10321	// The first pass vectorizes the main loop and creates a scalar epilogue
10322	// to be vectorized by executing the plan (potentially with a different
10323	// factor) again shortly afterwards.
10324	VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
10325	BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10326	preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
10327	EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, `1`,
10328	BestEpiPlan);
10329	EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10330	EPI, &CM, BFI, PSI, Checks,
10331	*BestMainPlan);
10332	auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
10333	BestVPlan&: BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false*);
10334	++LoopsVectorized;
10335
10336	// Second pass vectorizes the epilogue and adjusts the control flow
10337	// edges from the first pass.
10338	EPI.MainLoopVF = EPI.EpilogueVF;
10339	EPI.MainLoopUF = EPI.EpilogueUF;
10340	EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10341	ORE, EPI, &CM, BFI, PSI,
10342	Checks, BestEpiPlan);
10343	EpilogILV.setTripCount(MainILV.getTripCount());
10344	preparePlanForEpilogueVectorLoop(Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI);
10345
10346	LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10347	DT, VectorizingEpilogue: true);
10348
10349	// Fix induction resume values from the additional bypass block.
10350	BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10351	IRBuilder<> BypassBuilder(BypassBlock,
10352	BypassBlock->getFirstInsertionPt());
10353	BasicBlock *PH = L->getLoopPreheader();
10354	for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10355	auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
10356	Value *V = createInductionAdditionalBypassValues(
10357	OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount,
10358	OldInduction: LVL.getPrimaryInduction());
10359	// TODO: Directly add as extra operand to the VPResumePHI recipe.
10360	Inc->setIncomingValueForBlock(BB: BypassBlock, V);
10361	}
10362	++LoopsEpilogueVectorized;
10363
10364	if (!Checks.hasChecks())
10365	DisableRuntimeUnroll = true;
10366	} else {
10367	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10368	VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10369	Checks, BestPlan);
10370	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
10371	++LoopsVectorized;
10372
10373	// Add metadata to disable runtime unrolling a scalar loop when there
10374	// are no runtime checks about strides and memory. A scalar loop that is
10375	// rarely used is not worth unrolling.
10376	if (!Checks.hasChecks())
10377	DisableRuntimeUnroll = true;
10378	}
10379	// Report the vectorization decision.
10380	reportVectorization(ORE, TheLoop: L, VF, IC);
10381	}
10382
10383	if (ORE->allowExtraAnalysis(LV_NAME))
10384	checkMixedPrecision(L, ORE);
10385	}
10386
10387	assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10388	"DT not preserved correctly");
10389
10390	std::optional<MDNode *> RemainderLoopID =
10391	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10392	LLVMLoopVectorizeFollowupEpilogue});
10393	if (RemainderLoopID) {
10394	L->setLoopID(*RemainderLoopID);
10395	} else {
10396	if (DisableRuntimeUnroll)
10397	addRuntimeUnrollDisableMetaData(L);
10398
10399	// Mark the loop as already vectorized to avoid vectorizing again.
10400	Hints.setAlreadyVectorized();
10401	}
10402
10403	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10404	return true;
10405	}
10406
10407	LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10408
10409	// Don't attempt if
10410	// 1. the target claims to have no vector registers, and
10411	// 2. interleaving won't help ILP.
10412	//
10413	// The second condition is necessary because, even if the target has no
10414	// vector registers, loop vectorization may still enable scalar
10415	// interleaving.
10416	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10417	TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: `1`)) < `2`)
10418	return LoopVectorizeResult (false, false);
10419
10420	bool Changed = false, CFGChanged = false;
10421
10422	// The vectorizer requires loops to be in simplified form.
10423	// Since simplification may add new inner loops, it has to run before the
10424	// legality and profitability checks. This means running the loop vectorizer
10425	// will simplify all loops, regardless of whether anything end up being
10426	// vectorized.
10427	for (const auto &L : *LI)
10428	Changed \|= CFGChanged \|=
10429	simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
10430
10431	// Build up a worklist of inner-loops to vectorize. This is necessary as
10432	// the act of vectorizing or partially unrolling a loop creates new loops
10433	// and can invalidate iterators across the loops.
10434	SmallVector<Loop *, `8`> Worklist;
10435
10436	for (Loop L : LI)
10437	collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10438
10439	LoopsAnalyzed += Worklist.size();
10440
10441	// Now walk the identified inner loops.
10442	while (!Worklist.empty()) {
10443	Loop *L = Worklist.pop_back_val();
10444
10445	// For the inner loops we actually process, form LCSSA to simplify the
10446	// transform.
10447	Changed \|= formLCSSARecursively(L&: L, DT: DT, LI, SE);
10448
10449	Changed \|= CFGChanged \|= processLoop(L);
10450
10451	if (Changed) {
10452	LAIs->clear();
10453
10454	#ifndef NDEBUG
10455	if (VerifySCEV)
10456	SE->verify();
10457	#endif
10458	}
10459	}
10460
10461	// Process each loop nest in the function.
10462	return LoopVectorizeResult (Changed, CFGChanged);
10463	}
10464
10465	PreservedAnalyses LoopVectorizePass::run(Function &F,
10466	FunctionAnalysisManager &AM) {
10467	LI = &AM.getResult<LoopAnalysis>(IR&: F);
10468	// There are no loops in the function. Return before computing other
10469	// expensive analyses.
10470	if (LI->empty())
10471	return PreservedAnalyses::all();
10472	SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10473	TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
10474	DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
10475	TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
10476	AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
10477	DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
10478	ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10479	LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
10480
10481	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10482	PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10483	BFI = nullptr;
10484	if (PSI && PSI->hasProfileSummary())
10485	BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10486	LoopVectorizeResult Result = runImpl(F);
10487	if (!Result.MadeAnyChange)
10488	return PreservedAnalyses::all();
10489	PreservedAnalyses PA;
10490
10491	if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10492	for (auto &BB : F)
10493	RemoveRedundantDbgInstrs(BB: &BB);
10494	}
10495
10496	PA.preserve<LoopAnalysis>();
10497	PA.preserve<DominatorTreeAnalysis>();
10498	PA.preserve<ScalarEvolutionAnalysis>();
10499	PA.preserve<LoopAccessAnalysis>();
10500
10501	if (Result.MadeCFGChange) {
10502	// Making CFG changes likely means a loop got vectorized. Indicate that
10503	// extra simplification passes should be run.
10504	// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10505	// be run if runtime checks have been added.
10506	AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10507	PA.preserve<ShouldRunExtraVectorPasses>();
10508	} else {
10509	PA.preserveSet<CFGAnalyses>();
10510	}
10511	return PA;
10512	}
10513
10514	void LoopVectorizePass::printPipeline(
10515	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10516	static_cast<PassInfoMixin<LoopVectorizePass> >(this*)->printPipeline(
10517	OS, MapClassName2PassName);
10518
10519	OS << `'<'`;
10520	OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10521	OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10522	OS << `'>'`;
10523	}
10524

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp