LoopVectorizationLegality.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp]

1	//===- LoopVectorizationLegality.cpp --------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides loop vectorization legality analysis. Original code
10	// resided in LoopVectorize.cpp for a long time.
11	//
12	// At this point, it is implemented as a utility class, not as an analysis
13	// pass. It should be easy to create an analysis pass around it if there
14	// is a need (but D45420 needs to happen first).
15	//
16
17	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18	#include "llvm/Analysis/Loads.h"
19	#include "llvm/Analysis/LoopInfo.h"
20	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
21	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
22	#include "llvm/Analysis/TargetLibraryInfo.h"
23	#include "llvm/Analysis/TargetTransformInfo.h"
24	#include "llvm/Analysis/ValueTracking.h"
25	#include "llvm/Analysis/VectorUtils.h"
26	#include "llvm/IR/IntrinsicInst.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/Transforms/Utils/SizeOpts.h"
29	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
30
31	using namespace llvm;
32	using namespace PatternMatch;
33
34	#define LV_NAME "loop-vectorize"
35	#define DEBUG_TYPE LV_NAME
36
37	static cl::opt<bool>
38	EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
39	cl::desc ("Enable if-conversion during vectorization."));
40
41	static cl::opt<bool>
42	AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
43	cl::desc ("Enable recognition of non-constant strided "
44	"pointer induction variables."));
45
46	static cl::opt<bool>
47	HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
48	cl::desc ("Allow enabling loop hints to reorder "
49	"FP operations during vectorization."));
50
51	// TODO: Move size-based thresholds out of legality checking, make cost based
52	// decisions instead of hard thresholds.
53	static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
54	"vectorize-scev-check-threshold", cl::init(Val: `16`), cl::Hidden,
55	cl::desc ("The maximum number of SCEV checks allowed."));
56
57	static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
58	"pragma-vectorize-scev-check-threshold", cl::init(Val: `128`), cl::Hidden,
59	cl::desc ("The maximum number of SCEV checks allowed with a "
60	"vectorize(enable) pragma"));
61
62	static cl::opt<LoopVectorizeHints::ScalableForceKind>
63	ForceScalableVectorization(
64	"scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
65	cl::Hidden,
66	cl::desc ("Control whether the compiler can use scalable vectors to "
67	"vectorize a loop"),
68	cl::values(
69	clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
70	"Scalable vectorization is disabled."),
71	clEnumValN(
72	LoopVectorizeHints::SK_PreferScalable, "preferred",
73	"Scalable vectorization is available and favored when the "
74	"cost is inconclusive."),
75	clEnumValN(
76	LoopVectorizeHints::SK_PreferScalable, "on",
77	"Scalable vectorization is available and favored when the "
78	"cost is inconclusive.")));
79
80	static cl::opt<bool> EnableHistogramVectorization(
81	"enable-histogram-loop-vectorization", cl::init(Val: false), cl::Hidden,
82	cl::desc ("Enables autovectorization of some loops containing histograms"));
83
84	/// Maximum vectorization interleave count.
85	static const unsigned MaxInterleaveFactor = `16`;
86
87	namespace llvm {
88
89	bool LoopVectorizeHints::Hint::validate(unsigned Val) {
90	switch (Kind) {
91	case HK_WIDTH:
92	return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
93	case HK_INTERLEAVE:
94	return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
95	case HK_FORCE:
96	return (Val <= `1`);
97	case HK_ISVECTORIZED:
98	case HK_PREDICATE:
99	case HK_SCALABLE:
100	return (Val == `0` \|\| Val == `1`);
101	}
102	return false;
103	}
104
105	LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
106	bool InterleaveOnlyWhenForced,
107	OptimizationRemarkEmitter &ORE,
108	const TargetTransformInfo *TTI)
109	: Width ("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
110	Interleave ("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
111	Force ("vectorize.enable", FK_Undefined, HK_FORCE),
112	IsVectorized ("isvectorized", `0`, HK_ISVECTORIZED),
113	Predicate ("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
114	Scalable ("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
115	TheLoop(L), ORE(ORE) {
116	// Populate values with existing loop metadata.
117	getHintsFromMetadata();
118
119	// force-vector-interleave overrides DisableInterleaving.
120	if (VectorizerParams::isInterleaveForced())
121	Interleave.Value = VectorizerParams::VectorizationInterleave;
122
123	// If the metadata doesn't explicitly specify whether to enable scalable
124	// vectorization, then decide based on the following criteria (increasing
125	// level of priority):
126	// - Target default
127	// - Metadata width
128	// - Force option (always overrides)
129	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
130	if (TTI)
131	Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
132	: SK_FixedWidthOnly;
133
134	if (Width.Value)
135	// If the width is set, but the metadata says nothing about the scalable
136	// property, then assume it concerns only a fixed-width UserVF.
137	// If width is not set, the flag takes precedence.
138	Scalable.Value = SK_FixedWidthOnly;
139	}
140
141	// If the flag is set to force any use of scalable vectors, override the loop
142	// hints.
143	if (ForceScalableVectorization.getValue() !=
144	LoopVectorizeHints::SK_Unspecified)
145	Scalable.Value = ForceScalableVectorization.getValue();
146
147	// Scalable vectorization is disabled if no preference is specified.
148	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
149	Scalable.Value = SK_FixedWidthOnly;
150
151	if (IsVectorized.Value != `1`)
152	// If the vectorization width and interleaving count are both 1 then
153	// consider the loop to have been already vectorized because there's
154	// nothing more that we can do.
155	IsVectorized.Value =
156	getWidth() == ElementCount::getFixed(MinVal: `1`) && getInterleave() == `1`;
157	LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == `1`) dbgs()
158	<< "LV: Interleaving disabled by the pass manager\n");
159	}
160
161	void LoopVectorizeHints::setAlreadyVectorized() {
162	LLVMContext &Context = TheLoop->getHeader()->getContext();
163
164	MDNode *IsVectorizedMD = MDNode::get(
165	Context,
166	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
167	ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt (`32`, `1`)))});
168	MDNode *LoopID = TheLoop->getLoopID();
169	MDNode *NewLoopID =
170	makePostTransformationMetadata(Context, OrigLoopID: LoopID,
171	RemovePrefixes: {Twine (Prefix(), "vectorize.").str(),
172	Twine (Prefix(), "interleave.").str()},
173	AddAttrs: {IsVectorizedMD});
174	TheLoop->setLoopID(NewLoopID);
175
176	// Update internal cache.
177	IsVectorized.Value = `1`;
178	}
179
180	bool LoopVectorizeHints::allowVectorization(
181	Function F, Loop L, bool VectorizeOnlyWhenForced) const {
182	if (getForce() == LoopVectorizeHints::FK_Disabled) {
183	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
184	emitRemarkWithHints();
185	return false;
186	}
187
188	if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
189	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
190	emitRemarkWithHints();
191	return false;
192	}
193
194	if (getIsVectorized() == `1`) {
195	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
196	// FIXME: Add interleave.disable metadata. This will allow
197	// vectorize.disable to be used without disabling the pass and errors
198	// to differentiate between disabled vectorization and a width of 1.
199	ORE.emit(RemarkBuilder: [&]() {
200	return OptimizationRemarkAnalysis (vectorizeAnalysisPassName(),
201	"AllDisabled", L->getStartLoc(),
202	L->getHeader())
203	<< "loop not vectorized: vectorization and interleaving are "
204	"explicitly disabled, or the loop has already been "
205	"vectorized";
206	});
207	return false;
208	}
209
210	return true;
211	}
212
213	void LoopVectorizeHints::emitRemarkWithHints() const {
214	using namespace ore;
215
216	ORE.emit(RemarkBuilder: [&]() {
217	if (Force.Value == LoopVectorizeHints::FK_Disabled)
218	return OptimizationRemarkMissed (LV_NAME, "MissedExplicitlyDisabled",
219	TheLoop->getStartLoc(),
220	TheLoop->getHeader())
221	<< "loop not vectorized: vectorization is explicitly disabled";
222
223	OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
224	TheLoop->getHeader());
225	R << "loop not vectorized";
226	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
227	R << " (Force=" << NV ("Force", true);
228	if (Width.Value != `0`)
229	R << ", Vector Width=" << NV ("VectorWidth", getWidth());
230	if (getInterleave() != `0`)
231	R << ", Interleave Count=" << NV ("InterleaveCount", getInterleave());
232	R << ")";
233	}
234	return R;
235	});
236	}
237
238	const char LoopVectorizeHints::vectorizeAnalysisPassName() const* {
239	if (getWidth() == ElementCount::getFixed(MinVal: `1`))
240	return LV_NAME;
241	if (getForce() == LoopVectorizeHints::FK_Disabled)
242	return LV_NAME;
243	if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
244	return LV_NAME;
245	return OptimizationRemarkAnalysis::AlwaysPrint;
246	}
247
248	bool LoopVectorizeHints::allowReordering() const {
249	// Allow the vectorizer to change the order of operations if enabling
250	// loop hints are provided
251	ElementCount EC = getWidth();
252	return HintsAllowReordering &&
253	(getForce() == LoopVectorizeHints::FK_Enabled \|\|
254	EC.getKnownMinValue() > `1`);
255	}
256
257	void LoopVectorizeHints::getHintsFromMetadata() {
258	MDNode *LoopID = TheLoop->getLoopID();
259	if (!LoopID)
260	return;
261
262	// First operand should refer to the loop id itself.
263	assert(LoopID->getNumOperands() > `0` && "requires at least one operand");
264	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop id");
265
266	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
267	const MDString S = nullptr*;
268	SmallVector<Metadata *, `4`> Args;
269
270	// The expected hint is either a MDString or a MDNode with the first
271	// operand a MDString.
272	if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
273	if (!MD \|\| MD->getNumOperands() == `0`)
274	continue;
275	S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
276	for (unsigned Idx = `1`; Idx < MD->getNumOperands(); ++Idx)
277	Args.push_back(Elt: MD->getOperand(I: Idx));
278	} else {
279	S = dyn_cast<MDString>(Val: MDO);
280	assert(Args.size() == `0` && "too many arguments for MDString");
281	}
282
283	if (!S)
284	continue;
285
286	// Check if the hint starts with the loop metadata prefix.
287	StringRef Name = S->getString();
288	if (Args.size() == `1`)
289	setHint(Name, Arg: Args [`0`]);
290	}
291	}
292
293	void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
294	if (!Name.starts_with(Prefix: Prefix()))
295	return;
296	Name = Name.substr(Start: Prefix().size(), N: StringRef::npos);
297
298	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
299	if (!C)
300	return;
301	unsigned Val = C->getZExtValue();
302
303	Hint *Hints[] = {&Width, &Interleave, &Force,
304	&IsVectorized, &Predicate, &Scalable};
305	for (auto *H : Hints) {
306	if (Name == H->Name) {
307	if (H->validate(Val))
308	H->Value = Val;
309	else
310	LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
311	break;
312	}
313	}
314	}
315
316	// Return true if the inner loop \p Lp is uniform with regard to the outer loop
317	// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
318	// executing the inner loop will execute the same iterations). This check is
319	// very constrained for now but it will be relaxed in the future. \p Lp is
320	// considered uniform if it meets all the following conditions:
321	// 1) it has a canonical IV (starting from 0 and with stride 1),
322	// 2) its latch terminator is a conditional branch and,
323	// 3) its latch condition is a compare instruction whose operands are the
324	// canonical IV and an OuterLp invariant.
325	// This check doesn't take into account the uniformity of other conditions not
326	// related to the loop latch because they don't affect the loop uniformity.
327	//
328	// NOTE: We decided to keep all these checks and its associated documentation
329	// together so that we can easily have a picture of the current supported loop
330	// nests. However, some of the current checks don't depend on \p OuterLp and
331	// would be redundantly executed for each \p Lp if we invoked this function for
332	// different candidate outer loops. This is not the case for now because we
333	// don't currently have the infrastructure to evaluate multiple candidate outer
334	// loops and \p OuterLp will be a fixed parameter while we only support explicit
335	// outer loop vectorization. It's also very likely that these checks go away
336	// before introducing the aforementioned infrastructure. However, if this is not
337	// the case, we should move the \p OuterLp independent checks to a separate
338	// function that is only executed once for each \p Lp.
339	static bool isUniformLoop(Loop Lp, Loop OuterLp) {
340	assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
341
342	// If Lp is the outer loop, it's uniform by definition.
343	if (Lp == OuterLp)
344	return true;
345	assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
346
347	// 1.
348	PHINode *IV = Lp->getCanonicalInductionVariable();
349	if (!IV) {
350	LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
351	return false;
352	}
353
354	// 2.
355	BasicBlock *Latch = Lp->getLoopLatch();
356	auto *LatchBr = dyn_cast<BranchInst>(Val: Latch->getTerminator());
357	if (!LatchBr \|\| LatchBr->isUnconditional()) {
358	LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
359	return false;
360	}
361
362	// 3.
363	auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
364	if (!LatchCmp) {
365	LLVM_DEBUG(
366	dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
367	return false;
368	}
369
370	Value *CondOp0 = LatchCmp->getOperand(i_nocapture: `0`);
371	Value *CondOp1 = LatchCmp->getOperand(i_nocapture: `1`);
372	Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
373	if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
374	!(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
375	LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
376	return false;
377	}
378
379	return true;
380	}
381
382	// Return true if \p Lp and all its nested loops are uniform with regard to \p
383	// OuterLp.
384	static bool isUniformLoopNest(Loop Lp, Loop OuterLp) {
385	if (!isUniformLoop(Lp, OuterLp))
386	return false;
387
388	// Check if nested loops are uniform.
389	for (Loop SubLp : Lp)
390	if (!isUniformLoopNest(Lp: SubLp, OuterLp))
391	return false;
392
393	return true;
394	}
395
396	static IntegerType getInductionIntegerTy(const* DataLayout &DL, Type *Ty) {
397	assert(Ty->isIntOrPtrTy() && "Expected integer or pointer type");
398
399	if (Ty->isPointerTy())
400	return DL.getIntPtrType(C&: Ty->getContext(), AddressSpace: Ty->getPointerAddressSpace());
401
402	// It is possible that char's or short's overflow when we ask for the loop's
403	// trip count, work around this by changing the type size.
404	if (Ty->getScalarSizeInBits() < `32`)
405	return Type::getInt32Ty(C&: Ty->getContext());
406
407	return cast<IntegerType>(Val: Ty);
408	}
409
410	static IntegerType getWiderInductionTy(const* DataLayout &DL, Type *Ty0,
411	Type *Ty1) {
412	IntegerType *TyA = getInductionIntegerTy(DL, Ty: Ty0);
413	IntegerType *TyB = getInductionIntegerTy(DL, Ty: Ty1);
414	return TyA->getScalarSizeInBits() > TyB->getScalarSizeInBits() ? TyA : TyB;
415	}
416
417	/// Check that the instruction has outside loop users and is not an
418	/// identified reduction variable.
419	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
420	SmallPtrSetImpl<Value *> &AllowedExit) {
421	// Reductions, Inductions and non-header phis are allowed to have exit users. All
422	// other instructions must not have external users.
423	if (!AllowedExit.count(Ptr: Inst))
424	// Check that all of the users of the loop are inside the BB.
425	for (User *U : Inst->users()) {
426	Instruction *UI = cast<Instruction>(Val: U);
427	// This user may be a reduction exit value.
428	if (!TheLoop->contains(Inst: UI)) {
429	LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << `'\n'`);
430	return true;
431	}
432	}
433	return false;
434	}
435
436	/// Returns true if A and B have same pointer operands or same SCEVs addresses
437	static bool storeToSameAddress(ScalarEvolution SE, StoreInst A,
438	StoreInst *B) {
439	// Compare store
440	if (A == B)
441	return true;
442
443	// Otherwise Compare pointers
444	Value *APtr = A->getPointerOperand();
445	Value *BPtr = B->getPointerOperand();
446	if (APtr == BPtr)
447	return true;
448
449	// Otherwise compare address SCEVs
450	return SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr);
451	}
452
453	int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
454	Value Ptr) const* {
455	// FIXME: Currently, the set of symbolic strides is sometimes queried before
456	// it's collected. This happens from canVectorizeWithIfConvert, when the
457	// pointer is checked to reference consecutive elements suitable for a
458	// masked access.
459	const auto &Strides =
460	LAI ? LAI->getSymbolicStrides() : DenseMap<Value , const* SCEV *>();
461
462	bool CanAddPredicate = !llvm::shouldOptimizeForSize(
463	BB: TheLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass);
464	int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides,
465	Assume: CanAddPredicate, ShouldCheckWrap: false).value_or(u: `0`);
466	if (Stride == `1` \|\| Stride == -`1`)
467	return Stride;
468	return `0`;
469	}
470
471	bool LoopVectorizationLegality::isInvariant(Value V) const* {
472	return LAI->isInvariant(V);
473	}
474
475	namespace {
476	/// A rewriter to build the SCEVs for each of the VF lanes in the expected
477	/// vectorized loop, which can then be compared to detect their uniformity. This
478	/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
479	/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
480	/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
481	/// uniformity.
482	class SCEVAddRecForUniformityRewriter
483	: public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
484	/// Multiplier to be applied to the step of AddRecs in TheLoop.
485	unsigned StepMultiplier;
486
487	/// Offset to be added to the AddRecs in TheLoop.
488	unsigned Offset;
489
490	/// Loop for which to rewrite AddRecsFor.
491	Loop *TheLoop;
492
493	/// Is any sub-expressions not analyzable w.r.t. uniformity?
494	bool CannotAnalyze = false;
495
496	bool canAnalyze() const { return !CannotAnalyze; }
497
498	public:
499	SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
500	unsigned Offset, Loop *TheLoop)
501	: SCEVRewriteVisitor (SE), StepMultiplier(StepMultiplier), Offset(Offset),
502	TheLoop(TheLoop) {}
503
504	const SCEV visitAddRecExpr(const* SCEVAddRecExpr *Expr) {
505	assert(Expr->getLoop() == TheLoop &&
506	"addrec outside of TheLoop must be invariant and should have been "
507	"handled earlier");
508	// Build a new AddRec by multiplying the step by StepMultiplier and
509	// incrementing the start by Offset step.*
510	Type *Ty = Expr->getType();
511	const SCEV *Step = Expr->getStepRecurrence(SE);
512	if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
513	CannotAnalyze = true;
514	return Expr;
515	}
516	const SCEV *NewStep =
517	SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
518	const SCEV *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
519	const SCEV *NewStart = SE.getAddExpr(LHS: Expr->getStart(), RHS: ScaledOffset);
520	return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
521	}
522
523	const SCEV visit(const* SCEV *S) {
524	if (CannotAnalyze \|\| SE.isLoopInvariant(S, L: TheLoop))
525	return S;
526	return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
527	}
528
529	const SCEV visitUnknown(const* SCEVUnknown *S) {
530	if (SE.isLoopInvariant(S, L: TheLoop))
531	return S;
532	// The value could vary across iterations.
533	CannotAnalyze = true;
534	return S;
535	}
536
537	const SCEV visitCouldNotCompute(const* SCEVCouldNotCompute *S) {
538	// Could not analyze the expression.
539	CannotAnalyze = true;
540	return S;
541	}
542
543	static const SCEV rewrite(const* SCEV *S, ScalarEvolution &SE,
544	unsigned StepMultiplier, unsigned Offset,
545	Loop *TheLoop) {
546	/// Bail out if the expression does not contain an UDiv expression.
547	/// Uniform values which are not loop invariant require operations to strip
548	/// out the lowest bits. For now just look for UDivs and use it to avoid
549	/// re-writing UDIV-free expressions for other lanes to limit compile time.
550	if (!SCEVExprContains(Root: S,
551	Pred: [](const SCEV S) { return* isa<SCEVUDivExpr>(Val: S); }))
552	return SE.getCouldNotCompute();
553
554	SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
555	TheLoop);
556	const SCEV *Result = Rewriter.visit(S);
557
558	if (Rewriter.canAnalyze())
559	return Result;
560	return SE.getCouldNotCompute();
561	}
562	};
563
564	} // namespace
565
566	bool LoopVectorizationLegality::isUniform(Value V, ElementCount VF) const* {
567	if (isInvariant(V))
568	return true;
569	if (VF.isScalable())
570	return false;
571	if (VF.isScalar())
572	return true;
573
574	// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
575	// never considered uniform.
576	auto *SE = PSE.getSE();
577	if (!SE->isSCEVable(Ty: V->getType()))
578	return false;
579	const SCEV *S = SE->getSCEV(V);
580
581	// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
582	// lane 0 matches the expressions for all other lanes.
583	unsigned FixedVF = VF.getKnownMinValue();
584	const SCEV *FirstLaneExpr =
585	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: `0`, TheLoop);
586	if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
587	return false;
588
589	// Make sure the expressions for lanes FixedVF-1..1 match the expression for
590	// lane 0. We check lanes in reverse order for compile-time, as frequently
591	// checking the last lane is sufficient to rule out uniformity.
592	return all_of(Range: reverse(C: seq<unsigned>(Begin: `1`, End: FixedVF)), P: [&](unsigned I) {
593	const SCEV *IthLaneExpr =
594	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
595	return FirstLaneExpr == IthLaneExpr;
596	});
597	}
598
599	bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
600	ElementCount VF) const {
601	Value *Ptr = getLoadStorePointerOperand(V: &I);
602	if (!Ptr)
603	return false;
604	// Note: There's nothing inherent which prevents predicated loads and
605	// stores from being uniform. The current lowering simply doesn't handle
606	// it; in particular, the cost model distinguishes scatter/gather from
607	// scalar w/predication, and we currently rely on the scalar path.
608	return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
609	}
610
611	bool LoopVectorizationLegality::canVectorizeOuterLoop() {
612	assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
613	// Store the result and return it at the end instead of exiting early, in case
614	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
615	bool Result = true;
616	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
617
618	for (BasicBlock *BB : TheLoop->blocks()) {
619	// Check whether the BB terminator is a BranchInst. Any other terminator is
620	// not supported yet.
621	auto *Br = dyn_cast<BranchInst>(Val: BB->getTerminator());
622	if (!Br) {
623	reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
624	OREMsg: "loop control flow is not understood by vectorizer",
625	ORETag: "CFGNotUnderstood", ORE, TheLoop);
626	if (DoExtraAnalysis)
627	Result = false;
628	else
629	return false;
630	}
631
632	// Check whether the BranchInst is a supported one. Only unconditional
633	// branches, conditional branches with an outer loop invariant condition or
634	// backedges are supported.
635	// FIXME: We skip these checks when VPlan predication is enabled as we
636	// want to allow divergent branches. This whole check will be removed
637	// once VPlan predication is on by default.
638	if (Br && Br->isConditional() &&
639	!TheLoop->isLoopInvariant(V: Br->getCondition()) &&
640	!LI->isLoopHeader(BB: Br->getSuccessor(i: `0`)) &&
641	!LI->isLoopHeader(BB: Br->getSuccessor(i: `1`))) {
642	reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
643	OREMsg: "loop control flow is not understood by vectorizer",
644	ORETag: "CFGNotUnderstood", ORE, TheLoop);
645	if (DoExtraAnalysis)
646	Result = false;
647	else
648	return false;
649	}
650	}
651
652	// Check whether inner loops are uniform. At this point, we only support
653	// simple outer loops scenarios with uniform nested loops.
654	if (!isUniformLoopNest(Lp: TheLoop /loop nest/,
655	OuterLp: TheLoop /context outer loop/)) {
656	reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
657	OREMsg: "loop control flow is not understood by vectorizer",
658	ORETag: "CFGNotUnderstood", ORE, TheLoop);
659	if (DoExtraAnalysis)
660	Result = false;
661	else
662	return false;
663	}
664
665	// Check whether we are able to set up outer loop induction.
666	if (!setupOuterLoopInductions()) {
667	reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
668	ORETag: "UnsupportedPhi", ORE, TheLoop);
669	if (DoExtraAnalysis)
670	Result = false;
671	else
672	return false;
673	}
674
675	return Result;
676	}
677
678	void LoopVectorizationLegality::addInductionPhi(
679	PHINode Phi, const* InductionDescriptor &ID,
680	SmallPtrSetImpl<Value *> &AllowedExit) {
681	Inductions [Phi] = ID;
682
683	// In case this induction also comes with casts that we know we can ignore
684	// in the vectorized loop body, record them here. All casts could be recorded
685	// here for ignoring, but suffices to record only the first (as it is the
686	// only one that may bw used outside the cast sequence).
687	const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
688	if (!Casts.empty())
689	InductionCastsToIgnore.insert(Ptr: *Casts.begin());
690
691	Type *PhiTy = Phi->getType();
692	const DataLayout &DL = Phi->getDataLayout();
693
694	assert((PhiTy->isIntOrPtrTy() \|\| PhiTy->isFloatingPointTy()) &&
695	"Expected int, ptr, or FP induction phi type");
696
697	// Get the widest type.
698	if (PhiTy->isIntOrPtrTy()) {
699	if (!WidestIndTy)
700	WidestIndTy = getInductionIntegerTy(DL, Ty: PhiTy);
701	else
702	WidestIndTy = getWiderInductionTy(DL, Ty0: PhiTy, Ty1: WidestIndTy);
703	}
704
705	// Int inductions are special because we only allow one IV.
706	if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
707	ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
708	isa<Constant>(Val: ID.getStartValue()) &&
709	cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
710
711	// Use the phi node with the widest type as induction. Use the last
712	// one if there are multiple (no good reason for doing this other
713	// than it is expedient). We've checked that it begins at zero and
714	// steps by one, so this is a canonical induction variable.
715	if (!PrimaryInduction \|\| PhiTy == WidestIndTy)
716	PrimaryInduction = Phi;
717	}
718
719	// Both the PHI node itself, and the "post-increment" value feeding
720	// back into the PHI node may have external users.
721	// We can allow those uses, except if the SCEVs we have for them rely
722	// on predicates that only hold within the loop, since allowing the exit
723	// currently means re-using this SCEV outside the loop (see PR33706 for more
724	// details).
725	if (PSE.getPredicate().isAlwaysTrue()) {
726	AllowedExit.insert(Ptr: Phi);
727	AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
728	}
729
730	LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
731	}
732
733	bool LoopVectorizationLegality::setupOuterLoopInductions() {
734	BasicBlock *Header = TheLoop->getHeader();
735
736	// Returns true if a given Phi is a supported induction.
737	auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
738	InductionDescriptor ID;
739	if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
740	ID.getKind() == InductionDescriptor::IK_IntInduction) {
741	addInductionPhi(Phi: &Phi, ID, AllowedExit);
742	return true;
743	}
744	// Bail out for any Phi in the outer loop header that is not a supported
745	// induction.
746	LLVM_DEBUG(
747	dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
748	return false;
749	};
750
751	return llvm::all_of(Range: Header->phis(), P: IsSupportedPhi);
752	}
753
754	/// Checks if a function is scalarizable according to the TLI, in
755	/// the sense that it should be vectorized and then expanded in
756	/// multiple scalar calls. This is represented in the
757	/// TLI via mappings that do not specify a vector name, as in the
758	/// following example:
759	///
760	/// const VecDesc VecIntrinsics[] = {
761	/// {"llvm.phx.abs.i32", "", 4}
762	/// };
763	static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
764	const StringRef ScalarName = CI.getCalledFunction()->getName();
765	bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
766	// Check that all known VFs are not associated to a vector
767	// function, i.e. the vector name is emty.
768	if (Scalarize) {
769	ElementCount WidestFixedVF, WidestScalableVF;
770	TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
771	for (ElementCount VF = ElementCount::getFixed(MinVal: `2`);
772	ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= `2`)
773	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
774	for (ElementCount VF = ElementCount::getScalable(MinVal: `1`);
775	ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= `2`)
776	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
777	assert((WidestScalableVF.isZero() \|\| !Scalarize) &&
778	"Caller may decide to scalarize a variant using a scalable VF");
779	}
780	return Scalarize;
781	}
782
783	/// Returns true if the call return type `Ty` can be widened by the loop
784	/// vectorizer.
785	static bool canWidenCallReturnType(Type *Ty) {
786	auto *StructTy = dyn_cast<StructType>(Val: Ty);
787	// TODO: Remove the homogeneous types restriction. This is just an initial
788	// simplification. When we want to support things like the overflow intrinsics
789	// we will have to lift this restriction.
790	if (StructTy && !StructTy->containsHomogeneousTypes())
791	return false;
792	return canVectorizeTy(Ty: StructTy);
793	}
794
795	bool LoopVectorizationLegality::canVectorizeInstrs() {
796	BasicBlock *Header = TheLoop->getHeader();
797
798	// For each block in the loop.
799	for (BasicBlock *BB : TheLoop->blocks()) {
800	// Scan the instructions in the block and look for hazards.
801	for (Instruction &I : *BB) {
802	if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
803	Type *PhiTy = Phi->getType();
804	// Check that this PHI type is allowed.
805	if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
806	!PhiTy->isPointerTy()) {
807	reportVectorizationFailure(DebugMsg: "Found a non-int non-pointer PHI",
808	OREMsg: "loop control flow is not understood by vectorizer",
809	ORETag: "CFGNotUnderstood", ORE, TheLoop);
810	return false;
811	}
812
813	// If this PHINode is not in the header block, then we know that we
814	// can convert it to select during if-conversion. No need to check if
815	// the PHIs in this block are induction or reduction variables.
816	if (BB != Header) {
817	// Non-header phi nodes that have outside uses can be vectorized. Add
818	// them to the list of allowed exits.
819	// Unsafe cyclic dependencies with header phis are identified during
820	// legalization for reduction, induction and fixed order
821	// recurrences.
822	AllowedExit.insert(Ptr: &I);
823	continue;
824	}
825
826	// We only allow if-converted PHIs with exactly two incoming values.
827	if (Phi->getNumIncomingValues() != `2`) {
828	reportVectorizationFailure(DebugMsg: "Found an invalid PHI",
829	OREMsg: "loop control flow is not understood by vectorizer",
830	ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
831	return false;
832	}
833
834	RecurrenceDescriptor RedDes;
835	if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
836	DT, SE: PSE.getSE())) {
837	Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
838	AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
839	Reductions [Phi] = RedDes;
840	continue;
841	}
842
843	// We prevent matching non-constant strided pointer IVS to preserve
844	// historical vectorizer behavior after a generalization of the
845	// IVDescriptor code. The intent is to remove this check, but we
846	// have to fix issues around code quality for such loops first.
847	auto IsDisallowedStridedPointerInduction =
848	[](const InductionDescriptor &ID) {
849	if (AllowStridedPointerIVs)
850	return false;
851	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
852	ID.getConstIntStepValue() == nullptr;
853	};
854
855	// TODO: Instead of recording the AllowedExit, it would be good to
856	// record the complementary set: NotAllowedExit. These include (but may
857	// not be limited to):
858	// 1. Reduction phis as they represent the one-before-last value, which
859	// is not available when vectorized
860	// 2. Induction phis and increment when SCEV predicates cannot be used
861	// outside the loop - see addInductionPhi
862	// 3. Non-Phis with outside uses when SCEV predicates cannot be used
863	// outside the loop - see call to hasOutsideLoopUser in the non-phi
864	// handling below
865	// 4. FixedOrderRecurrence phis that can possibly be handled by
866	// extraction.
867	// By recording these, we can then reason about ways to vectorize each
868	// of these NotAllowedExit.
869	InductionDescriptor ID;
870	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
871	!IsDisallowedStridedPointerInduction (ID)) {
872	addInductionPhi(Phi, ID, AllowedExit);
873	Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
874	continue;
875	}
876
877	if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
878	AllowedExit.insert(Ptr: Phi);
879	FixedOrderRecurrences.insert(Ptr: Phi);
880	continue;
881	}
882
883	// As a last resort, coerce the PHI to a AddRec expression
884	// and re-try classifying it a an induction PHI.
885	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
886	!IsDisallowedStridedPointerInduction (ID)) {
887	addInductionPhi(Phi, ID, AllowedExit);
888	continue;
889	}
890
891	reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
892	OREMsg: "value that could not be identified as "
893	"reduction is used outside the loop",
894	ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop, I: Phi);
895	return false;
896	} // end of PHI handling
897
898	// We handle calls that:
899	// Have a mapping to an IR intrinsic.*
900	// Have a vector version available.*
901	auto *CI = dyn_cast<CallInst>(Val: &I);
902
903	if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
904	!(CI->getCalledFunction() && TLI &&
905	(!VFDatabase::getMappings(CI: *CI).empty() \|\|
906	isTLIScalarize(TLI: TLI, CI: CI)))) {
907	// If the call is a recognized math libary call, it is likely that
908	// we can vectorize it given loosened floating-point constraints.
909	LibFunc Func;
910	bool IsMathLibCall =
911	TLI && CI->getCalledFunction() &&
912	CI->getType()->isFloatingPointTy() &&
913	TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
914	TLI->hasOptimizedCodeGen(F: Func);
915
916	if (IsMathLibCall) {
917	// TODO: Ideally, we should not use clang-specific language here,
918	// but it's hard to provide meaningful yet generic advice.
919	// Also, should this be guarded by allowExtraAnalysis() and/or be part
920	// of the returned info from isFunctionVectorizable()?
921	reportVectorizationFailure(
922	DebugMsg: "Found a non-intrinsic callsite",
923	OREMsg: "library call cannot be vectorized. "
924	"Try compiling with -fno-math-errno, -ffast-math, "
925	"or similar flags",
926	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
927	} else {
928	reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
929	OREMsg: "call instruction cannot be vectorized",
930	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
931	}
932	return false;
933	}
934
935	// Some intrinsics have scalar arguments and should be same in order for
936	// them to be vectorized (i.e. loop invariant).
937	if (CI) {
938	auto *SE = PSE.getSE();
939	Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
940	for (unsigned Idx = `0`; Idx < CI->arg_size(); ++Idx)
941	if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: Idx, TTI)) {
942	if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: Idx)),
943	L: TheLoop)) {
944	reportVectorizationFailure(DebugMsg: "Found unvectorizable intrinsic",
945	OREMsg: "intrinsic instruction cannot be vectorized",
946	ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
947	return false;
948	}
949	}
950	}
951
952	// If we found a vectorized variant of a function, note that so LV can
953	// make better decisions about maximum VF.
954	if (CI && !VFDatabase::getMappings(CI: *CI).empty())
955	VecCallVariantsFound = true;
956
957	auto CanWidenInstructionTy = [](Instruction const &Inst) {
958	Type *InstTy = Inst.getType();
959	if (!isa<StructType>(Val: InstTy))
960	return canVectorizeTy(Ty: InstTy);
961
962	// For now, we only recognize struct values returned from calls where
963	// all users are extractvalue as vectorizable. All element types of the
964	// struct must be types that can be widened.
965	return isa<CallInst>(Val: Inst) && canWidenCallReturnType(Ty: InstTy) &&
966	all_of(Range: Inst.users(), P: IsaPred<ExtractValueInst>);
967	};
968
969	// Check that the instruction return type is vectorizable.
970	// We can't vectorize casts from vector type to scalar type.
971	// Also, we can't vectorize extractelement instructions.
972	if (!CanWidenInstructionTy (I) \|\|
973	(isa<CastInst>(Val: I) &&
974	!VectorType::isValidElementType(ElemTy: I.getOperand(i: `0`)->getType())) \|\|
975	isa<ExtractElementInst>(Val: I)) {
976	reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
977	OREMsg: "instruction return type cannot be vectorized",
978	ORETag: "CantVectorizeInstructionReturnType", ORE, TheLoop, I: &I);
979	return false;
980	}
981
982	// Check that the stored type is vectorizable.
983	if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
984	Type *T = ST->getValueOperand()->getType();
985	if (!VectorType::isValidElementType(ElemTy: T)) {
986	reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
987	ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
988	return false;
989	}
990
991	// For nontemporal stores, check that a nontemporal vector version is
992	// supported on the target.
993	if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
994	// Arbitrarily try a vector of 2 elements.
995	auto VecTy = FixedVectorType::get(ElementType: T, /NumElts=/*`2`);
996	assert(VecTy && "did not find vectorized version of stored type");
997	if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
998	reportVectorizationFailure(
999	DebugMsg: "nontemporal store instruction cannot be vectorized",
1000	ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
1001	return false;
1002	}
1003	}
1004
1005	} else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
1006	if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1007	// For nontemporal loads, check that a nontemporal vector version is
1008	// supported on the target (arbitrarily try a vector of 2 elements).
1009	auto VecTy = FixedVectorType::get(ElementType: I.getType(), /NumElts=/*`2`);
1010	assert(VecTy && "did not find vectorized version of load type");
1011	if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
1012	reportVectorizationFailure(
1013	DebugMsg: "nontemporal load instruction cannot be vectorized",
1014	ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
1015	return false;
1016	}
1017	}
1018
1019	// FP instructions can allow unsafe algebra, thus vectorizable by
1020	// non-IEEE-754 compliant SIMD units.
1021	// This applies to floating-point math operations and calls, not memory
1022	// operations, shuffles, or casts, as they don't change precision or
1023	// semantics.
1024	} else if (I.getType()->isFloatingPointTy() && (CI \|\| I.isBinaryOp()) &&
1025	!I.isFast()) {
1026	LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1027	Hints->setPotentiallyUnsafe();
1028	}
1029
1030	// Reduction instructions are allowed to have exit users.
1031	// All other instructions must not have external users.
1032	if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1033	// We can safely vectorize loops where instructions within the loop are
1034	// used outside the loop only if the SCEV predicates within the loop is
1035	// same as outside the loop. Allowing the exit means reusing the SCEV
1036	// outside the loop.
1037	if (PSE.getPredicate().isAlwaysTrue()) {
1038	AllowedExit.insert(Ptr: &I);
1039	continue;
1040	}
1041	reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1042	ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1043	return false;
1044	}
1045	} // next instr.
1046	}
1047
1048	if (!PrimaryInduction) {
1049	if (Inductions.empty()) {
1050	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1051	OREMsg: "loop induction variable could not be identified",
1052	ORETag: "NoInductionVariable", ORE, TheLoop);
1053	return false;
1054	}
1055	if (!WidestIndTy) {
1056	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1057	OREMsg: "integer loop induction variable could not be identified",
1058	ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
1059	return false;
1060	}
1061	LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1062	}
1063
1064	// Now we know the widest induction type, check if our found induction
1065	// is the same size. If it's not, unset it here and InnerLoopVectorizer
1066	// will create another.
1067	if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
1068	PrimaryInduction = nullptr;
1069
1070	return true;
1071	}
1072
1073	/// Find histogram operations that match high-level code in loops:
1074	/// \code
1075	/// buckets[indices[i]]+=step;
1076	/// \endcode
1077	///
1078	/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1079	/// array the computed histogram. It uses a BinOp to sum all counts, storing
1080	/// them using a loop-variant index Load from the 'indices' input array.
1081	///
1082	/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1083	/// regardless of hardware support. When there is support, it additionally
1084	/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1085	/// used to update histogram in \p HistogramPtrs.
1086	static bool findHistogram(LoadInst LI, StoreInst HSt, Loop *TheLoop,
1087	const PredicatedScalarEvolution &PSE,
1088	SmallVectorImpl<HistogramInfo> &Histograms) {
1089
1090	// Store value must come from a Binary Operation.
1091	Instruction HPtrInstr = nullptr*;
1092	BinaryOperator HBinOp = nullptr*;
1093	if (!match(V: HSt, P: m_Store(ValueOp: m_BinOp(I&: HBinOp), PointerOp: m_Instruction(I&: HPtrInstr))))
1094	return false;
1095
1096	// BinOp must be an Add or a Sub modifying the bucket value by a
1097	// loop invariant amount.
1098	// FIXME: We assume the loop invariant term is on the RHS.
1099	// Fine for an immediate/constant, but maybe not a generic value?
1100	Value HIncVal = nullptr*;
1101	if (!match(V: HBinOp, P: m_Add(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))) &&
1102	!match(V: HBinOp, P: m_Sub(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))))
1103	return false;
1104
1105	// Make sure the increment value is loop invariant.
1106	if (!TheLoop->isLoopInvariant(V: HIncVal))
1107	return false;
1108
1109	// The address to store is calculated through a GEP Instruction.
1110	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: HPtrInstr);
1111	if (!GEP)
1112	return false;
1113
1114	// Restrict address calculation to constant indices except for the last term.
1115	Value HIdx = nullptr*;
1116	for (Value *Index : GEP->indices()) {
1117	if (HIdx)
1118	return false;
1119	if (!isa<ConstantInt>(Val: Index))
1120	HIdx = Index;
1121	}
1122
1123	if (!HIdx)
1124	return false;
1125
1126	// Check that the index is calculated by loading from another array. Ignore
1127	// any extensions.
1128	// FIXME: Support indices from other sources than a linear load from memory?
1129	// We're currently trying to match an operation looping over an array
1130	// of indices, but there could be additional levels of indirection
1131	// in place, or possibly some additional calculation to form the index
1132	// from the loaded data.
1133	Value *VPtrVal;
1134	if (!match(V: HIdx, P: m_ZExtOrSExtOrSelf(Op: m_Load(Op: m_Value(V&: VPtrVal)))))
1135	return false;
1136
1137	// Make sure the index address varies in this loop, not an outer loop.
1138	const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: VPtrVal));
1139	if (!AR \|\| AR->getLoop() != TheLoop)
1140	return false;
1141
1142	// Ensure we'll have the same mask by checking that all parts of the histogram
1143	// (gather load, update, scatter store) are in the same block.
1144	LoadInst *IndexedLoad = cast<LoadInst>(Val: HBinOp->getOperand(i_nocapture: `0`));
1145	BasicBlock *LdBB = IndexedLoad->getParent();
1146	if (LdBB != HBinOp->getParent() \|\| LdBB != HSt->getParent())
1147	return false;
1148
1149	LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1150
1151	// Store the operations that make up the histogram.
1152	Histograms.emplace_back(Args&: IndexedLoad, Args&: HBinOp, Args&: HSt);
1153	return true;
1154	}
1155
1156	bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1157	// For now, we only support an IndirectUnsafe dependency that calculates
1158	// a histogram
1159	if (!EnableHistogramVectorization)
1160	return false;
1161
1162	// Find a single IndirectUnsafe dependency.
1163	const MemoryDepChecker::Dependence IUDep = nullptr*;
1164	const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1165	const auto *Deps = DepChecker.getDependences();
1166	// If there were too many dependences, LAA abandons recording them. We can't
1167	// proceed safely if we don't know what the dependences are.
1168	if (!Deps)
1169	return false;
1170
1171	for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1172	// Ignore dependencies that are either known to be safe or can be
1173	// checked at runtime.
1174	if (MemoryDepChecker::Dependence::isSafeForVectorization(Type: Dep.Type) !=
1175	MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1176	continue;
1177
1178	// We're only interested in IndirectUnsafe dependencies here, where the
1179	// address might come from a load from memory. We also only want to handle
1180	// one such dependency, at least for now.
1181	if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe \|\| IUDep)
1182	return false;
1183
1184	IUDep = &Dep;
1185	}
1186	if (!IUDep)
1187	return false;
1188
1189	// For now only normal loads and stores are supported.
1190	LoadInst *LI = dyn_cast<LoadInst>(Val: IUDep->getSource(DepChecker));
1191	StoreInst *SI = dyn_cast<StoreInst>(Val: IUDep->getDestination(DepChecker));
1192
1193	if (!LI \|\| !SI)
1194	return false;
1195
1196	LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1197	return findHistogram(LI, HSt: SI, TheLoop, PSE: LAI->getPSE(), Histograms);
1198	}
1199
1200	bool LoopVectorizationLegality::canVectorizeMemory() {
1201	LAI = &LAIs.getInfo(L&: *TheLoop);
1202	const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1203	if (LAR) {
1204	ORE->emit(RemarkBuilder: [&]() {
1205	return OptimizationRemarkAnalysis (Hints->vectorizeAnalysisPassName(),
1206	"loop not vectorized: ", *LAR);
1207	});
1208	}
1209
1210	if (!LAI->canVectorizeMemory())
1211	return canVectorizeIndirectUnsafeDependences();
1212
1213	if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1214	reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1215	OREMsg: "write to a loop invariant address could not "
1216	"be vectorized",
1217	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1218	TheLoop);
1219	return false;
1220	}
1221
1222	// We can vectorize stores to invariant address when final reduction value is
1223	// guaranteed to be stored at the end of the loop. Also, if decision to
1224	// vectorize loop is made, runtime checks are added so as to make sure that
1225	// invariant address won't alias with any other objects.
1226	if (!LAI->getStoresToInvariantAddresses().empty()) {
1227	// For each invariant address, check if last stored value is unconditional
1228	// and the address is not calculated inside the loop.
1229	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1230	if (!isInvariantStoreOfReduction(SI))
1231	continue;
1232
1233	if (blockNeedsPredication(BB: SI->getParent())) {
1234	reportVectorizationFailure(
1235	DebugMsg: "We don't allow storing to uniform addresses",
1236	OREMsg: "write of conditional recurring variant value to a loop "
1237	"invariant address could not be vectorized",
1238	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1239	return false;
1240	}
1241
1242	// Invariant address should be defined outside of loop. LICM pass usually
1243	// makes sure it happens, but in rare cases it does not, we do not want
1244	// to overcomplicate vectorization to support this case.
1245	if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1246	if (TheLoop->contains(Inst: Ptr)) {
1247	reportVectorizationFailure(
1248	DebugMsg: "Invariant address is calculated inside the loop",
1249	OREMsg: "write to a loop invariant address could not "
1250	"be vectorized",
1251	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1252	return false;
1253	}
1254	}
1255	}
1256
1257	if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1258	// For each invariant address, check its last stored value is the result
1259	// of one of our reductions.
1260	//
1261	// We do not check if dependence with loads exists because that is already
1262	// checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1263	ScalarEvolution *SE = PSE.getSE();
1264	SmallVector<StoreInst *, `4`> UnhandledStores;
1265	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1266	if (isInvariantStoreOfReduction(SI)) {
1267	// Earlier stores to this address are effectively deadcode.
1268	// With opaque pointers it is possible for one pointer to be used with
1269	// different sizes of stored values:
1270	// store i32 0, ptr %x
1271	// store i8 0, ptr %x
1272	// The latest store doesn't complitely overwrite the first one in the
1273	// example. That is why we have to make sure that types of stored
1274	// values are same.
1275	// TODO: Check that bitwidth of unhandled store is smaller then the
1276	// one that overwrites it and add a test.
1277	erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1278	return storeToSameAddress(SE, A: SI, B: I) &&
1279	I->getValueOperand()->getType() ==
1280	SI->getValueOperand()->getType();
1281	});
1282	continue;
1283	}
1284	UnhandledStores.push_back(Elt: SI);
1285	}
1286
1287	bool IsOK = UnhandledStores.empty();
1288	// TODO: we should also validate against InvariantMemSets.
1289	if (!IsOK) {
1290	reportVectorizationFailure(
1291	DebugMsg: "We don't allow storing to uniform addresses",
1292	OREMsg: "write to a loop invariant address could not "
1293	"be vectorized",
1294	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1295	return false;
1296	}
1297	}
1298	}
1299
1300	PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1301	return true;
1302	}
1303
1304	bool LoopVectorizationLegality::canVectorizeFPMath(
1305	bool EnableStrictReductions) {
1306
1307	// First check if there is any ExactFP math or if we allow reassociations
1308	if (!Requirements->getExactFPInst() \|\| Hints->allowReordering())
1309	return true;
1310
1311	// If the above is false, we have ExactFPMath & do not allow reordering.
1312	// If the EnableStrictReductions flag is set, first check if we have any
1313	// Exact FP induction vars, which we cannot vectorize.
1314	if (!EnableStrictReductions \|\|
1315	any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1316	InductionDescriptor IndDesc = Induction.second;
1317	return IndDesc.getExactFPMathInst();
1318	}))
1319	return false;
1320
1321	// We can now only vectorize if all reductions with Exact FP math also
1322	// have the isOrdered flag set, which indicates that we can move the
1323	// reduction operations in-loop.
1324	return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1325	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1326	return !RdxDesc.hasExactFPMath() \|\| RdxDesc.isOrdered();
1327	}));
1328	}
1329
1330	bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1331	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1332	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1333	return RdxDesc.IntermediateStore == SI;
1334	});
1335	}
1336
1337	bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1338	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1339	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1340	if (!RdxDesc.IntermediateStore)
1341	return false;
1342
1343	ScalarEvolution *SE = PSE.getSE();
1344	Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1345	return V == InvariantAddress \|\|
1346	SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1347	});
1348	}
1349
1350	bool LoopVectorizationLegality::isInductionPhi(const Value V) const* {
1351	Value In0 = const_cast<Value >(V);
1352	PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1353	if (!PN)
1354	return false;
1355
1356	return Inductions.count(Key: PN);
1357	}
1358
1359	const InductionDescriptor *
1360	LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode Phi) const* {
1361	if (!isInductionPhi(V: Phi))
1362	return nullptr;
1363	auto &ID = getInductionVars().find(Key: Phi)->second;
1364	if (ID.getKind() == InductionDescriptor::IK_IntInduction \|\|
1365	ID.getKind() == InductionDescriptor::IK_FpInduction)
1366	return &ID;
1367	return nullptr;
1368	}
1369
1370	const InductionDescriptor *
1371	LoopVectorizationLegality::getPointerInductionDescriptor(PHINode Phi) const* {
1372	if (!isInductionPhi(V: Phi))
1373	return nullptr;
1374	auto &ID = getInductionVars().find(Key: Phi)->second;
1375	if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1376	return &ID;
1377	return nullptr;
1378	}
1379
1380	bool LoopVectorizationLegality::isCastedInductionVariable(
1381	const Value V) const* {
1382	auto *Inst = dyn_cast<Instruction>(Val: V);
1383	return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1384	}
1385
1386	bool LoopVectorizationLegality::isInductionVariable(const Value V) const* {
1387	return isInductionPhi(V) \|\| isCastedInductionVariable(V);
1388	}
1389
1390	bool LoopVectorizationLegality::isFixedOrderRecurrence(
1391	const PHINode Phi) const* {
1392	return FixedOrderRecurrences.count(Ptr: Phi);
1393	}
1394
1395	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock BB) const* {
1396	// When vectorizing early exits, create predicates for the latch block only.
1397	// The early exiting block must be a direct predecessor of the latch at the
1398	// moment.
1399	BasicBlock *Latch = TheLoop->getLoopLatch();
1400	if (hasUncountableEarlyExit()) {
1401	assert(
1402	is_contained(predecessors(Latch), getUncountableEarlyExitingBlock()) &&
1403	"Uncountable exiting block must be a direct predecessor of latch");
1404	return BB == Latch;
1405	}
1406	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1407	}
1408
1409	bool LoopVectorizationLegality::blockCanBePredicated(
1410	BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs,
1411	SmallPtrSetImpl<const Instruction > &MaskedOp) const* {
1412	for (Instruction &I : *BB) {
1413	// We can predicate blocks with calls to assume, as long as we drop them in
1414	// case we flatten the CFG via predication.
1415	if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1416	MaskedOp.insert(Ptr: &I);
1417	continue;
1418	}
1419
1420	// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1421	// TODO: there might be cases that it should block the vectorization. Let's
1422	// ignore those for now.
1423	if (isa<NoAliasScopeDeclInst>(Val: &I))
1424	continue;
1425
1426	// We can allow masked calls if there's at least one vector variant, even
1427	// if we end up scalarizing due to the cost model calculations.
1428	// TODO: Allow other calls if they have appropriate attributes... readonly
1429	// and argmemonly?
1430	if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1431	if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1432	MaskedOp.insert(Ptr: CI);
1433	continue;
1434	}
1435
1436	// Loads are handled via masking (or speculated if safe to do so.)
1437	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1438	if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1439	MaskedOp.insert(Ptr: LI);
1440	continue;
1441	}
1442
1443	// Predicated store requires some form of masking:
1444	// 1) masked store HW instruction,
1445	// 2) emulation via load-blend-store (only if safe and legal to do so,
1446	// be aware on the race conditions), or
1447	// 3) element-by-element predicate check and scalar store.
1448	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1449	MaskedOp.insert(Ptr: SI);
1450	continue;
1451	}
1452
1453	if (I.mayReadFromMemory() \|\| I.mayWriteToMemory() \|\| I.mayThrow())
1454	return false;
1455	}
1456
1457	return true;
1458	}
1459
1460	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1461	if (!EnableIfConversion) {
1462	reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1463	ORETag: "IfConversionDisabled", ORE, TheLoop);
1464	return false;
1465	}
1466
1467	assert(TheLoop->getNumBlocks() > `1` && "Single block loops are vectorizable");
1468
1469	// A list of pointers which are known to be dereferenceable within scope of
1470	// the loop body for each iteration of the loop which executes. That is,
1471	// the memory pointed to can be dereferenced (with the access size implied by
1472	// the value's type) unconditionally within the loop header without
1473	// introducing a new fault.
1474	SmallPtrSet<Value *, `8`> SafePointers;
1475
1476	// Collect safe addresses.
1477	for (BasicBlock *BB : TheLoop->blocks()) {
1478	if (!blockNeedsPredication(BB)) {
1479	for (Instruction &I : *BB)
1480	if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1481	SafePointers.insert(Ptr);
1482	continue;
1483	}
1484
1485	// For a block which requires predication, a address may be safe to access
1486	// in the loop w/o predication if we can prove dereferenceability facts
1487	// sufficient to ensure it'll never fault within the loop. For the moment,
1488	// we restrict this to loads; stores are more complicated due to
1489	// concurrency restrictions.
1490	ScalarEvolution &SE = *PSE.getSE();
1491	SmallVector<const SCEVPredicate *, `4`> Predicates;
1492	for (Instruction &I : *BB) {
1493	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1494
1495	// Make sure we can execute all computations feeding into Ptr in the loop
1496	// w/o triggering UB and that none of the out-of-loop operands are poison.
1497	// We do not need to check if operations inside the loop can produce
1498	// poison due to flags (e.g. due to an inbounds GEP going out of bounds),
1499	// because flags will be dropped when executing them unconditionally.
1500	// TODO: Results could be improved by considering poison-propagation
1501	// properties of visited ops.
1502	auto CanSpeculatePointerOp = [this](Value *Ptr) {
1503	SmallVector<Value *> Worklist = {Ptr};
1504	SmallPtrSet<Value *, `4`> Visited;
1505	while (!Worklist.empty()) {
1506	Value *CurrV = Worklist.pop_back_val();
1507	if (!Visited.insert(Ptr: CurrV).second)
1508	continue;
1509
1510	auto *CurrI = dyn_cast<Instruction>(Val: CurrV);
1511	if (!CurrI \|\| !TheLoop->contains(Inst: CurrI)) {
1512	// If operands from outside the loop may be poison then Ptr may also
1513	// be poison.
1514	if (!isGuaranteedNotToBePoison(V: CurrV, AC,
1515	CtxI: TheLoop->getLoopPredecessor()
1516	->getTerminator()
1517	->getIterator()))
1518	return false;
1519	continue;
1520	}
1521
1522	// A loaded value may be poison, independent of any flags.
1523	if (isa<LoadInst>(Val: CurrI) && !isGuaranteedNotToBePoison(V: CurrV, AC))
1524	return false;
1525
1526	// For other ops, assume poison can only be introduced via flags,
1527	// which can be dropped.
1528	if (!isa<PHINode>(Val: CurrI) && !isSafeToSpeculativelyExecute(I: CurrI))
1529	return false;
1530	append_range(C&: Worklist, R: CurrI->operands());
1531	}
1532	return true;
1533	};
1534	// Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1535	// that it will consider loops that need guarding by SCEV checks. The
1536	// vectoriser will generate these checks if we decide to vectorise.
1537	if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1538	CanSpeculatePointerOp (LI->getPointerOperand()) &&
1539	isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC,
1540	Predicates: &Predicates))
1541	SafePointers.insert(Ptr: LI->getPointerOperand());
1542	Predicates.clear();
1543	}
1544	}
1545
1546	// Collect the blocks that need predication.
1547	for (BasicBlock *BB : TheLoop->blocks()) {
1548	// We support only branches and switch statements as terminators inside the
1549	// loop.
1550	if (isa<SwitchInst>(Val: BB->getTerminator())) {
1551	if (TheLoop->isLoopExiting(BB)) {
1552	reportVectorizationFailure(DebugMsg: "Loop contains an unsupported switch",
1553	ORETag: "LoopContainsUnsupportedSwitch", ORE,
1554	TheLoop, I: BB->getTerminator());
1555	return false;
1556	}
1557	} else if (!isa<BranchInst>(Val: BB->getTerminator())) {
1558	reportVectorizationFailure(DebugMsg: "Loop contains an unsupported terminator",
1559	ORETag: "LoopContainsUnsupportedTerminator", ORE,
1560	TheLoop, I: BB->getTerminator());
1561	return false;
1562	}
1563
1564	// We must be able to predicate all blocks that need to be predicated.
1565	if (blockNeedsPredication(BB) &&
1566	!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1567	reportVectorizationFailure(
1568	DebugMsg: "Control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1569	ORE, TheLoop, I: BB->getTerminator());
1570	return false;
1571	}
1572	}
1573
1574	// We can if-convert this loop.
1575	return true;
1576	}
1577
1578	// Helper function to canVectorizeLoopNestCFG.
1579	bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1580	bool UseVPlanNativePath) {
1581	assert((UseVPlanNativePath \|\| Lp->isInnermost()) &&
1582	"VPlan-native path is not enabled.");
1583
1584	// TODO: ORE should be improved to show more accurate information when an
1585	// outer loop can't be vectorized because a nested loop is not understood or
1586	// legal. Something like: "outer_loop_location: loop not vectorized:
1587	// (inner_loop_location) loop control flow is not understood by vectorizer".
1588
1589	// Store the result and return it at the end instead of exiting early, in case
1590	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1591	bool Result = true;
1592	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1593
1594	// We must have a loop in canonical form. Loops with indirectbr in them cannot
1595	// be canonicalized.
1596	if (!Lp->getLoopPreheader()) {
1597	reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1598	OREMsg: "loop control flow is not understood by vectorizer",
1599	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1600	if (DoExtraAnalysis)
1601	Result = false;
1602	else
1603	return false;
1604	}
1605
1606	// We must have a single backedge.
1607	if (Lp->getNumBackEdges() != `1`) {
1608	reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1609	OREMsg: "loop control flow is not understood by vectorizer",
1610	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1611	if (DoExtraAnalysis)
1612	Result = false;
1613	else
1614	return false;
1615	}
1616
1617	return Result;
1618	}
1619
1620	bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1621	Loop Lp, bool* UseVPlanNativePath) {
1622	// Store the result and return it at the end instead of exiting early, in case
1623	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1624	bool Result = true;
1625	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1626	if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1627	if (DoExtraAnalysis)
1628	Result = false;
1629	else
1630	return false;
1631	}
1632
1633	// Recursively check whether the loop control flow of nested loops is
1634	// understood.
1635	for (Loop SubLp : Lp)
1636	if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1637	if (DoExtraAnalysis)
1638	Result = false;
1639	else
1640	return false;
1641	}
1642
1643	return Result;
1644	}
1645
1646	bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1647	BasicBlock *LatchBB = TheLoop->getLoopLatch();
1648	if (!LatchBB) {
1649	reportVectorizationFailure(DebugMsg: "Loop does not have a latch",
1650	OREMsg: "Cannot vectorize early exit loop",
1651	ORETag: "NoLatchEarlyExit", ORE, TheLoop);
1652	return false;
1653	}
1654
1655	if (Reductions.size() \|\| FixedOrderRecurrences.size()) {
1656	reportVectorizationFailure(
1657	DebugMsg: "Found reductions or recurrences in early-exit loop",
1658	OREMsg: "Cannot vectorize early exit loop with reductions or recurrences",
1659	ORETag: "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1660	return false;
1661	}
1662
1663	SmallVector<BasicBlock *, `8`> ExitingBlocks;
1664	TheLoop->getExitingBlocks(ExitingBlocks);
1665
1666	// Keep a record of all the exiting blocks.
1667	SmallVector<const SCEVPredicate *, `4`> Predicates;
1668	std::optional<std::pair<BasicBlock , BasicBlock >> SingleUncountableEdge;
1669	for (BasicBlock *BB : ExitingBlocks) {
1670	const SCEV *EC =
1671	PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: BB, Predicates: &Predicates);
1672	if (isa<SCEVCouldNotCompute>(Val: EC)) {
1673	SmallVector<BasicBlock *, `2`> Succs(successors(BB));
1674	if (Succs.size() != `2`) {
1675	reportVectorizationFailure(
1676	DebugMsg: "Early exiting block does not have exactly two successors",
1677	OREMsg: "Incorrect number of successors from early exiting block",
1678	ORETag: "EarlyExitTooManySuccessors", ORE, TheLoop);
1679	return false;
1680	}
1681
1682	BasicBlock *ExitBlock;
1683	if (!TheLoop->contains(BB: Succs [`0`]))
1684	ExitBlock = Succs [`0`];
1685	else {
1686	assert(!TheLoop->contains(Succs[`1`]));
1687	ExitBlock = Succs [`1`];
1688	}
1689
1690	if (SingleUncountableEdge) {
1691	reportVectorizationFailure(
1692	DebugMsg: "Loop has too many uncountable exits",
1693	OREMsg: "Cannot vectorize early exit loop with more than one early exit",
1694	ORETag: "TooManyUncountableEarlyExits", ORE, TheLoop);
1695	return false;
1696	}
1697
1698	SingleUncountableEdge = {BB, ExitBlock};
1699	} else
1700	CountableExitingBlocks.push_back(Elt: BB);
1701	}
1702	// We can safely ignore the predicates here because when vectorizing the loop
1703	// the PredicatatedScalarEvolution class will keep track of all predicates
1704	// for each exiting block anyway. This happens when calling
1705	// PSE.getSymbolicMaxBackedgeTakenCount() below.
1706	Predicates.clear();
1707
1708	if (!SingleUncountableEdge) {
1709	LLVM_DEBUG(dbgs() << "LV: Cound not find any uncountable exits");
1710	return false;
1711	}
1712
1713	// The only supported early exit loops so far are ones where the early
1714	// exiting block is a unique predecessor of the latch block.
1715	BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
1716	if (LatchPredBB != SingleUncountableEdge ->first) {
1717	reportVectorizationFailure(DebugMsg: "Early exit is not the latch predecessor",
1718	OREMsg: "Cannot vectorize early exit loop",
1719	ORETag: "EarlyExitNotLatchPredecessor", ORE, TheLoop);
1720	return false;
1721	}
1722
1723	// The latch block must have a countable exit.
1724	if (isa<SCEVCouldNotCompute>(
1725	Val: PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: LatchBB, Predicates: &Predicates))) {
1726	reportVectorizationFailure(
1727	DebugMsg: "Cannot determine exact exit count for latch block",
1728	OREMsg: "Cannot vectorize early exit loop",
1729	ORETag: "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1730	return false;
1731	}
1732	assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1733	"Latch block not found in list of countable exits!");
1734
1735	// Check to see if there are instructions that could potentially generate
1736	// exceptions or have side-effects.
1737	auto IsSafeOperation = [](Instruction I) -> bool* {
1738	switch (I->getOpcode()) {
1739	case Instruction::Load:
1740	case Instruction::Store:
1741	case Instruction::PHI:
1742	case Instruction::Br:
1743	// These are checked separately.
1744	return true;
1745	default:
1746	return isSafeToSpeculativelyExecute(I);
1747	}
1748	};
1749
1750	for (auto *BB : TheLoop->blocks())
1751	for (auto &I : *BB) {
1752	if (I.mayWriteToMemory()) {
1753	// We don't support writes to memory.
1754	reportVectorizationFailure(
1755	DebugMsg: "Writes to memory unsupported in early exit loops",
1756	OREMsg: "Cannot vectorize early exit loop with writes to memory",
1757	ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
1758	return false;
1759	} else if (!IsSafeOperation (&I)) {
1760	reportVectorizationFailure(DebugMsg: "Early exit loop contains operations that "
1761	"cannot be speculatively executed",
1762	ORETag: "UnsafeOperationsEarlyExitLoop", ORE,
1763	TheLoop);
1764	return false;
1765	}
1766	}
1767
1768	// The vectoriser cannot handle loads that occur after the early exit block.
1769	assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first &&
1770	"Expected latch predecessor to be the early exiting block");
1771
1772	// TODO: Handle loops that may fault.
1773	Predicates.clear();
1774	if (!isDereferenceableReadOnlyLoop(L: TheLoop, SE: PSE.getSE(), DT, AC,
1775	Predicates: &Predicates)) {
1776	reportVectorizationFailure(
1777	DebugMsg: "Loop may fault",
1778	OREMsg: "Cannot vectorize potentially faulting early exit loop",
1779	ORETag: "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1780	return false;
1781	}
1782
1783	[[maybe_unused]] const SCEV *SymbolicMaxBTC =
1784	PSE.getSymbolicMaxBackedgeTakenCount();
1785	// Since we have an exact exit count for the latch and the early exit
1786	// dominates the latch, then this should guarantee a computed SCEV value.
1787	assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1788	"Failed to get symbolic expression for backedge taken count");
1789	LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1790	"backedge taken count: "
1791	<< *SymbolicMaxBTC << `'\n'`);
1792	UncountableEdge = SingleUncountableEdge;
1793	return true;
1794	}
1795
1796	bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1797	// Store the result and return it at the end instead of exiting early, in case
1798	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1799	bool Result = true;
1800
1801	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1802	// Check whether the loop-related control flow in the loop nest is expected by
1803	// vectorizer.
1804	if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1805	if (DoExtraAnalysis) {
1806	LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1807	Result = false;
1808	} else {
1809	return false;
1810	}
1811	}
1812
1813	// We need to have a loop header.
1814	LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1815	<< `'\n'`);
1816
1817	// Specific checks for outer loops. We skip the remaining legal checks at this
1818	// point because they don't support outer loops.
1819	if (!TheLoop->isInnermost()) {
1820	assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1821
1822	if (!canVectorizeOuterLoop()) {
1823	reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1824	ORETag: "UnsupportedOuterLoop", ORE, TheLoop);
1825	// TODO: Implement DoExtraAnalysis when subsequent legal checks support
1826	// outer loops.
1827	return false;
1828	}
1829
1830	LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1831	return Result;
1832	}
1833
1834	assert(TheLoop->isInnermost() && "Inner loop expected.");
1835	// Check if we can if-convert non-single-bb loops.
1836	unsigned NumBlocks = TheLoop->getNumBlocks();
1837	if (NumBlocks != `1` && !canVectorizeWithIfConvert()) {
1838	LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1839	if (DoExtraAnalysis)
1840	Result = false;
1841	else
1842	return false;
1843	}
1844
1845	// Check if we can vectorize the instructions and CFG in this loop.
1846	if (!canVectorizeInstrs()) {
1847	LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1848	if (DoExtraAnalysis)
1849	Result = false;
1850	else
1851	return false;
1852	}
1853
1854	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
1855	if (TheLoop->getExitingBlock()) {
1856	reportVectorizationFailure(DebugMsg: "Cannot vectorize uncountable loop",
1857	ORETag: "UnsupportedUncountableLoop", ORE, TheLoop);
1858	if (DoExtraAnalysis)
1859	Result = false;
1860	else
1861	return false;
1862	} else {
1863	if (!isVectorizableEarlyExitLoop()) {
1864	UncountableEdge = std::nullopt;
1865	if (DoExtraAnalysis)
1866	Result = false;
1867	else
1868	return false;
1869	}
1870	}
1871	}
1872
1873	// Go over each instruction and look at memory deps.
1874	if (!canVectorizeMemory()) {
1875	LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1876	if (DoExtraAnalysis)
1877	Result = false;
1878	else
1879	return false;
1880	}
1881
1882	if (Result) {
1883	LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1884	<< (LAI->getRuntimePointerChecking()->Need
1885	? " (with a runtime bound check)"
1886	: "")
1887	<< "!\n");
1888	}
1889
1890	unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1891	if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1892	SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1893
1894	if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
1895	LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
1896	"due to SCEVThreshold");
1897	reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
1898	OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
1899	ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
1900	if (DoExtraAnalysis)
1901	Result = false;
1902	else
1903	return false;
1904	}
1905
1906	// Okay! We've done all the tests. If any have failed, return false. Otherwise
1907	// we can vectorize, and at this point we don't have any other mem analysis
1908	// which may limit our maximum vectorization factor, so just return true with
1909	// no restrictions.
1910	return Result;
1911	}
1912
1913	bool LoopVectorizationLegality::canFoldTailByMasking() const {
1914	// The only loops we can vectorize without a scalar epilogue, are loops with
1915	// a bottom-test and a single exiting block. We'd have to handle the fact
1916	// that not every instruction executes on the last iteration. This will
1917	// require a lane mask which varies through the vector loop body. (TODO)
1918	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1919	LLVM_DEBUG(
1920	dbgs()
1921	<< "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
1922	return false;
1923	}
1924
1925	LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1926
1927	SmallPtrSet<const Value *, `8`> ReductionLiveOuts;
1928
1929	for (const auto &Reduction : getReductionVars())
1930	ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
1931
1932	// TODO: handle non-reduction outside users when tail is folded by masking.
1933	for (auto *AE : AllowedExit) {
1934	// Check that all users of allowed exit values are inside the loop or
1935	// are the live-out of a reduction.
1936	if (ReductionLiveOuts.count(Ptr: AE))
1937	continue;
1938	for (User *U : AE->users()) {
1939	Instruction *UI = cast<Instruction>(Val: U);
1940	if (TheLoop->contains(Inst: UI))
1941	continue;
1942	LLVM_DEBUG(
1943	dbgs()
1944	<< "LV: Cannot fold tail by masking, loop has an outside user for "
1945	<< *UI << "\n");
1946	return false;
1947	}
1948	}
1949
1950	for (const auto &Entry : getInductionVars()) {
1951	PHINode *OrigPhi = Entry.first;
1952	for (User *U : OrigPhi->users()) {
1953	auto *UI = cast<Instruction>(Val: U);
1954	if (!TheLoop->contains(Inst: UI)) {
1955	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1956	"outside user for "
1957	<< *UI << "\n");
1958	return false;
1959	}
1960	}
1961	}
1962
1963	// The list of pointers that we can safely read and write to remains empty.
1964	SmallPtrSet<Value *, `8`> SafePointers;
1965
1966	// Check all blocks for predication, including those that ordinarily do not
1967	// need predication such as the header block.
1968	SmallPtrSet<const Instruction *, `8`> TmpMaskedOp;
1969	for (BasicBlock *BB : TheLoop->blocks()) {
1970	if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
1971	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1972	return false;
1973	}
1974	}
1975
1976	LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1977
1978	return true;
1979	}
1980
1981	void LoopVectorizationLegality::prepareToFoldTailByMasking() {
1982	// The list of pointers that we can safely read and write to remains empty.
1983	SmallPtrSet<Value *, `8`> SafePointers;
1984
1985	// Mark all blocks for predication, including those that ordinarily do not
1986	// need predication such as the header block.
1987	for (BasicBlock *BB : TheLoop->blocks()) {
1988	[[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp);
1989	assert(R && "Must be able to predicate block when tail-folding.");
1990	}
1991	}
1992
1993	} // namespace llvm
1994

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp