LoopVectorizationLegality.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp]

1	//===- LoopVectorizationLegality.cpp --------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides loop vectorization legality analysis. Original code
10	// resided in LoopVectorize.cpp for a long time.
11	//
12	// At this point, it is implemented as a utility class, not as an analysis
13	// pass. It should be easy to create an analysis pass around it if there
14	// is a need (but D45420 needs to happen first).
15	//
16
17	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18	#include "llvm/Analysis/AliasAnalysis.h"
19	#include "llvm/Analysis/Loads.h"
20	#include "llvm/Analysis/LoopInfo.h"
21	#include "llvm/Analysis/MustExecute.h"
22	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
24	#include "llvm/Analysis/TargetLibraryInfo.h"
25	#include "llvm/Analysis/TargetTransformInfo.h"
26	#include "llvm/Analysis/ValueTracking.h"
27	#include "llvm/Analysis/VectorUtils.h"
28	#include "llvm/IR/Dominators.h"
29	#include "llvm/IR/IntrinsicInst.h"
30	#include "llvm/IR/PatternMatch.h"
31	#include "llvm/Transforms/Utils/SizeOpts.h"
32	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
33
34	using namespace llvm;
35	using namespace PatternMatch;
36
37	#define LV_NAME "loop-vectorize"
38	#define DEBUG_TYPE LV_NAME
39
40	static cl::opt<bool>
41	EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
42	cl::desc ("Enable if-conversion during vectorization."));
43
44	static cl::opt<bool>
45	AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
46	cl::desc ("Enable recognition of non-constant strided "
47	"pointer induction variables."));
48
49	static cl::opt<bool>
50	HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
51	cl::desc ("Allow enabling loop hints to reorder "
52	"FP operations during vectorization."));
53
54	// TODO: Move size-based thresholds out of legality checking, make cost based
55	// decisions instead of hard thresholds.
56	static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
57	"vectorize-scev-check-threshold", cl::init(Val: `16`), cl::Hidden,
58	cl::desc ("The maximum number of SCEV checks allowed."));
59
60	static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
61	"pragma-vectorize-scev-check-threshold", cl::init(Val: `128`), cl::Hidden,
62	cl::desc ("The maximum number of SCEV checks allowed with a "
63	"vectorize(enable) pragma"));
64
65	static cl::opt<LoopVectorizeHints::ScalableForceKind>
66	ForceScalableVectorization(
67	"scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
68	cl::Hidden,
69	cl::desc ("Control whether the compiler can use scalable vectors to "
70	"vectorize a loop"),
71	cl::values(
72	clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
73	"Scalable vectorization is disabled."),
74	clEnumValN(
75	LoopVectorizeHints::SK_PreferScalable, "preferred",
76	"Scalable vectorization is available and favored when the "
77	"cost is inconclusive."),
78	clEnumValN(
79	LoopVectorizeHints::SK_PreferScalable, "on",
80	"Scalable vectorization is available and favored when the "
81	"cost is inconclusive.")));
82
83	static cl::opt<bool> EnableHistogramVectorization(
84	"enable-histogram-loop-vectorization", cl::init(Val: false), cl::Hidden,
85	cl::desc ("Enables autovectorization of some loops containing histograms"));
86
87	/// Maximum vectorization interleave count.
88	static const unsigned MaxInterleaveFactor = `16`;
89
90	namespace llvm {
91
92	bool LoopVectorizeHints::Hint::validate(unsigned Val) {
93	switch (Kind) {
94	case HK_WIDTH:
95	return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
96	case HK_INTERLEAVE:
97	return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
98	case HK_FORCE:
99	return (Val <= `1`);
100	case HK_ISVECTORIZED:
101	case HK_PREDICATE:
102	case HK_SCALABLE:
103	return (Val == `0` \|\| Val == `1`);
104	}
105	return false;
106	}
107
108	LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
109	bool InterleaveOnlyWhenForced,
110	OptimizationRemarkEmitter &ORE,
111	const TargetTransformInfo *TTI)
112	: Width ("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
113	Interleave ("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
114	Force ("vectorize.enable", FK_Undefined, HK_FORCE),
115	IsVectorized ("isvectorized", `0`, HK_ISVECTORIZED),
116	Predicate ("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
117	Scalable ("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
118	TheLoop(L), ORE(ORE) {
119	// Populate values with existing loop metadata.
120	getHintsFromMetadata();
121
122	// force-vector-interleave overrides DisableInterleaving.
123	if (VectorizerParams::isInterleaveForced())
124	Interleave.Value = VectorizerParams::VectorizationInterleave;
125
126	// If the metadata doesn't explicitly specify whether to enable scalable
127	// vectorization, then decide based on the following criteria (increasing
128	// level of priority):
129	// - Target default
130	// - Metadata width
131	// - Force option (always overrides)
132	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
133	if (TTI)
134	Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
135	: SK_FixedWidthOnly;
136
137	if (Width.Value)
138	// If the width is set, but the metadata says nothing about the scalable
139	// property, then assume it concerns only a fixed-width UserVF.
140	// If width is not set, the flag takes precedence.
141	Scalable.Value = SK_FixedWidthOnly;
142	}
143
144	// If the flag is set to force any use of scalable vectors, override the loop
145	// hints.
146	if (ForceScalableVectorization.getValue() !=
147	LoopVectorizeHints::SK_Unspecified)
148	Scalable.Value = ForceScalableVectorization.getValue();
149
150	// Scalable vectorization is disabled if no preference is specified.
151	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
152	Scalable.Value = SK_FixedWidthOnly;
153
154	if (IsVectorized.Value != `1`)
155	// If the vectorization width and interleaving count are both 1 then
156	// consider the loop to have been already vectorized because there's
157	// nothing more that we can do.
158	IsVectorized.Value =
159	getWidth() == ElementCount::getFixed(MinVal: `1`) && getInterleave() == `1`;
160	LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == `1`) dbgs()
161	<< "LV: Interleaving disabled by the pass manager\n");
162	}
163
164	void LoopVectorizeHints::setAlreadyVectorized() {
165	LLVMContext &Context = TheLoop->getHeader()->getContext();
166
167	MDNode *IsVectorizedMD = MDNode::get(
168	Context,
169	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
170	ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt (`32`, `1`)))});
171	MDNode *LoopID = TheLoop->getLoopID();
172	MDNode *NewLoopID =
173	makePostTransformationMetadata(Context, OrigLoopID: LoopID,
174	RemovePrefixes: {Twine (Prefix(), "vectorize.").str(),
175	Twine (Prefix(), "interleave.").str()},
176	AddAttrs: {IsVectorizedMD});
177	TheLoop->setLoopID(NewLoopID);
178
179	// Update internal cache.
180	IsVectorized.Value = `1`;
181	}
182
183	void LoopVectorizeHints::reportDisallowedVectorization(
184	const StringRef DebugMsg, const StringRef RemarkName,
185	const StringRef RemarkMsg, const Loop L) const* {
186	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: " << DebugMsg << ".\n");
187	ORE.emit(OptDiag: OptimizationRemarkMissed (LV_NAME, RemarkName, L->getStartLoc(),
188	L->getHeader())
189	<< "loop not vectorized: " << RemarkMsg);
190	}
191
192	bool LoopVectorizeHints::allowVectorization(
193	Function F, Loop L, bool VectorizeOnlyWhenForced) const {
194	if (getForce() == LoopVectorizeHints::FK_Disabled) {
195	if (Force.Value == LoopVectorizeHints::FK_Disabled) {
196	reportDisallowedVectorization(DebugMsg: "#pragma vectorize disable",
197	RemarkName: "MissedExplicitlyDisabled",
198	RemarkMsg: "vectorization is explicitly disabled", L);
199	} else if (hasDisableAllTransformsHint(L)) {
200	reportDisallowedVectorization(DebugMsg: "loop hasDisableAllTransformsHint",
201	RemarkName: "MissedTransformsDisabled",
202	RemarkMsg: "loop transformations are disabled", L);
203	} else {
204	llvm_unreachable("loop vect disabled for an unknown reason");
205	}
206	return false;
207	}
208
209	if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
210	reportDisallowedVectorization(
211	DebugMsg: "VectorizeOnlyWhenForced is set, and no #pragma vectorize enable",
212	RemarkName: "MissedForceOnly", RemarkMsg: "only vectorizing loops that explicitly request it",
213	L);
214	return false;
215	}
216
217	if (getIsVectorized() == `1`) {
218	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
219	// FIXME: Add interleave.disable metadata. This will allow
220	// vectorize.disable to be used without disabling the pass and errors
221	// to differentiate between disabled vectorization and a width of 1.
222	ORE.emit(RemarkBuilder: [&]() {
223	return OptimizationRemarkAnalysis (vectorizeAnalysisPassName(),
224	"AllDisabled", L->getStartLoc(),
225	L->getHeader())
226	<< "loop not vectorized: vectorization and interleaving are "
227	"explicitly disabled, or the loop has already been "
228	"vectorized";
229	});
230	return false;
231	}
232
233	return true;
234	}
235
236	void LoopVectorizeHints::emitRemarkWithHints() const {
237	using namespace ore;
238
239	ORE.emit(RemarkBuilder: [&]() {
240	if (Force.Value == LoopVectorizeHints::FK_Disabled)
241	return OptimizationRemarkMissed (LV_NAME, "MissedExplicitlyDisabled",
242	TheLoop->getStartLoc(),
243	TheLoop->getHeader())
244	<< "loop not vectorized: vectorization is explicitly disabled";
245
246	OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
247	TheLoop->getHeader());
248	R << "loop not vectorized";
249	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
250	R << " (Force=" << NV ("Force", true);
251	if (Width.Value != `0`)
252	R << ", Vector Width=" << NV ("VectorWidth", getWidth());
253	if (getInterleave() != `0`)
254	R << ", Interleave Count=" << NV ("InterleaveCount", getInterleave());
255	R << ")";
256	}
257	return R;
258	});
259	}
260
261	const char LoopVectorizeHints::vectorizeAnalysisPassName() const* {
262	if (getWidth() == ElementCount::getFixed(MinVal: `1`))
263	return LV_NAME;
264	if (getForce() == LoopVectorizeHints::FK_Disabled)
265	return LV_NAME;
266	if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
267	return LV_NAME;
268	return OptimizationRemarkAnalysis::AlwaysPrint;
269	}
270
271	bool LoopVectorizeHints::allowReordering() const {
272	// Allow the vectorizer to change the order of operations if enabling
273	// loop hints are provided
274	ElementCount EC = getWidth();
275	return HintsAllowReordering &&
276	(getForce() == LoopVectorizeHints::FK_Enabled \|\|
277	EC.getKnownMinValue() > `1`);
278	}
279
280	void LoopVectorizeHints::getHintsFromMetadata() {
281	MDNode *LoopID = TheLoop->getLoopID();
282	if (!LoopID)
283	return;
284
285	// First operand should refer to the loop id itself.
286	assert(LoopID->getNumOperands() > `0` && "requires at least one operand");
287	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop id");
288
289	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
290	const MDString S = nullptr*;
291	SmallVector<Metadata *, `4`> Args;
292
293	// The expected hint is either a MDString or a MDNode with the first
294	// operand a MDString.
295	if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
296	if (!MD \|\| MD->getNumOperands() == `0`)
297	continue;
298	S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
299	for (unsigned Idx = `1`; Idx < MD->getNumOperands(); ++Idx)
300	Args.push_back(Elt: MD->getOperand(I: Idx));
301	} else {
302	S = dyn_cast<MDString>(Val: MDO);
303	assert(Args.size() == `0` && "too many arguments for MDString");
304	}
305
306	if (!S)
307	continue;
308
309	// Check if the hint starts with the loop metadata prefix.
310	StringRef Name = S->getString();
311	if (Args.size() == `1`)
312	setHint(Name, Arg: Args [`0`]);
313	}
314	}
315
316	void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
317	if (!Name.consume_front(Prefix: Prefix()))
318	return;
319
320	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
321	if (!C)
322	return;
323	unsigned Val = C->getZExtValue();
324
325	Hint *Hints[] = {&Width, &Interleave, &Force,
326	&IsVectorized, &Predicate, &Scalable};
327	for (auto *H : Hints) {
328	if (Name == H->Name) {
329	if (H->validate(Val))
330	H->Value = Val;
331	else
332	LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
333	break;
334	}
335	}
336	}
337
338	// Return true if the inner loop \p Lp is uniform with regard to the outer loop
339	// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
340	// executing the inner loop will execute the same iterations). This check is
341	// very constrained for now but it will be relaxed in the future. \p Lp is
342	// considered uniform if it meets all the following conditions:
343	// 1) it has a canonical IV (starting from 0 and with stride 1),
344	// 2) its latch terminator is a conditional branch and,
345	// 3) its latch condition is a compare instruction whose operands are the
346	// canonical IV and an OuterLp invariant.
347	// This check doesn't take into account the uniformity of other conditions not
348	// related to the loop latch because they don't affect the loop uniformity.
349	//
350	// NOTE: We decided to keep all these checks and its associated documentation
351	// together so that we can easily have a picture of the current supported loop
352	// nests. However, some of the current checks don't depend on \p OuterLp and
353	// would be redundantly executed for each \p Lp if we invoked this function for
354	// different candidate outer loops. This is not the case for now because we
355	// don't currently have the infrastructure to evaluate multiple candidate outer
356	// loops and \p OuterLp will be a fixed parameter while we only support explicit
357	// outer loop vectorization. It's also very likely that these checks go away
358	// before introducing the aforementioned infrastructure. However, if this is not
359	// the case, we should move the \p OuterLp independent checks to a separate
360	// function that is only executed once for each \p Lp.
361	static bool isUniformLoop(Loop Lp, Loop OuterLp) {
362	assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
363
364	// If Lp is the outer loop, it's uniform by definition.
365	if (Lp == OuterLp)
366	return true;
367	assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
368
369	// 1.
370	PHINode *IV = Lp->getCanonicalInductionVariable();
371	if (!IV) {
372	LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
373	return false;
374	}
375
376	// 2.
377	BasicBlock *Latch = Lp->getLoopLatch();
378	auto *LatchBr = dyn_cast<CondBrInst>(Val: Latch->getTerminator());
379	if (!LatchBr) {
380	LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
381	return false;
382	}
383
384	// 3.
385	auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
386	if (!LatchCmp) {
387	LLVM_DEBUG(
388	dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
389	return false;
390	}
391
392	Value *CondOp0 = LatchCmp->getOperand(i_nocapture: `0`);
393	Value *CondOp1 = LatchCmp->getOperand(i_nocapture: `1`);
394	Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
395	if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
396	!(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
397	LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
398	return false;
399	}
400
401	return true;
402	}
403
404	// Return true if \p Lp and all its nested loops are uniform with regard to \p
405	// OuterLp.
406	static bool isUniformLoopNest(Loop Lp, Loop OuterLp) {
407	if (!isUniformLoop(Lp, OuterLp))
408	return false;
409
410	// Check if nested loops are uniform.
411	for (Loop SubLp : Lp)
412	if (!isUniformLoopNest(Lp: SubLp, OuterLp))
413	return false;
414
415	return true;
416	}
417
418	static IntegerType getInductionIntegerTy(const* DataLayout &DL, Type *Ty) {
419	assert(Ty->isIntOrPtrTy() && "Expected integer or pointer type");
420
421	if (Ty->isPointerTy())
422	return DL.getIntPtrType(C&: Ty->getContext(), AddressSpace: Ty->getPointerAddressSpace());
423
424	// It is possible that char's or short's overflow when we ask for the loop's
425	// trip count, work around this by changing the type size.
426	if (Ty->getScalarSizeInBits() < `32`)
427	return Type::getInt32Ty(C&: Ty->getContext());
428
429	return cast<IntegerType>(Val: Ty);
430	}
431
432	static IntegerType getWiderInductionTy(const* DataLayout &DL, Type *Ty0,
433	Type *Ty1) {
434	IntegerType *TyA = getInductionIntegerTy(DL, Ty: Ty0);
435	IntegerType *TyB = getInductionIntegerTy(DL, Ty: Ty1);
436	return TyA->getScalarSizeInBits() > TyB->getScalarSizeInBits() ? TyA : TyB;
437	}
438
439	/// Check that the instruction has outside loop users and is not an
440	/// identified reduction variable.
441	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
442	SmallPtrSetImpl<Value *> &AllowedExit) {
443	// Reductions, Inductions and non-header phis are allowed to have exit users. All
444	// other instructions must not have external users.
445	if (!AllowedExit.count(Ptr: Inst))
446	// Check that all of the users of the loop are inside the BB.
447	for (User *U : Inst->users()) {
448	Instruction *UI = cast<Instruction>(Val: U);
449	// This user may be a reduction exit value.
450	if (!TheLoop->contains(Inst: UI)) {
451	LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << `'\n'`);
452	return true;
453	}
454	}
455	return false;
456	}
457
458	/// Returns true if A and B have same pointer operands or same SCEVs addresses
459	static bool storeToSameAddress(ScalarEvolution SE, StoreInst A,
460	StoreInst *B) {
461	// Compare store
462	if (A == B)
463	return true;
464
465	// Otherwise Compare pointers
466	Value *APtr = A->getPointerOperand();
467	Value *BPtr = B->getPointerOperand();
468	if (APtr == BPtr)
469	return true;
470
471	// Otherwise compare address SCEVs
472	return SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr);
473	}
474
475	int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
476	Value Ptr) const* {
477	// FIXME: Currently, the set of symbolic strides is sometimes queried before
478	// it's collected. This happens from canVectorizeWithIfConvert, when the
479	// pointer is checked to reference consecutive elements suitable for a
480	// masked access.
481	const auto &Strides =
482	LAI ? LAI->getSymbolicStrides() : DenseMap<Value , const* SCEV *>();
483
484	int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
485	Assume: AllowRuntimeSCEVChecks, ShouldCheckWrap: false)
486	.value_or(u: `0`);
487	if (Stride == `1` \|\| Stride == -`1`)
488	return Stride;
489	return `0`;
490	}
491
492	bool LoopVectorizationLegality::isInvariant(Value V) const* {
493	return LAI->isInvariant(V);
494	}
495
496	namespace {
497	/// A rewriter to build the SCEVs for each of the VF lanes in the expected
498	/// vectorized loop, which can then be compared to detect their uniformity. This
499	/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
500	/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
501	/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
502	/// uniformity.
503	class SCEVAddRecForUniformityRewriter
504	: public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
505	/// Multiplier to be applied to the step of AddRecs in TheLoop.
506	unsigned StepMultiplier;
507
508	/// Offset to be added to the AddRecs in TheLoop.
509	unsigned Offset;
510
511	/// Loop for which to rewrite AddRecsFor.
512	Loop *TheLoop;
513
514	/// Is any sub-expressions not analyzable w.r.t. uniformity?
515	bool CannotAnalyze = false;
516
517	bool canAnalyze() const { return !CannotAnalyze; }
518
519	public:
520	SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
521	unsigned Offset, Loop *TheLoop)
522	: SCEVRewriteVisitor (SE), StepMultiplier(StepMultiplier), Offset(Offset),
523	TheLoop(TheLoop) {}
524
525	const SCEV visitAddRecExpr(const* SCEVAddRecExpr *Expr) {
526	assert(Expr->getLoop() == TheLoop &&
527	"addrec outside of TheLoop must be invariant and should have been "
528	"handled earlier");
529	// Build a new AddRec by multiplying the step by StepMultiplier and
530	// incrementing the start by Offset step.*
531	Type *Ty = Expr->getType();
532	const SCEV *Step = Expr->getStepRecurrence(SE);
533	if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
534	CannotAnalyze = true;
535	return Expr;
536	}
537	const SCEV *NewStep =
538	SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
539	const SCEV *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
540	const SCEV *NewStart =
541	SE.getAddExpr(LHS: Expr->getStart(), RHS: SCEVUse (ScaledOffset));
542	return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
543	}
544
545	const SCEV visit(const* SCEV *S) {
546	if (CannotAnalyze \|\| SE.isLoopInvariant(S, L: TheLoop))
547	return S;
548	return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
549	}
550
551	const SCEV visitUnknown(const* SCEVUnknown *S) {
552	if (SE.isLoopInvariant(S, L: TheLoop))
553	return S;
554	// The value could vary across iterations.
555	CannotAnalyze = true;
556	return S;
557	}
558
559	const SCEV visitCouldNotCompute(const* SCEVCouldNotCompute *S) {
560	// Could not analyze the expression.
561	CannotAnalyze = true;
562	return S;
563	}
564
565	static const SCEV rewrite(const* SCEV *S, ScalarEvolution &SE,
566	unsigned StepMultiplier, unsigned Offset,
567	Loop *TheLoop) {
568	/// Bail out if the expression does not contain an UDiv expression.
569	/// Uniform values which are not loop invariant require operations to strip
570	/// out the lowest bits. For now just look for UDivs and use it to avoid
571	/// re-writing UDIV-free expressions for other lanes to limit compile time.
572	if (!SCEVExprContains(Root: S,
573	Pred: [](const SCEV S) { return* isa<SCEVUDivExpr>(Val: S); }))
574	return SE.getCouldNotCompute();
575
576	SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
577	TheLoop);
578	const SCEV *Result = Rewriter.visit(S);
579
580	if (Rewriter.canAnalyze())
581	return Result;
582	return SE.getCouldNotCompute();
583	}
584	};
585
586	} // namespace
587
588	bool LoopVectorizationLegality::isUniform(Value V, ElementCount VF) const* {
589	if (isInvariant(V))
590	return true;
591	if (VF.isScalable())
592	return false;
593	if (VF.isScalar())
594	return true;
595
596	// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
597	// never considered uniform.
598	auto *SE = PSE.getSE();
599	if (!SE->isSCEVable(Ty: V->getType()))
600	return false;
601	const SCEV *S = SE->getSCEV(V);
602
603	// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
604	// lane 0 matches the expressions for all other lanes.
605	unsigned FixedVF = VF.getKnownMinValue();
606	const SCEV *FirstLaneExpr =
607	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: `0`, TheLoop);
608	if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
609	return false;
610
611	// Make sure the expressions for lanes FixedVF-1..1 match the expression for
612	// lane 0. We check lanes in reverse order for compile-time, as frequently
613	// checking the last lane is sufficient to rule out uniformity.
614	return all_of(Range: reverse(C: seq<unsigned>(Begin: `1`, End: FixedVF)), P: [&](unsigned I) {
615	const SCEV *IthLaneExpr =
616	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
617	return FirstLaneExpr == IthLaneExpr;
618	});
619	}
620
621	bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
622	ElementCount VF) const {
623	Value *Ptr = getLoadStorePointerOperand(V: &I);
624	if (!Ptr)
625	return false;
626	// Note: There's nothing inherent which prevents predicated loads and
627	// stores from being uniform. The current lowering simply doesn't handle
628	// it; in particular, the cost model distinguishes scatter/gather from
629	// scalar w/predication, and we currently rely on the scalar path.
630	return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
631	}
632
633	bool LoopVectorizationLegality::canVectorizeOuterLoop() {
634	assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
635	// Store the result and return it at the end instead of exiting early, in case
636	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
637	bool Result = true;
638	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
639
640	for (BasicBlock *BB : TheLoop->blocks()) {
641	// Check whether the BB terminator is a branch. Any other terminator is
642	// not supported yet.
643	Instruction *Term = BB->getTerminator();
644	if (!isa<UncondBrInst, CondBrInst>(Val: Term)) {
645	reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
646	OREMsg: "loop control flow is not understood by vectorizer",
647	ORETag: "CFGNotUnderstood", ORE, TheLoop);
648	if (DoExtraAnalysis)
649	Result = false;
650	else
651	return false;
652	}
653
654	// Check whether the branch is a supported one. Only unconditional
655	// branches, conditional branches with an outer loop invariant condition or
656	// backedges are supported.
657	// FIXME: We skip these checks when VPlan predication is enabled as we
658	// want to allow divergent branches. This whole check will be removed
659	// once VPlan predication is on by default.
660	auto *Br = dyn_cast<CondBrInst>(Val: Term);
661	if (Br && !TheLoop->isLoopInvariant(V: Br->getCondition()) &&
662	!LI->isLoopHeader(BB: Br->getSuccessor(i: `0`)) &&
663	!LI->isLoopHeader(BB: Br->getSuccessor(i: `1`))) {
664	reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
665	OREMsg: "loop control flow is not understood by vectorizer",
666	ORETag: "CFGNotUnderstood", ORE, TheLoop);
667	if (DoExtraAnalysis)
668	Result = false;
669	else
670	return false;
671	}
672	}
673
674	// Check whether inner loops are uniform. At this point, we only support
675	// simple outer loops scenarios with uniform nested loops.
676	if (!isUniformLoopNest(Lp: TheLoop /loop nest/,
677	OuterLp: TheLoop /context outer loop/)) {
678	reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
679	OREMsg: "loop control flow is not understood by vectorizer",
680	ORETag: "CFGNotUnderstood", ORE, TheLoop);
681	if (DoExtraAnalysis)
682	Result = false;
683	else
684	return false;
685	}
686
687	// Check whether we are able to set up outer loop induction.
688	if (!setupOuterLoopInductions()) {
689	reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
690	ORETag: "UnsupportedPhi", ORE, TheLoop);
691	if (DoExtraAnalysis)
692	Result = false;
693	else
694	return false;
695	}
696
697	return Result;
698	}
699
700	void LoopVectorizationLegality::addInductionPhi(
701	PHINode Phi, const* InductionDescriptor &ID,
702	SmallPtrSetImpl<Value *> &AllowedExit) {
703	Inductions [Phi] = ID;
704
705	// In case this induction also comes with casts that we know we can ignore
706	// in the vectorized loop body, record them here. All casts could be recorded
707	// here for ignoring, but suffices to record only the first (as it is the
708	// only one that may bw used outside the cast sequence).
709	ArrayRef<Instruction *> Casts = ID.getCastInsts();
710	if (!Casts.empty())
711	InductionCastsToIgnore.insert(Ptr: *Casts.begin());
712
713	Type *PhiTy = Phi->getType();
714	const DataLayout &DL = Phi->getDataLayout();
715
716	assert((PhiTy->isIntOrPtrTy() \|\| PhiTy->isFloatingPointTy()) &&
717	"Expected int, ptr, or FP induction phi type");
718
719	// Get the widest type.
720	if (PhiTy->isIntOrPtrTy()) {
721	if (!WidestIndTy)
722	WidestIndTy = getInductionIntegerTy(DL, Ty: PhiTy);
723	else
724	WidestIndTy = getWiderInductionTy(DL, Ty0: PhiTy, Ty1: WidestIndTy);
725	}
726
727	// Int inductions are special because we only allow one IV.
728	if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
729	ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
730	isa<Constant>(Val: ID.getStartValue()) &&
731	cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
732
733	// Use the phi node with the widest type as induction. Use the last
734	// one if there are multiple (no good reason for doing this other
735	// than it is expedient). We've checked that it begins at zero and
736	// steps by one, so this is a canonical induction variable.
737	if (!PrimaryInduction \|\| PhiTy == WidestIndTy)
738	PrimaryInduction = Phi;
739	}
740
741	// Both the PHI node itself, and the "post-increment" value feeding
742	// back into the PHI node may have external users.
743	// We can allow those uses, except if the SCEVs we have for them rely
744	// on predicates that only hold within the loop, since allowing the exit
745	// currently means re-using this SCEV outside the loop (see PR33706 for more
746	// details).
747	if (PSE.getPredicate().isAlwaysTrue()) {
748	AllowedExit.insert(Ptr: Phi);
749	AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
750	}
751
752	LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
753	}
754
755	bool LoopVectorizationLegality::setupOuterLoopInductions() {
756	BasicBlock *Header = TheLoop->getHeader();
757
758	// Returns true if a given Phi is a supported induction.
759	auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
760	InductionDescriptor ID;
761	if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
762	ID.getKind() == InductionDescriptor::IK_IntInduction) {
763	addInductionPhi(Phi: &Phi, ID, AllowedExit);
764	return true;
765	}
766	// Bail out for any Phi in the outer loop header that is not a supported
767	// induction.
768	LLVM_DEBUG(
769	dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
770	return false;
771	};
772
773	return llvm::all_of(Range: Header->phis(), P: IsSupportedPhi);
774	}
775
776	/// Checks if a function is scalarizable according to the TLI, in
777	/// the sense that it should be vectorized and then expanded in
778	/// multiple scalar calls. This is represented in the
779	/// TLI via mappings that do not specify a vector name, as in the
780	/// following example:
781	///
782	/// const VecDesc VecIntrinsics[] = {
783	/// {"llvm.phx.abs.i32", "", 4}
784	/// };
785	static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
786	const StringRef ScalarName = CI.getCalledFunction()->getName();
787	bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
788	// Check that all known VFs are not associated to a vector
789	// function, i.e. the vector name is emty.
790	if (Scalarize) {
791	ElementCount WidestFixedVF, WidestScalableVF;
792	TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
793	for (ElementCount VF = ElementCount::getFixed(MinVal: `2`);
794	ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= `2`)
795	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
796	for (ElementCount VF = ElementCount::getScalable(MinVal: `1`);
797	ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= `2`)
798	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
799	assert((WidestScalableVF.isZero() \|\| !Scalarize) &&
800	"Caller may decide to scalarize a variant using a scalable VF");
801	}
802	return Scalarize;
803	}
804
805	bool LoopVectorizationLegality::canVectorizeInstrs() {
806	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
807	bool Result = true;
808
809	// For each block in the loop.
810	for (BasicBlock *BB : TheLoop->blocks()) {
811	// Scan the instructions in the block and look for hazards.
812	for (Instruction &I : *BB) {
813	Result &= canVectorizeInstr(I);
814	if (!DoExtraAnalysis && !Result)
815	return false;
816	}
817	}
818
819	if (!PrimaryInduction) {
820	if (Inductions.empty()) {
821	reportVectorizationFailure(
822	DebugMsg: "Did not find one integer induction var",
823	OREMsg: "loop induction variable could not be identified",
824	ORETag: "NoInductionVariable", ORE, TheLoop);
825	return false;
826	}
827	if (!WidestIndTy) {
828	reportVectorizationFailure(
829	DebugMsg: "Did not find one integer induction var",
830	OREMsg: "integer loop induction variable could not be identified",
831	ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
832	return false;
833	}
834	LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
835	}
836
837	// Now we know the widest induction type, check if our found induction
838	// is the same size. If it's not, unset it here and InnerLoopVectorizer
839	// will create another.
840	if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
841	PrimaryInduction = nullptr;
842
843	return Result;
844	}
845
846	bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
847	BasicBlock *BB = I.getParent();
848	BasicBlock *Header = TheLoop->getHeader();
849
850	if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
851	Type *PhiTy = Phi->getType();
852	// Check that this PHI type is allowed.
853	if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
854	!PhiTy->isPointerTy()) {
855	reportVectorizationFailure(
856	DebugMsg: "Found a non-int non-pointer PHI",
857	OREMsg: "loop control flow is not understood by vectorizer",
858	ORETag: "CFGNotUnderstood", ORE, TheLoop);
859	return false;
860	}
861
862	// If this PHINode is not in the header block, then we know that we
863	// can convert it to select during if-conversion. No need to check if
864	// the PHIs in this block are induction or reduction variables.
865	if (BB != Header) {
866	// Non-header phi nodes that have outside uses can be vectorized. Add
867	// them to the list of allowed exits.
868	// Unsafe cyclic dependencies with header phis are identified during
869	// legalization for reduction, induction and fixed order
870	// recurrences.
871	AllowedExit.insert(Ptr: &I);
872	return true;
873	}
874
875	// We only allow if-converted PHIs with exactly two incoming values.
876	if (Phi->getNumIncomingValues() != `2`) {
877	reportVectorizationFailure(
878	DebugMsg: "Found an invalid PHI",
879	OREMsg: "loop control flow is not understood by vectorizer",
880	ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
881	return false;
882	}
883
884	RecurrenceDescriptor RedDes;
885	if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
886	SE: PSE.getSE())) {
887	Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
888	AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
889	Reductions [Phi] = std::move(RedDes);
890	assert((!RedDes.hasUsesOutsideReductionChain() \|\|
891	RecurrenceDescriptor::isMinMaxRecurrenceKind(
892	RedDes.getRecurrenceKind())) &&
893	"Only min/max recurrences are allowed to have multiple uses "
894	"currently");
895	return true;
896	}
897
898	// We prevent matching non-constant strided pointer IVS to preserve
899	// historical vectorizer behavior after a generalization of the
900	// IVDescriptor code. The intent is to remove this check, but we
901	// have to fix issues around code quality for such loops first.
902	auto IsDisallowedStridedPointerInduction =
903	[](const InductionDescriptor &ID) {
904	if (AllowStridedPointerIVs)
905	return false;
906	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
907	ID.getConstIntStepValue() == nullptr;
908	};
909
910	// TODO: Instead of recording the AllowedExit, it would be good to
911	// record the complementary set: NotAllowedExit. These include (but may
912	// not be limited to):
913	// 1. Reduction phis as they represent the one-before-last value, which
914	// is not available when vectorized
915	// 2. Induction phis and increment when SCEV predicates cannot be used
916	// outside the loop - see addInductionPhi
917	// 3. Non-Phis with outside uses when SCEV predicates cannot be used
918	// outside the loop - see call to hasOutsideLoopUser in the non-phi
919	// handling below
920	// 4. FixedOrderRecurrence phis that can possibly be handled by
921	// extraction.
922	// By recording these, we can then reason about ways to vectorize each
923	// of these NotAllowedExit.
924	InductionDescriptor ID;
925	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
926	!IsDisallowedStridedPointerInduction (ID)) {
927	addInductionPhi(Phi, ID, AllowedExit);
928	Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
929	return true;
930	}
931
932	if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
933	AllowedExit.insert(Ptr: Phi);
934	FixedOrderRecurrences.insert(Ptr: Phi);
935	return true;
936	}
937
938	// As a last resort, coerce the PHI to a AddRec expression
939	// and re-try classifying it a an induction PHI.
940	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
941	!IsDisallowedStridedPointerInduction (ID)) {
942	addInductionPhi(Phi, ID, AllowedExit);
943	return true;
944	}
945
946	reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
947	OREMsg: "value that could not be identified as "
948	"reduction is used outside the loop",
949	ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
950	I: Phi);
951	return false;
952	} // end of PHI handling
953
954	// We handle calls that:
955	// Have a mapping to an IR intrinsic.*
956	// Have a vector version available.*
957	auto *CI = dyn_cast<CallInst>(Val: &I);
958
959	if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
960	!(CI->getCalledFunction() && TLI &&
961	(!VFDatabase::getMappings(CI: CI).empty() \|\| isTLIScalarize(TLI: TLI, CI: *CI)))) {
962	// If the call is a recognized math libary call, it is likely that
963	// we can vectorize it given loosened floating-point constraints.
964	LibFunc Func;
965	bool IsMathLibCall =
966	TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
967	TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
968	TLI->hasOptimizedCodeGen(F: Func);
969
970	if (IsMathLibCall) {
971	// TODO: Ideally, we should not use clang-specific language here,
972	// but it's hard to provide meaningful yet generic advice.
973	// Also, should this be guarded by allowExtraAnalysis() and/or be part
974	// of the returned info from isFunctionVectorizable()?
975	reportVectorizationFailure(
976	DebugMsg: "Found a non-intrinsic callsite",
977	OREMsg: "library call cannot be vectorized. "
978	"Try compiling with -fno-math-errno, -ffast-math, "
979	"or similar flags",
980	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
981	} else {
982	reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
983	OREMsg: "call instruction cannot be vectorized",
984	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
985	}
986	return false;
987	}
988
989	// Some intrinsics have scalar arguments and should be same in order for
990	// them to be vectorized (i.e. loop invariant).
991	if (CI) {
992	auto *SE = PSE.getSE();
993	Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
994	for (unsigned Idx = `0`; Idx < CI->arg_size(); ++Idx)
995	if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: Idx, TTI)) {
996	if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: Idx)), L: TheLoop)) {
997	reportVectorizationFailure(
998	DebugMsg: "Found unvectorizable intrinsic",
999	OREMsg: "intrinsic instruction cannot be vectorized",
1000	ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
1001	return false;
1002	}
1003	}
1004	}
1005
1006	// If we found a vectorized variant of a function, note that so LV can
1007	// make better decisions about maximum VF.
1008	if (CI && !VFDatabase::getMappings(CI: *CI).empty())
1009	VecCallVariantsFound = true;
1010
1011	auto CanWidenInstructionTy = [](Instruction const &Inst) {
1012	Type *InstTy = Inst.getType();
1013	if (!isa<StructType>(Val: InstTy))
1014	return canVectorizeTy(Ty: InstTy);
1015
1016	// For now, we only recognize struct values returned from calls where
1017	// all users are extractvalue as vectorizable. All element types of the
1018	// struct must be types that can be widened.
1019	return isa<CallInst>(Val: Inst) && canVectorizeTy(Ty: InstTy) &&
1020	all_of(Range: Inst.users(), P: IsaPred<ExtractValueInst>);
1021	};
1022
1023	// Check that the instruction return type is vectorizable.
1024	// We can't vectorize casts from vector type to scalar type.
1025	// Also, we can't vectorize extractelement instructions.
1026	if (!CanWidenInstructionTy (I) \|\|
1027	(isa<CastInst>(Val: I) &&
1028	!VectorType::isValidElementType(ElemTy: I.getOperand(i: `0`)->getType())) \|\|
1029	isa<ExtractElementInst>(Val: I)) {
1030	reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
1031	OREMsg: "instruction return type cannot be vectorized",
1032	ORETag: "CantVectorizeInstructionReturnType", ORE,
1033	TheLoop, I: &I);
1034	return false;
1035	}
1036
1037	// Check that the stored type is vectorizable.
1038	if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
1039	Type *T = ST->getValueOperand()->getType();
1040	if (!VectorType::isValidElementType(ElemTy: T)) {
1041	reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
1042	ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
1043	return false;
1044	}
1045
1046	// For nontemporal stores, check that a nontemporal vector version is
1047	// supported on the target.
1048	if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1049	// Arbitrarily try a vector of 2 elements.
1050	auto VecTy = FixedVectorType::get(ElementType: T, /NumElts=/*`2`);
1051	assert(VecTy && "did not find vectorized version of stored type");
1052	if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
1053	reportVectorizationFailure(
1054	DebugMsg: "nontemporal store instruction cannot be vectorized",
1055	ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
1056	return false;
1057	}
1058	}
1059
1060	} else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
1061	if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1062	// For nontemporal loads, check that a nontemporal vector version is
1063	// supported on the target (arbitrarily try a vector of 2 elements).
1064	auto VecTy = FixedVectorType::get(ElementType: I.getType(), /NumElts=/*`2`);
1065	assert(VecTy && "did not find vectorized version of load type");
1066	if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
1067	reportVectorizationFailure(
1068	DebugMsg: "nontemporal load instruction cannot be vectorized",
1069	ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
1070	return false;
1071	}
1072	}
1073
1074	// FP instructions can allow unsafe algebra, thus vectorizable by
1075	// non-IEEE-754 compliant SIMD units.
1076	// This applies to floating-point math operations and calls, not memory
1077	// operations, shuffles, or casts, as they don't change precision or
1078	// semantics.
1079	} else if (I.getType()->isFloatingPointTy() && (CI \|\| I.isBinaryOp()) &&
1080	!I.isFast()) {
1081	LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1082	Hints->setPotentiallyUnsafe();
1083	}
1084
1085	// Reduction instructions are allowed to have exit users.
1086	// All other instructions must not have external users.
1087	if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1088	// We can safely vectorize loops where instructions within the loop are
1089	// used outside the loop only if the SCEV predicates within the loop is
1090	// same as outside the loop. Allowing the exit means reusing the SCEV
1091	// outside the loop.
1092	if (PSE.getPredicate().isAlwaysTrue()) {
1093	AllowedExit.insert(Ptr: &I);
1094	return true;
1095	}
1096	reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1097	ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1098	return false;
1099	}
1100
1101	return true;
1102	}
1103
1104	/// Find histogram operations that match high-level code in loops:
1105	/// \code
1106	/// buckets[indices[i]]+=step;
1107	/// \endcode
1108	///
1109	/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1110	/// array the computed histogram. It uses a BinOp to sum all counts, storing
1111	/// them using a loop-variant index Load from the 'indices' input array.
1112	///
1113	/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1114	/// regardless of hardware support. When there is support, it additionally
1115	/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1116	/// used to update histogram in \p HistogramPtrs.
1117	static bool findHistogram(LoadInst LI, StoreInst HSt, Loop *TheLoop,
1118	const PredicatedScalarEvolution &PSE,
1119	SmallVectorImpl<HistogramInfo> &Histograms) {
1120
1121	// Store value must come from a Binary Operation.
1122	Instruction HPtrInstr = nullptr*;
1123	BinaryOperator HBinOp = nullptr*;
1124	if (!match(V: HSt, P: m_Store(ValueOp: m_BinOp(I&: HBinOp), PointerOp: m_Instruction(I&: HPtrInstr))))
1125	return false;
1126
1127	// BinOp must be an Add or a Sub modifying the bucket value by a
1128	// loop invariant amount.
1129	// FIXME: We assume the loop invariant term is on the RHS.
1130	// Fine for an immediate/constant, but maybe not a generic value?
1131	Value HIncVal = nullptr*;
1132	if (!match(V: HBinOp, P: m_Add(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))) &&
1133	!match(V: HBinOp, P: m_Sub(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))))
1134	return false;
1135
1136	// Make sure the increment value is loop invariant.
1137	if (!TheLoop->isLoopInvariant(V: HIncVal))
1138	return false;
1139
1140	// The address to store is calculated through a GEP Instruction.
1141	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: HPtrInstr);
1142	if (!GEP)
1143	return false;
1144
1145	// Restrict address calculation to constant indices except for the last term.
1146	Value HIdx = nullptr*;
1147	for (Value *Index : GEP->indices()) {
1148	if (HIdx)
1149	return false;
1150	if (!isa<ConstantInt>(Val: Index))
1151	HIdx = Index;
1152	}
1153
1154	if (!HIdx)
1155	return false;
1156
1157	// Check that the index is calculated by loading from another array. Ignore
1158	// any extensions.
1159	// FIXME: Support indices from other sources than a linear load from memory?
1160	// We're currently trying to match an operation looping over an array
1161	// of indices, but there could be additional levels of indirection
1162	// in place, or possibly some additional calculation to form the index
1163	// from the loaded data.
1164	Value *VPtrVal;
1165	if (!match(V: HIdx, P: m_ZExtOrSExtOrSelf(Op: m_Load(Op: m_Value(V&: VPtrVal)))))
1166	return false;
1167
1168	// Make sure the index address varies in this loop, not an outer loop.
1169	const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: VPtrVal));
1170	if (!AR \|\| AR->getLoop() != TheLoop)
1171	return false;
1172
1173	// Ensure we'll have the same mask by checking that all parts of the histogram
1174	// (gather load, update, scatter store) are in the same block.
1175	LoadInst *IndexedLoad = cast<LoadInst>(Val: HBinOp->getOperand(i_nocapture: `0`));
1176	BasicBlock *LdBB = IndexedLoad->getParent();
1177	if (LdBB != HBinOp->getParent() \|\| LdBB != HSt->getParent())
1178	return false;
1179
1180	LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1181
1182	// Store the operations that make up the histogram.
1183	Histograms.emplace_back(Args&: IndexedLoad, Args&: HBinOp, Args&: HSt);
1184	return true;
1185	}
1186
1187	bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1188	// For now, we only support an IndirectUnsafe dependency that calculates
1189	// a histogram
1190	if (!EnableHistogramVectorization)
1191	return false;
1192
1193	// Find a single IndirectUnsafe dependency.
1194	const MemoryDepChecker::Dependence IUDep = nullptr*;
1195	const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1196	const auto *Deps = DepChecker.getDependences();
1197	// If there were too many dependences, LAA abandons recording them. We can't
1198	// proceed safely if we don't know what the dependences are.
1199	if (!Deps)
1200	return false;
1201
1202	for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1203	// Ignore dependencies that are either known to be safe or can be
1204	// checked at runtime.
1205	if (MemoryDepChecker::Dependence::isSafeForVectorization(Type: Dep.Type) !=
1206	MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1207	continue;
1208
1209	// We're only interested in IndirectUnsafe dependencies here, where the
1210	// address might come from a load from memory. We also only want to handle
1211	// one such dependency, at least for now.
1212	if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe \|\| IUDep)
1213	return false;
1214
1215	IUDep = &Dep;
1216	}
1217	if (!IUDep)
1218	return false;
1219
1220	// For now only normal loads and stores are supported.
1221	LoadInst *LI = dyn_cast<LoadInst>(Val: IUDep->getSource(DepChecker));
1222	StoreInst *SI = dyn_cast<StoreInst>(Val: IUDep->getDestination(DepChecker));
1223
1224	if (!LI \|\| !SI)
1225	return false;
1226
1227	LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1228	return findHistogram(LI, HSt: SI, TheLoop, PSE: LAI->getPSE(), Histograms);
1229	}
1230
1231	bool LoopVectorizationLegality::canVectorizeMemory() {
1232	LAI = &LAIs.getInfo(L&: *TheLoop);
1233	const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1234	if (LAR) {
1235	ORE->emit(RemarkBuilder: [&]() {
1236	return OptimizationRemarkAnalysis (Hints->vectorizeAnalysisPassName(),
1237	"loop not vectorized: ", *LAR);
1238	});
1239	}
1240
1241	if (!LAI->canVectorizeMemory()) {
1242	if (hasUncountableExitWithSideEffects()) {
1243	reportVectorizationFailure(
1244	DebugMsg: "Cannot vectorize unsafe dependencies in uncountable exit loop with "
1245	"side effects",
1246	ORETag: "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1247	TheLoop);
1248	return false;
1249	}
1250
1251	return canVectorizeIndirectUnsafeDependences();
1252	}
1253
1254	if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1255	reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1256	OREMsg: "write to a loop invariant address could not "
1257	"be vectorized",
1258	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1259	TheLoop);
1260	return false;
1261	}
1262
1263	// We can vectorize stores to invariant address when final reduction value is
1264	// guaranteed to be stored at the end of the loop. Also, if decision to
1265	// vectorize loop is made, runtime checks are added so as to make sure that
1266	// invariant address won't alias with any other objects.
1267	if (!LAI->getStoresToInvariantAddresses().empty()) {
1268	// For each invariant address, check if last stored value is unconditional
1269	// and the address is not calculated inside the loop.
1270	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1271	if (!isInvariantStoreOfReduction(SI))
1272	continue;
1273
1274	if (blockNeedsPredication(BB: SI->getParent())) {
1275	reportVectorizationFailure(
1276	DebugMsg: "We don't allow storing to uniform addresses",
1277	OREMsg: "write of conditional recurring variant value to a loop "
1278	"invariant address could not be vectorized",
1279	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1280	return false;
1281	}
1282
1283	// Invariant address should be defined outside of loop. LICM pass usually
1284	// makes sure it happens, but in rare cases it does not, we do not want
1285	// to overcomplicate vectorization to support this case.
1286	if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1287	if (TheLoop->contains(Inst: Ptr)) {
1288	reportVectorizationFailure(
1289	DebugMsg: "Invariant address is calculated inside the loop",
1290	OREMsg: "write to a loop invariant address could not "
1291	"be vectorized",
1292	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1293	return false;
1294	}
1295	}
1296	}
1297
1298	if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1299	// For each invariant address, check its last stored value is the result
1300	// of one of our reductions.
1301	//
1302	// We do not check if dependence with loads exists because that is already
1303	// checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1304	ScalarEvolution *SE = PSE.getSE();
1305	SmallVector<StoreInst *, `4`> UnhandledStores;
1306	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1307	if (isInvariantStoreOfReduction(SI)) {
1308	// Earlier stores to this address are effectively deadcode.
1309	// With opaque pointers it is possible for one pointer to be used with
1310	// different sizes of stored values:
1311	// store i32 0, ptr %x
1312	// store i8 0, ptr %x
1313	// The latest store doesn't complitely overwrite the first one in the
1314	// example. That is why we have to make sure that types of stored
1315	// values are same.
1316	// TODO: Check that bitwidth of unhandled store is smaller then the
1317	// one that overwrites it and add a test.
1318	erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1319	return storeToSameAddress(SE, A: SI, B: I) &&
1320	I->getValueOperand()->getType() ==
1321	SI->getValueOperand()->getType();
1322	});
1323	continue;
1324	}
1325	UnhandledStores.push_back(Elt: SI);
1326	}
1327
1328	bool IsOK = UnhandledStores.empty();
1329	// TODO: we should also validate against InvariantMemSets.
1330	if (!IsOK) {
1331	reportVectorizationFailure(
1332	DebugMsg: "We don't allow storing to uniform addresses",
1333	OREMsg: "write to a loop invariant address could not "
1334	"be vectorized",
1335	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1336	return false;
1337	}
1338	}
1339	}
1340
1341	PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1342	return true;
1343	}
1344
1345	bool LoopVectorizationLegality::canVectorizeFPMath(
1346	bool EnableStrictReductions) {
1347
1348	// First check if there is any ExactFP math or if we allow reassociations
1349	if (!Requirements->getExactFPInst() \|\| Hints->allowReordering())
1350	return true;
1351
1352	// If the above is false, we have ExactFPMath & do not allow reordering.
1353	// If the EnableStrictReductions flag is set, first check if we have any
1354	// Exact FP induction vars, which we cannot vectorize.
1355	if (!EnableStrictReductions \|\|
1356	any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1357	InductionDescriptor IndDesc = Induction.second;
1358	return IndDesc.getExactFPMathInst();
1359	}))
1360	return false;
1361
1362	// We can now only vectorize if all reductions with Exact FP math also
1363	// have the isOrdered flag set, which indicates that we can move the
1364	// reduction operations in-loop.
1365	return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1366	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1367	return !RdxDesc.hasExactFPMath() \|\| RdxDesc.isOrdered();
1368	}));
1369	}
1370
1371	bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1372	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1373	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1374	return RdxDesc.IntermediateStore == SI;
1375	});
1376	}
1377
1378	bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1379	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1380	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1381	if (!RdxDesc.IntermediateStore)
1382	return false;
1383
1384	ScalarEvolution *SE = PSE.getSE();
1385	Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1386	return V == InvariantAddress \|\|
1387	SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1388	});
1389	}
1390
1391	bool LoopVectorizationLegality::isInductionPhi(const Value V) const* {
1392	Value In0 = const_cast<Value >(V);
1393	PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1394	if (!PN)
1395	return false;
1396
1397	return Inductions.count(Key: PN);
1398	}
1399
1400	const InductionDescriptor *
1401	LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode Phi) const* {
1402	if (!isInductionPhi(V: Phi))
1403	return nullptr;
1404	auto &ID = getInductionVars().find(Key: Phi)->second;
1405	if (ID.getKind() == InductionDescriptor::IK_IntInduction \|\|
1406	ID.getKind() == InductionDescriptor::IK_FpInduction)
1407	return &ID;
1408	return nullptr;
1409	}
1410
1411	const InductionDescriptor *
1412	LoopVectorizationLegality::getPointerInductionDescriptor(PHINode Phi) const* {
1413	if (!isInductionPhi(V: Phi))
1414	return nullptr;
1415	auto &ID = getInductionVars().find(Key: Phi)->second;
1416	if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1417	return &ID;
1418	return nullptr;
1419	}
1420
1421	bool LoopVectorizationLegality::isCastedInductionVariable(
1422	const Value V) const* {
1423	auto *Inst = dyn_cast<Instruction>(Val: V);
1424	return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1425	}
1426
1427	bool LoopVectorizationLegality::isInductionVariable(const Value V) const* {
1428	return isInductionPhi(V) \|\| isCastedInductionVariable(V);
1429	}
1430
1431	bool LoopVectorizationLegality::isFixedOrderRecurrence(
1432	const PHINode Phi) const* {
1433	return FixedOrderRecurrences.count(Ptr: Phi);
1434	}
1435
1436	bool LoopVectorizationLegality::blockNeedsPredication(
1437	const BasicBlock BB) const* {
1438	// When vectorizing early exits, create predicates for the latch block only.
1439	// For a single early exit, it must be a direct predecessor of the latch.
1440	// For multiple early exits, they form a chain where each exiting block
1441	// dominates all subsequent blocks up to the latch.
1442	BasicBlock *Latch = TheLoop->getLoopLatch();
1443	if (hasUncountableEarlyExit())
1444	return BB == Latch;
1445	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1446	}
1447
1448	bool LoopVectorizationLegality::blockCanBePredicated(
1449	BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs,
1450	SmallPtrSetImpl<const Instruction > &MaskedOp) const* {
1451	for (Instruction &I : *BB) {
1452	// We can predicate blocks with calls to assume, as long as we drop them in
1453	// case we flatten the CFG via predication.
1454	if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1455	MaskedOp.insert(Ptr: &I);
1456	continue;
1457	}
1458
1459	// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1460	// TODO: there might be cases that it should block the vectorization. Let's
1461	// ignore those for now.
1462	if (isa<NoAliasScopeDeclInst>(Val: &I))
1463	continue;
1464
1465	// We can allow masked calls if there's at least one vector variant, even
1466	// if we end up scalarizing due to the cost model calculations.
1467	// TODO: Allow other calls if they have appropriate attributes... readonly
1468	// and argmemonly?
1469	if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1470	if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1471	MaskedOp.insert(Ptr: CI);
1472	continue;
1473	}
1474
1475	// Loads are handled via masking (or speculated if safe to do so.)
1476	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1477	if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1478	MaskedOp.insert(Ptr: LI);
1479	continue;
1480	}
1481
1482	// Predicated store requires some form of masking:
1483	// 1) masked store HW instruction,
1484	// 2) emulation via load-blend-store (only if safe and legal to do so,
1485	// be aware on the race conditions), or
1486	// 3) element-by-element predicate check and scalar store.
1487	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1488	MaskedOp.insert(Ptr: SI);
1489	continue;
1490	}
1491
1492	if (I.mayReadFromMemory() \|\| I.mayWriteToMemory() \|\| I.mayThrow())
1493	return false;
1494	}
1495
1496	return true;
1497	}
1498
1499	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1500	if (!EnableIfConversion) {
1501	reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1502	ORETag: "IfConversionDisabled", ORE, TheLoop);
1503	return false;
1504	}
1505
1506	assert(TheLoop->getNumBlocks() > `1` && "Single block loops are vectorizable");
1507
1508	// A list of pointers which are known to be dereferenceable within scope of
1509	// the loop body for each iteration of the loop which executes. That is,
1510	// the memory pointed to can be dereferenced (with the access size implied by
1511	// the value's type) unconditionally within the loop header without
1512	// introducing a new fault.
1513	SmallPtrSet<Value *, `8`> SafePointers;
1514
1515	// Collect safe addresses.
1516	for (BasicBlock *BB : TheLoop->blocks()) {
1517	if (!blockNeedsPredication(BB)) {
1518	for (Instruction &I : *BB)
1519	if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1520	SafePointers.insert(Ptr);
1521	continue;
1522	}
1523
1524	// For a block which requires predication, a address may be safe to access
1525	// in the loop w/o predication if we can prove dereferenceability facts
1526	// sufficient to ensure it'll never fault within the loop. For the moment,
1527	// we restrict this to loads; stores are more complicated due to
1528	// concurrency restrictions.
1529	ScalarEvolution &SE = *PSE.getSE();
1530	SmallVector<const SCEVPredicate *, `4`> Predicates;
1531	for (Instruction &I : *BB) {
1532	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1533
1534	// Make sure we can execute all computations feeding into Ptr in the loop
1535	// w/o triggering UB and that none of the out-of-loop operands are poison.
1536	// We do not need to check if operations inside the loop can produce
1537	// poison due to flags (e.g. due to an inbounds GEP going out of bounds),
1538	// because flags will be dropped when executing them unconditionally.
1539	// TODO: Results could be improved by considering poison-propagation
1540	// properties of visited ops.
1541	auto CanSpeculatePointerOp = [this](Value *Ptr) {
1542	SmallVector<Value *> Worklist = {Ptr};
1543	SmallPtrSet<Value *, `4`> Visited;
1544	while (!Worklist.empty()) {
1545	Value *CurrV = Worklist.pop_back_val();
1546	if (!Visited.insert(Ptr: CurrV).second)
1547	continue;
1548
1549	auto *CurrI = dyn_cast<Instruction>(Val: CurrV);
1550	if (!CurrI \|\| !TheLoop->contains(Inst: CurrI)) {
1551	// If operands from outside the loop may be poison then Ptr may also
1552	// be poison.
1553	if (!isGuaranteedNotToBePoison(V: CurrV, AC,
1554	CtxI: TheLoop->getLoopPredecessor()
1555	->getTerminator()
1556	->getIterator(),
1557	DT))
1558	return false;
1559	continue;
1560	}
1561
1562	// A loaded value may be poison, independent of any flags.
1563	if (isa<LoadInst>(Val: CurrI) && !isGuaranteedNotToBePoison(V: CurrV, AC))
1564	return false;
1565
1566	// For other ops, assume poison can only be introduced via flags,
1567	// which can be dropped.
1568	if (!isa<PHINode>(Val: CurrI) && !isSafeToSpeculativelyExecute(I: CurrI))
1569	return false;
1570	append_range(C&: Worklist, R: CurrI->operands());
1571	}
1572	return true;
1573	};
1574	// Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1575	// that it will consider loops that need guarding by SCEV checks. The
1576	// vectoriser will generate these checks if we decide to vectorise.
1577	if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1578	CanSpeculatePointerOp (LI->getPointerOperand()) &&
1579	isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC,
1580	Predicates: &Predicates))
1581	SafePointers.insert(Ptr: LI->getPointerOperand());
1582	Predicates.clear();
1583	}
1584	}
1585
1586	// Collect the blocks that need predication.
1587	for (BasicBlock *BB : TheLoop->blocks()) {
1588	// We support only branches and switch statements as terminators inside the
1589	// loop.
1590	if (isa<SwitchInst>(Val: BB->getTerminator())) {
1591	if (TheLoop->isLoopExiting(BB)) {
1592	reportVectorizationFailure(DebugMsg: "Loop contains an unsupported switch",
1593	ORETag: "LoopContainsUnsupportedSwitch", ORE,
1594	TheLoop, I: BB->getTerminator());
1595	return false;
1596	}
1597	} else if (!isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator())) {
1598	reportVectorizationFailure(DebugMsg: "Loop contains an unsupported terminator",
1599	ORETag: "LoopContainsUnsupportedTerminator", ORE,
1600	TheLoop, I: BB->getTerminator());
1601	return false;
1602	}
1603
1604	// We must be able to predicate all blocks that need to be predicated.
1605	if (blockNeedsPredication(BB) &&
1606	!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1607	reportVectorizationFailure(
1608	DebugMsg: "Control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1609	ORE, TheLoop, I: BB->getTerminator());
1610	return false;
1611	}
1612	}
1613
1614	// We can if-convert this loop.
1615	return true;
1616	}
1617
1618	// Helper function to canVectorizeLoopNestCFG.
1619	bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1620	bool UseVPlanNativePath) {
1621	assert((UseVPlanNativePath \|\| Lp->isInnermost()) &&
1622	"VPlan-native path is not enabled.");
1623
1624	// TODO: ORE should be improved to show more accurate information when an
1625	// outer loop can't be vectorized because a nested loop is not understood or
1626	// legal. Something like: "outer_loop_location: loop not vectorized:
1627	// (inner_loop_location) loop control flow is not understood by vectorizer".
1628
1629	// Store the result and return it at the end instead of exiting early, in case
1630	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1631	bool Result = true;
1632	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1633
1634	// We must have a loop in canonical form. Loops with indirectbr in them cannot
1635	// be canonicalized.
1636	if (!Lp->getLoopPreheader()) {
1637	reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1638	OREMsg: "loop control flow is not understood by vectorizer",
1639	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1640	if (DoExtraAnalysis)
1641	Result = false;
1642	else
1643	return false;
1644	}
1645
1646	// We must have a single backedge.
1647	if (Lp->getNumBackEdges() != `1`) {
1648	reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1649	OREMsg: "loop control flow is not understood by vectorizer",
1650	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1651	if (DoExtraAnalysis)
1652	Result = false;
1653	else
1654	return false;
1655	}
1656
1657	// The latch must be terminated by a branch.
1658	BasicBlock *Latch = Lp->getLoopLatch();
1659	if (Latch && !isa<UncondBrInst, CondBrInst>(Val: Latch->getTerminator())) {
1660	reportVectorizationFailure(
1661	DebugMsg: "The loop latch terminator is not a UncondBrInst/CondBrInst",
1662	OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1663	ORE, TheLoop);
1664	if (DoExtraAnalysis)
1665	Result = false;
1666	else
1667	return false;
1668	}
1669
1670	return Result;
1671	}
1672
1673	bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1674	Loop Lp, bool* UseVPlanNativePath) {
1675	// Store the result and return it at the end instead of exiting early, in case
1676	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1677	bool Result = true;
1678	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1679	if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1680	if (DoExtraAnalysis)
1681	Result = false;
1682	else
1683	return false;
1684	}
1685
1686	// Recursively check whether the loop control flow of nested loops is
1687	// understood.
1688	for (Loop SubLp : Lp)
1689	if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1690	if (DoExtraAnalysis)
1691	Result = false;
1692	else
1693	return false;
1694	}
1695
1696	return Result;
1697	}
1698
1699	bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1700	BasicBlock *LatchBB = TheLoop->getLoopLatch();
1701	if (!LatchBB) {
1702	reportVectorizationFailure(DebugMsg: "Loop does not have a latch",
1703	OREMsg: "Cannot vectorize early exit loop",
1704	ORETag: "NoLatchEarlyExit", ORE, TheLoop);
1705	return false;
1706	}
1707
1708	if (Reductions.size() \|\| FixedOrderRecurrences.size()) {
1709	reportVectorizationFailure(
1710	DebugMsg: "Found reductions or recurrences in early-exit loop",
1711	OREMsg: "Cannot vectorize early exit loop with reductions or recurrences",
1712	ORETag: "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1713	return false;
1714	}
1715
1716	SmallVector<BasicBlock *, `8`> ExitingBlocks;
1717	TheLoop->getExitingBlocks(ExitingBlocks);
1718
1719	// Keep a record of all the exiting blocks.
1720	SmallVector<const SCEVPredicate *, `4`> Predicates;
1721	SmallVector<BasicBlock *> UncountableExitingBlocks;
1722	for (BasicBlock *BB : ExitingBlocks) {
1723	const SCEV *EC =
1724	PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: BB, Predicates: &Predicates);
1725	if (isa<SCEVCouldNotCompute>(Val: EC)) {
1726	if (size(Range: successors(BB)) != `2`) {
1727	reportVectorizationFailure(
1728	DebugMsg: "Early exiting block does not have exactly two successors",
1729	OREMsg: "Incorrect number of successors from early exiting block",
1730	ORETag: "EarlyExitTooManySuccessors", ORE, TheLoop);
1731	return false;
1732	}
1733
1734	UncountableExitingBlocks.push_back(Elt: BB);
1735	} else
1736	CountableExitingBlocks.push_back(Elt: BB);
1737	}
1738	// We can safely ignore the predicates here because when vectorizing the loop
1739	// the PredicatatedScalarEvolution class will keep track of all predicates
1740	// for each exiting block anyway. This happens when calling
1741	// PSE.getSymbolicMaxBackedgeTakenCount() below.
1742	Predicates.clear();
1743
1744	if (UncountableExitingBlocks.empty()) {
1745	LLVM_DEBUG(dbgs() << "LV: Could not find any uncountable exits");
1746	return false;
1747	}
1748
1749	// The latch block must have a countable exit.
1750	if (isa<SCEVCouldNotCompute>(
1751	Val: PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: LatchBB, Predicates: &Predicates))) {
1752	reportVectorizationFailure(
1753	DebugMsg: "Cannot determine exact exit count for latch block",
1754	OREMsg: "Cannot vectorize early exit loop",
1755	ORETag: "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1756	return false;
1757	}
1758	assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1759	"Latch block not found in list of countable exits!");
1760
1761	// Check to see if there are instructions that could potentially generate
1762	// exceptions or have side-effects.
1763	auto IsSafeOperation = [](Instruction I) -> bool* {
1764	switch (I->getOpcode()) {
1765	case Instruction::Load:
1766	case Instruction::Store:
1767	case Instruction::PHI:
1768	case Instruction::UncondBr:
1769	case Instruction::CondBr:
1770	// These are checked separately.
1771	return true;
1772	default:
1773	return isSafeToSpeculativelyExecute(I);
1774	}
1775	};
1776
1777	bool HasSideEffects = false;
1778	for (auto *BB : TheLoop->blocks())
1779	for (auto &I : *BB) {
1780	if (I.mayWriteToMemory()) {
1781	if (isa<StoreInst>(Val: &I) && cast<StoreInst>(Val: &I)->isSimple()) {
1782	HasSideEffects = true;
1783	continue;
1784	}
1785
1786	// We don't support complex writes to memory.
1787	reportVectorizationFailure(
1788	DebugMsg: "Complex writes to memory unsupported in early exit loops",
1789	OREMsg: "Cannot vectorize early exit loop with complex writes to memory",
1790	ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
1791	return false;
1792	}
1793
1794	if (!IsSafeOperation (&I)) {
1795	reportVectorizationFailure(DebugMsg: "Early exit loop contains operations that "
1796	"cannot be speculatively executed",
1797	ORETag: "UnsafeOperationsEarlyExitLoop", ORE,
1798	TheLoop);
1799	return false;
1800	}
1801	}
1802
1803	SmallVector<LoadInst *, `4`> NonDerefLoads;
1804	// TODO: Handle loops that may fault.
1805	if (!HasSideEffects) {
1806	// Read-only loop.
1807	Predicates.clear();
1808	if (!isReadOnlyLoop(L: TheLoop, SE: PSE.getSE(), DT, AC, NonDereferenceableAndAlignedLoads&: NonDerefLoads,
1809	Predicates: &Predicates)) {
1810	reportVectorizationFailure(
1811	DebugMsg: "Loop may fault", OREMsg: "Cannot vectorize non-read-only early exit loop",
1812	ORETag: "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1813	return false;
1814	}
1815	} else {
1816	// Check all uncountable exiting blocks for movable loads.
1817	for (BasicBlock *ExitingBB : UncountableExitingBlocks) {
1818	if (!canUncountableExitConditionLoadBeMoved(ExitingBlock: ExitingBB))
1819	return false;
1820	}
1821	}
1822
1823	// Check non-dereferenceable loads if any.
1824	for (LoadInst *LI : NonDerefLoads) {
1825	// Only support unit-stride access for now.
1826	int Stride = isConsecutivePtr(AccessTy: LI->getType(), Ptr: LI->getPointerOperand());
1827	if (Stride != `1`) {
1828	reportVectorizationFailure(
1829	DebugMsg: "Loop contains potentially faulting strided load",
1830	OREMsg: "Cannot vectorize early exit loop with "
1831	"strided fault-only-first load",
1832	ORETag: "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
1833	return false;
1834	}
1835	PotentiallyFaultingLoads.insert(Ptr: LI);
1836	LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
1837	<< "\n");
1838	}
1839
1840	[[maybe_unused]] const SCEV *SymbolicMaxBTC =
1841	PSE.getSymbolicMaxBackedgeTakenCount();
1842	// Since we have an exact exit count for the latch and the early exit
1843	// dominates the latch, then this should guarantee a computed SCEV value.
1844	assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1845	"Failed to get symbolic expression for backedge taken count");
1846	LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1847	"backedge taken count: "
1848	<< *SymbolicMaxBTC << `'\n'`);
1849	HasUncountableEarlyExit = true;
1850	UncountableExitWithSideEffects = HasSideEffects;
1851	return true;
1852	}
1853
1854	bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1855	BasicBlock *ExitingBlock) {
1856	// Try to find a load in the critical path for the uncountable exit condition.
1857	// This is currently matching about the simplest form we can, expecting
1858	// only one in-loop load, the result of which is directly compared against
1859	// a loop-invariant value.
1860	// FIXME: We're insisting on a single use for now, because otherwise we will
1861	// need to make PHI nodes for other users. That can be done once the initial
1862	// transform code lands.
1863	auto *Br = cast<CondBrInst>(Val: ExitingBlock->getTerminator());
1864
1865	using namespace llvm::PatternMatch;
1866	Instruction L = nullptr*;
1867	Value Ptr = nullptr*;
1868	Value R = nullptr*;
1869	if (!match(V: Br->getCondition(),
1870	P: m_OneUse(SubPattern: m_ICmp(L: m_OneUse(SubPattern: m_Instruction(I&: L, Match: m_Load(Op: m_Value(V&: Ptr)))),
1871	R: m_Value(V&: R))))) {
1872	reportVectorizationFailure(
1873	DebugMsg: "Early exit loop with store but no supported condition load",
1874	ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1875	return false;
1876	}
1877
1878	// FIXME: Don't rely on operand ordering for the comparison.
1879	if (!TheLoop->isLoopInvariant(V: R)) {
1880	reportVectorizationFailure(
1881	DebugMsg: "Early exit loop with store but no supported condition load",
1882	ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1883	return false;
1884	}
1885
1886	// Make sure that the load address is not loop invariant; we want an
1887	// address calculation that we can rotate to the next vector iteration.
1888	const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: Ptr));
1889	if (!AR \|\| AR->getLoop() != TheLoop \|\| !AR->isAffine()) {
1890	reportVectorizationFailure(
1891	DebugMsg: "Uncountable exit condition depends on load with an address that is "
1892	"not an add recurrence in the loop",
1893	ORETag: "EarlyExitLoadInvariantAddress", ORE, TheLoop);
1894	return false;
1895	}
1896
1897	// FIXME: Support gathers after first-faulting load support lands.
1898	SmallVector<const SCEVPredicate *, `4`> Predicates;
1899	LoadInst *Load = cast<LoadInst>(Val: L);
1900	if (!isDereferenceableAndAlignedInLoop(LI: Load, L: TheLoop, SE&: PSE.getSE(), DT&: DT, AC,
1901	Predicates: &Predicates)) {
1902	reportVectorizationFailure(
1903	DebugMsg: "Loop may fault",
1904	OREMsg: "Cannot vectorize potentially faulting early exit loop",
1905	ORETag: "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1906	return false;
1907	}
1908
1909	ICFLoopSafetyInfo SafetyInfo;
1910	SafetyInfo.computeLoopSafetyInfo(CurLoop: TheLoop);
1911	// We need to know that load will be executed before we can hoist a
1912	// copy out to run just before the first iteration.
1913	if (!SafetyInfo.isGuaranteedToExecute(Inst: *Load, DT, CurLoop: TheLoop)) {
1914	reportVectorizationFailure(
1915	DebugMsg: "Load for uncountable exit not guaranteed to execute",
1916	ORETag: "ConditionalUncountableExitLoad", ORE, TheLoop);
1917	return false;
1918	}
1919
1920	// Prohibit any potential aliasing with any instruction in the loop which
1921	// might store to memory.
1922	// FIXME: Relax this constraint where possible.
1923	for (auto *BB : TheLoop->blocks()) {
1924	for (auto &I : *BB) {
1925	if (&I == Load)
1926	continue;
1927
1928	if (I.mayWriteToMemory()) {
1929	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1930	AliasResult AR = AA->alias(V1: Ptr, V2: SI->getPointerOperand());
1931	if (AR == AliasResult::NoAlias)
1932	continue;
1933	}
1934
1935	reportVectorizationFailure(
1936	DebugMsg: "Cannot determine whether critical uncountable exit load address "
1937	"does not alias with a memory write",
1938	ORETag: "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1939	return false;
1940	}
1941	}
1942	}
1943
1944	return true;
1945	}
1946
1947	bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1948	// Store the result and return it at the end instead of exiting early, in case
1949	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1950	bool Result = true;
1951
1952	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1953	// Check whether the loop-related control flow in the loop nest is expected by
1954	// vectorizer.
1955	if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1956	if (DoExtraAnalysis) {
1957	LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1958	Result = false;
1959	} else {
1960	return false;
1961	}
1962	}
1963
1964	// We need to have a loop header.
1965	LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1966	<< `'\n'`);
1967
1968	// Specific checks for outer loops. We skip the remaining legal checks at this
1969	// point because they don't support outer loops.
1970	if (!TheLoop->isInnermost()) {
1971	assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1972
1973	if (!canVectorizeOuterLoop()) {
1974	reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1975	ORETag: "UnsupportedOuterLoop", ORE, TheLoop);
1976	// TODO: Implement DoExtraAnalysis when subsequent legal checks support
1977	// outer loops.
1978	return false;
1979	}
1980
1981	LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1982	return Result;
1983	}
1984
1985	assert(TheLoop->isInnermost() && "Inner loop expected.");
1986	// Check if we can if-convert non-single-bb loops.
1987	unsigned NumBlocks = TheLoop->getNumBlocks();
1988	if (NumBlocks != `1` && !canVectorizeWithIfConvert()) {
1989	LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1990	if (DoExtraAnalysis)
1991	Result = false;
1992	else
1993	return false;
1994	}
1995
1996	// Check if we can vectorize the instructions and CFG in this loop.
1997	if (!canVectorizeInstrs()) {
1998	LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1999	if (DoExtraAnalysis)
2000	Result = false;
2001	else
2002	return false;
2003	}
2004
2005	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
2006	if (TheLoop->getExitingBlock()) {
2007	reportVectorizationFailure(DebugMsg: "Cannot vectorize uncountable loop",
2008	ORETag: "UnsupportedUncountableLoop", ORE, TheLoop);
2009	if (DoExtraAnalysis)
2010	Result = false;
2011	else
2012	return false;
2013	} else {
2014	if (!isVectorizableEarlyExitLoop()) {
2015	assert(!hasUncountableEarlyExit() &&
2016	!hasUncountableExitWithSideEffects() &&
2017	"Must be false without vectorizable early-exit loop");
2018	if (DoExtraAnalysis)
2019	Result = false;
2020	else
2021	return false;
2022	}
2023	}
2024	}
2025
2026	// Go over each instruction and look at memory deps.
2027	if (!canVectorizeMemory()) {
2028	LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
2029	if (DoExtraAnalysis)
2030	Result = false;
2031	else
2032	return false;
2033	}
2034
2035	// Bail out for state-changing loops with uncountable exits for now.
2036	if (UncountableExitWithSideEffects) {
2037	reportVectorizationFailure(
2038	DebugMsg: "Writes to memory unsupported in early exit loops",
2039	OREMsg: "Cannot vectorize early exit loop with writes to memory",
2040	ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
2041	return false;
2042	}
2043
2044	if (Result) {
2045	LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
2046	<< (LAI->getRuntimePointerChecking()->Need
2047	? " (with a runtime bound check)"
2048	: "")
2049	<< "!\n");
2050	}
2051
2052	unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
2053	if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
2054	SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
2055
2056	if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
2057	LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
2058	"due to SCEVThreshold");
2059	reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
2060	OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
2061	ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
2062	if (DoExtraAnalysis)
2063	Result = false;
2064	else
2065	return false;
2066	}
2067
2068	// Okay! We've done all the tests. If any have failed, return false. Otherwise
2069	// we can vectorize, and at this point we don't have any other mem analysis
2070	// which may limit our maximum vectorization factor, so just return true with
2071	// no restrictions.
2072	return Result;
2073	}
2074
2075	bool LoopVectorizationLegality::canFoldTailByMasking() const {
2076	// The only loops we can vectorize without a scalar epilogue, are loops with
2077	// a bottom-test and a single exiting block. We'd have to handle the fact
2078	// that not every instruction executes on the last iteration. This will
2079	// require a lane mask which varies through the vector loop body. (TODO)
2080	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
2081	LLVM_DEBUG(
2082	dbgs()
2083	<< "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
2084	return false;
2085	}
2086
2087	LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
2088
2089	SmallPtrSet<const Value *, `8`> ReductionLiveOuts;
2090
2091	for (const auto &Reduction : getReductionVars())
2092	ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
2093
2094	// The list of pointers that we can safely read and write to remains empty.
2095	SmallPtrSet<Value *, `8`> SafePointers;
2096
2097	// Check all blocks for predication, including those that ordinarily do not
2098	// need predication such as the header block.
2099	SmallPtrSet<const Instruction *, `8`> TmpMaskedOp;
2100	for (BasicBlock *BB : TheLoop->blocks()) {
2101	if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
2102	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
2103	return false;
2104	}
2105	}
2106
2107	LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
2108
2109	return true;
2110	}
2111
2112	void LoopVectorizationLegality::prepareToFoldTailByMasking() {
2113	// The list of pointers that we can safely read and write to remains empty.
2114	SmallPtrSet<Value *, `8`> SafePointers;
2115
2116	// Mark all blocks for predication, including those that ordinarily do not
2117	// need predication such as the header block.
2118	for (BasicBlock *BB : TheLoop->blocks()) {
2119	[[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp);
2120	assert(R && "Must be able to predicate block when tail-folding.");
2121	}
2122	}
2123
2124	} // namespace llvm
2125

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp