LoopVectorizationLegality.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp]

1	//===- LoopVectorizationLegality.cpp --------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides loop vectorization legality analysis. Original code
10	// resided in LoopVectorize.cpp for a long time.
11	//
12	// At this point, it is implemented as a utility class, not as an analysis
13	// pass. It should be easy to create an analysis pass around it if there
14	// is a need (but D45420 needs to happen first).
15	//
16
17	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18	#include "llvm/Analysis/Loads.h"
19	#include "llvm/Analysis/LoopInfo.h"
20	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
21	#include "llvm/Analysis/TargetLibraryInfo.h"
22	#include "llvm/Analysis/TargetTransformInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/Analysis/VectorUtils.h"
25	#include "llvm/IR/IntrinsicInst.h"
26	#include "llvm/IR/PatternMatch.h"
27	#include "llvm/Transforms/Utils/SizeOpts.h"
28	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
29
30	using namespace llvm;
31	using namespace PatternMatch;
32
33	#define LV_NAME "loop-vectorize"
34	#define DEBUG_TYPE LV_NAME
35
36	static cl::opt<bool>
37	EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
38	cl::desc ("Enable if-conversion during vectorization."));
39
40	static cl::opt<bool>
41	AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
42	cl::desc ("Enable recognition of non-constant strided "
43	"pointer induction variables."));
44
45	namespace llvm {
46	cl::opt<bool>
47	HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
48	cl::desc ("Allow enabling loop hints to reorder "
49	"FP operations during vectorization."));
50	}
51
52	// TODO: Move size-based thresholds out of legality checking, make cost based
53	// decisions instead of hard thresholds.
54	static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
55	"vectorize-scev-check-threshold", cl::init(Val: `16`), cl::Hidden,
56	cl::desc ("The maximum number of SCEV checks allowed."));
57
58	static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
59	"pragma-vectorize-scev-check-threshold", cl::init(Val: `128`), cl::Hidden,
60	cl::desc ("The maximum number of SCEV checks allowed with a "
61	"vectorize(enable) pragma"));
62
63	static cl::opt<LoopVectorizeHints::ScalableForceKind>
64	ForceScalableVectorization(
65	"scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
66	cl::Hidden,
67	cl::desc ("Control whether the compiler can use scalable vectors to "
68	"vectorize a loop"),
69	cl::values(
70	clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
71	"Scalable vectorization is disabled."),
72	clEnumValN(
73	LoopVectorizeHints::SK_PreferScalable, "preferred",
74	"Scalable vectorization is available and favored when the "
75	"cost is inconclusive."),
76	clEnumValN(
77	LoopVectorizeHints::SK_PreferScalable, "on",
78	"Scalable vectorization is available and favored when the "
79	"cost is inconclusive.")));
80
81	/// Maximum vectorization interleave count.
82	static const unsigned MaxInterleaveFactor = `16`;
83
84	namespace llvm {
85
86	bool LoopVectorizeHints::Hint::validate(unsigned Val) {
87	switch (Kind) {
88	case HK_WIDTH:
89	return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
90	case HK_INTERLEAVE:
91	return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
92	case HK_FORCE:
93	return (Val <= `1`);
94	case HK_ISVECTORIZED:
95	case HK_PREDICATE:
96	case HK_SCALABLE:
97	return (Val == `0` \|\| Val == `1`);
98	}
99	return false;
100	}
101
102	LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
103	bool InterleaveOnlyWhenForced,
104	OptimizationRemarkEmitter &ORE,
105	const TargetTransformInfo *TTI)
106	: Width ("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
107	Interleave ("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
108	Force ("vectorize.enable", FK_Undefined, HK_FORCE),
109	IsVectorized ("isvectorized", `0`, HK_ISVECTORIZED),
110	Predicate ("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
111	Scalable ("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
112	TheLoop(L), ORE(ORE) {
113	// Populate values with existing loop metadata.
114	getHintsFromMetadata();
115
116	// force-vector-interleave overrides DisableInterleaving.
117	if (VectorizerParams::isInterleaveForced())
118	Interleave.Value = VectorizerParams::VectorizationInterleave;
119
120	// If the metadata doesn't explicitly specify whether to enable scalable
121	// vectorization, then decide based on the following criteria (increasing
122	// level of priority):
123	// - Target default
124	// - Metadata width
125	// - Force option (always overrides)
126	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
127	if (TTI)
128	Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
129	: SK_FixedWidthOnly;
130
131	if (Width.Value)
132	// If the width is set, but the metadata says nothing about the scalable
133	// property, then assume it concerns only a fixed-width UserVF.
134	// If width is not set, the flag takes precedence.
135	Scalable.Value = SK_FixedWidthOnly;
136	}
137
138	// If the flag is set to force any use of scalable vectors, override the loop
139	// hints.
140	if (ForceScalableVectorization.getValue() !=
141	LoopVectorizeHints::SK_Unspecified)
142	Scalable.Value = ForceScalableVectorization.getValue();
143
144	// Scalable vectorization is disabled if no preference is specified.
145	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
146	Scalable.Value = SK_FixedWidthOnly;
147
148	if (IsVectorized.Value != `1`)
149	// If the vectorization width and interleaving count are both 1 then
150	// consider the loop to have been already vectorized because there's
151	// nothing more that we can do.
152	IsVectorized.Value =
153	getWidth() == ElementCount::getFixed(MinVal: `1`) && getInterleave() == `1`;
154	LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == `1`) dbgs()
155	<< "LV: Interleaving disabled by the pass manager\n");
156	}
157
158	void LoopVectorizeHints::setAlreadyVectorized() {
159	LLVMContext &Context = TheLoop->getHeader()->getContext();
160
161	MDNode *IsVectorizedMD = MDNode::get(
162	Context,
163	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
164	ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt (`32`, `1`)))});
165	MDNode *LoopID = TheLoop->getLoopID();
166	MDNode *NewLoopID =
167	makePostTransformationMetadata(Context, OrigLoopID: LoopID,
168	RemovePrefixes: {Twine (Prefix(), "vectorize.").str(),
169	Twine (Prefix(), "interleave.").str()},
170	AddAttrs: {IsVectorizedMD});
171	TheLoop->setLoopID(NewLoopID);
172
173	// Update internal cache.
174	IsVectorized.Value = `1`;
175	}
176
177	bool LoopVectorizeHints::allowVectorization(
178	Function F, Loop L, bool VectorizeOnlyWhenForced) const {
179	if (getForce() == LoopVectorizeHints::FK_Disabled) {
180	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
181	emitRemarkWithHints();
182	return false;
183	}
184
185	if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
186	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
187	emitRemarkWithHints();
188	return false;
189	}
190
191	if (getIsVectorized() == `1`) {
192	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
193	// FIXME: Add interleave.disable metadata. This will allow
194	// vectorize.disable to be used without disabling the pass and errors
195	// to differentiate between disabled vectorization and a width of 1.
196	ORE.emit(RemarkBuilder: [&]() {
197	return OptimizationRemarkAnalysis (vectorizeAnalysisPassName(),
198	"AllDisabled", L->getStartLoc(),
199	L->getHeader())
200	<< "loop not vectorized: vectorization and interleaving are "
201	"explicitly disabled, or the loop has already been "
202	"vectorized";
203	});
204	return false;
205	}
206
207	return true;
208	}
209
210	void LoopVectorizeHints::emitRemarkWithHints() const {
211	using namespace ore;
212
213	ORE.emit(RemarkBuilder: [&]() {
214	if (Force.Value == LoopVectorizeHints::FK_Disabled)
215	return OptimizationRemarkMissed (LV_NAME, "MissedExplicitlyDisabled",
216	TheLoop->getStartLoc(),
217	TheLoop->getHeader())
218	<< "loop not vectorized: vectorization is explicitly disabled";
219	else {
220	OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
221	TheLoop->getStartLoc(), TheLoop->getHeader());
222	R << "loop not vectorized";
223	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
224	R << " (Force=" << NV ("Force", true);
225	if (Width.Value != `0`)
226	R << ", Vector Width=" << NV ("VectorWidth", getWidth());
227	if (getInterleave() != `0`)
228	R << ", Interleave Count=" << NV ("InterleaveCount", getInterleave());
229	R << ")";
230	}
231	return R;
232	}
233	});
234	}
235
236	const char LoopVectorizeHints::vectorizeAnalysisPassName() const* {
237	if (getWidth() == ElementCount::getFixed(MinVal: `1`))
238	return LV_NAME;
239	if (getForce() == LoopVectorizeHints::FK_Disabled)
240	return LV_NAME;
241	if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
242	return LV_NAME;
243	return OptimizationRemarkAnalysis::AlwaysPrint;
244	}
245
246	bool LoopVectorizeHints::allowReordering() const {
247	// Allow the vectorizer to change the order of operations if enabling
248	// loop hints are provided
249	ElementCount EC = getWidth();
250	return HintsAllowReordering &&
251	(getForce() == LoopVectorizeHints::FK_Enabled \|\|
252	EC.getKnownMinValue() > `1`);
253	}
254
255	void LoopVectorizeHints::getHintsFromMetadata() {
256	MDNode *LoopID = TheLoop->getLoopID();
257	if (!LoopID)
258	return;
259
260	// First operand should refer to the loop id itself.
261	assert(LoopID->getNumOperands() > `0` && "requires at least one operand");
262	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop id");
263
264	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
265	const MDString S = nullptr*;
266	SmallVector<Metadata *, `4`> Args;
267
268	// The expected hint is either a MDString or a MDNode with the first
269	// operand a MDString.
270	if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
271	if (!MD \|\| MD->getNumOperands() == `0`)
272	continue;
273	S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
274	for (unsigned i = `1`, ie = MD->getNumOperands(); i < ie; ++i)
275	Args.push_back(Elt: MD->getOperand(I: i));
276	} else {
277	S = dyn_cast<MDString>(Val: MDO);
278	assert(Args.size() == `0` && "too many arguments for MDString");
279	}
280
281	if (!S)
282	continue;
283
284	// Check if the hint starts with the loop metadata prefix.
285	StringRef Name = S->getString();
286	if (Args.size() == `1`)
287	setHint(Name, Arg: Args [`0`]);
288	}
289	}
290
291	void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
292	if (!Name.starts_with(Prefix: Prefix()))
293	return;
294	Name = Name.substr(Start: Prefix().size(), N: StringRef::npos);
295
296	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
297	if (!C)
298	return;
299	unsigned Val = C->getZExtValue();
300
301	Hint *Hints[] = {&Width, &Interleave, &Force,
302	&IsVectorized, &Predicate, &Scalable};
303	for (auto *H : Hints) {
304	if (Name == H->Name) {
305	if (H->validate(Val))
306	H->Value = Val;
307	else
308	LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
309	break;
310	}
311	}
312	}
313
314	// Return true if the inner loop \p Lp is uniform with regard to the outer loop
315	// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
316	// executing the inner loop will execute the same iterations). This check is
317	// very constrained for now but it will be relaxed in the future. \p Lp is
318	// considered uniform if it meets all the following conditions:
319	// 1) it has a canonical IV (starting from 0 and with stride 1),
320	// 2) its latch terminator is a conditional branch and,
321	// 3) its latch condition is a compare instruction whose operands are the
322	// canonical IV and an OuterLp invariant.
323	// This check doesn't take into account the uniformity of other conditions not
324	// related to the loop latch because they don't affect the loop uniformity.
325	//
326	// NOTE: We decided to keep all these checks and its associated documentation
327	// together so that we can easily have a picture of the current supported loop
328	// nests. However, some of the current checks don't depend on \p OuterLp and
329	// would be redundantly executed for each \p Lp if we invoked this function for
330	// different candidate outer loops. This is not the case for now because we
331	// don't currently have the infrastructure to evaluate multiple candidate outer
332	// loops and \p OuterLp will be a fixed parameter while we only support explicit
333	// outer loop vectorization. It's also very likely that these checks go away
334	// before introducing the aforementioned infrastructure. However, if this is not
335	// the case, we should move the \p OuterLp independent checks to a separate
336	// function that is only executed once for each \p Lp.
337	static bool isUniformLoop(Loop Lp, Loop OuterLp) {
338	assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
339
340	// If Lp is the outer loop, it's uniform by definition.
341	if (Lp == OuterLp)
342	return true;
343	assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
344
345	// 1.
346	PHINode *IV = Lp->getCanonicalInductionVariable();
347	if (!IV) {
348	LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
349	return false;
350	}
351
352	// 2.
353	BasicBlock *Latch = Lp->getLoopLatch();
354	auto *LatchBr = dyn_cast<BranchInst>(Val: Latch->getTerminator());
355	if (!LatchBr \|\| LatchBr->isUnconditional()) {
356	LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
357	return false;
358	}
359
360	// 3.
361	auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
362	if (!LatchCmp) {
363	LLVM_DEBUG(
364	dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
365	return false;
366	}
367
368	Value *CondOp0 = LatchCmp->getOperand(i_nocapture: `0`);
369	Value *CondOp1 = LatchCmp->getOperand(i_nocapture: `1`);
370	Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
371	if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
372	!(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
373	LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
374	return false;
375	}
376
377	return true;
378	}
379
380	// Return true if \p Lp and all its nested loops are uniform with regard to \p
381	// OuterLp.
382	static bool isUniformLoopNest(Loop Lp, Loop OuterLp) {
383	if (!isUniformLoop(Lp, OuterLp))
384	return false;
385
386	// Check if nested loops are uniform.
387	for (Loop SubLp : Lp)
388	if (!isUniformLoopNest(Lp: SubLp, OuterLp))
389	return false;
390
391	return true;
392	}
393
394	static Type convertPointerToIntegerType(const* DataLayout &DL, Type *Ty) {
395	if (Ty->isPointerTy())
396	return DL.getIntPtrType(Ty);
397
398	// It is possible that char's or short's overflow when we ask for the loop's
399	// trip count, work around this by changing the type size.
400	if (Ty->getScalarSizeInBits() < `32`)
401	return Type::getInt32Ty(C&: Ty->getContext());
402
403	return Ty;
404	}
405
406	static Type getWiderType(const* DataLayout &DL, Type Ty0, Type Ty1) {
407	Ty0 = convertPointerToIntegerType(DL, Ty: Ty0);
408	Ty1 = convertPointerToIntegerType(DL, Ty: Ty1);
409	if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
410	return Ty0;
411	return Ty1;
412	}
413
414	/// Check that the instruction has outside loop users and is not an
415	/// identified reduction variable.
416	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
417	SmallPtrSetImpl<Value *> &AllowedExit) {
418	// Reductions, Inductions and non-header phis are allowed to have exit users. All
419	// other instructions must not have external users.
420	if (!AllowedExit.count(Ptr: Inst))
421	// Check that all of the users of the loop are inside the BB.
422	for (User *U : Inst->users()) {
423	Instruction *UI = cast<Instruction>(Val: U);
424	// This user may be a reduction exit value.
425	if (!TheLoop->contains(Inst: UI)) {
426	LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << `'\n'`);
427	return true;
428	}
429	}
430	return false;
431	}
432
433	/// Returns true if A and B have same pointer operands or same SCEVs addresses
434	static bool storeToSameAddress(ScalarEvolution SE, StoreInst A,
435	StoreInst *B) {
436	// Compare store
437	if (A == B)
438	return true;
439
440	// Otherwise Compare pointers
441	Value *APtr = A->getPointerOperand();
442	Value *BPtr = B->getPointerOperand();
443	if (APtr == BPtr)
444	return true;
445
446	// Otherwise compare address SCEVs
447	if (SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr))
448	return true;
449
450	return false;
451	}
452
453	int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
454	Value Ptr) const* {
455	// FIXME: Currently, the set of symbolic strides is sometimes queried before
456	// it's collected. This happens from canVectorizeWithIfConvert, when the
457	// pointer is checked to reference consecutive elements suitable for a
458	// masked access.
459	const auto &Strides =
460	LAI ? LAI->getSymbolicStrides() : DenseMap<Value , const* SCEV *>();
461
462	Function *F = TheLoop->getHeader()->getParent();
463	bool OptForSize = F->hasOptSize() \|\|
464	llvm::shouldOptimizeForSize(BB: TheLoop->getHeader(), PSI, BFI,
465	QueryType: PGSOQueryType::IRPass);
466	bool CanAddPredicate = !OptForSize;
467	int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides,
468	Assume: CanAddPredicate, ShouldCheckWrap: false).value_or(u: `0`);
469	if (Stride == `1` \|\| Stride == -`1`)
470	return Stride;
471	return `0`;
472	}
473
474	bool LoopVectorizationLegality::isInvariant(Value V) const* {
475	return LAI->isInvariant(V);
476	}
477
478	namespace {
479	/// A rewriter to build the SCEVs for each of the VF lanes in the expected
480	/// vectorized loop, which can then be compared to detect their uniformity. This
481	/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
482	/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
483	/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
484	/// uniformity.
485	class SCEVAddRecForUniformityRewriter
486	: public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
487	/// Multiplier to be applied to the step of AddRecs in TheLoop.
488	unsigned StepMultiplier;
489
490	/// Offset to be added to the AddRecs in TheLoop.
491	unsigned Offset;
492
493	/// Loop for which to rewrite AddRecsFor.
494	Loop *TheLoop;
495
496	/// Is any sub-expressions not analyzable w.r.t. uniformity?
497	bool CannotAnalyze = false;
498
499	bool canAnalyze() const { return !CannotAnalyze; }
500
501	public:
502	SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
503	unsigned Offset, Loop *TheLoop)
504	: SCEVRewriteVisitor (SE), StepMultiplier(StepMultiplier), Offset(Offset),
505	TheLoop(TheLoop) {}
506
507	const SCEV visitAddRecExpr(const* SCEVAddRecExpr *Expr) {
508	assert(Expr->getLoop() == TheLoop &&
509	"addrec outside of TheLoop must be invariant and should have been "
510	"handled earlier");
511	// Build a new AddRec by multiplying the step by StepMultiplier and
512	// incrementing the start by Offset step.*
513	Type *Ty = Expr->getType();
514	auto *Step = Expr->getStepRecurrence(SE);
515	if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
516	CannotAnalyze = true;
517	return Expr;
518	}
519	auto *NewStep = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
520	auto *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
521	auto *NewStart = SE.getAddExpr(LHS: Expr->getStart(), RHS: ScaledOffset);
522	return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
523	}
524
525	const SCEV visit(const* SCEV *S) {
526	if (CannotAnalyze \|\| SE.isLoopInvariant(S, L: TheLoop))
527	return S;
528	return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
529	}
530
531	const SCEV visitUnknown(const* SCEVUnknown *S) {
532	if (SE.isLoopInvariant(S, L: TheLoop))
533	return S;
534	// The value could vary across iterations.
535	CannotAnalyze = true;
536	return S;
537	}
538
539	const SCEV visitCouldNotCompute(const* SCEVCouldNotCompute *S) {
540	// Could not analyze the expression.
541	CannotAnalyze = true;
542	return S;
543	}
544
545	static const SCEV rewrite(const* SCEV *S, ScalarEvolution &SE,
546	unsigned StepMultiplier, unsigned Offset,
547	Loop *TheLoop) {
548	/// Bail out if the expression does not contain an UDiv expression.
549	/// Uniform values which are not loop invariant require operations to strip
550	/// out the lowest bits. For now just look for UDivs and use it to avoid
551	/// re-writing UDIV-free expressions for other lanes to limit compile time.
552	if (!SCEVExprContains(Root: S,
553	Pred: [](const SCEV S) { return* isa<SCEVUDivExpr>(Val: S); }))
554	return SE.getCouldNotCompute();
555
556	SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
557	TheLoop);
558	const SCEV *Result = Rewriter.visit(S);
559
560	if (Rewriter.canAnalyze())
561	return Result;
562	return SE.getCouldNotCompute();
563	}
564	};
565
566	} // namespace
567
568	bool LoopVectorizationLegality::isUniform(Value V, ElementCount VF) const* {
569	if (isInvariant(V))
570	return true;
571	if (VF.isScalable())
572	return false;
573	if (VF.isScalar())
574	return true;
575
576	// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
577	// never considered uniform.
578	auto *SE = PSE.getSE();
579	if (!SE->isSCEVable(Ty: V->getType()))
580	return false;
581	const SCEV *S = SE->getSCEV(V);
582
583	// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
584	// lane 0 matches the expressions for all other lanes.
585	unsigned FixedVF = VF.getKnownMinValue();
586	const SCEV *FirstLaneExpr =
587	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: `0`, TheLoop);
588	if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
589	return false;
590
591	// Make sure the expressions for lanes FixedVF-1..1 match the expression for
592	// lane 0. We check lanes in reverse order for compile-time, as frequently
593	// checking the last lane is sufficient to rule out uniformity.
594	return all_of(Range: reverse(C: seq<unsigned>(Begin: `1`, End: FixedVF)), P: [&](unsigned I) {
595	const SCEV *IthLaneExpr =
596	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
597	return FirstLaneExpr == IthLaneExpr;
598	});
599	}
600
601	bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
602	ElementCount VF) const {
603	Value *Ptr = getLoadStorePointerOperand(V: &I);
604	if (!Ptr)
605	return false;
606	// Note: There's nothing inherent which prevents predicated loads and
607	// stores from being uniform. The current lowering simply doesn't handle
608	// it; in particular, the cost model distinguishes scatter/gather from
609	// scalar w/predication, and we currently rely on the scalar path.
610	return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
611	}
612
613	bool LoopVectorizationLegality::canVectorizeOuterLoop() {
614	assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
615	// Store the result and return it at the end instead of exiting early, in case
616	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
617	bool Result = true;
618	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
619
620	for (BasicBlock *BB : TheLoop->blocks()) {
621	// Check whether the BB terminator is a BranchInst. Any other terminator is
622	// not supported yet.
623	auto *Br = dyn_cast<BranchInst>(Val: BB->getTerminator());
624	if (!Br) {
625	reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
626	OREMsg: "loop control flow is not understood by vectorizer",
627	ORETag: "CFGNotUnderstood", ORE, TheLoop);
628	if (DoExtraAnalysis)
629	Result = false;
630	else
631	return false;
632	}
633
634	// Check whether the BranchInst is a supported one. Only unconditional
635	// branches, conditional branches with an outer loop invariant condition or
636	// backedges are supported.
637	// FIXME: We skip these checks when VPlan predication is enabled as we
638	// want to allow divergent branches. This whole check will be removed
639	// once VPlan predication is on by default.
640	if (Br && Br->isConditional() &&
641	!TheLoop->isLoopInvariant(V: Br->getCondition()) &&
642	!LI->isLoopHeader(BB: Br->getSuccessor(i: `0`)) &&
643	!LI->isLoopHeader(BB: Br->getSuccessor(i: `1`))) {
644	reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
645	OREMsg: "loop control flow is not understood by vectorizer",
646	ORETag: "CFGNotUnderstood", ORE, TheLoop);
647	if (DoExtraAnalysis)
648	Result = false;
649	else
650	return false;
651	}
652	}
653
654	// Check whether inner loops are uniform. At this point, we only support
655	// simple outer loops scenarios with uniform nested loops.
656	if (!isUniformLoopNest(Lp: TheLoop /loop nest/,
657	OuterLp: TheLoop /context outer loop/)) {
658	reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
659	OREMsg: "loop control flow is not understood by vectorizer",
660	ORETag: "CFGNotUnderstood", ORE, TheLoop);
661	if (DoExtraAnalysis)
662	Result = false;
663	else
664	return false;
665	}
666
667	// Check whether we are able to set up outer loop induction.
668	if (!setupOuterLoopInductions()) {
669	reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
670	OREMsg: "Unsupported outer loop Phi(s)",
671	ORETag: "UnsupportedPhi", ORE, TheLoop);
672	if (DoExtraAnalysis)
673	Result = false;
674	else
675	return false;
676	}
677
678	return Result;
679	}
680
681	void LoopVectorizationLegality::addInductionPhi(
682	PHINode Phi, const* InductionDescriptor &ID,
683	SmallPtrSetImpl<Value *> &AllowedExit) {
684	Inductions [Phi] = ID;
685
686	// In case this induction also comes with casts that we know we can ignore
687	// in the vectorized loop body, record them here. All casts could be recorded
688	// here for ignoring, but suffices to record only the first (as it is the
689	// only one that may bw used outside the cast sequence).
690	const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
691	if (!Casts.empty())
692	InductionCastsToIgnore.insert(Ptr: *Casts.begin());
693
694	Type *PhiTy = Phi->getType();
695	const DataLayout &DL = Phi->getDataLayout();
696
697	// Get the widest type.
698	if (!PhiTy->isFloatingPointTy()) {
699	if (!WidestIndTy)
700	WidestIndTy = convertPointerToIntegerType(DL, Ty: PhiTy);
701	else
702	WidestIndTy = getWiderType(DL, Ty0: PhiTy, Ty1: WidestIndTy);
703	}
704
705	// Int inductions are special because we only allow one IV.
706	if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
707	ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
708	isa<Constant>(Val: ID.getStartValue()) &&
709	cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
710
711	// Use the phi node with the widest type as induction. Use the last
712	// one if there are multiple (no good reason for doing this other
713	// than it is expedient). We've checked that it begins at zero and
714	// steps by one, so this is a canonical induction variable.
715	if (!PrimaryInduction \|\| PhiTy == WidestIndTy)
716	PrimaryInduction = Phi;
717	}
718
719	// Both the PHI node itself, and the "post-increment" value feeding
720	// back into the PHI node may have external users.
721	// We can allow those uses, except if the SCEVs we have for them rely
722	// on predicates that only hold within the loop, since allowing the exit
723	// currently means re-using this SCEV outside the loop (see PR33706 for more
724	// details).
725	if (PSE.getPredicate().isAlwaysTrue()) {
726	AllowedExit.insert(Ptr: Phi);
727	AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
728	}
729
730	LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
731	}
732
733	bool LoopVectorizationLegality::setupOuterLoopInductions() {
734	BasicBlock *Header = TheLoop->getHeader();
735
736	// Returns true if a given Phi is a supported induction.
737	auto isSupportedPhi = [&](PHINode &Phi) -> bool {
738	InductionDescriptor ID;
739	if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
740	ID.getKind() == InductionDescriptor::IK_IntInduction) {
741	addInductionPhi(Phi: &Phi, ID, AllowedExit);
742	return true;
743	} else {
744	// Bail out for any Phi in the outer loop header that is not a supported
745	// induction.
746	LLVM_DEBUG(
747	dbgs()
748	<< "LV: Found unsupported PHI for outer loop vectorization.\n");
749	return false;
750	}
751	};
752
753	if (llvm::all_of(Range: Header->phis(), P: isSupportedPhi))
754	return true;
755	else
756	return false;
757	}
758
759	/// Checks if a function is scalarizable according to the TLI, in
760	/// the sense that it should be vectorized and then expanded in
761	/// multiple scalar calls. This is represented in the
762	/// TLI via mappings that do not specify a vector name, as in the
763	/// following example:
764	///
765	/// const VecDesc VecIntrinsics[] = {
766	/// {"llvm.phx.abs.i32", "", 4}
767	/// };
768	static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
769	const StringRef ScalarName = CI.getCalledFunction()->getName();
770	bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
771	// Check that all known VFs are not associated to a vector
772	// function, i.e. the vector name is emty.
773	if (Scalarize) {
774	ElementCount WidestFixedVF, WidestScalableVF;
775	TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
776	for (ElementCount VF = ElementCount::getFixed(MinVal: `2`);
777	ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= `2`)
778	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
779	for (ElementCount VF = ElementCount::getScalable(MinVal: `1`);
780	ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= `2`)
781	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
782	assert((WidestScalableVF.isZero() \|\| !Scalarize) &&
783	"Caller may decide to scalarize a variant using a scalable VF");
784	}
785	return Scalarize;
786	}
787
788	bool LoopVectorizationLegality::canVectorizeInstrs() {
789	BasicBlock *Header = TheLoop->getHeader();
790
791	// For each block in the loop.
792	for (BasicBlock *BB : TheLoop->blocks()) {
793	// Scan the instructions in the block and look for hazards.
794	for (Instruction &I : *BB) {
795	if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
796	Type *PhiTy = Phi->getType();
797	// Check that this PHI type is allowed.
798	if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
799	!PhiTy->isPointerTy()) {
800	reportVectorizationFailure(DebugMsg: "Found a non-int non-pointer PHI",
801	OREMsg: "loop control flow is not understood by vectorizer",
802	ORETag: "CFGNotUnderstood", ORE, TheLoop);
803	return false;
804	}
805
806	// If this PHINode is not in the header block, then we know that we
807	// can convert it to select during if-conversion. No need to check if
808	// the PHIs in this block are induction or reduction variables.
809	if (BB != Header) {
810	// Non-header phi nodes that have outside uses can be vectorized. Add
811	// them to the list of allowed exits.
812	// Unsafe cyclic dependencies with header phis are identified during
813	// legalization for reduction, induction and fixed order
814	// recurrences.
815	AllowedExit.insert(Ptr: &I);
816	continue;
817	}
818
819	// We only allow if-converted PHIs with exactly two incoming values.
820	if (Phi->getNumIncomingValues() != `2`) {
821	reportVectorizationFailure(DebugMsg: "Found an invalid PHI",
822	OREMsg: "loop control flow is not understood by vectorizer",
823	ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
824	return false;
825	}
826
827	RecurrenceDescriptor RedDes;
828	if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
829	DT, SE: PSE.getSE())) {
830	Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
831	AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
832	Reductions [Phi] = RedDes;
833	continue;
834	}
835
836	// We prevent matching non-constant strided pointer IVS to preserve
837	// historical vectorizer behavior after a generalization of the
838	// IVDescriptor code. The intent is to remove this check, but we
839	// have to fix issues around code quality for such loops first.
840	auto isDisallowedStridedPointerInduction =
841	[](const InductionDescriptor &ID) {
842	if (AllowStridedPointerIVs)
843	return false;
844	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
845	ID.getConstIntStepValue() == nullptr;
846	};
847
848	// TODO: Instead of recording the AllowedExit, it would be good to
849	// record the complementary set: NotAllowedExit. These include (but may
850	// not be limited to):
851	// 1. Reduction phis as they represent the one-before-last value, which
852	// is not available when vectorized
853	// 2. Induction phis and increment when SCEV predicates cannot be used
854	// outside the loop - see addInductionPhi
855	// 3. Non-Phis with outside uses when SCEV predicates cannot be used
856	// outside the loop - see call to hasOutsideLoopUser in the non-phi
857	// handling below
858	// 4. FixedOrderRecurrence phis that can possibly be handled by
859	// extraction.
860	// By recording these, we can then reason about ways to vectorize each
861	// of these NotAllowedExit.
862	InductionDescriptor ID;
863	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
864	!isDisallowedStridedPointerInduction (ID)) {
865	addInductionPhi(Phi, ID, AllowedExit);
866	Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
867	continue;
868	}
869
870	if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
871	AllowedExit.insert(Ptr: Phi);
872	FixedOrderRecurrences.insert(Ptr: Phi);
873	continue;
874	}
875
876	// As a last resort, coerce the PHI to a AddRec expression
877	// and re-try classifying it a an induction PHI.
878	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
879	!isDisallowedStridedPointerInduction (ID)) {
880	addInductionPhi(Phi, ID, AllowedExit);
881	continue;
882	}
883
884	reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
885	OREMsg: "value that could not be identified as "
886	"reduction is used outside the loop",
887	ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop, I: Phi);
888	return false;
889	} // end of PHI handling
890
891	// We handle calls that:
892	// Are debug info intrinsics.*
893	// Have a mapping to an IR intrinsic.*
894	// Have a vector version available.*
895	auto *CI = dyn_cast<CallInst>(Val: &I);
896
897	if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
898	!isa<DbgInfoIntrinsic>(Val: CI) &&
899	!(CI->getCalledFunction() && TLI &&
900	(!VFDatabase::getMappings(CI: *CI).empty() \|\|
901	isTLIScalarize(TLI: TLI, CI: CI)))) {
902	// If the call is a recognized math libary call, it is likely that
903	// we can vectorize it given loosened floating-point constraints.
904	LibFunc Func;
905	bool IsMathLibCall =
906	TLI && CI->getCalledFunction() &&
907	CI->getType()->isFloatingPointTy() &&
908	TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
909	TLI->hasOptimizedCodeGen(F: Func);
910
911	if (IsMathLibCall) {
912	// TODO: Ideally, we should not use clang-specific language here,
913	// but it's hard to provide meaningful yet generic advice.
914	// Also, should this be guarded by allowExtraAnalysis() and/or be part
915	// of the returned info from isFunctionVectorizable()?
916	reportVectorizationFailure(
917	DebugMsg: "Found a non-intrinsic callsite",
918	OREMsg: "library call cannot be vectorized. "
919	"Try compiling with -fno-math-errno, -ffast-math, "
920	"or similar flags",
921	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
922	} else {
923	reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
924	OREMsg: "call instruction cannot be vectorized",
925	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
926	}
927	return false;
928	}
929
930	// Some intrinsics have scalar arguments and should be same in order for
931	// them to be vectorized (i.e. loop invariant).
932	if (CI) {
933	auto *SE = PSE.getSE();
934	Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
935	for (unsigned i = `0`, e = CI->arg_size(); i != e; ++i)
936	if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: i)) {
937	if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: i)), L: TheLoop)) {
938	reportVectorizationFailure(DebugMsg: "Found unvectorizable intrinsic",
939	OREMsg: "intrinsic instruction cannot be vectorized",
940	ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
941	return false;
942	}
943	}
944	}
945
946	// If we found a vectorized variant of a function, note that so LV can
947	// make better decisions about maximum VF.
948	if (CI && !VFDatabase::getMappings(CI: *CI).empty())
949	VecCallVariantsFound = true;
950
951	// Check that the instruction return type is vectorizable.
952	// Also, we can't vectorize extractelement instructions.
953	if ((!VectorType::isValidElementType(ElemTy: I.getType()) &&
954	!I.getType()->isVoidTy()) \|\|
955	isa<ExtractElementInst>(Val: I)) {
956	reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
957	OREMsg: "instruction return type cannot be vectorized",
958	ORETag: "CantVectorizeInstructionReturnType", ORE, TheLoop, I: &I);
959	return false;
960	}
961
962	// Check that the stored type is vectorizable.
963	if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
964	Type *T = ST->getValueOperand()->getType();
965	if (!VectorType::isValidElementType(ElemTy: T)) {
966	reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
967	OREMsg: "store instruction cannot be vectorized",
968	ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
969	return false;
970	}
971
972	// For nontemporal stores, check that a nontemporal vector version is
973	// supported on the target.
974	if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
975	// Arbitrarily try a vector of 2 elements.
976	auto VecTy = FixedVectorType::get(ElementType: T, /NumElts=/*`2`);
977	assert(VecTy && "did not find vectorized version of stored type");
978	if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
979	reportVectorizationFailure(
980	DebugMsg: "nontemporal store instruction cannot be vectorized",
981	OREMsg: "nontemporal store instruction cannot be vectorized",
982	ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
983	return false;
984	}
985	}
986
987	} else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
988	if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
989	// For nontemporal loads, check that a nontemporal vector version is
990	// supported on the target (arbitrarily try a vector of 2 elements).
991	auto VecTy = FixedVectorType::get(ElementType: I.getType(), /NumElts=/*`2`);
992	assert(VecTy && "did not find vectorized version of load type");
993	if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
994	reportVectorizationFailure(
995	DebugMsg: "nontemporal load instruction cannot be vectorized",
996	OREMsg: "nontemporal load instruction cannot be vectorized",
997	ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
998	return false;
999	}
1000	}
1001
1002	// FP instructions can allow unsafe algebra, thus vectorizable by
1003	// non-IEEE-754 compliant SIMD units.
1004	// This applies to floating-point math operations and calls, not memory
1005	// operations, shuffles, or casts, as they don't change precision or
1006	// semantics.
1007	} else if (I.getType()->isFloatingPointTy() && (CI \|\| I.isBinaryOp()) &&
1008	!I.isFast()) {
1009	LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1010	Hints->setPotentiallyUnsafe();
1011	}
1012
1013	// Reduction instructions are allowed to have exit users.
1014	// All other instructions must not have external users.
1015	if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1016	// We can safely vectorize loops where instructions within the loop are
1017	// used outside the loop only if the SCEV predicates within the loop is
1018	// same as outside the loop. Allowing the exit means reusing the SCEV
1019	// outside the loop.
1020	if (PSE.getPredicate().isAlwaysTrue()) {
1021	AllowedExit.insert(Ptr: &I);
1022	continue;
1023	}
1024	reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1025	OREMsg: "value cannot be used outside the loop",
1026	ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1027	return false;
1028	}
1029	} // next instr.
1030	}
1031
1032	if (!PrimaryInduction) {
1033	if (Inductions.empty()) {
1034	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1035	OREMsg: "loop induction variable could not be identified",
1036	ORETag: "NoInductionVariable", ORE, TheLoop);
1037	return false;
1038	} else if (!WidestIndTy) {
1039	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1040	OREMsg: "integer loop induction variable could not be identified",
1041	ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
1042	return false;
1043	} else {
1044	LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1045	}
1046	}
1047
1048	// Now we know the widest induction type, check if our found induction
1049	// is the same size. If it's not, unset it here and InnerLoopVectorizer
1050	// will create another.
1051	if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
1052	PrimaryInduction = nullptr;
1053
1054	return true;
1055	}
1056
1057	bool LoopVectorizationLegality::canVectorizeMemory() {
1058	LAI = &LAIs.getInfo(L&: *TheLoop);
1059	const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1060	if (LAR) {
1061	ORE->emit(RemarkBuilder: [&]() {
1062	return OptimizationRemarkAnalysis (Hints->vectorizeAnalysisPassName(),
1063	"loop not vectorized: ", *LAR);
1064	});
1065	}
1066
1067	if (!LAI->canVectorizeMemory())
1068	return false;
1069
1070	if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1071	reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1072	OREMsg: "write to a loop invariant address could not "
1073	"be vectorized",
1074	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1075	TheLoop);
1076	return false;
1077	}
1078
1079	// We can vectorize stores to invariant address when final reduction value is
1080	// guaranteed to be stored at the end of the loop. Also, if decision to
1081	// vectorize loop is made, runtime checks are added so as to make sure that
1082	// invariant address won't alias with any other objects.
1083	if (!LAI->getStoresToInvariantAddresses().empty()) {
1084	// For each invariant address, check if last stored value is unconditional
1085	// and the address is not calculated inside the loop.
1086	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1087	if (!isInvariantStoreOfReduction(SI))
1088	continue;
1089
1090	if (blockNeedsPredication(BB: SI->getParent())) {
1091	reportVectorizationFailure(
1092	DebugMsg: "We don't allow storing to uniform addresses",
1093	OREMsg: "write of conditional recurring variant value to a loop "
1094	"invariant address could not be vectorized",
1095	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1096	return false;
1097	}
1098
1099	// Invariant address should be defined outside of loop. LICM pass usually
1100	// makes sure it happens, but in rare cases it does not, we do not want
1101	// to overcomplicate vectorization to support this case.
1102	if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1103	if (TheLoop->contains(Inst: Ptr)) {
1104	reportVectorizationFailure(
1105	DebugMsg: "Invariant address is calculated inside the loop",
1106	OREMsg: "write to a loop invariant address could not "
1107	"be vectorized",
1108	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1109	return false;
1110	}
1111	}
1112	}
1113
1114	if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1115	// For each invariant address, check its last stored value is the result
1116	// of one of our reductions.
1117	//
1118	// We do not check if dependence with loads exists because that is already
1119	// checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1120	ScalarEvolution *SE = PSE.getSE();
1121	SmallVector<StoreInst *, `4`> UnhandledStores;
1122	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1123	if (isInvariantStoreOfReduction(SI)) {
1124	// Earlier stores to this address are effectively deadcode.
1125	// With opaque pointers it is possible for one pointer to be used with
1126	// different sizes of stored values:
1127	// store i32 0, ptr %x
1128	// store i8 0, ptr %x
1129	// The latest store doesn't complitely overwrite the first one in the
1130	// example. That is why we have to make sure that types of stored
1131	// values are same.
1132	// TODO: Check that bitwidth of unhandled store is smaller then the
1133	// one that overwrites it and add a test.
1134	erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1135	return storeToSameAddress(SE, A: SI, B: I) &&
1136	I->getValueOperand()->getType() ==
1137	SI->getValueOperand()->getType();
1138	});
1139	continue;
1140	}
1141	UnhandledStores.push_back(Elt: SI);
1142	}
1143
1144	bool IsOK = UnhandledStores.empty();
1145	// TODO: we should also validate against InvariantMemSets.
1146	if (!IsOK) {
1147	reportVectorizationFailure(
1148	DebugMsg: "We don't allow storing to uniform addresses",
1149	OREMsg: "write to a loop invariant address could not "
1150	"be vectorized",
1151	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1152	return false;
1153	}
1154	}
1155	}
1156
1157	PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1158	return true;
1159	}
1160
1161	bool LoopVectorizationLegality::canVectorizeFPMath(
1162	bool EnableStrictReductions) {
1163
1164	// First check if there is any ExactFP math or if we allow reassociations
1165	if (!Requirements->getExactFPInst() \|\| Hints->allowReordering())
1166	return true;
1167
1168	// If the above is false, we have ExactFPMath & do not allow reordering.
1169	// If the EnableStrictReductions flag is set, first check if we have any
1170	// Exact FP induction vars, which we cannot vectorize.
1171	if (!EnableStrictReductions \|\|
1172	any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1173	InductionDescriptor IndDesc = Induction.second;
1174	return IndDesc.getExactFPMathInst();
1175	}))
1176	return false;
1177
1178	// We can now only vectorize if all reductions with Exact FP math also
1179	// have the isOrdered flag set, which indicates that we can move the
1180	// reduction operations in-loop.
1181	return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1182	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1183	return !RdxDesc.hasExactFPMath() \|\| RdxDesc.isOrdered();
1184	}));
1185	}
1186
1187	bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1188	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1189	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1190	return RdxDesc.IntermediateStore == SI;
1191	});
1192	}
1193
1194	bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1195	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1196	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1197	if (!RdxDesc.IntermediateStore)
1198	return false;
1199
1200	ScalarEvolution *SE = PSE.getSE();
1201	Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1202	return V == InvariantAddress \|\|
1203	SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1204	});
1205	}
1206
1207	bool LoopVectorizationLegality::isInductionPhi(const Value V) const* {
1208	Value In0 = const_cast<Value >(V);
1209	PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1210	if (!PN)
1211	return false;
1212
1213	return Inductions.count(Key: PN);
1214	}
1215
1216	const InductionDescriptor *
1217	LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode Phi) const* {
1218	if (!isInductionPhi(V: Phi))
1219	return nullptr;
1220	auto &ID = getInductionVars().find(Key: Phi)->second;
1221	if (ID.getKind() == InductionDescriptor::IK_IntInduction \|\|
1222	ID.getKind() == InductionDescriptor::IK_FpInduction)
1223	return &ID;
1224	return nullptr;
1225	}
1226
1227	const InductionDescriptor *
1228	LoopVectorizationLegality::getPointerInductionDescriptor(PHINode Phi) const* {
1229	if (!isInductionPhi(V: Phi))
1230	return nullptr;
1231	auto &ID = getInductionVars().find(Key: Phi)->second;
1232	if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1233	return &ID;
1234	return nullptr;
1235	}
1236
1237	bool LoopVectorizationLegality::isCastedInductionVariable(
1238	const Value V) const* {
1239	auto *Inst = dyn_cast<Instruction>(Val: V);
1240	return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1241	}
1242
1243	bool LoopVectorizationLegality::isInductionVariable(const Value V) const* {
1244	return isInductionPhi(V) \|\| isCastedInductionVariable(V);
1245	}
1246
1247	bool LoopVectorizationLegality::isFixedOrderRecurrence(
1248	const PHINode Phi) const* {
1249	return FixedOrderRecurrences.count(Ptr: Phi);
1250	}
1251
1252	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock BB) const* {
1253	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1254	}
1255
1256	bool LoopVectorizationLegality::blockCanBePredicated(
1257	BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs,
1258	SmallPtrSetImpl<const Instruction > &MaskedOp) const* {
1259	for (Instruction &I : *BB) {
1260	// We can predicate blocks with calls to assume, as long as we drop them in
1261	// case we flatten the CFG via predication.
1262	if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1263	MaskedOp.insert(Ptr: &I);
1264	continue;
1265	}
1266
1267	// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1268	// TODO: there might be cases that it should block the vectorization. Let's
1269	// ignore those for now.
1270	if (isa<NoAliasScopeDeclInst>(Val: &I))
1271	continue;
1272
1273	// We can allow masked calls if there's at least one vector variant, even
1274	// if we end up scalarizing due to the cost model calculations.
1275	// TODO: Allow other calls if they have appropriate attributes... readonly
1276	// and argmemonly?
1277	if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1278	if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1279	MaskedOp.insert(Ptr: CI);
1280	continue;
1281	}
1282
1283	// Loads are handled via masking (or speculated if safe to do so.)
1284	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1285	if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1286	MaskedOp.insert(Ptr: LI);
1287	continue;
1288	}
1289
1290	// Predicated store requires some form of masking:
1291	// 1) masked store HW instruction,
1292	// 2) emulation via load-blend-store (only if safe and legal to do so,
1293	// be aware on the race conditions), or
1294	// 3) element-by-element predicate check and scalar store.
1295	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1296	MaskedOp.insert(Ptr: SI);
1297	continue;
1298	}
1299
1300	if (I.mayReadFromMemory() \|\| I.mayWriteToMemory() \|\| I.mayThrow())
1301	return false;
1302	}
1303
1304	return true;
1305	}
1306
1307	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1308	if (!EnableIfConversion) {
1309	reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1310	OREMsg: "if-conversion is disabled",
1311	ORETag: "IfConversionDisabled",
1312	ORE, TheLoop);
1313	return false;
1314	}
1315
1316	assert(TheLoop->getNumBlocks() > `1` && "Single block loops are vectorizable");
1317
1318	// A list of pointers which are known to be dereferenceable within scope of
1319	// the loop body for each iteration of the loop which executes. That is,
1320	// the memory pointed to can be dereferenced (with the access size implied by
1321	// the value's type) unconditionally within the loop header without
1322	// introducing a new fault.
1323	SmallPtrSet<Value *, `8`> SafePointers;
1324
1325	// Collect safe addresses.
1326	for (BasicBlock *BB : TheLoop->blocks()) {
1327	if (!blockNeedsPredication(BB)) {
1328	for (Instruction &I : *BB)
1329	if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1330	SafePointers.insert(Ptr);
1331	continue;
1332	}
1333
1334	// For a block which requires predication, a address may be safe to access
1335	// in the loop w/o predication if we can prove dereferenceability facts
1336	// sufficient to ensure it'll never fault within the loop. For the moment,
1337	// we restrict this to loads; stores are more complicated due to
1338	// concurrency restrictions.
1339	ScalarEvolution &SE = *PSE.getSE();
1340	for (Instruction &I : *BB) {
1341	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1342	if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1343	isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC))
1344	SafePointers.insert(Ptr: LI->getPointerOperand());
1345	}
1346	}
1347
1348	// Collect the blocks that need predication.
1349	for (BasicBlock *BB : TheLoop->blocks()) {
1350	// We don't support switch statements inside loops.
1351	if (!isa<BranchInst>(Val: BB->getTerminator())) {
1352	reportVectorizationFailure(DebugMsg: "Loop contains a switch statement",
1353	OREMsg: "loop contains a switch statement",
1354	ORETag: "LoopContainsSwitch", ORE, TheLoop,
1355	I: BB->getTerminator());
1356	return false;
1357	}
1358
1359	// We must be able to predicate all blocks that need to be predicated.
1360	if (blockNeedsPredication(BB) &&
1361	!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1362	reportVectorizationFailure(
1363	DebugMsg: "Control flow cannot be substituted for a select",
1364	OREMsg: "control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1365	ORE, TheLoop, I: BB->getTerminator());
1366	return false;
1367	}
1368	}
1369
1370	// We can if-convert this loop.
1371	return true;
1372	}
1373
1374	// Helper function to canVectorizeLoopNestCFG.
1375	bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1376	bool UseVPlanNativePath) {
1377	assert((UseVPlanNativePath \|\| Lp->isInnermost()) &&
1378	"VPlan-native path is not enabled.");
1379
1380	// TODO: ORE should be improved to show more accurate information when an
1381	// outer loop can't be vectorized because a nested loop is not understood or
1382	// legal. Something like: "outer_loop_location: loop not vectorized:
1383	// (inner_loop_location) loop control flow is not understood by vectorizer".
1384
1385	// Store the result and return it at the end instead of exiting early, in case
1386	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1387	bool Result = true;
1388	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1389
1390	// We must have a loop in canonical form. Loops with indirectbr in them cannot
1391	// be canonicalized.
1392	if (!Lp->getLoopPreheader()) {
1393	reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1394	OREMsg: "loop control flow is not understood by vectorizer",
1395	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1396	if (DoExtraAnalysis)
1397	Result = false;
1398	else
1399	return false;
1400	}
1401
1402	// We must have a single backedge.
1403	if (Lp->getNumBackEdges() != `1`) {
1404	reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1405	OREMsg: "loop control flow is not understood by vectorizer",
1406	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1407	if (DoExtraAnalysis)
1408	Result = false;
1409	else
1410	return false;
1411	}
1412
1413	return Result;
1414	}
1415
1416	bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1417	Loop Lp, bool* UseVPlanNativePath) {
1418	// Store the result and return it at the end instead of exiting early, in case
1419	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1420	bool Result = true;
1421	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1422	if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1423	if (DoExtraAnalysis)
1424	Result = false;
1425	else
1426	return false;
1427	}
1428
1429	// Recursively check whether the loop control flow of nested loops is
1430	// understood.
1431	for (Loop SubLp : Lp)
1432	if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1433	if (DoExtraAnalysis)
1434	Result = false;
1435	else
1436	return false;
1437	}
1438
1439	return Result;
1440	}
1441
1442	bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1443	// Store the result and return it at the end instead of exiting early, in case
1444	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1445	bool Result = true;
1446
1447	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1448	// Check whether the loop-related control flow in the loop nest is expected by
1449	// vectorizer.
1450	if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1451	if (DoExtraAnalysis)
1452	Result = false;
1453	else
1454	return false;
1455	}
1456
1457	// We need to have a loop header.
1458	LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1459	<< `'\n'`);
1460
1461	// Specific checks for outer loops. We skip the remaining legal checks at this
1462	// point because they don't support outer loops.
1463	if (!TheLoop->isInnermost()) {
1464	assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1465
1466	if (!canVectorizeOuterLoop()) {
1467	reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1468	OREMsg: "unsupported outer loop",
1469	ORETag: "UnsupportedOuterLoop",
1470	ORE, TheLoop);
1471	// TODO: Implement DoExtraAnalysis when subsequent legal checks support
1472	// outer loops.
1473	return false;
1474	}
1475
1476	LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1477	return Result;
1478	}
1479
1480	assert(TheLoop->isInnermost() && "Inner loop expected.");
1481	// Check if we can if-convert non-single-bb loops.
1482	unsigned NumBlocks = TheLoop->getNumBlocks();
1483	if (NumBlocks != `1` && !canVectorizeWithIfConvert()) {
1484	LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1485	if (DoExtraAnalysis)
1486	Result = false;
1487	else
1488	return false;
1489	}
1490
1491	// Check if we can vectorize the instructions and CFG in this loop.
1492	if (!canVectorizeInstrs()) {
1493	LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1494	if (DoExtraAnalysis)
1495	Result = false;
1496	else
1497	return false;
1498	}
1499
1500	// Go over each instruction and look at memory deps.
1501	if (!canVectorizeMemory()) {
1502	LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1503	if (DoExtraAnalysis)
1504	Result = false;
1505	else
1506	return false;
1507	}
1508
1509	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
1510	reportVectorizationFailure(DebugMsg: "could not determine number of loop iterations",
1511	OREMsg: "could not determine number of loop iterations",
1512	ORETag: "CantComputeNumberOfIterations", ORE, TheLoop);
1513	if (DoExtraAnalysis)
1514	Result = false;
1515	else
1516	return false;
1517	}
1518
1519	LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1520	<< (LAI->getRuntimePointerChecking()->Need
1521	? " (with a runtime bound check)"
1522	: "")
1523	<< "!\n");
1524
1525	unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1526	if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1527	SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1528
1529	if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
1530	reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
1531	OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
1532	ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
1533	if (DoExtraAnalysis)
1534	Result = false;
1535	else
1536	return false;
1537	}
1538
1539	// Okay! We've done all the tests. If any have failed, return false. Otherwise
1540	// we can vectorize, and at this point we don't have any other mem analysis
1541	// which may limit our maximum vectorization factor, so just return true with
1542	// no restrictions.
1543	return Result;
1544	}
1545
1546	bool LoopVectorizationLegality::canFoldTailByMasking() const {
1547
1548	LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1549
1550	SmallPtrSet<const Value *, `8`> ReductionLiveOuts;
1551
1552	for (const auto &Reduction : getReductionVars())
1553	ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
1554
1555	// TODO: handle non-reduction outside users when tail is folded by masking.
1556	for (auto *AE : AllowedExit) {
1557	// Check that all users of allowed exit values are inside the loop or
1558	// are the live-out of a reduction.
1559	if (ReductionLiveOuts.count(Ptr: AE))
1560	continue;
1561	for (User *U : AE->users()) {
1562	Instruction *UI = cast<Instruction>(Val: U);
1563	if (TheLoop->contains(Inst: UI))
1564	continue;
1565	LLVM_DEBUG(
1566	dbgs()
1567	<< "LV: Cannot fold tail by masking, loop has an outside user for "
1568	<< *UI << "\n");
1569	return false;
1570	}
1571	}
1572
1573	for (const auto &Entry : getInductionVars()) {
1574	PHINode *OrigPhi = Entry.first;
1575	for (User *U : OrigPhi->users()) {
1576	auto *UI = cast<Instruction>(Val: U);
1577	if (!TheLoop->contains(Inst: UI)) {
1578	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1579	"outside user for "
1580	<< *UI << "\n");
1581	return false;
1582	}
1583	}
1584	}
1585
1586	// The list of pointers that we can safely read and write to remains empty.
1587	SmallPtrSet<Value *, `8`> SafePointers;
1588
1589	// Check all blocks for predication, including those that ordinarily do not
1590	// need predication such as the header block.
1591	SmallPtrSet<const Instruction *, `8`> TmpMaskedOp;
1592	for (BasicBlock *BB : TheLoop->blocks()) {
1593	if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
1594	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1595	return false;
1596	}
1597	}
1598
1599	LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1600
1601	return true;
1602	}
1603
1604	void LoopVectorizationLegality::prepareToFoldTailByMasking() {
1605	// The list of pointers that we can safely read and write to remains empty.
1606	SmallPtrSet<Value *, `8`> SafePointers;
1607
1608	// Mark all blocks for predication, including those that ordinarily do not
1609	// need predication such as the header block.
1610	for (BasicBlock *BB : TheLoop->blocks()) {
1611	[[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp);
1612	assert(R && "Must be able to predicate block when tail-folding.");
1613	}
1614	}
1615
1616	} // namespace llvm
1617

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp