1//===- LoopVectorizationLegality.cpp --------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides loop vectorization legality analysis. Original code
10// resided in LoopVectorize.cpp for a long time.
11//
12// At this point, it is implemented as a utility class, not as an analysis
13// pass. It should be easy to create an analysis pass around it if there
14// is a need (but D45420 needs to happen first).
15//
16
17#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18#include "llvm/Analysis/AliasAnalysis.h"
19#include "llvm/Analysis/Loads.h"
20#include "llvm/Analysis/LoopInfo.h"
21#include "llvm/Analysis/MustExecute.h"
22#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23#include "llvm/Analysis/ScalarEvolutionExpressions.h"
24#include "llvm/Analysis/TargetLibraryInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/ValueTracking.h"
27#include "llvm/Analysis/VectorUtils.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/IntrinsicInst.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/Transforms/Utils/SizeOpts.h"
32#include "llvm/Transforms/Vectorize/LoopVectorize.h"
33
34using namespace llvm;
35using namespace PatternMatch;
36
37#define LV_NAME "loop-vectorize"
38#define DEBUG_TYPE LV_NAME
39
40static cl::opt<bool>
41 EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
42 cl::desc("Enable if-conversion during vectorization."));
43
44static cl::opt<bool>
45AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
46 cl::desc("Enable recognition of non-constant strided "
47 "pointer induction variables."));
48
49static cl::opt<bool>
50 HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
51 cl::desc("Allow enabling loop hints to reorder "
52 "FP operations during vectorization."));
53
54// TODO: Move size-based thresholds out of legality checking, make cost based
55// decisions instead of hard thresholds.
56static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
57 "vectorize-scev-check-threshold", cl::init(Val: 16), cl::Hidden,
58 cl::desc("The maximum number of SCEV checks allowed."));
59
60static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
61 "pragma-vectorize-scev-check-threshold", cl::init(Val: 128), cl::Hidden,
62 cl::desc("The maximum number of SCEV checks allowed with a "
63 "vectorize(enable) pragma"));
64
65static cl::opt<LoopVectorizeHints::ScalableForceKind>
66 ForceScalableVectorization(
67 "scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
68 cl::Hidden,
69 cl::desc("Control whether the compiler can use scalable vectors to "
70 "vectorize a loop"),
71 cl::values(
72 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
73 "Scalable vectorization is disabled."),
74 clEnumValN(
75 LoopVectorizeHints::SK_PreferScalable, "preferred",
76 "Scalable vectorization is available and favored when the "
77 "cost is inconclusive."),
78 clEnumValN(
79 LoopVectorizeHints::SK_PreferScalable, "on",
80 "Scalable vectorization is available and favored when the "
81 "cost is inconclusive.")));
82
83static cl::opt<bool> EnableHistogramVectorization(
84 "enable-histogram-loop-vectorization", cl::init(Val: false), cl::Hidden,
85 cl::desc("Enables autovectorization of some loops containing histograms"));
86
87/// Maximum vectorization interleave count.
88static const unsigned MaxInterleaveFactor = 16;
89
90namespace llvm {
91
92bool LoopVectorizeHints::Hint::validate(unsigned Val) {
93 switch (Kind) {
94 case HK_WIDTH:
95 return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
96 case HK_INTERLEAVE:
97 return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
98 case HK_FORCE:
99 return (Val <= 1);
100 case HK_ISVECTORIZED:
101 case HK_PREDICATE:
102 case HK_SCALABLE:
103 return (Val == 0 || Val == 1);
104 }
105 return false;
106}
107
108LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
109 bool InterleaveOnlyWhenForced,
110 OptimizationRemarkEmitter &ORE,
111 const TargetTransformInfo *TTI)
112 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
113 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
114 Force("vectorize.enable", FK_Undefined, HK_FORCE),
115 IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
116 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
117 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
118 TheLoop(L), ORE(ORE) {
119 // Populate values with existing loop metadata.
120 getHintsFromMetadata();
121
122 // force-vector-interleave overrides DisableInterleaving.
123 if (VectorizerParams::isInterleaveForced())
124 Interleave.Value = VectorizerParams::VectorizationInterleave;
125
126 // If the metadata doesn't explicitly specify whether to enable scalable
127 // vectorization, then decide based on the following criteria (increasing
128 // level of priority):
129 // - Target default
130 // - Metadata width
131 // - Force option (always overrides)
132 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
133 if (TTI)
134 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
135 : SK_FixedWidthOnly;
136
137 if (Width.Value)
138 // If the width is set, but the metadata says nothing about the scalable
139 // property, then assume it concerns only a fixed-width UserVF.
140 // If width is not set, the flag takes precedence.
141 Scalable.Value = SK_FixedWidthOnly;
142 }
143
144 // If the flag is set to force any use of scalable vectors, override the loop
145 // hints.
146 if (ForceScalableVectorization.getValue() !=
147 LoopVectorizeHints::SK_Unspecified)
148 Scalable.Value = ForceScalableVectorization.getValue();
149
150 // Scalable vectorization is disabled if no preference is specified.
151 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
152 Scalable.Value = SK_FixedWidthOnly;
153
154 if (IsVectorized.Value != 1)
155 // If the vectorization width and interleaving count are both 1 then
156 // consider the loop to have been already vectorized because there's
157 // nothing more that we can do.
158 IsVectorized.Value =
159 getWidth() == ElementCount::getFixed(MinVal: 1) && getInterleave() == 1;
160 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
161 << "LV: Interleaving disabled by the pass manager\n");
162}
163
164void LoopVectorizeHints::setAlreadyVectorized() {
165 LLVMContext &Context = TheLoop->getHeader()->getContext();
166
167 MDNode *IsVectorizedMD = MDNode::get(
168 Context,
169 MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
170 ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt(32, 1)))});
171 MDNode *LoopID = TheLoop->getLoopID();
172 MDNode *NewLoopID =
173 makePostTransformationMetadata(Context, OrigLoopID: LoopID,
174 RemovePrefixes: {Twine(Prefix(), "vectorize.").str(),
175 Twine(Prefix(), "interleave.").str()},
176 AddAttrs: {IsVectorizedMD});
177 TheLoop->setLoopID(NewLoopID);
178
179 // Update internal cache.
180 IsVectorized.Value = 1;
181}
182
183void LoopVectorizeHints::reportDisallowedVectorization(
184 const StringRef DebugMsg, const StringRef RemarkName,
185 const StringRef RemarkMsg, const Loop *L) const {
186 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: " << DebugMsg << ".\n");
187 ORE.emit(OptDiag: OptimizationRemarkMissed(LV_NAME, RemarkName, L->getStartLoc(),
188 L->getHeader())
189 << "loop not vectorized: " << RemarkMsg);
190}
191
192bool LoopVectorizeHints::allowVectorization(
193 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
194 if (getForce() == LoopVectorizeHints::FK_Disabled) {
195 if (Force.Value == LoopVectorizeHints::FK_Disabled) {
196 reportDisallowedVectorization(DebugMsg: "#pragma vectorize disable",
197 RemarkName: "MissedExplicitlyDisabled",
198 RemarkMsg: "vectorization is explicitly disabled", L);
199 } else if (hasDisableAllTransformsHint(L)) {
200 reportDisallowedVectorization(DebugMsg: "loop hasDisableAllTransformsHint",
201 RemarkName: "MissedTransformsDisabled",
202 RemarkMsg: "loop transformations are disabled", L);
203 } else {
204 llvm_unreachable("loop vect disabled for an unknown reason");
205 }
206 return false;
207 }
208
209 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
210 reportDisallowedVectorization(
211 DebugMsg: "VectorizeOnlyWhenForced is set, and no #pragma vectorize enable",
212 RemarkName: "MissedForceOnly", RemarkMsg: "only vectorizing loops that explicitly request it",
213 L);
214 return false;
215 }
216
217 if (getIsVectorized() == 1) {
218 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
219 // FIXME: Add interleave.disable metadata. This will allow
220 // vectorize.disable to be used without disabling the pass and errors
221 // to differentiate between disabled vectorization and a width of 1.
222 ORE.emit(RemarkBuilder: [&]() {
223 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
224 "AllDisabled", L->getStartLoc(),
225 L->getHeader())
226 << "loop not vectorized: vectorization and interleaving are "
227 "explicitly disabled, or the loop has already been "
228 "vectorized";
229 });
230 return false;
231 }
232
233 return true;
234}
235
236void LoopVectorizeHints::emitRemarkWithHints() const {
237 using namespace ore;
238
239 ORE.emit(RemarkBuilder: [&]() {
240 if (Force.Value == LoopVectorizeHints::FK_Disabled)
241 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
242 TheLoop->getStartLoc(),
243 TheLoop->getHeader())
244 << "loop not vectorized: vectorization is explicitly disabled";
245
246 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
247 TheLoop->getHeader());
248 R << "loop not vectorized";
249 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
250 R << " (Force=" << NV("Force", true);
251 if (Width.Value != 0)
252 R << ", Vector Width=" << NV("VectorWidth", getWidth());
253 if (getInterleave() != 0)
254 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
255 R << ")";
256 }
257 return R;
258 });
259}
260
261const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
262 if (getWidth() == ElementCount::getFixed(MinVal: 1))
263 return LV_NAME;
264 if (getForce() == LoopVectorizeHints::FK_Disabled)
265 return LV_NAME;
266 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
267 return LV_NAME;
268 return OptimizationRemarkAnalysis::AlwaysPrint;
269}
270
271bool LoopVectorizeHints::allowReordering() const {
272 // Allow the vectorizer to change the order of operations if enabling
273 // loop hints are provided
274 ElementCount EC = getWidth();
275 return HintsAllowReordering &&
276 (getForce() == LoopVectorizeHints::FK_Enabled ||
277 EC.getKnownMinValue() > 1);
278}
279
280void LoopVectorizeHints::getHintsFromMetadata() {
281 MDNode *LoopID = TheLoop->getLoopID();
282 if (!LoopID)
283 return;
284
285 // First operand should refer to the loop id itself.
286 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
287 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
288
289 for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
290 const MDString *S = nullptr;
291 SmallVector<Metadata *, 4> Args;
292
293 // The expected hint is either a MDString or a MDNode with the first
294 // operand a MDString.
295 if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
296 if (!MD || MD->getNumOperands() == 0)
297 continue;
298 S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
299 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
300 Args.push_back(Elt: MD->getOperand(I: Idx));
301 } else {
302 S = dyn_cast<MDString>(Val: MDO);
303 assert(Args.size() == 0 && "too many arguments for MDString");
304 }
305
306 if (!S)
307 continue;
308
309 // Check if the hint starts with the loop metadata prefix.
310 StringRef Name = S->getString();
311 if (Args.size() == 1)
312 setHint(Name, Arg: Args[0]);
313 }
314}
315
316void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
317 if (!Name.consume_front(Prefix: Prefix()))
318 return;
319
320 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
321 if (!C)
322 return;
323 unsigned Val = C->getZExtValue();
324
325 Hint *Hints[] = {&Width, &Interleave, &Force,
326 &IsVectorized, &Predicate, &Scalable};
327 for (auto *H : Hints) {
328 if (Name == H->Name) {
329 if (H->validate(Val))
330 H->Value = Val;
331 else
332 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
333 break;
334 }
335 }
336}
337
338// Return true if the inner loop \p Lp is uniform with regard to the outer loop
339// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
340// executing the inner loop will execute the same iterations). This check is
341// very constrained for now but it will be relaxed in the future. \p Lp is
342// considered uniform if it meets all the following conditions:
343// 1) it has a canonical IV (starting from 0 and with stride 1),
344// 2) its latch terminator is a conditional branch and,
345// 3) its latch condition is a compare instruction whose operands are the
346// canonical IV and an OuterLp invariant.
347// This check doesn't take into account the uniformity of other conditions not
348// related to the loop latch because they don't affect the loop uniformity.
349//
350// NOTE: We decided to keep all these checks and its associated documentation
351// together so that we can easily have a picture of the current supported loop
352// nests. However, some of the current checks don't depend on \p OuterLp and
353// would be redundantly executed for each \p Lp if we invoked this function for
354// different candidate outer loops. This is not the case for now because we
355// don't currently have the infrastructure to evaluate multiple candidate outer
356// loops and \p OuterLp will be a fixed parameter while we only support explicit
357// outer loop vectorization. It's also very likely that these checks go away
358// before introducing the aforementioned infrastructure. However, if this is not
359// the case, we should move the \p OuterLp independent checks to a separate
360// function that is only executed once for each \p Lp.
361static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
362 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
363
364 // If Lp is the outer loop, it's uniform by definition.
365 if (Lp == OuterLp)
366 return true;
367 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
368
369 // 1.
370 PHINode *IV = Lp->getCanonicalInductionVariable();
371 if (!IV) {
372 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
373 return false;
374 }
375
376 // 2.
377 BasicBlock *Latch = Lp->getLoopLatch();
378 auto *LatchBr = dyn_cast<CondBrInst>(Val: Latch->getTerminator());
379 if (!LatchBr) {
380 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
381 return false;
382 }
383
384 // 3.
385 auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
386 if (!LatchCmp) {
387 LLVM_DEBUG(
388 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
389 return false;
390 }
391
392 Value *CondOp0 = LatchCmp->getOperand(i_nocapture: 0);
393 Value *CondOp1 = LatchCmp->getOperand(i_nocapture: 1);
394 Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
395 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
396 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
397 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
398 return false;
399 }
400
401 return true;
402}
403
404// Return true if \p Lp and all its nested loops are uniform with regard to \p
405// OuterLp.
406static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
407 if (!isUniformLoop(Lp, OuterLp))
408 return false;
409
410 // Check if nested loops are uniform.
411 for (Loop *SubLp : *Lp)
412 if (!isUniformLoopNest(Lp: SubLp, OuterLp))
413 return false;
414
415 return true;
416}
417
418static IntegerType *getInductionIntegerTy(const DataLayout &DL, Type *Ty) {
419 assert(Ty->isIntOrPtrTy() && "Expected integer or pointer type");
420
421 if (Ty->isPointerTy())
422 return DL.getIntPtrType(C&: Ty->getContext(), AddressSpace: Ty->getPointerAddressSpace());
423
424 // It is possible that char's or short's overflow when we ask for the loop's
425 // trip count, work around this by changing the type size.
426 if (Ty->getScalarSizeInBits() < 32)
427 return Type::getInt32Ty(C&: Ty->getContext());
428
429 return cast<IntegerType>(Val: Ty);
430}
431
432static IntegerType *getWiderInductionTy(const DataLayout &DL, Type *Ty0,
433 Type *Ty1) {
434 IntegerType *TyA = getInductionIntegerTy(DL, Ty: Ty0);
435 IntegerType *TyB = getInductionIntegerTy(DL, Ty: Ty1);
436 return TyA->getScalarSizeInBits() > TyB->getScalarSizeInBits() ? TyA : TyB;
437}
438
439/// Check that the instruction has outside loop users and is not an
440/// identified reduction variable.
441static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
442 SmallPtrSetImpl<Value *> &AllowedExit) {
443 // Reductions, Inductions and non-header phis are allowed to have exit users. All
444 // other instructions must not have external users.
445 if (!AllowedExit.count(Ptr: Inst))
446 // Check that all of the users of the loop are inside the BB.
447 for (User *U : Inst->users()) {
448 Instruction *UI = cast<Instruction>(Val: U);
449 // This user may be a reduction exit value.
450 if (!TheLoop->contains(Inst: UI)) {
451 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
452 return true;
453 }
454 }
455 return false;
456}
457
458/// Returns true if A and B have same pointer operands or same SCEVs addresses
459static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
460 StoreInst *B) {
461 // Compare store
462 if (A == B)
463 return true;
464
465 // Otherwise Compare pointers
466 Value *APtr = A->getPointerOperand();
467 Value *BPtr = B->getPointerOperand();
468 if (APtr == BPtr)
469 return true;
470
471 // Otherwise compare address SCEVs
472 return SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr);
473}
474
475int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
476 Value *Ptr) const {
477 // FIXME: Currently, the set of symbolic strides is sometimes queried before
478 // it's collected. This happens from canVectorizeWithIfConvert, when the
479 // pointer is checked to reference consecutive elements suitable for a
480 // masked access.
481 const auto &Strides =
482 LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>();
483
484 int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
485 Assume: AllowRuntimeSCEVChecks, ShouldCheckWrap: false)
486 .value_or(u: 0);
487 if (Stride == 1 || Stride == -1)
488 return Stride;
489 return 0;
490}
491
492bool LoopVectorizationLegality::isInvariant(Value *V) const {
493 return LAI->isInvariant(V);
494}
495
496namespace {
497/// A rewriter to build the SCEVs for each of the VF lanes in the expected
498/// vectorized loop, which can then be compared to detect their uniformity. This
499/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
500/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
501/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
502/// uniformity.
503class SCEVAddRecForUniformityRewriter
504 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
505 /// Multiplier to be applied to the step of AddRecs in TheLoop.
506 unsigned StepMultiplier;
507
508 /// Offset to be added to the AddRecs in TheLoop.
509 unsigned Offset;
510
511 /// Loop for which to rewrite AddRecsFor.
512 Loop *TheLoop;
513
514 /// Is any sub-expressions not analyzable w.r.t. uniformity?
515 bool CannotAnalyze = false;
516
517 bool canAnalyze() const { return !CannotAnalyze; }
518
519public:
520 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
521 unsigned Offset, Loop *TheLoop)
522 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
523 TheLoop(TheLoop) {}
524
525 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
526 assert(Expr->getLoop() == TheLoop &&
527 "addrec outside of TheLoop must be invariant and should have been "
528 "handled earlier");
529 // Build a new AddRec by multiplying the step by StepMultiplier and
530 // incrementing the start by Offset * step.
531 Type *Ty = Expr->getType();
532 const SCEV *Step = Expr->getStepRecurrence(SE);
533 if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
534 CannotAnalyze = true;
535 return Expr;
536 }
537 const SCEV *NewStep =
538 SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
539 const SCEV *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
540 const SCEV *NewStart =
541 SE.getAddExpr(LHS: Expr->getStart(), RHS: SCEVUse(ScaledOffset));
542 return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
543 }
544
545 const SCEV *visit(const SCEV *S) {
546 if (CannotAnalyze || SE.isLoopInvariant(S, L: TheLoop))
547 return S;
548 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
549 }
550
551 const SCEV *visitUnknown(const SCEVUnknown *S) {
552 if (SE.isLoopInvariant(S, L: TheLoop))
553 return S;
554 // The value could vary across iterations.
555 CannotAnalyze = true;
556 return S;
557 }
558
559 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
560 // Could not analyze the expression.
561 CannotAnalyze = true;
562 return S;
563 }
564
565 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
566 unsigned StepMultiplier, unsigned Offset,
567 Loop *TheLoop) {
568 /// Bail out if the expression does not contain an UDiv expression.
569 /// Uniform values which are not loop invariant require operations to strip
570 /// out the lowest bits. For now just look for UDivs and use it to avoid
571 /// re-writing UDIV-free expressions for other lanes to limit compile time.
572 if (!SCEVExprContains(Root: S,
573 Pred: [](const SCEV *S) { return isa<SCEVUDivExpr>(Val: S); }))
574 return SE.getCouldNotCompute();
575
576 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
577 TheLoop);
578 const SCEV *Result = Rewriter.visit(S);
579
580 if (Rewriter.canAnalyze())
581 return Result;
582 return SE.getCouldNotCompute();
583 }
584};
585
586} // namespace
587
588bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
589 if (isInvariant(V))
590 return true;
591 if (VF.isScalable())
592 return false;
593 if (VF.isScalar())
594 return true;
595
596 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
597 // never considered uniform.
598 auto *SE = PSE.getSE();
599 if (!SE->isSCEVable(Ty: V->getType()))
600 return false;
601 const SCEV *S = SE->getSCEV(V);
602
603 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
604 // lane 0 matches the expressions for all other lanes.
605 unsigned FixedVF = VF.getKnownMinValue();
606 const SCEV *FirstLaneExpr =
607 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: 0, TheLoop);
608 if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
609 return false;
610
611 // Make sure the expressions for lanes FixedVF-1..1 match the expression for
612 // lane 0. We check lanes in reverse order for compile-time, as frequently
613 // checking the last lane is sufficient to rule out uniformity.
614 return all_of(Range: reverse(C: seq<unsigned>(Begin: 1, End: FixedVF)), P: [&](unsigned I) {
615 const SCEV *IthLaneExpr =
616 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
617 return FirstLaneExpr == IthLaneExpr;
618 });
619}
620
621bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
622 ElementCount VF) const {
623 Value *Ptr = getLoadStorePointerOperand(V: &I);
624 if (!Ptr)
625 return false;
626 // Note: There's nothing inherent which prevents predicated loads and
627 // stores from being uniform. The current lowering simply doesn't handle
628 // it; in particular, the cost model distinguishes scatter/gather from
629 // scalar w/predication, and we currently rely on the scalar path.
630 return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
631}
632
633bool LoopVectorizationLegality::canVectorizeOuterLoop() {
634 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
635 // Store the result and return it at the end instead of exiting early, in case
636 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
637 bool Result = true;
638 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
639
640 for (BasicBlock *BB : TheLoop->blocks()) {
641 // Check whether the BB terminator is a branch. Any other terminator is
642 // not supported yet.
643 Instruction *Term = BB->getTerminator();
644 if (!isa<UncondBrInst, CondBrInst>(Val: Term)) {
645 reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
646 OREMsg: "loop control flow is not understood by vectorizer",
647 ORETag: "CFGNotUnderstood", ORE, TheLoop);
648 if (DoExtraAnalysis)
649 Result = false;
650 else
651 return false;
652 }
653
654 // Check whether the branch is a supported one. Only unconditional
655 // branches, conditional branches with an outer loop invariant condition or
656 // backedges are supported.
657 // FIXME: We skip these checks when VPlan predication is enabled as we
658 // want to allow divergent branches. This whole check will be removed
659 // once VPlan predication is on by default.
660 auto *Br = dyn_cast<CondBrInst>(Val: Term);
661 if (Br && !TheLoop->isLoopInvariant(V: Br->getCondition()) &&
662 !LI->isLoopHeader(BB: Br->getSuccessor(i: 0)) &&
663 !LI->isLoopHeader(BB: Br->getSuccessor(i: 1))) {
664 reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
665 OREMsg: "loop control flow is not understood by vectorizer",
666 ORETag: "CFGNotUnderstood", ORE, TheLoop);
667 if (DoExtraAnalysis)
668 Result = false;
669 else
670 return false;
671 }
672 }
673
674 // Check whether inner loops are uniform. At this point, we only support
675 // simple outer loops scenarios with uniform nested loops.
676 if (!isUniformLoopNest(Lp: TheLoop /*loop nest*/,
677 OuterLp: TheLoop /*context outer loop*/)) {
678 reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
679 OREMsg: "loop control flow is not understood by vectorizer",
680 ORETag: "CFGNotUnderstood", ORE, TheLoop);
681 if (DoExtraAnalysis)
682 Result = false;
683 else
684 return false;
685 }
686
687 // Check whether we are able to set up outer loop induction.
688 if (!setupOuterLoopInductions()) {
689 reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
690 ORETag: "UnsupportedPhi", ORE, TheLoop);
691 if (DoExtraAnalysis)
692 Result = false;
693 else
694 return false;
695 }
696
697 return Result;
698}
699
700void LoopVectorizationLegality::addInductionPhi(
701 PHINode *Phi, const InductionDescriptor &ID,
702 SmallPtrSetImpl<Value *> &AllowedExit) {
703 Inductions[Phi] = ID;
704
705 // In case this induction also comes with casts that we know we can ignore
706 // in the vectorized loop body, record them here. All casts could be recorded
707 // here for ignoring, but suffices to record only the first (as it is the
708 // only one that may bw used outside the cast sequence).
709 ArrayRef<Instruction *> Casts = ID.getCastInsts();
710 if (!Casts.empty())
711 InductionCastsToIgnore.insert(Ptr: *Casts.begin());
712
713 Type *PhiTy = Phi->getType();
714 const DataLayout &DL = Phi->getDataLayout();
715
716 assert((PhiTy->isIntOrPtrTy() || PhiTy->isFloatingPointTy()) &&
717 "Expected int, ptr, or FP induction phi type");
718
719 // Get the widest type.
720 if (PhiTy->isIntOrPtrTy()) {
721 if (!WidestIndTy)
722 WidestIndTy = getInductionIntegerTy(DL, Ty: PhiTy);
723 else
724 WidestIndTy = getWiderInductionTy(DL, Ty0: PhiTy, Ty1: WidestIndTy);
725 }
726
727 // Int inductions are special because we only allow one IV.
728 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
729 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
730 isa<Constant>(Val: ID.getStartValue()) &&
731 cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
732
733 // Use the phi node with the widest type as induction. Use the last
734 // one if there are multiple (no good reason for doing this other
735 // than it is expedient). We've checked that it begins at zero and
736 // steps by one, so this is a canonical induction variable.
737 if (!PrimaryInduction || PhiTy == WidestIndTy)
738 PrimaryInduction = Phi;
739 }
740
741 // Both the PHI node itself, and the "post-increment" value feeding
742 // back into the PHI node may have external users.
743 // We can allow those uses, except if the SCEVs we have for them rely
744 // on predicates that only hold within the loop, since allowing the exit
745 // currently means re-using this SCEV outside the loop (see PR33706 for more
746 // details).
747 if (PSE.getPredicate().isAlwaysTrue()) {
748 AllowedExit.insert(Ptr: Phi);
749 AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
750 }
751
752 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
753}
754
755bool LoopVectorizationLegality::setupOuterLoopInductions() {
756 BasicBlock *Header = TheLoop->getHeader();
757
758 // Returns true if a given Phi is a supported induction.
759 auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
760 InductionDescriptor ID;
761 if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
762 ID.getKind() == InductionDescriptor::IK_IntInduction) {
763 addInductionPhi(Phi: &Phi, ID, AllowedExit);
764 return true;
765 }
766 // Bail out for any Phi in the outer loop header that is not a supported
767 // induction.
768 LLVM_DEBUG(
769 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
770 return false;
771 };
772
773 return llvm::all_of(Range: Header->phis(), P: IsSupportedPhi);
774}
775
776/// Checks if a function is scalarizable according to the TLI, in
777/// the sense that it should be vectorized and then expanded in
778/// multiple scalar calls. This is represented in the
779/// TLI via mappings that do not specify a vector name, as in the
780/// following example:
781///
782/// const VecDesc VecIntrinsics[] = {
783/// {"llvm.phx.abs.i32", "", 4}
784/// };
785static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
786 const StringRef ScalarName = CI.getCalledFunction()->getName();
787 bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
788 // Check that all known VFs are not associated to a vector
789 // function, i.e. the vector name is emty.
790 if (Scalarize) {
791 ElementCount WidestFixedVF, WidestScalableVF;
792 TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
793 for (ElementCount VF = ElementCount::getFixed(MinVal: 2);
794 ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= 2)
795 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
796 for (ElementCount VF = ElementCount::getScalable(MinVal: 1);
797 ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= 2)
798 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
799 assert((WidestScalableVF.isZero() || !Scalarize) &&
800 "Caller may decide to scalarize a variant using a scalable VF");
801 }
802 return Scalarize;
803}
804
805bool LoopVectorizationLegality::canVectorizeInstrs() {
806 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
807 bool Result = true;
808
809 // For each block in the loop.
810 for (BasicBlock *BB : TheLoop->blocks()) {
811 // Scan the instructions in the block and look for hazards.
812 for (Instruction &I : *BB) {
813 Result &= canVectorizeInstr(I);
814 if (!DoExtraAnalysis && !Result)
815 return false;
816 }
817 }
818
819 if (!PrimaryInduction) {
820 if (Inductions.empty()) {
821 reportVectorizationFailure(
822 DebugMsg: "Did not find one integer induction var",
823 OREMsg: "loop induction variable could not be identified",
824 ORETag: "NoInductionVariable", ORE, TheLoop);
825 return false;
826 }
827 if (!WidestIndTy) {
828 reportVectorizationFailure(
829 DebugMsg: "Did not find one integer induction var",
830 OREMsg: "integer loop induction variable could not be identified",
831 ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
832 return false;
833 }
834 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
835 }
836
837 // Now we know the widest induction type, check if our found induction
838 // is the same size. If it's not, unset it here and InnerLoopVectorizer
839 // will create another.
840 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
841 PrimaryInduction = nullptr;
842
843 return Result;
844}
845
846bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
847 BasicBlock *BB = I.getParent();
848 BasicBlock *Header = TheLoop->getHeader();
849
850 if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
851 Type *PhiTy = Phi->getType();
852 // Check that this PHI type is allowed.
853 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
854 !PhiTy->isPointerTy()) {
855 reportVectorizationFailure(
856 DebugMsg: "Found a non-int non-pointer PHI",
857 OREMsg: "loop control flow is not understood by vectorizer",
858 ORETag: "CFGNotUnderstood", ORE, TheLoop);
859 return false;
860 }
861
862 // If this PHINode is not in the header block, then we know that we
863 // can convert it to select during if-conversion. No need to check if
864 // the PHIs in this block are induction or reduction variables.
865 if (BB != Header) {
866 // Non-header phi nodes that have outside uses can be vectorized. Add
867 // them to the list of allowed exits.
868 // Unsafe cyclic dependencies with header phis are identified during
869 // legalization for reduction, induction and fixed order
870 // recurrences.
871 AllowedExit.insert(Ptr: &I);
872 return true;
873 }
874
875 // We only allow if-converted PHIs with exactly two incoming values.
876 if (Phi->getNumIncomingValues() != 2) {
877 reportVectorizationFailure(
878 DebugMsg: "Found an invalid PHI",
879 OREMsg: "loop control flow is not understood by vectorizer",
880 ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
881 return false;
882 }
883
884 RecurrenceDescriptor RedDes;
885 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
886 SE: PSE.getSE())) {
887 Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
888 AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
889 Reductions[Phi] = std::move(RedDes);
890 assert((!RedDes.hasUsesOutsideReductionChain() ||
891 RecurrenceDescriptor::isMinMaxRecurrenceKind(
892 RedDes.getRecurrenceKind())) &&
893 "Only min/max recurrences are allowed to have multiple uses "
894 "currently");
895 return true;
896 }
897
898 // We prevent matching non-constant strided pointer IVS to preserve
899 // historical vectorizer behavior after a generalization of the
900 // IVDescriptor code. The intent is to remove this check, but we
901 // have to fix issues around code quality for such loops first.
902 auto IsDisallowedStridedPointerInduction =
903 [](const InductionDescriptor &ID) {
904 if (AllowStridedPointerIVs)
905 return false;
906 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
907 ID.getConstIntStepValue() == nullptr;
908 };
909
910 // TODO: Instead of recording the AllowedExit, it would be good to
911 // record the complementary set: NotAllowedExit. These include (but may
912 // not be limited to):
913 // 1. Reduction phis as they represent the one-before-last value, which
914 // is not available when vectorized
915 // 2. Induction phis and increment when SCEV predicates cannot be used
916 // outside the loop - see addInductionPhi
917 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
918 // outside the loop - see call to hasOutsideLoopUser in the non-phi
919 // handling below
920 // 4. FixedOrderRecurrence phis that can possibly be handled by
921 // extraction.
922 // By recording these, we can then reason about ways to vectorize each
923 // of these NotAllowedExit.
924 InductionDescriptor ID;
925 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
926 !IsDisallowedStridedPointerInduction(ID)) {
927 addInductionPhi(Phi, ID, AllowedExit);
928 Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
929 return true;
930 }
931
932 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
933 AllowedExit.insert(Ptr: Phi);
934 FixedOrderRecurrences.insert(Ptr: Phi);
935 return true;
936 }
937
938 // As a last resort, coerce the PHI to a AddRec expression
939 // and re-try classifying it a an induction PHI.
940 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
941 !IsDisallowedStridedPointerInduction(ID)) {
942 addInductionPhi(Phi, ID, AllowedExit);
943 return true;
944 }
945
946 reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
947 OREMsg: "value that could not be identified as "
948 "reduction is used outside the loop",
949 ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
950 I: Phi);
951 return false;
952 } // end of PHI handling
953
954 // We handle calls that:
955 // * Have a mapping to an IR intrinsic.
956 // * Have a vector version available.
957 auto *CI = dyn_cast<CallInst>(Val: &I);
958
959 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
960 !(CI->getCalledFunction() && TLI &&
961 (!VFDatabase::getMappings(CI: *CI).empty() || isTLIScalarize(TLI: *TLI, CI: *CI)))) {
962 // If the call is a recognized math libary call, it is likely that
963 // we can vectorize it given loosened floating-point constraints.
964 LibFunc Func;
965 bool IsMathLibCall =
966 TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
967 TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
968 TLI->hasOptimizedCodeGen(F: Func);
969
970 if (IsMathLibCall) {
971 // TODO: Ideally, we should not use clang-specific language here,
972 // but it's hard to provide meaningful yet generic advice.
973 // Also, should this be guarded by allowExtraAnalysis() and/or be part
974 // of the returned info from isFunctionVectorizable()?
975 reportVectorizationFailure(
976 DebugMsg: "Found a non-intrinsic callsite",
977 OREMsg: "library call cannot be vectorized. "
978 "Try compiling with -fno-math-errno, -ffast-math, "
979 "or similar flags",
980 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
981 } else {
982 reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
983 OREMsg: "call instruction cannot be vectorized",
984 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
985 }
986 return false;
987 }
988
989 // Some intrinsics have scalar arguments and should be same in order for
990 // them to be vectorized (i.e. loop invariant).
991 if (CI) {
992 auto *SE = PSE.getSE();
993 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
994 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
995 if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: Idx, TTI)) {
996 if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: Idx)), L: TheLoop)) {
997 reportVectorizationFailure(
998 DebugMsg: "Found unvectorizable intrinsic",
999 OREMsg: "intrinsic instruction cannot be vectorized",
1000 ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
1001 return false;
1002 }
1003 }
1004 }
1005
1006 // If we found a vectorized variant of a function, note that so LV can
1007 // make better decisions about maximum VF.
1008 if (CI && !VFDatabase::getMappings(CI: *CI).empty())
1009 VecCallVariantsFound = true;
1010
1011 auto CanWidenInstructionTy = [](Instruction const &Inst) {
1012 Type *InstTy = Inst.getType();
1013 if (!isa<StructType>(Val: InstTy))
1014 return canVectorizeTy(Ty: InstTy);
1015
1016 // For now, we only recognize struct values returned from calls where
1017 // all users are extractvalue as vectorizable. All element types of the
1018 // struct must be types that can be widened.
1019 return isa<CallInst>(Val: Inst) && canVectorizeTy(Ty: InstTy) &&
1020 all_of(Range: Inst.users(), P: IsaPred<ExtractValueInst>);
1021 };
1022
1023 // Check that the instruction return type is vectorizable.
1024 // We can't vectorize casts from vector type to scalar type.
1025 // Also, we can't vectorize extractelement instructions.
1026 if (!CanWidenInstructionTy(I) ||
1027 (isa<CastInst>(Val: I) &&
1028 !VectorType::isValidElementType(ElemTy: I.getOperand(i: 0)->getType())) ||
1029 isa<ExtractElementInst>(Val: I)) {
1030 reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
1031 OREMsg: "instruction return type cannot be vectorized",
1032 ORETag: "CantVectorizeInstructionReturnType", ORE,
1033 TheLoop, I: &I);
1034 return false;
1035 }
1036
1037 // Check that the stored type is vectorizable.
1038 if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
1039 Type *T = ST->getValueOperand()->getType();
1040 if (!VectorType::isValidElementType(ElemTy: T)) {
1041 reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
1042 ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
1043 return false;
1044 }
1045
1046 // For nontemporal stores, check that a nontemporal vector version is
1047 // supported on the target.
1048 if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1049 // Arbitrarily try a vector of 2 elements.
1050 auto *VecTy = FixedVectorType::get(ElementType: T, /*NumElts=*/2);
1051 assert(VecTy && "did not find vectorized version of stored type");
1052 if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
1053 reportVectorizationFailure(
1054 DebugMsg: "nontemporal store instruction cannot be vectorized",
1055 ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
1056 return false;
1057 }
1058 }
1059
1060 } else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
1061 if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1062 // For nontemporal loads, check that a nontemporal vector version is
1063 // supported on the target (arbitrarily try a vector of 2 elements).
1064 auto *VecTy = FixedVectorType::get(ElementType: I.getType(), /*NumElts=*/2);
1065 assert(VecTy && "did not find vectorized version of load type");
1066 if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
1067 reportVectorizationFailure(
1068 DebugMsg: "nontemporal load instruction cannot be vectorized",
1069 ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
1070 return false;
1071 }
1072 }
1073
1074 // FP instructions can allow unsafe algebra, thus vectorizable by
1075 // non-IEEE-754 compliant SIMD units.
1076 // This applies to floating-point math operations and calls, not memory
1077 // operations, shuffles, or casts, as they don't change precision or
1078 // semantics.
1079 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
1080 !I.isFast()) {
1081 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1082 Hints->setPotentiallyUnsafe();
1083 }
1084
1085 // Reduction instructions are allowed to have exit users.
1086 // All other instructions must not have external users.
1087 if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1088 // We can safely vectorize loops where instructions within the loop are
1089 // used outside the loop only if the SCEV predicates within the loop is
1090 // same as outside the loop. Allowing the exit means reusing the SCEV
1091 // outside the loop.
1092 if (PSE.getPredicate().isAlwaysTrue()) {
1093 AllowedExit.insert(Ptr: &I);
1094 return true;
1095 }
1096 reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1097 ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1098 return false;
1099 }
1100
1101 return true;
1102}
1103
1104/// Find histogram operations that match high-level code in loops:
1105/// \code
1106/// buckets[indices[i]]+=step;
1107/// \endcode
1108///
1109/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1110/// array the computed histogram. It uses a BinOp to sum all counts, storing
1111/// them using a loop-variant index Load from the 'indices' input array.
1112///
1113/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1114/// regardless of hardware support. When there is support, it additionally
1115/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1116/// used to update histogram in \p HistogramPtrs.
1117static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
1118 const PredicatedScalarEvolution &PSE,
1119 SmallVectorImpl<HistogramInfo> &Histograms) {
1120
1121 // Store value must come from a Binary Operation.
1122 Instruction *HPtrInstr = nullptr;
1123 BinaryOperator *HBinOp = nullptr;
1124 if (!match(V: HSt, P: m_Store(ValueOp: m_BinOp(I&: HBinOp), PointerOp: m_Instruction(I&: HPtrInstr))))
1125 return false;
1126
1127 // BinOp must be an Add or a Sub modifying the bucket value by a
1128 // loop invariant amount.
1129 // FIXME: We assume the loop invariant term is on the RHS.
1130 // Fine for an immediate/constant, but maybe not a generic value?
1131 Value *HIncVal = nullptr;
1132 if (!match(V: HBinOp, P: m_Add(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))) &&
1133 !match(V: HBinOp, P: m_Sub(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))))
1134 return false;
1135
1136 // Make sure the increment value is loop invariant.
1137 if (!TheLoop->isLoopInvariant(V: HIncVal))
1138 return false;
1139
1140 // The address to store is calculated through a GEP Instruction.
1141 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: HPtrInstr);
1142 if (!GEP)
1143 return false;
1144
1145 // Restrict address calculation to constant indices except for the last term.
1146 Value *HIdx = nullptr;
1147 for (Value *Index : GEP->indices()) {
1148 if (HIdx)
1149 return false;
1150 if (!isa<ConstantInt>(Val: Index))
1151 HIdx = Index;
1152 }
1153
1154 if (!HIdx)
1155 return false;
1156
1157 // Check that the index is calculated by loading from another array. Ignore
1158 // any extensions.
1159 // FIXME: Support indices from other sources than a linear load from memory?
1160 // We're currently trying to match an operation looping over an array
1161 // of indices, but there could be additional levels of indirection
1162 // in place, or possibly some additional calculation to form the index
1163 // from the loaded data.
1164 Value *VPtrVal;
1165 if (!match(V: HIdx, P: m_ZExtOrSExtOrSelf(Op: m_Load(Op: m_Value(V&: VPtrVal)))))
1166 return false;
1167
1168 // Make sure the index address varies in this loop, not an outer loop.
1169 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: VPtrVal));
1170 if (!AR || AR->getLoop() != TheLoop)
1171 return false;
1172
1173 // Ensure we'll have the same mask by checking that all parts of the histogram
1174 // (gather load, update, scatter store) are in the same block.
1175 LoadInst *IndexedLoad = cast<LoadInst>(Val: HBinOp->getOperand(i_nocapture: 0));
1176 BasicBlock *LdBB = IndexedLoad->getParent();
1177 if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
1178 return false;
1179
1180 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1181
1182 // Store the operations that make up the histogram.
1183 Histograms.emplace_back(Args&: IndexedLoad, Args&: HBinOp, Args&: HSt);
1184 return true;
1185}
1186
1187bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1188 // For now, we only support an IndirectUnsafe dependency that calculates
1189 // a histogram
1190 if (!EnableHistogramVectorization)
1191 return false;
1192
1193 // Find a single IndirectUnsafe dependency.
1194 const MemoryDepChecker::Dependence *IUDep = nullptr;
1195 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1196 const auto *Deps = DepChecker.getDependences();
1197 // If there were too many dependences, LAA abandons recording them. We can't
1198 // proceed safely if we don't know what the dependences are.
1199 if (!Deps)
1200 return false;
1201
1202 for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1203 // Ignore dependencies that are either known to be safe or can be
1204 // checked at runtime.
1205 if (MemoryDepChecker::Dependence::isSafeForVectorization(Type: Dep.Type) !=
1206 MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1207 continue;
1208
1209 // We're only interested in IndirectUnsafe dependencies here, where the
1210 // address might come from a load from memory. We also only want to handle
1211 // one such dependency, at least for now.
1212 if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
1213 return false;
1214
1215 IUDep = &Dep;
1216 }
1217 if (!IUDep)
1218 return false;
1219
1220 // For now only normal loads and stores are supported.
1221 LoadInst *LI = dyn_cast<LoadInst>(Val: IUDep->getSource(DepChecker));
1222 StoreInst *SI = dyn_cast<StoreInst>(Val: IUDep->getDestination(DepChecker));
1223
1224 if (!LI || !SI)
1225 return false;
1226
1227 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1228 return findHistogram(LI, HSt: SI, TheLoop, PSE: LAI->getPSE(), Histograms);
1229}
1230
1231bool LoopVectorizationLegality::canVectorizeMemory() {
1232 LAI = &LAIs.getInfo(L&: *TheLoop);
1233 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1234 if (LAR) {
1235 ORE->emit(RemarkBuilder: [&]() {
1236 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
1237 "loop not vectorized: ", *LAR);
1238 });
1239 }
1240
1241 if (!LAI->canVectorizeMemory()) {
1242 if (hasUncountableExitWithSideEffects()) {
1243 reportVectorizationFailure(
1244 DebugMsg: "Cannot vectorize unsafe dependencies in uncountable exit loop with "
1245 "side effects",
1246 ORETag: "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1247 TheLoop);
1248 return false;
1249 }
1250
1251 return canVectorizeIndirectUnsafeDependences();
1252 }
1253
1254 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1255 reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1256 OREMsg: "write to a loop invariant address could not "
1257 "be vectorized",
1258 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1259 TheLoop);
1260 return false;
1261 }
1262
1263 // We can vectorize stores to invariant address when final reduction value is
1264 // guaranteed to be stored at the end of the loop. Also, if decision to
1265 // vectorize loop is made, runtime checks are added so as to make sure that
1266 // invariant address won't alias with any other objects.
1267 if (!LAI->getStoresToInvariantAddresses().empty()) {
1268 // For each invariant address, check if last stored value is unconditional
1269 // and the address is not calculated inside the loop.
1270 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1271 if (!isInvariantStoreOfReduction(SI))
1272 continue;
1273
1274 if (blockNeedsPredication(BB: SI->getParent())) {
1275 reportVectorizationFailure(
1276 DebugMsg: "We don't allow storing to uniform addresses",
1277 OREMsg: "write of conditional recurring variant value to a loop "
1278 "invariant address could not be vectorized",
1279 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1280 return false;
1281 }
1282
1283 // Invariant address should be defined outside of loop. LICM pass usually
1284 // makes sure it happens, but in rare cases it does not, we do not want
1285 // to overcomplicate vectorization to support this case.
1286 if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1287 if (TheLoop->contains(Inst: Ptr)) {
1288 reportVectorizationFailure(
1289 DebugMsg: "Invariant address is calculated inside the loop",
1290 OREMsg: "write to a loop invariant address could not "
1291 "be vectorized",
1292 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1293 return false;
1294 }
1295 }
1296 }
1297
1298 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1299 // For each invariant address, check its last stored value is the result
1300 // of one of our reductions.
1301 //
1302 // We do not check if dependence with loads exists because that is already
1303 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1304 ScalarEvolution *SE = PSE.getSE();
1305 SmallVector<StoreInst *, 4> UnhandledStores;
1306 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1307 if (isInvariantStoreOfReduction(SI)) {
1308 // Earlier stores to this address are effectively deadcode.
1309 // With opaque pointers it is possible for one pointer to be used with
1310 // different sizes of stored values:
1311 // store i32 0, ptr %x
1312 // store i8 0, ptr %x
1313 // The latest store doesn't complitely overwrite the first one in the
1314 // example. That is why we have to make sure that types of stored
1315 // values are same.
1316 // TODO: Check that bitwidth of unhandled store is smaller then the
1317 // one that overwrites it and add a test.
1318 erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1319 return storeToSameAddress(SE, A: SI, B: I) &&
1320 I->getValueOperand()->getType() ==
1321 SI->getValueOperand()->getType();
1322 });
1323 continue;
1324 }
1325 UnhandledStores.push_back(Elt: SI);
1326 }
1327
1328 bool IsOK = UnhandledStores.empty();
1329 // TODO: we should also validate against InvariantMemSets.
1330 if (!IsOK) {
1331 reportVectorizationFailure(
1332 DebugMsg: "We don't allow storing to uniform addresses",
1333 OREMsg: "write to a loop invariant address could not "
1334 "be vectorized",
1335 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1336 return false;
1337 }
1338 }
1339 }
1340
1341 PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1342 return true;
1343}
1344
1345bool LoopVectorizationLegality::canVectorizeFPMath(
1346 bool EnableStrictReductions) {
1347
1348 // First check if there is any ExactFP math or if we allow reassociations
1349 if (!Requirements->getExactFPInst() || Hints->allowReordering())
1350 return true;
1351
1352 // If the above is false, we have ExactFPMath & do not allow reordering.
1353 // If the EnableStrictReductions flag is set, first check if we have any
1354 // Exact FP induction vars, which we cannot vectorize.
1355 if (!EnableStrictReductions ||
1356 any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1357 InductionDescriptor IndDesc = Induction.second;
1358 return IndDesc.getExactFPMathInst();
1359 }))
1360 return false;
1361
1362 // We can now only vectorize if all reductions with Exact FP math also
1363 // have the isOrdered flag set, which indicates that we can move the
1364 // reduction operations in-loop.
1365 return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1366 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1367 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
1368 }));
1369}
1370
1371bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1372 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1373 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1374 return RdxDesc.IntermediateStore == SI;
1375 });
1376}
1377
1378bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1379 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1380 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1381 if (!RdxDesc.IntermediateStore)
1382 return false;
1383
1384 ScalarEvolution *SE = PSE.getSE();
1385 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1386 return V == InvariantAddress ||
1387 SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1388 });
1389}
1390
1391bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
1392 Value *In0 = const_cast<Value *>(V);
1393 PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1394 if (!PN)
1395 return false;
1396
1397 return Inductions.count(Key: PN);
1398}
1399
1400const InductionDescriptor *
1401LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
1402 if (!isInductionPhi(V: Phi))
1403 return nullptr;
1404 auto &ID = getInductionVars().find(Key: Phi)->second;
1405 if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
1406 ID.getKind() == InductionDescriptor::IK_FpInduction)
1407 return &ID;
1408 return nullptr;
1409}
1410
1411const InductionDescriptor *
1412LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
1413 if (!isInductionPhi(V: Phi))
1414 return nullptr;
1415 auto &ID = getInductionVars().find(Key: Phi)->second;
1416 if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1417 return &ID;
1418 return nullptr;
1419}
1420
1421bool LoopVectorizationLegality::isCastedInductionVariable(
1422 const Value *V) const {
1423 auto *Inst = dyn_cast<Instruction>(Val: V);
1424 return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1425}
1426
1427bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
1428 return isInductionPhi(V) || isCastedInductionVariable(V);
1429}
1430
1431bool LoopVectorizationLegality::isFixedOrderRecurrence(
1432 const PHINode *Phi) const {
1433 return FixedOrderRecurrences.count(Ptr: Phi);
1434}
1435
1436bool LoopVectorizationLegality::blockNeedsPredication(
1437 const BasicBlock *BB) const {
1438 // When vectorizing early exits, create predicates for the latch block only.
1439 // For a single early exit, it must be a direct predecessor of the latch.
1440 // For multiple early exits, they form a chain where each exiting block
1441 // dominates all subsequent blocks up to the latch.
1442 BasicBlock *Latch = TheLoop->getLoopLatch();
1443 if (hasUncountableEarlyExit())
1444 return BB == Latch;
1445 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1446}
1447
1448bool LoopVectorizationLegality::blockCanBePredicated(
1449 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
1450 SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
1451 for (Instruction &I : *BB) {
1452 // We can predicate blocks with calls to assume, as long as we drop them in
1453 // case we flatten the CFG via predication.
1454 if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1455 MaskedOp.insert(Ptr: &I);
1456 continue;
1457 }
1458
1459 // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1460 // TODO: there might be cases that it should block the vectorization. Let's
1461 // ignore those for now.
1462 if (isa<NoAliasScopeDeclInst>(Val: &I))
1463 continue;
1464
1465 // We can allow masked calls if there's at least one vector variant, even
1466 // if we end up scalarizing due to the cost model calculations.
1467 // TODO: Allow other calls if they have appropriate attributes... readonly
1468 // and argmemonly?
1469 if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1470 if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1471 MaskedOp.insert(Ptr: CI);
1472 continue;
1473 }
1474
1475 // Loads are handled via masking (or speculated if safe to do so.)
1476 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1477 if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1478 MaskedOp.insert(Ptr: LI);
1479 continue;
1480 }
1481
1482 // Predicated store requires some form of masking:
1483 // 1) masked store HW instruction,
1484 // 2) emulation via load-blend-store (only if safe and legal to do so,
1485 // be aware on the race conditions), or
1486 // 3) element-by-element predicate check and scalar store.
1487 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1488 MaskedOp.insert(Ptr: SI);
1489 continue;
1490 }
1491
1492 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
1493 return false;
1494 }
1495
1496 return true;
1497}
1498
1499bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1500 if (!EnableIfConversion) {
1501 reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1502 ORETag: "IfConversionDisabled", ORE, TheLoop);
1503 return false;
1504 }
1505
1506 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1507
1508 // A list of pointers which are known to be dereferenceable within scope of
1509 // the loop body for each iteration of the loop which executes. That is,
1510 // the memory pointed to can be dereferenced (with the access size implied by
1511 // the value's type) unconditionally within the loop header without
1512 // introducing a new fault.
1513 SmallPtrSet<Value *, 8> SafePointers;
1514
1515 // Collect safe addresses.
1516 for (BasicBlock *BB : TheLoop->blocks()) {
1517 if (!blockNeedsPredication(BB)) {
1518 for (Instruction &I : *BB)
1519 if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1520 SafePointers.insert(Ptr);
1521 continue;
1522 }
1523
1524 // For a block which requires predication, a address may be safe to access
1525 // in the loop w/o predication if we can prove dereferenceability facts
1526 // sufficient to ensure it'll never fault within the loop. For the moment,
1527 // we restrict this to loads; stores are more complicated due to
1528 // concurrency restrictions.
1529 ScalarEvolution &SE = *PSE.getSE();
1530 SmallVector<const SCEVPredicate *, 4> Predicates;
1531 for (Instruction &I : *BB) {
1532 LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1533
1534 // Make sure we can execute all computations feeding into Ptr in the loop
1535 // w/o triggering UB and that none of the out-of-loop operands are poison.
1536 // We do not need to check if operations inside the loop can produce
1537 // poison due to flags (e.g. due to an inbounds GEP going out of bounds),
1538 // because flags will be dropped when executing them unconditionally.
1539 // TODO: Results could be improved by considering poison-propagation
1540 // properties of visited ops.
1541 auto CanSpeculatePointerOp = [this](Value *Ptr) {
1542 SmallVector<Value *> Worklist = {Ptr};
1543 SmallPtrSet<Value *, 4> Visited;
1544 while (!Worklist.empty()) {
1545 Value *CurrV = Worklist.pop_back_val();
1546 if (!Visited.insert(Ptr: CurrV).second)
1547 continue;
1548
1549 auto *CurrI = dyn_cast<Instruction>(Val: CurrV);
1550 if (!CurrI || !TheLoop->contains(Inst: CurrI)) {
1551 // If operands from outside the loop may be poison then Ptr may also
1552 // be poison.
1553 if (!isGuaranteedNotToBePoison(V: CurrV, AC,
1554 CtxI: TheLoop->getLoopPredecessor()
1555 ->getTerminator()
1556 ->getIterator(),
1557 DT))
1558 return false;
1559 continue;
1560 }
1561
1562 // A loaded value may be poison, independent of any flags.
1563 if (isa<LoadInst>(Val: CurrI) && !isGuaranteedNotToBePoison(V: CurrV, AC))
1564 return false;
1565
1566 // For other ops, assume poison can only be introduced via flags,
1567 // which can be dropped.
1568 if (!isa<PHINode>(Val: CurrI) && !isSafeToSpeculativelyExecute(I: CurrI))
1569 return false;
1570 append_range(C&: Worklist, R: CurrI->operands());
1571 }
1572 return true;
1573 };
1574 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1575 // that it will consider loops that need guarding by SCEV checks. The
1576 // vectoriser will generate these checks if we decide to vectorise.
1577 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1578 CanSpeculatePointerOp(LI->getPointerOperand()) &&
1579 isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC,
1580 Predicates: &Predicates))
1581 SafePointers.insert(Ptr: LI->getPointerOperand());
1582 Predicates.clear();
1583 }
1584 }
1585
1586 // Collect the blocks that need predication.
1587 for (BasicBlock *BB : TheLoop->blocks()) {
1588 // We support only branches and switch statements as terminators inside the
1589 // loop.
1590 if (isa<SwitchInst>(Val: BB->getTerminator())) {
1591 if (TheLoop->isLoopExiting(BB)) {
1592 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported switch",
1593 ORETag: "LoopContainsUnsupportedSwitch", ORE,
1594 TheLoop, I: BB->getTerminator());
1595 return false;
1596 }
1597 } else if (!isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator())) {
1598 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported terminator",
1599 ORETag: "LoopContainsUnsupportedTerminator", ORE,
1600 TheLoop, I: BB->getTerminator());
1601 return false;
1602 }
1603
1604 // We must be able to predicate all blocks that need to be predicated.
1605 if (blockNeedsPredication(BB) &&
1606 !blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1607 reportVectorizationFailure(
1608 DebugMsg: "Control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1609 ORE, TheLoop, I: BB->getTerminator());
1610 return false;
1611 }
1612 }
1613
1614 // We can if-convert this loop.
1615 return true;
1616}
1617
1618// Helper function to canVectorizeLoopNestCFG.
1619bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1620 bool UseVPlanNativePath) {
1621 assert((UseVPlanNativePath || Lp->isInnermost()) &&
1622 "VPlan-native path is not enabled.");
1623
1624 // TODO: ORE should be improved to show more accurate information when an
1625 // outer loop can't be vectorized because a nested loop is not understood or
1626 // legal. Something like: "outer_loop_location: loop not vectorized:
1627 // (inner_loop_location) loop control flow is not understood by vectorizer".
1628
1629 // Store the result and return it at the end instead of exiting early, in case
1630 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1631 bool Result = true;
1632 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1633
1634 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1635 // be canonicalized.
1636 if (!Lp->getLoopPreheader()) {
1637 reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1638 OREMsg: "loop control flow is not understood by vectorizer",
1639 ORETag: "CFGNotUnderstood", ORE, TheLoop);
1640 if (DoExtraAnalysis)
1641 Result = false;
1642 else
1643 return false;
1644 }
1645
1646 // We must have a single backedge.
1647 if (Lp->getNumBackEdges() != 1) {
1648 reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1649 OREMsg: "loop control flow is not understood by vectorizer",
1650 ORETag: "CFGNotUnderstood", ORE, TheLoop);
1651 if (DoExtraAnalysis)
1652 Result = false;
1653 else
1654 return false;
1655 }
1656
1657 // The latch must be terminated by a branch.
1658 BasicBlock *Latch = Lp->getLoopLatch();
1659 if (Latch && !isa<UncondBrInst, CondBrInst>(Val: Latch->getTerminator())) {
1660 reportVectorizationFailure(
1661 DebugMsg: "The loop latch terminator is not a UncondBrInst/CondBrInst",
1662 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1663 ORE, TheLoop);
1664 if (DoExtraAnalysis)
1665 Result = false;
1666 else
1667 return false;
1668 }
1669
1670 return Result;
1671}
1672
1673bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1674 Loop *Lp, bool UseVPlanNativePath) {
1675 // Store the result and return it at the end instead of exiting early, in case
1676 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1677 bool Result = true;
1678 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1679 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1680 if (DoExtraAnalysis)
1681 Result = false;
1682 else
1683 return false;
1684 }
1685
1686 // Recursively check whether the loop control flow of nested loops is
1687 // understood.
1688 for (Loop *SubLp : *Lp)
1689 if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1690 if (DoExtraAnalysis)
1691 Result = false;
1692 else
1693 return false;
1694 }
1695
1696 return Result;
1697}
1698
1699bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1700 BasicBlock *LatchBB = TheLoop->getLoopLatch();
1701 if (!LatchBB) {
1702 reportVectorizationFailure(DebugMsg: "Loop does not have a latch",
1703 OREMsg: "Cannot vectorize early exit loop",
1704 ORETag: "NoLatchEarlyExit", ORE, TheLoop);
1705 return false;
1706 }
1707
1708 if (Reductions.size() || FixedOrderRecurrences.size()) {
1709 reportVectorizationFailure(
1710 DebugMsg: "Found reductions or recurrences in early-exit loop",
1711 OREMsg: "Cannot vectorize early exit loop with reductions or recurrences",
1712 ORETag: "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1713 return false;
1714 }
1715
1716 SmallVector<BasicBlock *, 8> ExitingBlocks;
1717 TheLoop->getExitingBlocks(ExitingBlocks);
1718
1719 // Keep a record of all the exiting blocks.
1720 SmallVector<const SCEVPredicate *, 4> Predicates;
1721 SmallVector<BasicBlock *> UncountableExitingBlocks;
1722 for (BasicBlock *BB : ExitingBlocks) {
1723 const SCEV *EC =
1724 PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: BB, Predicates: &Predicates);
1725 if (isa<SCEVCouldNotCompute>(Val: EC)) {
1726 if (size(Range: successors(BB)) != 2) {
1727 reportVectorizationFailure(
1728 DebugMsg: "Early exiting block does not have exactly two successors",
1729 OREMsg: "Incorrect number of successors from early exiting block",
1730 ORETag: "EarlyExitTooManySuccessors", ORE, TheLoop);
1731 return false;
1732 }
1733
1734 UncountableExitingBlocks.push_back(Elt: BB);
1735 } else
1736 CountableExitingBlocks.push_back(Elt: BB);
1737 }
1738 // We can safely ignore the predicates here because when vectorizing the loop
1739 // the PredicatatedScalarEvolution class will keep track of all predicates
1740 // for each exiting block anyway. This happens when calling
1741 // PSE.getSymbolicMaxBackedgeTakenCount() below.
1742 Predicates.clear();
1743
1744 if (UncountableExitingBlocks.empty()) {
1745 LLVM_DEBUG(dbgs() << "LV: Could not find any uncountable exits");
1746 return false;
1747 }
1748
1749 // The latch block must have a countable exit.
1750 if (isa<SCEVCouldNotCompute>(
1751 Val: PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: LatchBB, Predicates: &Predicates))) {
1752 reportVectorizationFailure(
1753 DebugMsg: "Cannot determine exact exit count for latch block",
1754 OREMsg: "Cannot vectorize early exit loop",
1755 ORETag: "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1756 return false;
1757 }
1758 assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1759 "Latch block not found in list of countable exits!");
1760
1761 // Check to see if there are instructions that could potentially generate
1762 // exceptions or have side-effects.
1763 auto IsSafeOperation = [](Instruction *I) -> bool {
1764 switch (I->getOpcode()) {
1765 case Instruction::Load:
1766 case Instruction::Store:
1767 case Instruction::PHI:
1768 case Instruction::UncondBr:
1769 case Instruction::CondBr:
1770 // These are checked separately.
1771 return true;
1772 default:
1773 return isSafeToSpeculativelyExecute(I);
1774 }
1775 };
1776
1777 bool HasSideEffects = false;
1778 for (auto *BB : TheLoop->blocks())
1779 for (auto &I : *BB) {
1780 if (I.mayWriteToMemory()) {
1781 if (isa<StoreInst>(Val: &I) && cast<StoreInst>(Val: &I)->isSimple()) {
1782 HasSideEffects = true;
1783 continue;
1784 }
1785
1786 // We don't support complex writes to memory.
1787 reportVectorizationFailure(
1788 DebugMsg: "Complex writes to memory unsupported in early exit loops",
1789 OREMsg: "Cannot vectorize early exit loop with complex writes to memory",
1790 ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
1791 return false;
1792 }
1793
1794 if (!IsSafeOperation(&I)) {
1795 reportVectorizationFailure(DebugMsg: "Early exit loop contains operations that "
1796 "cannot be speculatively executed",
1797 ORETag: "UnsafeOperationsEarlyExitLoop", ORE,
1798 TheLoop);
1799 return false;
1800 }
1801 }
1802
1803 SmallVector<LoadInst *, 4> NonDerefLoads;
1804 // TODO: Handle loops that may fault.
1805 if (!HasSideEffects) {
1806 // Read-only loop.
1807 Predicates.clear();
1808 if (!isReadOnlyLoop(L: TheLoop, SE: PSE.getSE(), DT, AC, NonDereferenceableAndAlignedLoads&: NonDerefLoads,
1809 Predicates: &Predicates)) {
1810 reportVectorizationFailure(
1811 DebugMsg: "Loop may fault", OREMsg: "Cannot vectorize non-read-only early exit loop",
1812 ORETag: "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1813 return false;
1814 }
1815 } else {
1816 // Check all uncountable exiting blocks for movable loads.
1817 for (BasicBlock *ExitingBB : UncountableExitingBlocks) {
1818 if (!canUncountableExitConditionLoadBeMoved(ExitingBlock: ExitingBB))
1819 return false;
1820 }
1821 }
1822
1823 // Check non-dereferenceable loads if any.
1824 for (LoadInst *LI : NonDerefLoads) {
1825 // Only support unit-stride access for now.
1826 int Stride = isConsecutivePtr(AccessTy: LI->getType(), Ptr: LI->getPointerOperand());
1827 if (Stride != 1) {
1828 reportVectorizationFailure(
1829 DebugMsg: "Loop contains potentially faulting strided load",
1830 OREMsg: "Cannot vectorize early exit loop with "
1831 "strided fault-only-first load",
1832 ORETag: "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
1833 return false;
1834 }
1835 PotentiallyFaultingLoads.insert(Ptr: LI);
1836 LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
1837 << "\n");
1838 }
1839
1840 [[maybe_unused]] const SCEV *SymbolicMaxBTC =
1841 PSE.getSymbolicMaxBackedgeTakenCount();
1842 // Since we have an exact exit count for the latch and the early exit
1843 // dominates the latch, then this should guarantee a computed SCEV value.
1844 assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1845 "Failed to get symbolic expression for backedge taken count");
1846 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1847 "backedge taken count: "
1848 << *SymbolicMaxBTC << '\n');
1849 HasUncountableEarlyExit = true;
1850 UncountableExitWithSideEffects = HasSideEffects;
1851 return true;
1852}
1853
1854bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1855 BasicBlock *ExitingBlock) {
1856 // Try to find a load in the critical path for the uncountable exit condition.
1857 // This is currently matching about the simplest form we can, expecting
1858 // only one in-loop load, the result of which is directly compared against
1859 // a loop-invariant value.
1860 // FIXME: We're insisting on a single use for now, because otherwise we will
1861 // need to make PHI nodes for other users. That can be done once the initial
1862 // transform code lands.
1863 auto *Br = cast<CondBrInst>(Val: ExitingBlock->getTerminator());
1864
1865 using namespace llvm::PatternMatch;
1866 Instruction *L = nullptr;
1867 Value *Ptr = nullptr;
1868 Value *R = nullptr;
1869 if (!match(V: Br->getCondition(),
1870 P: m_OneUse(SubPattern: m_ICmp(L: m_OneUse(SubPattern: m_Instruction(I&: L, Match: m_Load(Op: m_Value(V&: Ptr)))),
1871 R: m_Value(V&: R))))) {
1872 reportVectorizationFailure(
1873 DebugMsg: "Early exit loop with store but no supported condition load",
1874 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1875 return false;
1876 }
1877
1878 // FIXME: Don't rely on operand ordering for the comparison.
1879 if (!TheLoop->isLoopInvariant(V: R)) {
1880 reportVectorizationFailure(
1881 DebugMsg: "Early exit loop with store but no supported condition load",
1882 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1883 return false;
1884 }
1885
1886 // Make sure that the load address is not loop invariant; we want an
1887 // address calculation that we can rotate to the next vector iteration.
1888 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: Ptr));
1889 if (!AR || AR->getLoop() != TheLoop || !AR->isAffine()) {
1890 reportVectorizationFailure(
1891 DebugMsg: "Uncountable exit condition depends on load with an address that is "
1892 "not an add recurrence in the loop",
1893 ORETag: "EarlyExitLoadInvariantAddress", ORE, TheLoop);
1894 return false;
1895 }
1896
1897 // FIXME: Support gathers after first-faulting load support lands.
1898 SmallVector<const SCEVPredicate *, 4> Predicates;
1899 LoadInst *Load = cast<LoadInst>(Val: L);
1900 if (!isDereferenceableAndAlignedInLoop(LI: Load, L: TheLoop, SE&: *PSE.getSE(), DT&: *DT, AC,
1901 Predicates: &Predicates)) {
1902 reportVectorizationFailure(
1903 DebugMsg: "Loop may fault",
1904 OREMsg: "Cannot vectorize potentially faulting early exit loop",
1905 ORETag: "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1906 return false;
1907 }
1908
1909 ICFLoopSafetyInfo SafetyInfo;
1910 SafetyInfo.computeLoopSafetyInfo(CurLoop: TheLoop);
1911 // We need to know that load will be executed before we can hoist a
1912 // copy out to run just before the first iteration.
1913 if (!SafetyInfo.isGuaranteedToExecute(Inst: *Load, DT, CurLoop: TheLoop)) {
1914 reportVectorizationFailure(
1915 DebugMsg: "Load for uncountable exit not guaranteed to execute",
1916 ORETag: "ConditionalUncountableExitLoad", ORE, TheLoop);
1917 return false;
1918 }
1919
1920 // Prohibit any potential aliasing with any instruction in the loop which
1921 // might store to memory.
1922 // FIXME: Relax this constraint where possible.
1923 for (auto *BB : TheLoop->blocks()) {
1924 for (auto &I : *BB) {
1925 if (&I == Load)
1926 continue;
1927
1928 if (I.mayWriteToMemory()) {
1929 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1930 AliasResult AR = AA->alias(V1: Ptr, V2: SI->getPointerOperand());
1931 if (AR == AliasResult::NoAlias)
1932 continue;
1933 }
1934
1935 reportVectorizationFailure(
1936 DebugMsg: "Cannot determine whether critical uncountable exit load address "
1937 "does not alias with a memory write",
1938 ORETag: "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1939 return false;
1940 }
1941 }
1942 }
1943
1944 return true;
1945}
1946
1947bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1948 // Store the result and return it at the end instead of exiting early, in case
1949 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1950 bool Result = true;
1951
1952 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1953 // Check whether the loop-related control flow in the loop nest is expected by
1954 // vectorizer.
1955 if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1956 if (DoExtraAnalysis) {
1957 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1958 Result = false;
1959 } else {
1960 return false;
1961 }
1962 }
1963
1964 // We need to have a loop header.
1965 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1966 << '\n');
1967
1968 // Specific checks for outer loops. We skip the remaining legal checks at this
1969 // point because they don't support outer loops.
1970 if (!TheLoop->isInnermost()) {
1971 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1972
1973 if (!canVectorizeOuterLoop()) {
1974 reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1975 ORETag: "UnsupportedOuterLoop", ORE, TheLoop);
1976 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1977 // outer loops.
1978 return false;
1979 }
1980
1981 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1982 return Result;
1983 }
1984
1985 assert(TheLoop->isInnermost() && "Inner loop expected.");
1986 // Check if we can if-convert non-single-bb loops.
1987 unsigned NumBlocks = TheLoop->getNumBlocks();
1988 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1989 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1990 if (DoExtraAnalysis)
1991 Result = false;
1992 else
1993 return false;
1994 }
1995
1996 // Check if we can vectorize the instructions and CFG in this loop.
1997 if (!canVectorizeInstrs()) {
1998 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1999 if (DoExtraAnalysis)
2000 Result = false;
2001 else
2002 return false;
2003 }
2004
2005 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
2006 if (TheLoop->getExitingBlock()) {
2007 reportVectorizationFailure(DebugMsg: "Cannot vectorize uncountable loop",
2008 ORETag: "UnsupportedUncountableLoop", ORE, TheLoop);
2009 if (DoExtraAnalysis)
2010 Result = false;
2011 else
2012 return false;
2013 } else {
2014 if (!isVectorizableEarlyExitLoop()) {
2015 assert(!hasUncountableEarlyExit() &&
2016 !hasUncountableExitWithSideEffects() &&
2017 "Must be false without vectorizable early-exit loop");
2018 if (DoExtraAnalysis)
2019 Result = false;
2020 else
2021 return false;
2022 }
2023 }
2024 }
2025
2026 // Go over each instruction and look at memory deps.
2027 if (!canVectorizeMemory()) {
2028 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
2029 if (DoExtraAnalysis)
2030 Result = false;
2031 else
2032 return false;
2033 }
2034
2035 // Bail out for state-changing loops with uncountable exits for now.
2036 if (UncountableExitWithSideEffects) {
2037 reportVectorizationFailure(
2038 DebugMsg: "Writes to memory unsupported in early exit loops",
2039 OREMsg: "Cannot vectorize early exit loop with writes to memory",
2040 ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
2041 return false;
2042 }
2043
2044 if (Result) {
2045 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
2046 << (LAI->getRuntimePointerChecking()->Need
2047 ? " (with a runtime bound check)"
2048 : "")
2049 << "!\n");
2050 }
2051
2052 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
2053 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
2054 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
2055
2056 if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
2057 LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
2058 "due to SCEVThreshold");
2059 reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
2060 OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
2061 ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
2062 if (DoExtraAnalysis)
2063 Result = false;
2064 else
2065 return false;
2066 }
2067
2068 // Okay! We've done all the tests. If any have failed, return false. Otherwise
2069 // we can vectorize, and at this point we don't have any other mem analysis
2070 // which may limit our maximum vectorization factor, so just return true with
2071 // no restrictions.
2072 return Result;
2073}
2074
2075bool LoopVectorizationLegality::canFoldTailByMasking() const {
2076 // The only loops we can vectorize without a scalar epilogue, are loops with
2077 // a bottom-test and a single exiting block. We'd have to handle the fact
2078 // that not every instruction executes on the last iteration. This will
2079 // require a lane mask which varies through the vector loop body. (TODO)
2080 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
2081 LLVM_DEBUG(
2082 dbgs()
2083 << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
2084 return false;
2085 }
2086
2087 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
2088
2089 SmallPtrSet<const Value *, 8> ReductionLiveOuts;
2090
2091 for (const auto &Reduction : getReductionVars())
2092 ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
2093
2094 // The list of pointers that we can safely read and write to remains empty.
2095 SmallPtrSet<Value *, 8> SafePointers;
2096
2097 // Check all blocks for predication, including those that ordinarily do not
2098 // need predication such as the header block.
2099 SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
2100 for (BasicBlock *BB : TheLoop->blocks()) {
2101 if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
2102 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
2103 return false;
2104 }
2105 }
2106
2107 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
2108
2109 return true;
2110}
2111
2112void LoopVectorizationLegality::prepareToFoldTailByMasking() {
2113 // The list of pointers that we can safely read and write to remains empty.
2114 SmallPtrSet<Value *, 8> SafePointers;
2115
2116 // Mark all blocks for predication, including those that ordinarily do not
2117 // need predication such as the header block.
2118 for (BasicBlock *BB : TheLoop->blocks()) {
2119 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp);
2120 assert(R && "Must be able to predicate block when tail-folding.");
2121 }
2122}
2123
2124} // namespace llvm
2125