1//===- LoopVectorizationLegality.cpp --------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides loop vectorization legality analysis. Original code
10// resided in LoopVectorize.cpp for a long time.
11//
12// At this point, it is implemented as a utility class, not as an analysis
13// pass. It should be easy to create an analysis pass around it if there
14// is a need (but D45420 needs to happen first).
15//
16
17#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18#include "LoopVectorizationPlanner.h"
19#include "llvm/Analysis/AliasAnalysis.h"
20#include "llvm/Analysis/Loads.h"
21#include "llvm/Analysis/LoopInfo.h"
22#include "llvm/Analysis/MustExecute.h"
23#include "llvm/Analysis/OptimizationRemarkEmitter.h"
24#include "llvm/Analysis/ScalarEvolutionExpressions.h"
25#include "llvm/Analysis/TargetLibraryInfo.h"
26#include "llvm/Analysis/TargetTransformInfo.h"
27#include "llvm/Analysis/ValueTracking.h"
28#include "llvm/Analysis/VectorUtils.h"
29#include "llvm/IR/Dominators.h"
30#include "llvm/IR/IntrinsicInst.h"
31#include "llvm/IR/PatternMatch.h"
32#include "llvm/Transforms/Utils/SizeOpts.h"
33#include "llvm/Transforms/Vectorize/LoopVectorize.h"
34
35using namespace llvm;
36using namespace PatternMatch;
37using namespace LoopVectorizationUtils;
38
39#define LV_NAME "loop-vectorize"
40#define DEBUG_TYPE LV_NAME
41
42static cl::opt<bool>
43 EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
44 cl::desc("Enable if-conversion during vectorization."));
45
46static cl::opt<bool>
47AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
48 cl::desc("Enable recognition of non-constant strided "
49 "pointer induction variables."));
50
51static cl::opt<bool>
52 HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
53 cl::desc("Allow enabling loop hints to reorder "
54 "FP operations during vectorization."));
55
56static cl::opt<LoopVectorizeHints::ScalableForceKind>
57 ForceScalableVectorization(
58 "scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
59 cl::Hidden,
60 cl::desc("Control whether the compiler can use scalable vectors to "
61 "vectorize a loop"),
62 cl::values(
63 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
64 "Scalable vectorization is disabled."),
65 clEnumValN(
66 LoopVectorizeHints::SK_PreferScalable, "preferred",
67 "Scalable vectorization is available and favored when the "
68 "cost is inconclusive."),
69 clEnumValN(
70 LoopVectorizeHints::SK_PreferScalable, "on",
71 "Scalable vectorization is available and favored when the "
72 "cost is inconclusive."),
73 clEnumValN(
74 LoopVectorizeHints::SK_AlwaysScalable, "always",
75 "Scalable vectorization is available and always favored when "
76 "feasible")));
77
78static cl::opt<bool> EnableHistogramVectorization(
79 "enable-histogram-loop-vectorization", cl::init(Val: false), cl::Hidden,
80 cl::desc("Enables autovectorization of some loops containing histograms"));
81
82/// Maximum vectorization interleave count.
83static const unsigned MaxInterleaveFactor = 16;
84
85namespace llvm {
86
87bool LoopVectorizeHints::Hint::validate(unsigned Val) {
88 switch (Kind) {
89 case HK_WIDTH:
90 return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
91 case HK_INTERLEAVE:
92 return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
93 case HK_FORCE:
94 return (Val <= 1);
95 case HK_ISVECTORIZED:
96 case HK_PREDICATE:
97 case HK_SCALABLE:
98 return (Val == 0 || Val == 1);
99 }
100 return false;
101}
102
103LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
104 bool InterleaveOnlyWhenForced,
105 OptimizationRemarkEmitter &ORE,
106 const TargetTransformInfo *TTI)
107 : Width("vectorize.width",
108 VectorizerParams::VectorizationFactor.getKnownMinValue(), HK_WIDTH),
109 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
110 Force("vectorize.enable", FK_Undefined, HK_FORCE),
111 IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
112 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
113 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
114 TheLoop(L), ORE(ORE) {
115 // Populate values with existing loop metadata.
116 getHintsFromMetadata();
117
118 // force-vector-interleave overrides DisableInterleaving.
119 if (VectorizerParams::isInterleaveForced())
120 Interleave.Value = VectorizerParams::VectorizationInterleave;
121
122 // If the metadata doesn't explicitly specify whether to enable scalable
123 // vectorization, then decide based on the following criteria (increasing
124 // level of priority):
125 // - Target default
126 // - Metadata width
127 // - Force option (always overrides)
128 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
129 if (TTI)
130 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
131 : SK_FixedWidthOnly;
132
133 if (Width.Value)
134 // If the width is set, but the metadata says nothing about the scalable
135 // property, then assume it concerns only a fixed-width UserVF.
136 // If width is not set, the flag takes precedence.
137 Scalable.Value = SK_FixedWidthOnly;
138 }
139
140 // If the flag is set to force any use of scalable vectors, override the loop
141 // hints.
142 if (ForceScalableVectorization.getValue() !=
143 LoopVectorizeHints::SK_Unspecified)
144 Scalable.Value = ForceScalableVectorization.getValue();
145
146 // If force-vector-width is scalable, force scalable vectorization.
147 if (VectorizerParams::VectorizationFactor.isScalable())
148 Scalable.Value = SK_AlwaysScalable;
149
150 // Scalable vectorization is disabled if no preference is specified.
151 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
152 Scalable.Value = SK_FixedWidthOnly;
153
154 if (IsVectorized.Value != 1)
155 // If the vectorization width and interleaving count are both 1 then
156 // consider the loop to have been already vectorized because there's
157 // nothing more that we can do.
158 IsVectorized.Value =
159 getWidth() == ElementCount::getFixed(MinVal: 1) && getInterleave() == 1;
160 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
161 << "LV: Interleaving disabled by the pass manager\n");
162}
163
164void LoopVectorizeHints::setAlreadyVectorized() {
165 TheLoop->addIntLoopAttribute(Name: "llvm.loop.isvectorized", Value: 1,
166 RemovePrefixes: {Twine(Prefix(), "vectorize.").str(),
167 Twine(Prefix(), "interleave.").str()});
168
169 // Update internal cache.
170 IsVectorized.Value = 1;
171}
172
173void LoopVectorizeHints::reportDisallowedVectorization(
174 const StringRef DebugMsg, const StringRef RemarkName,
175 const StringRef RemarkMsg, const Loop *L) const {
176 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: " << DebugMsg << ".\n");
177 ORE.emit(OptDiag: OptimizationRemarkMissed(LV_NAME, RemarkName, L->getStartLoc(),
178 L->getHeader())
179 << "loop not vectorized: " << RemarkMsg);
180}
181
182bool LoopVectorizeHints::allowVectorization(
183 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
184 if (getForce() == LoopVectorizeHints::FK_Disabled) {
185 if (Force.Value == LoopVectorizeHints::FK_Disabled) {
186 reportDisallowedVectorization(DebugMsg: "#pragma vectorize disable",
187 RemarkName: "MissedExplicitlyDisabled",
188 RemarkMsg: "vectorization is explicitly disabled", L);
189 } else if (hasDisableAllTransformsHint(L)) {
190 reportDisallowedVectorization(DebugMsg: "loop hasDisableAllTransformsHint",
191 RemarkName: "MissedTransformsDisabled",
192 RemarkMsg: "loop transformations are disabled", L);
193 } else {
194 llvm_unreachable("loop vect disabled for an unknown reason");
195 }
196 return false;
197 }
198
199 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
200 reportDisallowedVectorization(
201 DebugMsg: "VectorizeOnlyWhenForced is set, and no #pragma vectorize enable",
202 RemarkName: "MissedForceOnly", RemarkMsg: "only vectorizing loops that explicitly request it",
203 L);
204 return false;
205 }
206
207 if (getIsVectorized() == 1) {
208 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
209 // FIXME: Add interleave.disable metadata. This will allow
210 // vectorize.disable to be used without disabling the pass and errors
211 // to differentiate between disabled vectorization and a width of 1.
212 ORE.emit(RemarkBuilder: [&]() {
213 return OptimizationRemarkAnalysis(LV_NAME, "AllDisabled",
214 L->getStartLoc(), L->getHeader())
215 << "loop not vectorized: vectorization and interleaving are "
216 "explicitly disabled, or the loop has already been "
217 "vectorized";
218 });
219 return false;
220 }
221
222 return true;
223}
224
225void LoopVectorizeHints::emitRemarkWithHints() const {
226 using namespace ore;
227
228 ORE.emit(RemarkBuilder: [&]() {
229 if (Force.Value == LoopVectorizeHints::FK_Disabled)
230 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
231 TheLoop->getStartLoc(),
232 TheLoop->getHeader())
233 << "loop not vectorized: vectorization is explicitly disabled";
234
235 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
236 TheLoop->getHeader());
237 R << "loop not vectorized";
238 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
239 R << " (Force=" << NV("Force", true);
240 if (Width.Value != 0)
241 R << ", Vector Width=" << NV("VectorWidth", getWidth());
242 if (getInterleave() != 0)
243 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
244 R << ")";
245 }
246 return R;
247 });
248}
249
250bool LoopVectorizeHints::allowReordering() const {
251 // Allow the vectorizer to change the order of operations if enabling
252 // loop hints are provided
253 ElementCount EC = getWidth();
254 return HintsAllowReordering &&
255 (getForce() == LoopVectorizeHints::FK_Enabled ||
256 EC.getKnownMinValue() > 1);
257}
258
259void LoopVectorizeHints::getHintsFromMetadata() {
260 MDNode *LoopID = TheLoop->getLoopID();
261 if (!LoopID)
262 return;
263
264 // First operand should refer to the loop id itself.
265 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
266 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
267
268 for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
269 const MDString *S = nullptr;
270 SmallVector<Metadata *, 4> Args;
271
272 // The expected hint is either a MDString or a MDNode with the first
273 // operand a MDString.
274 if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
275 if (!MD || MD->getNumOperands() == 0)
276 continue;
277 S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
278 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
279 Args.push_back(Elt: MD->getOperand(I: Idx));
280 } else {
281 S = dyn_cast<MDString>(Val: MDO);
282 assert(Args.size() == 0 && "too many arguments for MDString");
283 }
284
285 if (!S)
286 continue;
287
288 // Check if the hint starts with the loop metadata prefix.
289 StringRef Name = S->getString();
290 if (Args.size() == 1)
291 setHint(Name, Arg: Args[0]);
292 }
293}
294
295void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
296 if (!Name.consume_front(Prefix: Prefix()))
297 return;
298
299 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
300 if (!C)
301 return;
302 unsigned Val = C->getZExtValue();
303
304 Hint *Hints[] = {&Width, &Interleave, &Force,
305 &IsVectorized, &Predicate, &Scalable};
306 for (auto *H : Hints) {
307 if (Name == H->Name) {
308 if (H->validate(Val))
309 H->Value = Val;
310 else
311 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
312 break;
313 }
314 }
315}
316
317// Return true if the inner loop \p Lp is uniform with regard to the outer loop
318// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
319// executing the inner loop will execute the same iterations). This check is
320// very constrained for now but it will be relaxed in the future. \p Lp is
321// considered uniform if it meets all the following conditions:
322// 1) it has a canonical IV (starting from 0 and with stride 1),
323// 2) its latch terminator is a conditional branch and,
324// 3) its latch condition is a compare instruction whose operands are the
325// canonical IV and an OuterLp invariant.
326// This check doesn't take into account the uniformity of other conditions not
327// related to the loop latch because they don't affect the loop uniformity.
328//
329// NOTE: We decided to keep all these checks and its associated documentation
330// together so that we can easily have a picture of the current supported loop
331// nests. However, some of the current checks don't depend on \p OuterLp and
332// would be redundantly executed for each \p Lp if we invoked this function for
333// different candidate outer loops. This is not the case for now because we
334// don't currently have the infrastructure to evaluate multiple candidate outer
335// loops and \p OuterLp will be a fixed parameter while we only support explicit
336// outer loop vectorization. It's also very likely that these checks go away
337// before introducing the aforementioned infrastructure. However, if this is not
338// the case, we should move the \p OuterLp independent checks to a separate
339// function that is only executed once for each \p Lp.
340static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
341 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
342
343 // If Lp is the outer loop, it's uniform by definition.
344 if (Lp == OuterLp)
345 return true;
346 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
347
348 // 1.
349 PHINode *IV = Lp->getCanonicalInductionVariable();
350 if (!IV) {
351 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
352 return false;
353 }
354
355 // 2.
356 BasicBlock *Latch = Lp->getLoopLatch();
357 auto *LatchBr = dyn_cast<CondBrInst>(Val: Latch->getTerminator());
358 if (!LatchBr) {
359 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
360 return false;
361 }
362
363 // 3.
364 auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
365 if (!LatchCmp) {
366 LLVM_DEBUG(
367 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
368 return false;
369 }
370
371 Value *CondOp0 = LatchCmp->getOperand(i_nocapture: 0);
372 Value *CondOp1 = LatchCmp->getOperand(i_nocapture: 1);
373 Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
374 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
375 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
376 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
377 return false;
378 }
379
380 return true;
381}
382
383// Return true if \p Lp and all its nested loops are uniform with regard to \p
384// OuterLp.
385static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
386 if (!isUniformLoop(Lp, OuterLp))
387 return false;
388
389 // Check if nested loops are uniform.
390 for (Loop *SubLp : *Lp)
391 if (!isUniformLoopNest(Lp: SubLp, OuterLp))
392 return false;
393
394 return true;
395}
396
397static IntegerType *getInductionIntegerTy(const DataLayout &DL, Type *Ty) {
398 assert(Ty->isIntOrPtrTy() && "Expected integer or pointer type");
399
400 if (Ty->isPointerTy())
401 return DL.getIntPtrType(C&: Ty->getContext(), AddressSpace: Ty->getPointerAddressSpace());
402
403 // It is possible that char's or short's overflow when we ask for the loop's
404 // trip count, work around this by changing the type size.
405 if (Ty->getScalarSizeInBits() < 32)
406 return Type::getInt32Ty(C&: Ty->getContext());
407
408 return cast<IntegerType>(Val: Ty);
409}
410
411static IntegerType *getWiderInductionTy(const DataLayout &DL, Type *Ty0,
412 Type *Ty1) {
413 IntegerType *TyA = getInductionIntegerTy(DL, Ty: Ty0);
414 IntegerType *TyB = getInductionIntegerTy(DL, Ty: Ty1);
415 return TyA->getScalarSizeInBits() > TyB->getScalarSizeInBits() ? TyA : TyB;
416}
417
418/// Returns true if A and B have same pointer operands or same SCEVs addresses
419static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
420 StoreInst *B) {
421 // Compare store
422 if (A == B)
423 return true;
424
425 // Otherwise Compare pointers
426 Value *APtr = A->getPointerOperand();
427 Value *BPtr = B->getPointerOperand();
428 if (APtr == BPtr)
429 return true;
430
431 // Otherwise compare address SCEVs
432 return SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr);
433}
434
435void LoopVectorizationLegality::collectUnitStridePredicates() const {
436 if (!AllowRuntimeSCEVChecks || !TheLoop->isInnermost())
437 return;
438
439 for (BasicBlock *BB : TheLoop->blocks())
440 for (Instruction &I : *BB)
441 if (Value *Ptr = getLoadStorePointerOperand(V: &I))
442 isConsecutivePtr(AccessTy: getLoadStoreType(I: &I), Ptr);
443}
444
445int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
446 Value *Ptr) const {
447 // FIXME: Currently, the set of symbolic strides is sometimes queried before
448 // it's collected. This happens from canVectorizeWithIfConvert, when the
449 // pointer is checked to reference consecutive elements suitable for a
450 // masked access.
451 // Stride versioning requires adding a SCEV equality predicate; only consult
452 // the symbolic strides when runtime SCEV checks are permitted.
453 const auto &Strides = LAI && AllowRuntimeSCEVChecks
454 ? LAI->getSymbolicStrides()
455 : DenseMap<Value *, const SCEV *>();
456 int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
457 Assume: AllowRuntimeSCEVChecks, ShouldCheckWrap: false)
458 .value_or(u: 0);
459 if (Stride == 1 || Stride == -1)
460 return Stride;
461 return 0;
462}
463
464bool LoopVectorizationLegality::isInvariant(Value *V) const {
465 return LAI->isInvariant(V);
466}
467
468namespace {
469/// A rewriter to build the SCEVs for each of the VF lanes in the expected
470/// vectorized loop, which can then be compared to detect their uniformity. This
471/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
472/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
473/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
474/// uniformity.
475class SCEVAddRecForUniformityRewriter
476 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
477 /// Multiplier to be applied to the step of AddRecs in TheLoop.
478 unsigned StepMultiplier;
479
480 /// Offset to be added to the AddRecs in TheLoop.
481 unsigned Offset;
482
483 /// Loop for which to rewrite AddRecsFor.
484 Loop *TheLoop;
485
486 /// Is any sub-expressions not analyzable w.r.t. uniformity?
487 bool CannotAnalyze = false;
488
489 bool canAnalyze() const { return !CannotAnalyze; }
490
491public:
492 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
493 unsigned Offset, Loop *TheLoop)
494 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
495 TheLoop(TheLoop) {}
496
497 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
498 assert(Expr->getLoop() == TheLoop &&
499 "addrec outside of TheLoop must be invariant and should have been "
500 "handled earlier");
501 // Build a new AddRec by multiplying the step by StepMultiplier and
502 // incrementing the start by Offset * step.
503 Type *Ty = Expr->getType();
504 const SCEV *Step = Expr->getStepRecurrence(SE);
505 if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
506 CannotAnalyze = true;
507 return Expr;
508 }
509 const SCEV *NewStep =
510 SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
511 const SCEV *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
512 const SCEV *NewStart =
513 SE.getAddExpr(LHS: Expr->getStart(), RHS: SCEVUse(ScaledOffset));
514 return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
515 }
516
517 const SCEV *visit(const SCEV *S) {
518 if (CannotAnalyze || SE.isLoopInvariant(S, L: TheLoop))
519 return S;
520 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
521 }
522
523 const SCEV *visitUnknown(const SCEVUnknown *S) {
524 if (SE.isLoopInvariant(S, L: TheLoop))
525 return S;
526 // The value could vary across iterations.
527 CannotAnalyze = true;
528 return S;
529 }
530
531 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
532 // Could not analyze the expression.
533 CannotAnalyze = true;
534 return S;
535 }
536
537 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
538 unsigned StepMultiplier, unsigned Offset,
539 Loop *TheLoop) {
540 /// Bail out if the expression does not contain an UDiv expression.
541 /// Uniform values which are not loop invariant require operations to strip
542 /// out the lowest bits. For now just look for UDivs and use it to avoid
543 /// re-writing UDIV-free expressions for other lanes to limit compile time.
544 if (!SCEVExprContains(Root: S,
545 Pred: [](const SCEV *S) { return isa<SCEVUDivExpr>(Val: S); }))
546 return SE.getCouldNotCompute();
547
548 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
549 TheLoop);
550 const SCEV *Result = Rewriter.visit(S);
551
552 if (Rewriter.canAnalyze())
553 return Result;
554 return SE.getCouldNotCompute();
555 }
556};
557
558} // namespace
559
560bool LoopVectorizationLegality::isUniform(
561 Value *V, std::optional<ElementCount> VF) const {
562 if (isInvariant(V))
563 return true;
564 if (!VF || VF->isScalable())
565 return false;
566 if (VF->isScalar())
567 return true;
568
569 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
570 // never considered uniform.
571 auto *SE = PSE.getSE();
572 if (!SE->isSCEVable(Ty: V->getType()))
573 return false;
574 const SCEV *S = SE->getSCEV(V);
575
576 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
577 // lane 0 matches the expressions for all other lanes.
578 unsigned FixedVF = VF->getKnownMinValue();
579 const SCEV *FirstLaneExpr =
580 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: 0, TheLoop);
581 if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
582 return false;
583
584 // Make sure the expressions for lanes FixedVF-1..1 match the expression for
585 // lane 0. We check lanes in reverse order for compile-time, as frequently
586 // checking the last lane is sufficient to rule out uniformity.
587 return all_of(Range: reverse(C: seq<unsigned>(Begin: 1, End: FixedVF)), P: [&](unsigned I) {
588 const SCEV *IthLaneExpr =
589 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
590 return FirstLaneExpr == IthLaneExpr;
591 });
592}
593
594bool LoopVectorizationLegality::isUniformMemOp(
595 Instruction &I, std::optional<ElementCount> VF) const {
596 Value *Ptr = getLoadStorePointerOperand(V: &I);
597 if (!Ptr)
598 return false;
599 // Note: There's nothing inherent which prevents predicated loads and
600 // stores from being uniform. The current lowering simply doesn't handle
601 // it; in particular, the cost model distinguishes scatter/gather from
602 // scalar w/predication, and we currently rely on the scalar path.
603 return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
604}
605
606bool LoopVectorizationLegality::canVectorizeOuterLoop() {
607 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
608 // Store the result and return it at the end instead of exiting early, in case
609 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
610 bool Result = true;
611 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
612
613 for (BasicBlock *BB : TheLoop->blocks()) {
614 // Check whether the BB terminator is a branch. Any other terminator is
615 // not supported yet.
616 Instruction *Term = BB->getTerminator();
617 if (!isa<UncondBrInst, CondBrInst>(Val: Term)) {
618 reportVectorizationFailure(
619 DebugMsg: "Unsupported basic block terminator",
620 OREMsg: "loop control flow is not understood by vectorizer",
621 ORETag: "CFGNotUnderstood", ORE, TheLoop);
622 if (DoExtraAnalysis)
623 Result = false;
624 else
625 return false;
626 }
627
628 // Check whether the branch is a supported one. Only unconditional
629 // branches, conditional branches with an outer loop invariant condition or
630 // backedges are supported.
631 // FIXME: We skip these checks when VPlan predication is enabled as we
632 // want to allow divergent branches. This whole check will be removed
633 // once VPlan predication is on by default.
634 auto *Br = dyn_cast<CondBrInst>(Val: Term);
635 if (Br && !TheLoop->isLoopInvariant(V: Br->getCondition()) &&
636 !LI->isLoopHeader(BB: Br->getSuccessor(i: 0)) &&
637 !LI->isLoopHeader(BB: Br->getSuccessor(i: 1))) {
638 reportVectorizationFailure(
639 DebugMsg: "Unsupported conditional branch",
640 OREMsg: "loop control flow is not understood by vectorizer",
641 ORETag: "CFGNotUnderstood", ORE, TheLoop);
642 if (DoExtraAnalysis)
643 Result = false;
644 else
645 return false;
646 }
647 }
648
649 // Check whether inner loops are uniform. At this point, we only support
650 // simple outer loops scenarios with uniform nested loops.
651 if (!isUniformLoopNest(Lp: TheLoop /*loop nest*/,
652 OuterLp: TheLoop /*context outer loop*/)) {
653 reportVectorizationFailure(
654 DebugMsg: "Outer loop contains divergent loops",
655 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
656 ORE, TheLoop);
657 if (DoExtraAnalysis)
658 Result = false;
659 else
660 return false;
661 }
662
663 // Check whether we are able to set up outer loop induction.
664 if (!setupOuterLoopInductions()) {
665 reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
666 ORETag: "UnsupportedPhi", ORE, TheLoop);
667 if (DoExtraAnalysis)
668 Result = false;
669 else
670 return false;
671 }
672
673 return Result;
674}
675
676void LoopVectorizationLegality::addInductionPhi(PHINode *Phi,
677 const InductionDescriptor &ID) {
678 Inductions[Phi] = ID;
679
680 // In case this induction also comes with casts that we know we can ignore
681 // in the vectorized loop body, record them here. All casts could be recorded
682 // here for ignoring, but suffices to record only the first (as it is the
683 // only one that may bw used outside the cast sequence).
684 ArrayRef<Instruction *> Casts = ID.getCastInsts();
685 if (!Casts.empty())
686 InductionCastsToIgnore.insert(Ptr: *Casts.begin());
687
688 Type *PhiTy = Phi->getType();
689 const DataLayout &DL = Phi->getDataLayout();
690
691 assert((PhiTy->isIntOrPtrTy() || PhiTy->isFloatingPointTy()) &&
692 "Expected int, ptr, or FP induction phi type");
693
694 // Get the widest type.
695 if (PhiTy->isIntOrPtrTy()) {
696 if (!WidestIndTy)
697 WidestIndTy = getInductionIntegerTy(DL, Ty: PhiTy);
698 else
699 WidestIndTy = getWiderInductionTy(DL, Ty0: PhiTy, Ty1: WidestIndTy);
700 }
701
702 // Int inductions are special because we only allow one IV.
703 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
704 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
705 isa<Constant>(Val: ID.getStartValue()) &&
706 cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
707
708 // Use the phi node with the widest type as induction. Use the last
709 // one if there are multiple (no good reason for doing this other
710 // than it is expedient). We've checked that it begins at zero and
711 // steps by one, so this is a canonical induction variable.
712 if (!PrimaryInduction || PhiTy == WidestIndTy)
713 PrimaryInduction = Phi;
714 }
715
716 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
717}
718
719bool LoopVectorizationLegality::setupOuterLoopInductions() {
720 BasicBlock *Header = TheLoop->getHeader();
721
722 // Returns true if a given Phi is a supported induction.
723 auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
724 InductionDescriptor ID;
725 if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
726 ID.getKind() == InductionDescriptor::IK_IntInduction) {
727 addInductionPhi(Phi: &Phi, ID);
728 return true;
729 }
730 // Bail out for any Phi in the outer loop header that is not a supported
731 // induction.
732 LLVM_DEBUG(
733 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
734 return false;
735 };
736
737 return llvm::all_of(Range: Header->phis(), P: IsSupportedPhi);
738}
739
740/// Checks if a function is scalarizable according to the TLI, in
741/// the sense that it should be vectorized and then expanded in
742/// multiple scalar calls. This is represented in the
743/// TLI via mappings that do not specify a vector name, as in the
744/// following example:
745///
746/// const VecDesc VecIntrinsics[] = {
747/// {"llvm.phx.abs.i32", "", 4}
748/// };
749static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
750 const StringRef ScalarName = CI.getCalledFunction()->getName();
751 bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
752 // Check that all known VFs are not associated to a vector
753 // function, i.e. the vector name is emty.
754 if (Scalarize) {
755 ElementCount WidestFixedVF, WidestScalableVF;
756 TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
757 for (ElementCount VF = ElementCount::getFixed(MinVal: 2);
758 ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= 2)
759 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
760 for (ElementCount VF = ElementCount::getScalable(MinVal: 1);
761 ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= 2)
762 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
763 assert((WidestScalableVF.isZero() || !Scalarize) &&
764 "Caller may decide to scalarize a variant using a scalable VF");
765 }
766 return Scalarize;
767}
768
769bool LoopVectorizationLegality::canVectorizeInstrs() {
770 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
771 bool Result = true;
772
773 // For each block in the loop.
774 for (BasicBlock *BB : TheLoop->blocks()) {
775 // Scan the instructions in the block and look for hazards.
776 for (Instruction &I : *BB) {
777 Result &= canVectorizeInstr(I);
778 if (!DoExtraAnalysis && !Result)
779 return false;
780 }
781 }
782
783 if (!PrimaryInduction) {
784 if (Inductions.empty()) {
785 reportVectorizationFailure(
786 DebugMsg: "Did not find one integer induction var",
787 OREMsg: "loop induction variable could not be identified",
788 ORETag: "NoInductionVariable", ORE, TheLoop);
789 return false;
790 }
791 if (!WidestIndTy) {
792 reportVectorizationFailure(
793 DebugMsg: "Did not find one integer induction var",
794 OREMsg: "integer loop induction variable could not be identified",
795 ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
796 return false;
797 }
798 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
799 }
800
801 // Now we know the widest induction type, check if our found induction
802 // is the same size. If it's not, unset it here and InnerLoopVectorizer
803 // will create another.
804 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
805 PrimaryInduction = nullptr;
806
807 return Result;
808}
809
810bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
811 BasicBlock *BB = I.getParent();
812 BasicBlock *Header = TheLoop->getHeader();
813
814 if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
815 Type *PhiTy = Phi->getType();
816 // Check that this PHI type is allowed.
817 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
818 !PhiTy->isPointerTy()) {
819 reportVectorizationFailure(
820 DebugMsg: "Found a non-int non-pointer PHI",
821 OREMsg: "loop control flow is not understood by vectorizer",
822 ORETag: "CFGNotUnderstood", ORE, TheLoop);
823 return false;
824 }
825
826 // If this PHINode is not in the header block, then we know that we
827 // can convert it to select during if-conversion. No need to check if
828 // the PHIs in this block are induction or reduction variables.
829 if (BB != Header) {
830 // Non-header phi nodes that have outside uses can be vectorized. Unsafe
831 // cyclic dependencies with header phis are identified during legalization
832 // for reduction, induction and fixed order recurrences.
833 return true;
834 }
835
836 // We only allow if-converted PHIs with exactly two incoming values.
837 if (Phi->getNumIncomingValues() != 2) {
838 reportVectorizationFailure(
839 DebugMsg: "Found an invalid PHI",
840 OREMsg: "loop control flow is not understood by vectorizer",
841 ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
842 return false;
843 }
844
845 RecurrenceDescriptor RedDes;
846 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
847 SE: PSE.getSE())) {
848 Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
849 Reductions[Phi] = std::move(RedDes);
850 assert((!RedDes.hasUsesOutsideReductionChain() ||
851 RecurrenceDescriptor::isMinMaxRecurrenceKind(
852 RedDes.getRecurrenceKind())) &&
853 "Only min/max recurrences are allowed to have multiple uses "
854 "currently");
855 return true;
856 }
857
858 // We prevent matching non-constant strided pointer IVS to preserve
859 // historical vectorizer behavior after a generalization of the
860 // IVDescriptor code. The intent is to remove this check, but we
861 // have to fix issues around code quality for such loops first.
862 auto IsDisallowedStridedPointerInduction =
863 [](const InductionDescriptor &ID) {
864 if (AllowStridedPointerIVs)
865 return false;
866 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
867 ID.getConstIntStepValue() == nullptr;
868 };
869
870 InductionDescriptor ID;
871 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
872 !IsDisallowedStridedPointerInduction(ID)) {
873 addInductionPhi(Phi, ID);
874 Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
875 return true;
876 }
877
878 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
879 FixedOrderRecurrences.insert(Ptr: Phi);
880 return true;
881 }
882
883 // As a last resort, coerce the PHI to a AddRec expression
884 // and re-try classifying it a an induction PHI.
885 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
886 !IsDisallowedStridedPointerInduction(ID)) {
887 addInductionPhi(Phi, ID);
888 return true;
889 }
890
891 reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
892 OREMsg: "value that could not be identified as "
893 "reduction is used outside the loop",
894 ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
895 I: Phi);
896 return false;
897 } // end of PHI handling
898
899 // We handle calls that:
900 // * Have a mapping to an IR intrinsic.
901 // * Have a vector version available.
902 auto *CI = dyn_cast<CallInst>(Val: &I);
903
904 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
905 !(CI->getCalledFunction() && TLI &&
906 (!VFDatabase::getMappings(CI: *CI).empty() || isTLIScalarize(TLI: *TLI, CI: *CI)))) {
907 // If the call is a recognized math libary call, it is likely that
908 // we can vectorize it given loosened floating-point constraints.
909 LibFunc Func;
910 bool IsMathLibCall =
911 TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
912 TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
913 TLI->hasOptimizedCodeGen(F: Func);
914
915 if (IsMathLibCall) {
916 // TODO: Ideally, we should not use clang-specific language here,
917 // but it's hard to provide meaningful yet generic advice.
918 // Also, should this be guarded by allowExtraAnalysis() and/or be part
919 // of the returned info from isFunctionVectorizable()?
920 reportVectorizationFailure(
921 DebugMsg: "Found a non-intrinsic callsite",
922 OREMsg: "library call cannot be vectorized. "
923 "Try compiling with -fno-math-errno, -ffast-math, "
924 "or similar flags",
925 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
926 } else {
927 reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
928 OREMsg: "call instruction cannot be vectorized",
929 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
930 }
931 return false;
932 }
933
934 // Some intrinsics have scalar arguments and should be same in order for
935 // them to be vectorized (i.e. loop invariant).
936 if (CI) {
937 auto *SE = PSE.getSE();
938 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
939 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
940 if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: Idx, TTI)) {
941 if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: Idx)), L: TheLoop)) {
942 reportVectorizationFailure(
943 DebugMsg: "Found unvectorizable intrinsic",
944 OREMsg: "intrinsic instruction cannot be vectorized",
945 ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
946 return false;
947 }
948 }
949 }
950
951 // If we found a vectorized variant of a function, note that so LV can
952 // make better decisions about maximum VF.
953 if (CI && !VFDatabase::getMappings(CI: *CI).empty())
954 VecCallVariantsFound = true;
955
956 auto CanWidenInstructionTy = [](Instruction const &Inst) {
957 Type *InstTy = Inst.getType();
958 if (!isa<StructType>(Val: InstTy))
959 return canVectorizeTy(Ty: InstTy);
960
961 // For now, we only recognize struct values returned from calls where
962 // all users are extractvalue as vectorizable. All element types of the
963 // struct must be types that can be widened.
964 return isa<CallInst>(Val: Inst) && canVectorizeTy(Ty: InstTy) &&
965 all_of(Range: Inst.users(), P: IsaPred<ExtractValueInst>);
966 };
967
968 // Check that the instruction return type is vectorizable.
969 // We can't vectorize casts from vector type to scalar type.
970 // Also, we can't vectorize extractelement instructions.
971 if (!CanWidenInstructionTy(I) ||
972 (isa<CastInst>(Val: I) &&
973 !VectorType::isValidElementType(ElemTy: I.getOperand(i: 0)->getType())) ||
974 isa<ExtractElementInst>(Val: I)) {
975 reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
976 OREMsg: "instruction return type cannot be vectorized",
977 ORETag: "CantVectorizeInstructionReturnType", ORE,
978 TheLoop, I: &I);
979 return false;
980 }
981
982 // Check that the stored type is vectorizable.
983 if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
984 Type *T = ST->getValueOperand()->getType();
985 if (!VectorType::isValidElementType(ElemTy: T)) {
986 reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
987 ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
988 return false;
989 }
990
991 // For nontemporal stores, check that a nontemporal vector version is
992 // supported on the target.
993 if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
994 // Arbitrarily try a vector of 2 elements.
995 auto *VecTy = FixedVectorType::get(ElementType: T, /*NumElts=*/2);
996 assert(VecTy && "did not find vectorized version of stored type");
997 if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
998 reportVectorizationFailure(
999 DebugMsg: "nontemporal store instruction cannot be vectorized",
1000 ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
1001 return false;
1002 }
1003 }
1004
1005 } else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
1006 if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1007 // For nontemporal loads, check that a nontemporal vector version is
1008 // supported on the target (arbitrarily try a vector of 2 elements).
1009 auto *VecTy = FixedVectorType::get(ElementType: I.getType(), /*NumElts=*/2);
1010 assert(VecTy && "did not find vectorized version of load type");
1011 if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
1012 reportVectorizationFailure(
1013 DebugMsg: "nontemporal load instruction cannot be vectorized",
1014 ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
1015 return false;
1016 }
1017 }
1018
1019 // FP instructions can allow unsafe algebra, thus vectorizable by
1020 // non-IEEE-754 compliant SIMD units.
1021 // This applies to floating-point math operations and calls, not memory
1022 // operations, shuffles, or casts, as they don't change precision or
1023 // semantics.
1024 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
1025 !I.isFast()) {
1026 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1027 Hints->setPotentiallyUnsafe();
1028 }
1029
1030 return true;
1031}
1032
1033/// Find histogram operations that match high-level code in loops:
1034/// \code
1035/// buckets[indices[i]]+=step;
1036/// \endcode
1037///
1038/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1039/// array the computed histogram. It uses a BinOp to sum all counts, storing
1040/// them using a loop-variant index Load from the 'indices' input array.
1041///
1042/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1043/// regardless of hardware support. When there is support, it additionally
1044/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1045/// used to update histogram in \p HistogramPtrs.
1046static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
1047 const PredicatedScalarEvolution &PSE,
1048 SmallVectorImpl<HistogramInfo> &Histograms) {
1049
1050 // Store value must come from a Binary Operation.
1051 Instruction *HPtrInstr = nullptr;
1052 BinaryOperator *HBinOp = nullptr;
1053 if (!match(V: HSt, P: m_Store(ValueOp: m_BinOp(I&: HBinOp), PointerOp: m_Instruction(I&: HPtrInstr))))
1054 return false;
1055
1056 // BinOp must be an Add or a Sub modifying the bucket value by a
1057 // loop invariant amount.
1058 // FIXME: We assume the loop invariant term is on the RHS.
1059 // Fine for an immediate/constant, but maybe not a generic value?
1060 Value *HIncVal = nullptr;
1061 if (!match(V: HBinOp, P: m_Add(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))) &&
1062 !match(V: HBinOp, P: m_Sub(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))))
1063 return false;
1064
1065 // Make sure the increment value is loop invariant.
1066 if (!TheLoop->isLoopInvariant(V: HIncVal))
1067 return false;
1068
1069 // The address to store is calculated through a GEP Instruction.
1070 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: HPtrInstr);
1071 if (!GEP)
1072 return false;
1073
1074 // Restrict address calculation to constant indices except for the last term.
1075 Value *HIdx = nullptr;
1076 for (Value *Index : GEP->indices()) {
1077 if (HIdx)
1078 return false;
1079 if (!isa<ConstantInt>(Val: Index))
1080 HIdx = Index;
1081 }
1082
1083 if (!HIdx)
1084 return false;
1085
1086 // Check that the index is calculated by loading from another array. Ignore
1087 // any extensions.
1088 // FIXME: Support indices from other sources than a linear load from memory?
1089 // We're currently trying to match an operation looping over an array
1090 // of indices, but there could be additional levels of indirection
1091 // in place, or possibly some additional calculation to form the index
1092 // from the loaded data.
1093 Value *VPtrVal;
1094 if (!match(V: HIdx, P: m_ZExtOrSExtOrSelf(Op: m_Load(Op: m_Value(V&: VPtrVal)))))
1095 return false;
1096
1097 // Make sure the index address varies in this loop, not an outer loop.
1098 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: VPtrVal));
1099 if (!AR || AR->getLoop() != TheLoop)
1100 return false;
1101
1102 // Ensure we'll have the same mask by checking that all parts of the histogram
1103 // (gather load, update, scatter store) are in the same block.
1104 LoadInst *IndexedLoad = cast<LoadInst>(Val: HBinOp->getOperand(i_nocapture: 0));
1105 BasicBlock *LdBB = IndexedLoad->getParent();
1106 if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
1107 return false;
1108
1109 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1110
1111 // Store the operations that make up the histogram.
1112 Histograms.emplace_back(Args&: IndexedLoad, Args&: HBinOp, Args&: HSt);
1113 return true;
1114}
1115
1116bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1117 // For now, we only support an IndirectUnsafe dependency that calculates
1118 // a histogram
1119 if (!EnableHistogramVectorization)
1120 return false;
1121
1122 // Find a single IndirectUnsafe dependency.
1123 const MemoryDepChecker::Dependence *IUDep = nullptr;
1124 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1125 const auto *Deps = DepChecker.getDependences();
1126 // If there were too many dependences, LAA abandons recording them. We can't
1127 // proceed safely if we don't know what the dependences are.
1128 if (!Deps)
1129 return false;
1130
1131 for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1132 // Ignore dependencies that are either known to be safe or can be
1133 // checked at runtime.
1134 if (MemoryDepChecker::Dependence::isSafeForVectorization(Type: Dep.Type) !=
1135 MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1136 continue;
1137
1138 // We're only interested in IndirectUnsafe dependencies here, where the
1139 // address might come from a load from memory. We also only want to handle
1140 // one such dependency, at least for now.
1141 if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
1142 return false;
1143
1144 IUDep = &Dep;
1145 }
1146 if (!IUDep)
1147 return false;
1148
1149 // For now only normal loads and stores are supported.
1150 LoadInst *LI = dyn_cast<LoadInst>(Val: IUDep->getSource(DepChecker));
1151 StoreInst *SI = dyn_cast<StoreInst>(Val: IUDep->getDestination(DepChecker));
1152
1153 if (!LI || !SI)
1154 return false;
1155
1156 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1157 return findHistogram(LI, HSt: SI, TheLoop, PSE: LAI->getPSE(), Histograms);
1158}
1159
1160bool LoopVectorizationLegality::canVectorizeMemory() {
1161 LAI = &LAIs.getInfo(L&: *TheLoop);
1162 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1163 if (LAR) {
1164 ORE->emit(RemarkBuilder: [&]() {
1165 return OptimizationRemarkAnalysis(LV_NAME, "loop not vectorized: ", *LAR);
1166 });
1167 }
1168
1169 if (!LAI->canVectorizeMemory()) {
1170 if (hasUncountableExitWithSideEffects()) {
1171 reportVectorizationFailure(
1172 DebugMsg: "Cannot vectorize unsafe dependencies in uncountable exit loop with "
1173 "side effects",
1174 ORETag: "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1175 TheLoop);
1176 return false;
1177 }
1178
1179 return canVectorizeIndirectUnsafeDependences();
1180 }
1181
1182 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1183 reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1184 OREMsg: "write to a loop invariant address could not "
1185 "be vectorized",
1186 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1187 TheLoop);
1188 return false;
1189 }
1190
1191 // We can vectorize stores to invariant address when final reduction value is
1192 // guaranteed to be stored at the end of the loop. Also, if decision to
1193 // vectorize loop is made, runtime checks are added so as to make sure that
1194 // invariant address won't alias with any other objects.
1195 if (!LAI->getStoresToInvariantAddresses().empty()) {
1196 // For each invariant address, check if last stored value is unconditional
1197 // and the address is not calculated inside the loop.
1198 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1199 if (!isInvariantStoreOfReduction(SI))
1200 continue;
1201
1202 if (blockNeedsPredication(BB: SI->getParent())) {
1203 reportVectorizationFailure(
1204 DebugMsg: "We don't allow storing to uniform addresses",
1205 OREMsg: "write of conditional recurring variant value to a loop "
1206 "invariant address could not be vectorized",
1207 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1208 return false;
1209 }
1210
1211 // Invariant address should be defined outside of loop. LICM pass usually
1212 // makes sure it happens, but in rare cases it does not, we do not want
1213 // to overcomplicate vectorization to support this case.
1214 if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1215 if (TheLoop->contains(Inst: Ptr)) {
1216 reportVectorizationFailure(
1217 DebugMsg: "Invariant address is calculated inside the loop",
1218 OREMsg: "write to a loop invariant address could not "
1219 "be vectorized",
1220 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1221 return false;
1222 }
1223 }
1224 }
1225
1226 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1227 // For each invariant address, check its last stored value is the result
1228 // of one of our reductions.
1229 //
1230 // We do not check if dependence with loads exists because that is already
1231 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1232 ScalarEvolution *SE = PSE.getSE();
1233 SmallVector<StoreInst *, 4> UnhandledStores;
1234 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1235 if (isInvariantStoreOfReduction(SI)) {
1236 // Earlier stores to this address are effectively deadcode.
1237 // With opaque pointers it is possible for one pointer to be used with
1238 // different sizes of stored values:
1239 // store i32 0, ptr %x
1240 // store i8 0, ptr %x
1241 // The latest store doesn't complitely overwrite the first one in the
1242 // example. That is why we have to make sure that types of stored
1243 // values are same.
1244 // TODO: Check that bitwidth of unhandled store is smaller then the
1245 // one that overwrites it and add a test.
1246 erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1247 return storeToSameAddress(SE, A: SI, B: I) &&
1248 I->getValueOperand()->getType() ==
1249 SI->getValueOperand()->getType();
1250 });
1251 continue;
1252 }
1253 UnhandledStores.push_back(Elt: SI);
1254 }
1255
1256 bool IsOK = UnhandledStores.empty();
1257 // TODO: we should also validate against InvariantMemSets.
1258 if (!IsOK) {
1259 reportVectorizationFailure(
1260 DebugMsg: "We don't allow storing to uniform addresses",
1261 OREMsg: "write to a loop invariant address could not "
1262 "be vectorized",
1263 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1264 return false;
1265 }
1266 }
1267 }
1268
1269 PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1270 return true;
1271}
1272
1273bool LoopVectorizationLegality::canVectorizeFPMath(
1274 bool EnableStrictReductions) {
1275
1276 // First check if there is any ExactFP math or if we allow reassociations
1277 if (!Requirements->getExactFPInst() || Hints->allowReordering())
1278 return true;
1279
1280 // If the above is false, we have ExactFPMath & do not allow reordering.
1281 // If the EnableStrictReductions flag is set, first check if we have any
1282 // Exact FP induction vars, which we cannot vectorize.
1283 if (!EnableStrictReductions ||
1284 any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1285 InductionDescriptor IndDesc = Induction.second;
1286 return IndDesc.getExactFPMathInst();
1287 }))
1288 return false;
1289
1290 // We can now only vectorize if all reductions with Exact FP math also
1291 // have the isOrdered flag set, which indicates that we can move the
1292 // reduction operations in-loop.
1293 return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1294 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1295 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
1296 }));
1297}
1298
1299bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1300 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1301 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1302 return RdxDesc.IntermediateStore == SI;
1303 });
1304}
1305
1306bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1307 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1308 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1309 if (!RdxDesc.IntermediateStore)
1310 return false;
1311
1312 ScalarEvolution *SE = PSE.getSE();
1313 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1314 return V == InvariantAddress ||
1315 SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1316 });
1317}
1318
1319bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
1320 Value *In0 = const_cast<Value *>(V);
1321 PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1322 if (!PN)
1323 return false;
1324
1325 return Inductions.count(Key: PN);
1326}
1327
1328bool LoopVectorizationLegality::isCastedInductionVariable(
1329 const Value *V) const {
1330 auto *Inst = dyn_cast<Instruction>(Val: V);
1331 return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1332}
1333
1334bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
1335 return isInductionPhi(V) || isCastedInductionVariable(V);
1336}
1337
1338bool LoopVectorizationLegality::isFixedOrderRecurrence(
1339 const PHINode *Phi) const {
1340 return FixedOrderRecurrences.count(Ptr: Phi);
1341}
1342
1343bool LoopVectorizationLegality::blockNeedsPredication(
1344 const BasicBlock *BB) const {
1345 BasicBlock *Latch = TheLoop->getLoopLatch();
1346
1347 // Without a latch, we cannot properly answer blockNeedsPredication,
1348 // return early.
1349 if (!Latch) {
1350 assert(ORE->allowExtraAnalysis(DEBUG_TYPE) &&
1351 !canVectorizeLoopCFG(TheLoop, /*UseVPlanNativePath=*/false) &&
1352 "Loop shape should have been rejected by earlier checks");
1353 return false;
1354 }
1355
1356 // When vectorizing early exits, create predicates for the latch block only.
1357 // For a single early exit, it must be a direct predecessor of the latch.
1358 // For multiple early exits, they form a chain where each exiting block
1359 // dominates all subsequent blocks up to the latch.
1360 if (hasUncountableEarlyExit())
1361 return BB == Latch;
1362 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1363}
1364
1365bool LoopVectorizationLegality::blockCanBePredicated(
1366 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
1367 SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
1368 for (Instruction &I : *BB) {
1369 // We can predicate blocks with calls to assume, as long as we drop them in
1370 // case we flatten the CFG via predication.
1371 if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1372 MaskedOp.insert(Ptr: &I);
1373 continue;
1374 }
1375
1376 // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1377 // TODO: there might be cases that it should block the vectorization. Let's
1378 // ignore those for now.
1379 if (isa<NoAliasScopeDeclInst>(Val: &I))
1380 continue;
1381
1382 // We can allow masked calls if there's at least one vector variant, even
1383 // if we end up scalarizing due to the cost model calculations.
1384 // TODO: Allow other calls if they have appropriate attributes... readonly
1385 // and argmemonly?
1386 if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1387 if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1388 MaskedOp.insert(Ptr: CI);
1389 continue;
1390 }
1391
1392 // Loads are handled via masking (or speculated if safe to do so.)
1393 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1394 if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1395 MaskedOp.insert(Ptr: LI);
1396 continue;
1397 }
1398
1399 // Predicated store requires some form of masking:
1400 // 1) masked store HW instruction,
1401 // 2) emulation via load-blend-store (only if safe and legal to do so,
1402 // be aware on the race conditions), or
1403 // 3) element-by-element predicate check and scalar store.
1404 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1405 MaskedOp.insert(Ptr: SI);
1406 continue;
1407 }
1408
1409 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
1410 return false;
1411 }
1412
1413 return true;
1414}
1415
1416bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1417 if (!EnableIfConversion) {
1418 reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1419 ORETag: "IfConversionDisabled", ORE, TheLoop);
1420 return false;
1421 }
1422
1423 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1424
1425 // A list of pointers which are known to be dereferenceable within scope of
1426 // the loop body for each iteration of the loop which executes. That is,
1427 // the memory pointed to can be dereferenced (with the access size implied by
1428 // the value's type) unconditionally within the loop header without
1429 // introducing a new fault.
1430 SmallPtrSet<Value *, 8> SafePointers;
1431
1432 // Collect safe addresses.
1433 for (BasicBlock *BB : TheLoop->blocks()) {
1434 if (!blockNeedsPredication(BB)) {
1435 for (Instruction &I : *BB)
1436 if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1437 SafePointers.insert(Ptr);
1438 continue;
1439 }
1440
1441 // For a block which requires predication, a address may be safe to access
1442 // in the loop w/o predication if we can prove dereferenceability facts
1443 // sufficient to ensure it'll never fault within the loop. For the moment,
1444 // we restrict this to loads; stores are more complicated due to
1445 // concurrency restrictions.
1446 ScalarEvolution &SE = *PSE.getSE();
1447 SmallVector<const SCEVPredicate *, 4> Predicates;
1448 for (Instruction &I : *BB) {
1449 LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1450
1451 // Make sure we can execute all computations feeding into Ptr in the loop
1452 // w/o triggering UB and that none of the out-of-loop operands are poison.
1453 // We do not need to check if operations inside the loop can produce
1454 // poison due to flags (e.g. due to an inbounds GEP going out of bounds),
1455 // because flags will be dropped when executing them unconditionally.
1456 // TODO: Results could be improved by considering poison-propagation
1457 // properties of visited ops.
1458 auto CanSpeculatePointerOp = [this](Value *Ptr) {
1459 SmallVector<Value *> Worklist = {Ptr};
1460 SmallPtrSet<Value *, 4> Visited;
1461 while (!Worklist.empty()) {
1462 Value *CurrV = Worklist.pop_back_val();
1463 if (!Visited.insert(Ptr: CurrV).second)
1464 continue;
1465
1466 auto *CurrI = dyn_cast<Instruction>(Val: CurrV);
1467 if (!CurrI || !TheLoop->contains(Inst: CurrI)) {
1468 BasicBlock *LoopPred = TheLoop->getLoopPredecessor();
1469 Instruction *CtxI = LoopPred ? LoopPred->getTerminator() : nullptr;
1470 assert((CtxI || ORE->allowExtraAnalysis(DEBUG_TYPE)) &&
1471 "Loop with multiple predecessors should have been rejected "
1472 "early.");
1473 // If operands from outside the loop may be poison then Ptr may also
1474 // be poison.
1475 if (!isGuaranteedNotToBePoison(V: CurrV, AC, CtxI, DT))
1476 return false;
1477 continue;
1478 }
1479
1480 // A loaded value may be poison, independent of any flags.
1481 if (isa<LoadInst>(Val: CurrI) && !isGuaranteedNotToBePoison(V: CurrV, AC))
1482 return false;
1483
1484 // For other ops, assume poison can only be introduced via flags,
1485 // which can be dropped.
1486 if (!isa<PHINode>(Val: CurrI) && !isSafeToSpeculativelyExecute(I: CurrI))
1487 return false;
1488 append_range(C&: Worklist, R: CurrI->operands());
1489 }
1490 return true;
1491 };
1492 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1493 // that it will consider loops that need guarding by SCEV checks. The
1494 // vectoriser will generate these checks if we decide to vectorise.
1495 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1496 CanSpeculatePointerOp(LI->getPointerOperand()) &&
1497 isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC,
1498 Predicates: &Predicates))
1499 SafePointers.insert(Ptr: LI->getPointerOperand());
1500 Predicates.clear();
1501 }
1502 }
1503
1504 // Collect the blocks that need predication.
1505 for (BasicBlock *BB : TheLoop->blocks()) {
1506 // We support only branches and switch statements as terminators inside the
1507 // loop.
1508 if (isa<SwitchInst>(Val: BB->getTerminator())) {
1509 if (TheLoop->isLoopExiting(BB)) {
1510 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported switch",
1511 ORETag: "LoopContainsUnsupportedSwitch", ORE,
1512 TheLoop, I: BB->getTerminator());
1513 return false;
1514 }
1515 } else if (!isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator())) {
1516 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported terminator",
1517 ORETag: "LoopContainsUnsupportedTerminator", ORE,
1518 TheLoop, I: BB->getTerminator());
1519 return false;
1520 }
1521
1522 // We must be able to predicate all blocks that need to be predicated.
1523 if (blockNeedsPredication(BB) &&
1524 !blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: ConditionallyExecutedOps)) {
1525 reportVectorizationFailure(
1526 DebugMsg: "Control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1527 ORE, TheLoop, I: BB->getTerminator());
1528 return false;
1529 }
1530 }
1531
1532 // We can if-convert this loop.
1533 return true;
1534}
1535
1536// Helper function to canVectorizeLoopNestCFG.
1537bool LoopVectorizationLegality::canVectorizeLoopCFG(
1538 Loop *Lp, bool UseVPlanNativePath) const {
1539 assert((UseVPlanNativePath || Lp->isInnermost()) &&
1540 "VPlan-native path is not enabled.");
1541
1542 // TODO: ORE should be improved to show more accurate information when an
1543 // outer loop can't be vectorized because a nested loop is not understood or
1544 // legal. Something like: "outer_loop_location: loop not vectorized:
1545 // (inner_loop_location) loop control flow is not understood by vectorizer".
1546
1547 // Store the result and return it at the end instead of exiting early, in case
1548 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1549 bool Result = true;
1550 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1551
1552 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1553 // be canonicalized.
1554 if (!Lp->getLoopPreheader()) {
1555 reportVectorizationFailure(
1556 DebugMsg: "Loop doesn't have a legal pre-header",
1557 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1558 ORE, TheLoop);
1559 if (DoExtraAnalysis)
1560 Result = false;
1561 else
1562 return false;
1563 }
1564
1565 // We must have a single backedge.
1566 if (Lp->getNumBackEdges() != 1) {
1567 reportVectorizationFailure(
1568 DebugMsg: "The loop must have a single backedge",
1569 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1570 ORE, TheLoop);
1571 if (DoExtraAnalysis)
1572 Result = false;
1573 else
1574 return false;
1575 }
1576
1577 // The latch must be terminated by a branch.
1578 BasicBlock *Latch = Lp->getLoopLatch();
1579 if (Latch && !isa<UncondBrInst, CondBrInst>(Val: Latch->getTerminator())) {
1580 reportVectorizationFailure(
1581 DebugMsg: "The loop latch terminator is not a UncondBrInst/CondBrInst",
1582 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1583 ORE, TheLoop);
1584 if (DoExtraAnalysis)
1585 Result = false;
1586 else
1587 return false;
1588 }
1589
1590 return Result;
1591}
1592
1593bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1594 Loop *Lp, bool UseVPlanNativePath) {
1595 // Store the result and return it at the end instead of exiting early, in case
1596 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1597 bool Result = true;
1598 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1599 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1600 if (DoExtraAnalysis)
1601 Result = false;
1602 else
1603 return false;
1604 }
1605
1606 // Recursively check whether the loop control flow of nested loops is
1607 // understood.
1608 for (Loop *SubLp : *Lp)
1609 if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1610 if (DoExtraAnalysis)
1611 Result = false;
1612 else
1613 return false;
1614 }
1615
1616 return Result;
1617}
1618
1619bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1620 BasicBlock *LatchBB = TheLoop->getLoopLatch();
1621 if (!LatchBB) {
1622 reportVectorizationFailure(DebugMsg: "Loop does not have a latch",
1623 OREMsg: "Cannot vectorize early exit loop",
1624 ORETag: "NoLatchEarlyExit", ORE, TheLoop);
1625 return false;
1626 }
1627
1628 if (Reductions.size() || FixedOrderRecurrences.size()) {
1629 reportVectorizationFailure(
1630 DebugMsg: "Found reductions or recurrences in early-exit loop",
1631 OREMsg: "Cannot vectorize early exit loop with reductions or recurrences",
1632 ORETag: "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1633 return false;
1634 }
1635
1636 SmallVector<BasicBlock *, 8> ExitingBlocks;
1637 TheLoop->getExitingBlocks(ExitingBlocks);
1638
1639 // Keep a record of all the exiting blocks.
1640 SmallVector<const SCEVPredicate *, 4> Predicates;
1641 SmallVector<BasicBlock *> UncountableExitingBlocks;
1642 for (BasicBlock *BB : ExitingBlocks) {
1643 const SCEV *EC =
1644 PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: BB, Predicates: &Predicates);
1645 if (isa<SCEVCouldNotCompute>(Val: EC)) {
1646 if (size(Range: successors(BB)) != 2) {
1647 reportVectorizationFailure(
1648 DebugMsg: "Early exiting block does not have exactly two successors",
1649 OREMsg: "Incorrect number of successors from early exiting block",
1650 ORETag: "EarlyExitTooManySuccessors", ORE, TheLoop);
1651 return false;
1652 }
1653
1654 UncountableExitingBlocks.push_back(Elt: BB);
1655 } else
1656 CountableExitingBlocks.push_back(Elt: BB);
1657 }
1658 // We can safely ignore the predicates here because when vectorizing the loop
1659 // the PredicatatedScalarEvolution class will keep track of all predicates
1660 // for each exiting block anyway. This happens when calling
1661 // PSE.getSymbolicMaxBackedgeTakenCount() below.
1662 Predicates.clear();
1663
1664 if (UncountableExitingBlocks.empty()) {
1665 LLVM_DEBUG(dbgs() << "LV: Could not find any uncountable exits");
1666 return false;
1667 }
1668
1669 // The latch block must have a countable exit.
1670 if (isa<SCEVCouldNotCompute>(
1671 Val: PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: LatchBB, Predicates: &Predicates))) {
1672 reportVectorizationFailure(
1673 DebugMsg: "Cannot determine exact exit count for latch block",
1674 OREMsg: "Cannot vectorize early exit loop",
1675 ORETag: "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1676 return false;
1677 }
1678 assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1679 "Latch block not found in list of countable exits!");
1680
1681 // Check to see if there are instructions that could potentially generate
1682 // exceptions or have side-effects.
1683 auto IsSafeOperation = [](Instruction *I) -> bool {
1684 switch (I->getOpcode()) {
1685 case Instruction::Load:
1686 case Instruction::Store:
1687 case Instruction::PHI:
1688 case Instruction::UncondBr:
1689 case Instruction::CondBr:
1690 // These are checked separately.
1691 return true;
1692 default:
1693 return isSafeToSpeculativelyExecute(I);
1694 }
1695 };
1696
1697 bool HasSideEffects = false;
1698 for (auto *BB : TheLoop->blocks())
1699 for (auto &I : *BB) {
1700 if (I.mayWriteToMemory()) {
1701 if (isa<StoreInst>(Val: &I) && cast<StoreInst>(Val: &I)->isSimple()) {
1702 HasSideEffects = true;
1703 continue;
1704 }
1705
1706 // We don't support complex writes to memory.
1707 reportVectorizationFailure(
1708 DebugMsg: "Complex writes to memory unsupported in early exit loops",
1709 OREMsg: "Cannot vectorize early exit loop with complex writes to memory",
1710 ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
1711 return false;
1712 }
1713
1714 if (!IsSafeOperation(&I)) {
1715 reportVectorizationFailure(DebugMsg: "Early exit loop contains operations that "
1716 "cannot be speculatively executed",
1717 ORETag: "UnsafeOperationsEarlyExitLoop", ORE,
1718 TheLoop);
1719 return false;
1720 }
1721 }
1722
1723 SmallVector<LoadInst *, 4> NonDerefLoads;
1724 // TODO: Handle loops that may fault.
1725 if (!HasSideEffects) {
1726 // Read-only loop.
1727 Predicates.clear();
1728 if (!isReadOnlyLoop(L: TheLoop, SE: PSE.getSE(), DT, AC, NonDereferenceableAndAlignedLoads&: NonDerefLoads,
1729 Predicates: &Predicates)) {
1730 reportVectorizationFailure(
1731 DebugMsg: "Loop may fault", OREMsg: "Cannot vectorize non-read-only early exit loop",
1732 ORETag: "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1733 return false;
1734 }
1735 } else {
1736 // Check all uncountable exiting blocks for movable loads.
1737 for (BasicBlock *ExitingBB : UncountableExitingBlocks) {
1738 if (!canUncountableExitConditionLoadBeMoved(ExitingBlock: ExitingBB))
1739 return false;
1740 }
1741 }
1742
1743 // Check non-dereferenceable loads if any.
1744 for (LoadInst *LI : NonDerefLoads) {
1745 // Only support unit-stride access for now.
1746 int Stride = isConsecutivePtr(AccessTy: LI->getType(), Ptr: LI->getPointerOperand());
1747 if (Stride != 1) {
1748 reportVectorizationFailure(
1749 DebugMsg: "Loop contains potentially faulting strided load",
1750 OREMsg: "Cannot vectorize early exit loop with "
1751 "strided fault-only-first load",
1752 ORETag: "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
1753 return false;
1754 }
1755 }
1756
1757 [[maybe_unused]] const SCEV *SymbolicMaxBTC =
1758 PSE.getSymbolicMaxBackedgeTakenCount();
1759 // Since we have an exact exit count for the latch and the early exit
1760 // dominates the latch, then this should guarantee a computed SCEV value.
1761 assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1762 "Failed to get symbolic expression for backedge taken count");
1763 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1764 "backedge taken count: "
1765 << *SymbolicMaxBTC << '\n');
1766 UncountableExitType = HasSideEffects ? UncountableExitTrait::ReadWrite
1767 : UncountableExitTrait::ReadOnly;
1768 return true;
1769}
1770
1771bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1772 BasicBlock *ExitingBlock) {
1773 // Try to find a load in the critical path for the uncountable exit condition.
1774 // This is currently matching about the simplest form we can, expecting
1775 // only one in-loop load, the result of which is directly compared against
1776 // a loop-invariant value.
1777 // FIXME: We're insisting on a single use for now, because otherwise we will
1778 // need to make PHI nodes for other users. That can be done once the initial
1779 // transform code lands.
1780 auto *Br = cast<CondBrInst>(Val: ExitingBlock->getTerminator());
1781
1782 using namespace llvm::PatternMatch;
1783 Instruction *L = nullptr;
1784 Value *Ptr = nullptr;
1785 Value *R = nullptr;
1786 // The exit-condition load can appear on either side of the icmp.
1787 if (!match(V: Br->getCondition(),
1788 P: m_OneUse(SubPattern: m_c_ICmp(L: m_OneUse(SubPattern: m_Instruction(I&: L, P: m_Load(Op: m_Value(V&: Ptr)))),
1789 R: m_Value(V&: R))))) {
1790 reportVectorizationFailure(
1791 DebugMsg: "Early exit loop with store but no supported condition load",
1792 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1793 return false;
1794 }
1795
1796 if (!TheLoop->isLoopInvariant(V: R)) {
1797 reportVectorizationFailure(
1798 DebugMsg: "Early exit loop with store but no supported condition load",
1799 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1800 return false;
1801 }
1802
1803 // Make sure that the load address is not loop invariant; we want an
1804 // address calculation that we can rotate to the next vector iteration.
1805 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: Ptr));
1806 if (!AR || AR->getLoop() != TheLoop || !AR->isAffine()) {
1807 reportVectorizationFailure(
1808 DebugMsg: "Uncountable exit condition depends on load with an address that is "
1809 "not an add recurrence in the loop",
1810 ORETag: "EarlyExitLoadInvariantAddress", ORE, TheLoop);
1811 return false;
1812 }
1813
1814 ICFLoopSafetyInfo SafetyInfo;
1815 SafetyInfo.computeLoopSafetyInfo(CurLoop: TheLoop);
1816 LoadInst *Load = cast<LoadInst>(Val: L);
1817 // We need to know that load will be executed before we can hoist a
1818 // copy out to run just before the first iteration.
1819 if (!SafetyInfo.isGuaranteedToExecute(Inst: *Load, DT, CurLoop: TheLoop)) {
1820 reportVectorizationFailure(
1821 DebugMsg: "Load for uncountable exit not guaranteed to execute",
1822 ORETag: "ConditionalUncountableExitLoad", ORE, TheLoop);
1823 return false;
1824 }
1825
1826 // Prohibit any potential aliasing with any instruction in the loop which
1827 // might store to memory.
1828 // FIXME: Relax this constraint where possible.
1829 for (auto *BB : TheLoop->blocks()) {
1830 for (auto &I : *BB) {
1831 if (&I == Load)
1832 continue;
1833
1834 if (I.mayReadOrWriteMemory()) {
1835 // We need to mask all other memory ops.
1836 ConditionallyExecutedOps.insert(Ptr: &I);
1837 if (isa<LoadInst>(Val: &I))
1838 continue;
1839 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1840 AliasResult AR = AA->alias(V1: Ptr, V2: SI->getPointerOperand());
1841 if (AR == AliasResult::NoAlias)
1842 continue;
1843 }
1844
1845 reportVectorizationFailure(
1846 DebugMsg: "Cannot determine whether critical uncountable exit load address "
1847 "does not alias with a memory write",
1848 ORETag: "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1849 return false;
1850 }
1851 }
1852 }
1853
1854 return true;
1855}
1856
1857bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1858 // Store the result and return it at the end instead of exiting early, in case
1859 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1860 bool Result = true;
1861
1862 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1863 // Check whether the loop-related control flow in the loop nest is expected by
1864 // vectorizer.
1865 if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1866 if (DoExtraAnalysis) {
1867 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1868 Result = false;
1869 } else {
1870 return false;
1871 }
1872 }
1873
1874 // We need to have a loop header.
1875 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1876 << '\n');
1877
1878 // Specific checks for outer loops. We skip the remaining legal checks at this
1879 // point because they don't support outer loops.
1880 if (!TheLoop->isInnermost()) {
1881 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1882
1883 if (!canVectorizeOuterLoop()) {
1884 reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1885 ORETag: "UnsupportedOuterLoop", ORE, TheLoop);
1886 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1887 // outer loops.
1888 return false;
1889 }
1890
1891 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1892 return Result;
1893 }
1894
1895 assert(TheLoop->isInnermost() && "Inner loop expected.");
1896 // Check if we can if-convert non-single-bb loops.
1897 unsigned NumBlocks = TheLoop->getNumBlocks();
1898 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1899 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1900 if (DoExtraAnalysis)
1901 Result = false;
1902 else
1903 return false;
1904 }
1905
1906 // Check if we can vectorize the instructions and CFG in this loop.
1907 if (!canVectorizeInstrs()) {
1908 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1909 if (DoExtraAnalysis)
1910 Result = false;
1911 else
1912 return false;
1913 }
1914
1915 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
1916 if (TheLoop->getExitingBlock()) {
1917 reportVectorizationFailure(DebugMsg: "Cannot vectorize uncountable loop",
1918 ORETag: "UnsupportedUncountableLoop", ORE, TheLoop);
1919 if (DoExtraAnalysis)
1920 Result = false;
1921 else
1922 return false;
1923 } else {
1924 if (!isVectorizableEarlyExitLoop()) {
1925 assert(UncountableExitType == UncountableExitTrait::None &&
1926 "Must be false without vectorizable early-exit loop");
1927 if (DoExtraAnalysis)
1928 Result = false;
1929 else
1930 return false;
1931 }
1932 }
1933 }
1934
1935 // Go over each instruction and look at memory deps.
1936 if (!canVectorizeMemory()) {
1937 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1938 if (DoExtraAnalysis)
1939 Result = false;
1940 else
1941 return false;
1942 }
1943
1944 // TODO: Remove this restriction, should be straightforward to support.
1945 if (UncountableExitType != UncountableExitTrait::None &&
1946 !LAI->getStoresToInvariantAddresses().empty()) {
1947 LLVM_DEBUG(dbgs() << "LV: Cannot vectorize early exit loops with stores to "
1948 "loop-invariant addresses\n");
1949 reportVectorizationFailure(DebugMsg: "Cannot vectorize early exit loops with stores "
1950 "to loop-invariant addresses",
1951 ORETag: "LoopInvariantStoresInEELoop", ORE, TheLoop);
1952 return false;
1953 }
1954
1955 if (Result) {
1956 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1957 << (LAI->getRuntimePointerChecking()->Need
1958 ? " (with a runtime bound check)"
1959 : "")
1960 << "!\n");
1961 }
1962
1963 // Okay! We've done all the tests. If any have failed, return false. Otherwise
1964 // we can vectorize, and at this point we don't have any other mem analysis
1965 // which may limit our maximum vectorization factor, so just return true with
1966 // no restrictions.
1967 return Result;
1968}
1969
1970bool LoopVectorizationLegality::canFoldTailByMasking() const {
1971 // The only loops we can vectorize without a scalar epilogue, are loops with
1972 // a bottom-test and a single exiting block. We'd have to handle the fact
1973 // that not every instruction executes on the last iteration. This will
1974 // require a lane mask which varies through the vector loop body. (TODO)
1975 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1976 LLVM_DEBUG(
1977 dbgs()
1978 << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
1979 return false;
1980 }
1981
1982 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1983
1984 // The list of pointers that we can safely read and write to remains empty.
1985 SmallPtrSet<Value *, 8> SafePointers;
1986
1987 // Check all blocks for predication, including those that ordinarily do not
1988 // need predication such as the header block.
1989 SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
1990 for (BasicBlock *BB : TheLoop->blocks()) {
1991 if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
1992 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
1993 return false;
1994 }
1995 }
1996
1997 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1998
1999 return true;
2000}
2001
2002void LoopVectorizationLegality::prepareToFoldTailByMasking() {
2003 // The list of pointers that we can safely read and write to remains empty.
2004 SmallPtrSet<Value *, 8> SafePointers;
2005
2006 // Mark all blocks for predication, including those that ordinarily do not
2007 // need predication such as the header block, and collect instructions needing
2008 // predication in TailFoldedMaskedOp.
2009 for (BasicBlock *BB : TheLoop->blocks()) {
2010 [[maybe_unused]] bool R =
2011 blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TailFoldedMaskedOp);
2012 assert(R && "Must be able to predicate block when tail-folding.");
2013 }
2014}
2015
2016} // namespace llvm
2017