1//===- LoopVectorizationLegality.cpp --------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file provides loop vectorization legality analysis. Original code
10// resided in LoopVectorize.cpp for a long time.
11//
12// At this point, it is implemented as a utility class, not as an analysis
13// pass. It should be easy to create an analysis pass around it if there
14// is a need (but D45420 needs to happen first).
15//
16
17#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18#include "llvm/Analysis/AliasAnalysis.h"
19#include "llvm/Analysis/Loads.h"
20#include "llvm/Analysis/LoopInfo.h"
21#include "llvm/Analysis/MustExecute.h"
22#include "llvm/Analysis/OptimizationRemarkEmitter.h"
23#include "llvm/Analysis/ScalarEvolutionExpressions.h"
24#include "llvm/Analysis/TargetLibraryInfo.h"
25#include "llvm/Analysis/TargetTransformInfo.h"
26#include "llvm/Analysis/ValueTracking.h"
27#include "llvm/Analysis/VectorUtils.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/IntrinsicInst.h"
30#include "llvm/IR/PatternMatch.h"
31#include "llvm/Transforms/Utils/SizeOpts.h"
32#include "llvm/Transforms/Vectorize/LoopVectorize.h"
33
34using namespace llvm;
35using namespace PatternMatch;
36
37#define LV_NAME "loop-vectorize"
38#define DEBUG_TYPE LV_NAME
39
40static cl::opt<bool>
41 EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
42 cl::desc("Enable if-conversion during vectorization."));
43
44static cl::opt<bool>
45AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
46 cl::desc("Enable recognition of non-constant strided "
47 "pointer induction variables."));
48
49static cl::opt<bool>
50 HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
51 cl::desc("Allow enabling loop hints to reorder "
52 "FP operations during vectorization."));
53
54// TODO: Move size-based thresholds out of legality checking, make cost based
55// decisions instead of hard thresholds.
56static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
57 "vectorize-scev-check-threshold", cl::init(Val: 16), cl::Hidden,
58 cl::desc("The maximum number of SCEV checks allowed."));
59
60static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
61 "pragma-vectorize-scev-check-threshold", cl::init(Val: 128), cl::Hidden,
62 cl::desc("The maximum number of SCEV checks allowed with a "
63 "vectorize(enable) pragma"));
64
65static cl::opt<LoopVectorizeHints::ScalableForceKind>
66 ForceScalableVectorization(
67 "scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
68 cl::Hidden,
69 cl::desc("Control whether the compiler can use scalable vectors to "
70 "vectorize a loop"),
71 cl::values(
72 clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
73 "Scalable vectorization is disabled."),
74 clEnumValN(
75 LoopVectorizeHints::SK_PreferScalable, "preferred",
76 "Scalable vectorization is available and favored when the "
77 "cost is inconclusive."),
78 clEnumValN(
79 LoopVectorizeHints::SK_PreferScalable, "on",
80 "Scalable vectorization is available and favored when the "
81 "cost is inconclusive.")));
82
83static cl::opt<bool> EnableHistogramVectorization(
84 "enable-histogram-loop-vectorization", cl::init(Val: false), cl::Hidden,
85 cl::desc("Enables autovectorization of some loops containing histograms"));
86
87/// Maximum vectorization interleave count.
88static const unsigned MaxInterleaveFactor = 16;
89
90namespace llvm {
91
92bool LoopVectorizeHints::Hint::validate(unsigned Val) {
93 switch (Kind) {
94 case HK_WIDTH:
95 return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
96 case HK_INTERLEAVE:
97 return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
98 case HK_FORCE:
99 return (Val <= 1);
100 case HK_ISVECTORIZED:
101 case HK_PREDICATE:
102 case HK_SCALABLE:
103 return (Val == 0 || Val == 1);
104 }
105 return false;
106}
107
108LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
109 bool InterleaveOnlyWhenForced,
110 OptimizationRemarkEmitter &ORE,
111 const TargetTransformInfo *TTI)
112 : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
113 Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
114 Force("vectorize.enable", FK_Undefined, HK_FORCE),
115 IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
116 Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
117 Scalable("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
118 TheLoop(L), ORE(ORE) {
119 // Populate values with existing loop metadata.
120 getHintsFromMetadata();
121
122 // force-vector-interleave overrides DisableInterleaving.
123 if (VectorizerParams::isInterleaveForced())
124 Interleave.Value = VectorizerParams::VectorizationInterleave;
125
126 // If the metadata doesn't explicitly specify whether to enable scalable
127 // vectorization, then decide based on the following criteria (increasing
128 // level of priority):
129 // - Target default
130 // - Metadata width
131 // - Force option (always overrides)
132 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
133 if (TTI)
134 Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
135 : SK_FixedWidthOnly;
136
137 if (Width.Value)
138 // If the width is set, but the metadata says nothing about the scalable
139 // property, then assume it concerns only a fixed-width UserVF.
140 // If width is not set, the flag takes precedence.
141 Scalable.Value = SK_FixedWidthOnly;
142 }
143
144 // If the flag is set to force any use of scalable vectors, override the loop
145 // hints.
146 if (ForceScalableVectorization.getValue() !=
147 LoopVectorizeHints::SK_Unspecified)
148 Scalable.Value = ForceScalableVectorization.getValue();
149
150 // Scalable vectorization is disabled if no preference is specified.
151 if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
152 Scalable.Value = SK_FixedWidthOnly;
153
154 if (IsVectorized.Value != 1)
155 // If the vectorization width and interleaving count are both 1 then
156 // consider the loop to have been already vectorized because there's
157 // nothing more that we can do.
158 IsVectorized.Value =
159 getWidth() == ElementCount::getFixed(MinVal: 1) && getInterleave() == 1;
160 LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs()
161 << "LV: Interleaving disabled by the pass manager\n");
162}
163
164void LoopVectorizeHints::setAlreadyVectorized() {
165 LLVMContext &Context = TheLoop->getHeader()->getContext();
166
167 MDNode *IsVectorizedMD = MDNode::get(
168 Context,
169 MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
170 ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt(32, 1)))});
171 MDNode *LoopID = TheLoop->getLoopID();
172 MDNode *NewLoopID =
173 makePostTransformationMetadata(Context, OrigLoopID: LoopID,
174 RemovePrefixes: {Twine(Prefix(), "vectorize.").str(),
175 Twine(Prefix(), "interleave.").str()},
176 AddAttrs: {IsVectorizedMD});
177 TheLoop->setLoopID(NewLoopID);
178
179 // Update internal cache.
180 IsVectorized.Value = 1;
181}
182
183void LoopVectorizeHints::reportDisallowedVectorization(
184 const StringRef DebugMsg, const StringRef RemarkName,
185 const StringRef RemarkMsg, const Loop *L) const {
186 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: " << DebugMsg << ".\n");
187 ORE.emit(OptDiag: OptimizationRemarkMissed(LV_NAME, RemarkName, L->getStartLoc(),
188 L->getHeader())
189 << "loop not vectorized: " << RemarkMsg);
190}
191
192bool LoopVectorizeHints::allowVectorization(
193 Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
194 if (getForce() == LoopVectorizeHints::FK_Disabled) {
195 if (Force.Value == LoopVectorizeHints::FK_Disabled) {
196 reportDisallowedVectorization(DebugMsg: "#pragma vectorize disable",
197 RemarkName: "MissedExplicitlyDisabled",
198 RemarkMsg: "vectorization is explicitly disabled", L);
199 } else if (hasDisableAllTransformsHint(L)) {
200 reportDisallowedVectorization(DebugMsg: "loop hasDisableAllTransformsHint",
201 RemarkName: "MissedTransformsDisabled",
202 RemarkMsg: "loop transformations are disabled", L);
203 } else {
204 llvm_unreachable("loop vect disabled for an unknown reason");
205 }
206 return false;
207 }
208
209 if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
210 reportDisallowedVectorization(
211 DebugMsg: "VectorizeOnlyWhenForced is set, and no #pragma vectorize enable",
212 RemarkName: "MissedForceOnly", RemarkMsg: "only vectorizing loops that explicitly request it",
213 L);
214 return false;
215 }
216
217 if (getIsVectorized() == 1) {
218 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
219 // FIXME: Add interleave.disable metadata. This will allow
220 // vectorize.disable to be used without disabling the pass and errors
221 // to differentiate between disabled vectorization and a width of 1.
222 ORE.emit(RemarkBuilder: [&]() {
223 return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
224 "AllDisabled", L->getStartLoc(),
225 L->getHeader())
226 << "loop not vectorized: vectorization and interleaving are "
227 "explicitly disabled, or the loop has already been "
228 "vectorized";
229 });
230 return false;
231 }
232
233 return true;
234}
235
236void LoopVectorizeHints::emitRemarkWithHints() const {
237 using namespace ore;
238
239 ORE.emit(RemarkBuilder: [&]() {
240 if (Force.Value == LoopVectorizeHints::FK_Disabled)
241 return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
242 TheLoop->getStartLoc(),
243 TheLoop->getHeader())
244 << "loop not vectorized: vectorization is explicitly disabled";
245
246 OptimizationRemarkMissed R(LV_NAME, "MissedDetails", TheLoop->getStartLoc(),
247 TheLoop->getHeader());
248 R << "loop not vectorized";
249 if (Force.Value == LoopVectorizeHints::FK_Enabled) {
250 R << " (Force=" << NV("Force", true);
251 if (Width.Value != 0)
252 R << ", Vector Width=" << NV("VectorWidth", getWidth());
253 if (getInterleave() != 0)
254 R << ", Interleave Count=" << NV("InterleaveCount", getInterleave());
255 R << ")";
256 }
257 return R;
258 });
259}
260
261const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
262 if (getWidth() == ElementCount::getFixed(MinVal: 1))
263 return LV_NAME;
264 if (getForce() == LoopVectorizeHints::FK_Disabled)
265 return LV_NAME;
266 if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
267 return LV_NAME;
268 return OptimizationRemarkAnalysis::AlwaysPrint;
269}
270
271bool LoopVectorizeHints::allowReordering() const {
272 // Allow the vectorizer to change the order of operations if enabling
273 // loop hints are provided
274 ElementCount EC = getWidth();
275 return HintsAllowReordering &&
276 (getForce() == LoopVectorizeHints::FK_Enabled ||
277 EC.getKnownMinValue() > 1);
278}
279
280void LoopVectorizeHints::getHintsFromMetadata() {
281 MDNode *LoopID = TheLoop->getLoopID();
282 if (!LoopID)
283 return;
284
285 // First operand should refer to the loop id itself.
286 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
287 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
288
289 for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
290 const MDString *S = nullptr;
291 SmallVector<Metadata *, 4> Args;
292
293 // The expected hint is either a MDString or a MDNode with the first
294 // operand a MDString.
295 if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) {
296 if (!MD || MD->getNumOperands() == 0)
297 continue;
298 S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
299 for (unsigned Idx = 1; Idx < MD->getNumOperands(); ++Idx)
300 Args.push_back(Elt: MD->getOperand(I: Idx));
301 } else {
302 S = dyn_cast<MDString>(Val: MDO);
303 assert(Args.size() == 0 && "too many arguments for MDString");
304 }
305
306 if (!S)
307 continue;
308
309 // Check if the hint starts with the loop metadata prefix.
310 StringRef Name = S->getString();
311 if (Args.size() == 1)
312 setHint(Name, Arg: Args[0]);
313 }
314}
315
316void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
317 if (!Name.consume_front(Prefix: Prefix()))
318 return;
319
320 const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
321 if (!C)
322 return;
323 unsigned Val = C->getZExtValue();
324
325 Hint *Hints[] = {&Width, &Interleave, &Force,
326 &IsVectorized, &Predicate, &Scalable};
327 for (auto *H : Hints) {
328 if (Name == H->Name) {
329 if (H->validate(Val))
330 H->Value = Val;
331 else
332 LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
333 break;
334 }
335 }
336}
337
338// Return true if the inner loop \p Lp is uniform with regard to the outer loop
339// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
340// executing the inner loop will execute the same iterations). This check is
341// very constrained for now but it will be relaxed in the future. \p Lp is
342// considered uniform if it meets all the following conditions:
343// 1) it has a canonical IV (starting from 0 and with stride 1),
344// 2) its latch terminator is a conditional branch and,
345// 3) its latch condition is a compare instruction whose operands are the
346// canonical IV and an OuterLp invariant.
347// This check doesn't take into account the uniformity of other conditions not
348// related to the loop latch because they don't affect the loop uniformity.
349//
350// NOTE: We decided to keep all these checks and its associated documentation
351// together so that we can easily have a picture of the current supported loop
352// nests. However, some of the current checks don't depend on \p OuterLp and
353// would be redundantly executed for each \p Lp if we invoked this function for
354// different candidate outer loops. This is not the case for now because we
355// don't currently have the infrastructure to evaluate multiple candidate outer
356// loops and \p OuterLp will be a fixed parameter while we only support explicit
357// outer loop vectorization. It's also very likely that these checks go away
358// before introducing the aforementioned infrastructure. However, if this is not
359// the case, we should move the \p OuterLp independent checks to a separate
360// function that is only executed once for each \p Lp.
361static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
362 assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
363
364 // If Lp is the outer loop, it's uniform by definition.
365 if (Lp == OuterLp)
366 return true;
367 assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
368
369 // 1.
370 PHINode *IV = Lp->getCanonicalInductionVariable();
371 if (!IV) {
372 LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
373 return false;
374 }
375
376 // 2.
377 BasicBlock *Latch = Lp->getLoopLatch();
378 auto *LatchBr = dyn_cast<BranchInst>(Val: Latch->getTerminator());
379 if (!LatchBr || LatchBr->isUnconditional()) {
380 LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
381 return false;
382 }
383
384 // 3.
385 auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
386 if (!LatchCmp) {
387 LLVM_DEBUG(
388 dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
389 return false;
390 }
391
392 Value *CondOp0 = LatchCmp->getOperand(i_nocapture: 0);
393 Value *CondOp1 = LatchCmp->getOperand(i_nocapture: 1);
394 Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
395 if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
396 !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
397 LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
398 return false;
399 }
400
401 return true;
402}
403
404// Return true if \p Lp and all its nested loops are uniform with regard to \p
405// OuterLp.
406static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
407 if (!isUniformLoop(Lp, OuterLp))
408 return false;
409
410 // Check if nested loops are uniform.
411 for (Loop *SubLp : *Lp)
412 if (!isUniformLoopNest(Lp: SubLp, OuterLp))
413 return false;
414
415 return true;
416}
417
418static IntegerType *getInductionIntegerTy(const DataLayout &DL, Type *Ty) {
419 assert(Ty->isIntOrPtrTy() && "Expected integer or pointer type");
420
421 if (Ty->isPointerTy())
422 return DL.getIntPtrType(C&: Ty->getContext(), AddressSpace: Ty->getPointerAddressSpace());
423
424 // It is possible that char's or short's overflow when we ask for the loop's
425 // trip count, work around this by changing the type size.
426 if (Ty->getScalarSizeInBits() < 32)
427 return Type::getInt32Ty(C&: Ty->getContext());
428
429 return cast<IntegerType>(Val: Ty);
430}
431
432static IntegerType *getWiderInductionTy(const DataLayout &DL, Type *Ty0,
433 Type *Ty1) {
434 IntegerType *TyA = getInductionIntegerTy(DL, Ty: Ty0);
435 IntegerType *TyB = getInductionIntegerTy(DL, Ty: Ty1);
436 return TyA->getScalarSizeInBits() > TyB->getScalarSizeInBits() ? TyA : TyB;
437}
438
439/// Check that the instruction has outside loop users and is not an
440/// identified reduction variable.
441static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
442 SmallPtrSetImpl<Value *> &AllowedExit) {
443 // Reductions, Inductions and non-header phis are allowed to have exit users. All
444 // other instructions must not have external users.
445 if (!AllowedExit.count(Ptr: Inst))
446 // Check that all of the users of the loop are inside the BB.
447 for (User *U : Inst->users()) {
448 Instruction *UI = cast<Instruction>(Val: U);
449 // This user may be a reduction exit value.
450 if (!TheLoop->contains(Inst: UI)) {
451 LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
452 return true;
453 }
454 }
455 return false;
456}
457
458/// Returns true if A and B have same pointer operands or same SCEVs addresses
459static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A,
460 StoreInst *B) {
461 // Compare store
462 if (A == B)
463 return true;
464
465 // Otherwise Compare pointers
466 Value *APtr = A->getPointerOperand();
467 Value *BPtr = B->getPointerOperand();
468 if (APtr == BPtr)
469 return true;
470
471 // Otherwise compare address SCEVs
472 return SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr);
473}
474
475int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
476 Value *Ptr) const {
477 // FIXME: Currently, the set of symbolic strides is sometimes queried before
478 // it's collected. This happens from canVectorizeWithIfConvert, when the
479 // pointer is checked to reference consecutive elements suitable for a
480 // masked access.
481 const auto &Strides =
482 LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>();
483
484 int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
485 Assume: AllowRuntimeSCEVChecks, ShouldCheckWrap: false)
486 .value_or(u: 0);
487 if (Stride == 1 || Stride == -1)
488 return Stride;
489 return 0;
490}
491
492bool LoopVectorizationLegality::isInvariant(Value *V) const {
493 return LAI->isInvariant(V);
494}
495
496namespace {
497/// A rewriter to build the SCEVs for each of the VF lanes in the expected
498/// vectorized loop, which can then be compared to detect their uniformity. This
499/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
500/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
501/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
502/// uniformity.
503class SCEVAddRecForUniformityRewriter
504 : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
505 /// Multiplier to be applied to the step of AddRecs in TheLoop.
506 unsigned StepMultiplier;
507
508 /// Offset to be added to the AddRecs in TheLoop.
509 unsigned Offset;
510
511 /// Loop for which to rewrite AddRecsFor.
512 Loop *TheLoop;
513
514 /// Is any sub-expressions not analyzable w.r.t. uniformity?
515 bool CannotAnalyze = false;
516
517 bool canAnalyze() const { return !CannotAnalyze; }
518
519public:
520 SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
521 unsigned Offset, Loop *TheLoop)
522 : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
523 TheLoop(TheLoop) {}
524
525 const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
526 assert(Expr->getLoop() == TheLoop &&
527 "addrec outside of TheLoop must be invariant and should have been "
528 "handled earlier");
529 // Build a new AddRec by multiplying the step by StepMultiplier and
530 // incrementing the start by Offset * step.
531 Type *Ty = Expr->getType();
532 const SCEV *Step = Expr->getStepRecurrence(SE);
533 if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
534 CannotAnalyze = true;
535 return Expr;
536 }
537 const SCEV *NewStep =
538 SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
539 const SCEV *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
540 const SCEV *NewStart = SE.getAddExpr(LHS: Expr->getStart(), RHS: ScaledOffset);
541 return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
542 }
543
544 const SCEV *visit(const SCEV *S) {
545 if (CannotAnalyze || SE.isLoopInvariant(S, L: TheLoop))
546 return S;
547 return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
548 }
549
550 const SCEV *visitUnknown(const SCEVUnknown *S) {
551 if (SE.isLoopInvariant(S, L: TheLoop))
552 return S;
553 // The value could vary across iterations.
554 CannotAnalyze = true;
555 return S;
556 }
557
558 const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
559 // Could not analyze the expression.
560 CannotAnalyze = true;
561 return S;
562 }
563
564 static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
565 unsigned StepMultiplier, unsigned Offset,
566 Loop *TheLoop) {
567 /// Bail out if the expression does not contain an UDiv expression.
568 /// Uniform values which are not loop invariant require operations to strip
569 /// out the lowest bits. For now just look for UDivs and use it to avoid
570 /// re-writing UDIV-free expressions for other lanes to limit compile time.
571 if (!SCEVExprContains(Root: S,
572 Pred: [](const SCEV *S) { return isa<SCEVUDivExpr>(Val: S); }))
573 return SE.getCouldNotCompute();
574
575 SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
576 TheLoop);
577 const SCEV *Result = Rewriter.visit(S);
578
579 if (Rewriter.canAnalyze())
580 return Result;
581 return SE.getCouldNotCompute();
582 }
583};
584
585} // namespace
586
587bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
588 if (isInvariant(V))
589 return true;
590 if (VF.isScalable())
591 return false;
592 if (VF.isScalar())
593 return true;
594
595 // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
596 // never considered uniform.
597 auto *SE = PSE.getSE();
598 if (!SE->isSCEVable(Ty: V->getType()))
599 return false;
600 const SCEV *S = SE->getSCEV(V);
601
602 // Rewrite AddRecs in TheLoop to step by VF and check if the expression for
603 // lane 0 matches the expressions for all other lanes.
604 unsigned FixedVF = VF.getKnownMinValue();
605 const SCEV *FirstLaneExpr =
606 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: 0, TheLoop);
607 if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
608 return false;
609
610 // Make sure the expressions for lanes FixedVF-1..1 match the expression for
611 // lane 0. We check lanes in reverse order for compile-time, as frequently
612 // checking the last lane is sufficient to rule out uniformity.
613 return all_of(Range: reverse(C: seq<unsigned>(Begin: 1, End: FixedVF)), P: [&](unsigned I) {
614 const SCEV *IthLaneExpr =
615 SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
616 return FirstLaneExpr == IthLaneExpr;
617 });
618}
619
620bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
621 ElementCount VF) const {
622 Value *Ptr = getLoadStorePointerOperand(V: &I);
623 if (!Ptr)
624 return false;
625 // Note: There's nothing inherent which prevents predicated loads and
626 // stores from being uniform. The current lowering simply doesn't handle
627 // it; in particular, the cost model distinguishes scatter/gather from
628 // scalar w/predication, and we currently rely on the scalar path.
629 return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
630}
631
632bool LoopVectorizationLegality::canVectorizeOuterLoop() {
633 assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
634 // Store the result and return it at the end instead of exiting early, in case
635 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
636 bool Result = true;
637 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
638
639 for (BasicBlock *BB : TheLoop->blocks()) {
640 // Check whether the BB terminator is a BranchInst. Any other terminator is
641 // not supported yet.
642 auto *Br = dyn_cast<BranchInst>(Val: BB->getTerminator());
643 if (!Br) {
644 reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
645 OREMsg: "loop control flow is not understood by vectorizer",
646 ORETag: "CFGNotUnderstood", ORE, TheLoop);
647 if (DoExtraAnalysis)
648 Result = false;
649 else
650 return false;
651 }
652
653 // Check whether the BranchInst is a supported one. Only unconditional
654 // branches, conditional branches with an outer loop invariant condition or
655 // backedges are supported.
656 // FIXME: We skip these checks when VPlan predication is enabled as we
657 // want to allow divergent branches. This whole check will be removed
658 // once VPlan predication is on by default.
659 if (Br && Br->isConditional() &&
660 !TheLoop->isLoopInvariant(V: Br->getCondition()) &&
661 !LI->isLoopHeader(BB: Br->getSuccessor(i: 0)) &&
662 !LI->isLoopHeader(BB: Br->getSuccessor(i: 1))) {
663 reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
664 OREMsg: "loop control flow is not understood by vectorizer",
665 ORETag: "CFGNotUnderstood", ORE, TheLoop);
666 if (DoExtraAnalysis)
667 Result = false;
668 else
669 return false;
670 }
671 }
672
673 // Check whether inner loops are uniform. At this point, we only support
674 // simple outer loops scenarios with uniform nested loops.
675 if (!isUniformLoopNest(Lp: TheLoop /*loop nest*/,
676 OuterLp: TheLoop /*context outer loop*/)) {
677 reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
678 OREMsg: "loop control flow is not understood by vectorizer",
679 ORETag: "CFGNotUnderstood", ORE, TheLoop);
680 if (DoExtraAnalysis)
681 Result = false;
682 else
683 return false;
684 }
685
686 // Check whether we are able to set up outer loop induction.
687 if (!setupOuterLoopInductions()) {
688 reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
689 ORETag: "UnsupportedPhi", ORE, TheLoop);
690 if (DoExtraAnalysis)
691 Result = false;
692 else
693 return false;
694 }
695
696 return Result;
697}
698
699void LoopVectorizationLegality::addInductionPhi(
700 PHINode *Phi, const InductionDescriptor &ID,
701 SmallPtrSetImpl<Value *> &AllowedExit) {
702 Inductions[Phi] = ID;
703
704 // In case this induction also comes with casts that we know we can ignore
705 // in the vectorized loop body, record them here. All casts could be recorded
706 // here for ignoring, but suffices to record only the first (as it is the
707 // only one that may bw used outside the cast sequence).
708 ArrayRef<Instruction *> Casts = ID.getCastInsts();
709 if (!Casts.empty())
710 InductionCastsToIgnore.insert(Ptr: *Casts.begin());
711
712 Type *PhiTy = Phi->getType();
713 const DataLayout &DL = Phi->getDataLayout();
714
715 assert((PhiTy->isIntOrPtrTy() || PhiTy->isFloatingPointTy()) &&
716 "Expected int, ptr, or FP induction phi type");
717
718 // Get the widest type.
719 if (PhiTy->isIntOrPtrTy()) {
720 if (!WidestIndTy)
721 WidestIndTy = getInductionIntegerTy(DL, Ty: PhiTy);
722 else
723 WidestIndTy = getWiderInductionTy(DL, Ty0: PhiTy, Ty1: WidestIndTy);
724 }
725
726 // Int inductions are special because we only allow one IV.
727 if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
728 ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
729 isa<Constant>(Val: ID.getStartValue()) &&
730 cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
731
732 // Use the phi node with the widest type as induction. Use the last
733 // one if there are multiple (no good reason for doing this other
734 // than it is expedient). We've checked that it begins at zero and
735 // steps by one, so this is a canonical induction variable.
736 if (!PrimaryInduction || PhiTy == WidestIndTy)
737 PrimaryInduction = Phi;
738 }
739
740 // Both the PHI node itself, and the "post-increment" value feeding
741 // back into the PHI node may have external users.
742 // We can allow those uses, except if the SCEVs we have for them rely
743 // on predicates that only hold within the loop, since allowing the exit
744 // currently means re-using this SCEV outside the loop (see PR33706 for more
745 // details).
746 if (PSE.getPredicate().isAlwaysTrue()) {
747 AllowedExit.insert(Ptr: Phi);
748 AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
749 }
750
751 LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
752}
753
754bool LoopVectorizationLegality::setupOuterLoopInductions() {
755 BasicBlock *Header = TheLoop->getHeader();
756
757 // Returns true if a given Phi is a supported induction.
758 auto IsSupportedPhi = [&](PHINode &Phi) -> bool {
759 InductionDescriptor ID;
760 if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
761 ID.getKind() == InductionDescriptor::IK_IntInduction) {
762 addInductionPhi(Phi: &Phi, ID, AllowedExit);
763 return true;
764 }
765 // Bail out for any Phi in the outer loop header that is not a supported
766 // induction.
767 LLVM_DEBUG(
768 dbgs() << "LV: Found unsupported PHI for outer loop vectorization.\n");
769 return false;
770 };
771
772 return llvm::all_of(Range: Header->phis(), P: IsSupportedPhi);
773}
774
775/// Checks if a function is scalarizable according to the TLI, in
776/// the sense that it should be vectorized and then expanded in
777/// multiple scalar calls. This is represented in the
778/// TLI via mappings that do not specify a vector name, as in the
779/// following example:
780///
781/// const VecDesc VecIntrinsics[] = {
782/// {"llvm.phx.abs.i32", "", 4}
783/// };
784static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
785 const StringRef ScalarName = CI.getCalledFunction()->getName();
786 bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
787 // Check that all known VFs are not associated to a vector
788 // function, i.e. the vector name is emty.
789 if (Scalarize) {
790 ElementCount WidestFixedVF, WidestScalableVF;
791 TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
792 for (ElementCount VF = ElementCount::getFixed(MinVal: 2);
793 ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= 2)
794 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
795 for (ElementCount VF = ElementCount::getScalable(MinVal: 1);
796 ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= 2)
797 Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
798 assert((WidestScalableVF.isZero() || !Scalarize) &&
799 "Caller may decide to scalarize a variant using a scalable VF");
800 }
801 return Scalarize;
802}
803
804bool LoopVectorizationLegality::canVectorizeInstrs() {
805 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
806 bool Result = true;
807
808 // For each block in the loop.
809 for (BasicBlock *BB : TheLoop->blocks()) {
810 // Scan the instructions in the block and look for hazards.
811 for (Instruction &I : *BB) {
812 Result &= canVectorizeInstr(I);
813 if (!DoExtraAnalysis && !Result)
814 return false;
815 }
816 }
817
818 if (!PrimaryInduction) {
819 if (Inductions.empty()) {
820 reportVectorizationFailure(
821 DebugMsg: "Did not find one integer induction var",
822 OREMsg: "loop induction variable could not be identified",
823 ORETag: "NoInductionVariable", ORE, TheLoop);
824 return false;
825 }
826 if (!WidestIndTy) {
827 reportVectorizationFailure(
828 DebugMsg: "Did not find one integer induction var",
829 OREMsg: "integer loop induction variable could not be identified",
830 ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
831 return false;
832 }
833 LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
834 }
835
836 // Now we know the widest induction type, check if our found induction
837 // is the same size. If it's not, unset it here and InnerLoopVectorizer
838 // will create another.
839 if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
840 PrimaryInduction = nullptr;
841
842 return Result;
843}
844
845bool LoopVectorizationLegality::canVectorizeInstr(Instruction &I) {
846 BasicBlock *BB = I.getParent();
847 BasicBlock *Header = TheLoop->getHeader();
848
849 if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
850 Type *PhiTy = Phi->getType();
851 // Check that this PHI type is allowed.
852 if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
853 !PhiTy->isPointerTy()) {
854 reportVectorizationFailure(
855 DebugMsg: "Found a non-int non-pointer PHI",
856 OREMsg: "loop control flow is not understood by vectorizer",
857 ORETag: "CFGNotUnderstood", ORE, TheLoop);
858 return false;
859 }
860
861 // If this PHINode is not in the header block, then we know that we
862 // can convert it to select during if-conversion. No need to check if
863 // the PHIs in this block are induction or reduction variables.
864 if (BB != Header) {
865 // Non-header phi nodes that have outside uses can be vectorized. Add
866 // them to the list of allowed exits.
867 // Unsafe cyclic dependencies with header phis are identified during
868 // legalization for reduction, induction and fixed order
869 // recurrences.
870 AllowedExit.insert(Ptr: &I);
871 return true;
872 }
873
874 // We only allow if-converted PHIs with exactly two incoming values.
875 if (Phi->getNumIncomingValues() != 2) {
876 reportVectorizationFailure(
877 DebugMsg: "Found an invalid PHI",
878 OREMsg: "loop control flow is not understood by vectorizer",
879 ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
880 return false;
881 }
882
883 RecurrenceDescriptor RedDes;
884 if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, DT,
885 SE: PSE.getSE())) {
886 Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
887 AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
888 Reductions[Phi] = std::move(RedDes);
889 assert((!RedDes.hasUsesOutsideReductionChain() ||
890 RecurrenceDescriptor::isMinMaxRecurrenceKind(
891 RedDes.getRecurrenceKind())) &&
892 "Only min/max recurrences are allowed to have multiple uses "
893 "currently");
894 return true;
895 }
896
897 // We prevent matching non-constant strided pointer IVS to preserve
898 // historical vectorizer behavior after a generalization of the
899 // IVDescriptor code. The intent is to remove this check, but we
900 // have to fix issues around code quality for such loops first.
901 auto IsDisallowedStridedPointerInduction =
902 [](const InductionDescriptor &ID) {
903 if (AllowStridedPointerIVs)
904 return false;
905 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
906 ID.getConstIntStepValue() == nullptr;
907 };
908
909 // TODO: Instead of recording the AllowedExit, it would be good to
910 // record the complementary set: NotAllowedExit. These include (but may
911 // not be limited to):
912 // 1. Reduction phis as they represent the one-before-last value, which
913 // is not available when vectorized
914 // 2. Induction phis and increment when SCEV predicates cannot be used
915 // outside the loop - see addInductionPhi
916 // 3. Non-Phis with outside uses when SCEV predicates cannot be used
917 // outside the loop - see call to hasOutsideLoopUser in the non-phi
918 // handling below
919 // 4. FixedOrderRecurrence phis that can possibly be handled by
920 // extraction.
921 // By recording these, we can then reason about ways to vectorize each
922 // of these NotAllowedExit.
923 InductionDescriptor ID;
924 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
925 !IsDisallowedStridedPointerInduction(ID)) {
926 addInductionPhi(Phi, ID, AllowedExit);
927 Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
928 return true;
929 }
930
931 if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
932 AllowedExit.insert(Ptr: Phi);
933 FixedOrderRecurrences.insert(Ptr: Phi);
934 return true;
935 }
936
937 // As a last resort, coerce the PHI to a AddRec expression
938 // and re-try classifying it a an induction PHI.
939 if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
940 !IsDisallowedStridedPointerInduction(ID)) {
941 addInductionPhi(Phi, ID, AllowedExit);
942 return true;
943 }
944
945 reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
946 OREMsg: "value that could not be identified as "
947 "reduction is used outside the loop",
948 ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop,
949 I: Phi);
950 return false;
951 } // end of PHI handling
952
953 // We handle calls that:
954 // * Have a mapping to an IR intrinsic.
955 // * Have a vector version available.
956 auto *CI = dyn_cast<CallInst>(Val: &I);
957
958 if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
959 !(CI->getCalledFunction() && TLI &&
960 (!VFDatabase::getMappings(CI: *CI).empty() || isTLIScalarize(TLI: *TLI, CI: *CI)))) {
961 // If the call is a recognized math libary call, it is likely that
962 // we can vectorize it given loosened floating-point constraints.
963 LibFunc Func;
964 bool IsMathLibCall =
965 TLI && CI->getCalledFunction() && CI->getType()->isFloatingPointTy() &&
966 TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
967 TLI->hasOptimizedCodeGen(F: Func);
968
969 if (IsMathLibCall) {
970 // TODO: Ideally, we should not use clang-specific language here,
971 // but it's hard to provide meaningful yet generic advice.
972 // Also, should this be guarded by allowExtraAnalysis() and/or be part
973 // of the returned info from isFunctionVectorizable()?
974 reportVectorizationFailure(
975 DebugMsg: "Found a non-intrinsic callsite",
976 OREMsg: "library call cannot be vectorized. "
977 "Try compiling with -fno-math-errno, -ffast-math, "
978 "or similar flags",
979 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
980 } else {
981 reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
982 OREMsg: "call instruction cannot be vectorized",
983 ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
984 }
985 return false;
986 }
987
988 // Some intrinsics have scalar arguments and should be same in order for
989 // them to be vectorized (i.e. loop invariant).
990 if (CI) {
991 auto *SE = PSE.getSE();
992 Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
993 for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
994 if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: Idx, TTI)) {
995 if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: Idx)), L: TheLoop)) {
996 reportVectorizationFailure(
997 DebugMsg: "Found unvectorizable intrinsic",
998 OREMsg: "intrinsic instruction cannot be vectorized",
999 ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
1000 return false;
1001 }
1002 }
1003 }
1004
1005 // If we found a vectorized variant of a function, note that so LV can
1006 // make better decisions about maximum VF.
1007 if (CI && !VFDatabase::getMappings(CI: *CI).empty())
1008 VecCallVariantsFound = true;
1009
1010 auto CanWidenInstructionTy = [](Instruction const &Inst) {
1011 Type *InstTy = Inst.getType();
1012 if (!isa<StructType>(Val: InstTy))
1013 return canVectorizeTy(Ty: InstTy);
1014
1015 // For now, we only recognize struct values returned from calls where
1016 // all users are extractvalue as vectorizable. All element types of the
1017 // struct must be types that can be widened.
1018 return isa<CallInst>(Val: Inst) && canVectorizeTy(Ty: InstTy) &&
1019 all_of(Range: Inst.users(), P: IsaPred<ExtractValueInst>);
1020 };
1021
1022 // Check that the instruction return type is vectorizable.
1023 // We can't vectorize casts from vector type to scalar type.
1024 // Also, we can't vectorize extractelement instructions.
1025 if (!CanWidenInstructionTy(I) ||
1026 (isa<CastInst>(Val: I) &&
1027 !VectorType::isValidElementType(ElemTy: I.getOperand(i: 0)->getType())) ||
1028 isa<ExtractElementInst>(Val: I)) {
1029 reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
1030 OREMsg: "instruction return type cannot be vectorized",
1031 ORETag: "CantVectorizeInstructionReturnType", ORE,
1032 TheLoop, I: &I);
1033 return false;
1034 }
1035
1036 // Check that the stored type is vectorizable.
1037 if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
1038 Type *T = ST->getValueOperand()->getType();
1039 if (!VectorType::isValidElementType(ElemTy: T)) {
1040 reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
1041 ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
1042 return false;
1043 }
1044
1045 // For nontemporal stores, check that a nontemporal vector version is
1046 // supported on the target.
1047 if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1048 // Arbitrarily try a vector of 2 elements.
1049 auto *VecTy = FixedVectorType::get(ElementType: T, /*NumElts=*/2);
1050 assert(VecTy && "did not find vectorized version of stored type");
1051 if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
1052 reportVectorizationFailure(
1053 DebugMsg: "nontemporal store instruction cannot be vectorized",
1054 ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
1055 return false;
1056 }
1057 }
1058
1059 } else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
1060 if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
1061 // For nontemporal loads, check that a nontemporal vector version is
1062 // supported on the target (arbitrarily try a vector of 2 elements).
1063 auto *VecTy = FixedVectorType::get(ElementType: I.getType(), /*NumElts=*/2);
1064 assert(VecTy && "did not find vectorized version of load type");
1065 if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
1066 reportVectorizationFailure(
1067 DebugMsg: "nontemporal load instruction cannot be vectorized",
1068 ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
1069 return false;
1070 }
1071 }
1072
1073 // FP instructions can allow unsafe algebra, thus vectorizable by
1074 // non-IEEE-754 compliant SIMD units.
1075 // This applies to floating-point math operations and calls, not memory
1076 // operations, shuffles, or casts, as they don't change precision or
1077 // semantics.
1078 } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
1079 !I.isFast()) {
1080 LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1081 Hints->setPotentiallyUnsafe();
1082 }
1083
1084 // Reduction instructions are allowed to have exit users.
1085 // All other instructions must not have external users.
1086 if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1087 // We can safely vectorize loops where instructions within the loop are
1088 // used outside the loop only if the SCEV predicates within the loop is
1089 // same as outside the loop. Allowing the exit means reusing the SCEV
1090 // outside the loop.
1091 if (PSE.getPredicate().isAlwaysTrue()) {
1092 AllowedExit.insert(Ptr: &I);
1093 return true;
1094 }
1095 reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1096 ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1097 return false;
1098 }
1099
1100 return true;
1101}
1102
1103/// Find histogram operations that match high-level code in loops:
1104/// \code
1105/// buckets[indices[i]]+=step;
1106/// \endcode
1107///
1108/// It matches a pattern starting from \p HSt, which Stores to the 'buckets'
1109/// array the computed histogram. It uses a BinOp to sum all counts, storing
1110/// them using a loop-variant index Load from the 'indices' input array.
1111///
1112/// On successful matches it updates the STATISTIC 'HistogramsDetected',
1113/// regardless of hardware support. When there is support, it additionally
1114/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers
1115/// used to update histogram in \p HistogramPtrs.
1116static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop,
1117 const PredicatedScalarEvolution &PSE,
1118 SmallVectorImpl<HistogramInfo> &Histograms) {
1119
1120 // Store value must come from a Binary Operation.
1121 Instruction *HPtrInstr = nullptr;
1122 BinaryOperator *HBinOp = nullptr;
1123 if (!match(V: HSt, P: m_Store(ValueOp: m_BinOp(I&: HBinOp), PointerOp: m_Instruction(I&: HPtrInstr))))
1124 return false;
1125
1126 // BinOp must be an Add or a Sub modifying the bucket value by a
1127 // loop invariant amount.
1128 // FIXME: We assume the loop invariant term is on the RHS.
1129 // Fine for an immediate/constant, but maybe not a generic value?
1130 Value *HIncVal = nullptr;
1131 if (!match(V: HBinOp, P: m_Add(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))) &&
1132 !match(V: HBinOp, P: m_Sub(L: m_Load(Op: m_Specific(V: HPtrInstr)), R: m_Value(V&: HIncVal))))
1133 return false;
1134
1135 // Make sure the increment value is loop invariant.
1136 if (!TheLoop->isLoopInvariant(V: HIncVal))
1137 return false;
1138
1139 // The address to store is calculated through a GEP Instruction.
1140 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: HPtrInstr);
1141 if (!GEP)
1142 return false;
1143
1144 // Restrict address calculation to constant indices except for the last term.
1145 Value *HIdx = nullptr;
1146 for (Value *Index : GEP->indices()) {
1147 if (HIdx)
1148 return false;
1149 if (!isa<ConstantInt>(Val: Index))
1150 HIdx = Index;
1151 }
1152
1153 if (!HIdx)
1154 return false;
1155
1156 // Check that the index is calculated by loading from another array. Ignore
1157 // any extensions.
1158 // FIXME: Support indices from other sources than a linear load from memory?
1159 // We're currently trying to match an operation looping over an array
1160 // of indices, but there could be additional levels of indirection
1161 // in place, or possibly some additional calculation to form the index
1162 // from the loaded data.
1163 Value *VPtrVal;
1164 if (!match(V: HIdx, P: m_ZExtOrSExtOrSelf(Op: m_Load(Op: m_Value(V&: VPtrVal)))))
1165 return false;
1166
1167 // Make sure the index address varies in this loop, not an outer loop.
1168 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: VPtrVal));
1169 if (!AR || AR->getLoop() != TheLoop)
1170 return false;
1171
1172 // Ensure we'll have the same mask by checking that all parts of the histogram
1173 // (gather load, update, scatter store) are in the same block.
1174 LoadInst *IndexedLoad = cast<LoadInst>(Val: HBinOp->getOperand(i_nocapture: 0));
1175 BasicBlock *LdBB = IndexedLoad->getParent();
1176 if (LdBB != HBinOp->getParent() || LdBB != HSt->getParent())
1177 return false;
1178
1179 LLVM_DEBUG(dbgs() << "LV: Found histogram for: " << *HSt << "\n");
1180
1181 // Store the operations that make up the histogram.
1182 Histograms.emplace_back(Args&: IndexedLoad, Args&: HBinOp, Args&: HSt);
1183 return true;
1184}
1185
1186bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1187 // For now, we only support an IndirectUnsafe dependency that calculates
1188 // a histogram
1189 if (!EnableHistogramVectorization)
1190 return false;
1191
1192 // Find a single IndirectUnsafe dependency.
1193 const MemoryDepChecker::Dependence *IUDep = nullptr;
1194 const MemoryDepChecker &DepChecker = LAI->getDepChecker();
1195 const auto *Deps = DepChecker.getDependences();
1196 // If there were too many dependences, LAA abandons recording them. We can't
1197 // proceed safely if we don't know what the dependences are.
1198 if (!Deps)
1199 return false;
1200
1201 for (const MemoryDepChecker::Dependence &Dep : *Deps) {
1202 // Ignore dependencies that are either known to be safe or can be
1203 // checked at runtime.
1204 if (MemoryDepChecker::Dependence::isSafeForVectorization(Type: Dep.Type) !=
1205 MemoryDepChecker::VectorizationSafetyStatus::Unsafe)
1206 continue;
1207
1208 // We're only interested in IndirectUnsafe dependencies here, where the
1209 // address might come from a load from memory. We also only want to handle
1210 // one such dependency, at least for now.
1211 if (Dep.Type != MemoryDepChecker::Dependence::IndirectUnsafe || IUDep)
1212 return false;
1213
1214 IUDep = &Dep;
1215 }
1216 if (!IUDep)
1217 return false;
1218
1219 // For now only normal loads and stores are supported.
1220 LoadInst *LI = dyn_cast<LoadInst>(Val: IUDep->getSource(DepChecker));
1221 StoreInst *SI = dyn_cast<StoreInst>(Val: IUDep->getDestination(DepChecker));
1222
1223 if (!LI || !SI)
1224 return false;
1225
1226 LLVM_DEBUG(dbgs() << "LV: Checking for a histogram on: " << *SI << "\n");
1227 return findHistogram(LI, HSt: SI, TheLoop, PSE: LAI->getPSE(), Histograms);
1228}
1229
1230bool LoopVectorizationLegality::canVectorizeMemory() {
1231 LAI = &LAIs.getInfo(L&: *TheLoop);
1232 const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1233 if (LAR) {
1234 ORE->emit(RemarkBuilder: [&]() {
1235 return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
1236 "loop not vectorized: ", *LAR);
1237 });
1238 }
1239
1240 if (!LAI->canVectorizeMemory()) {
1241 if (hasUncountableExitWithSideEffects()) {
1242 reportVectorizationFailure(
1243 DebugMsg: "Cannot vectorize unsafe dependencies in uncountable exit loop with "
1244 "side effects",
1245 ORETag: "CantVectorizeUnsafeDependencyForEELoopWithSideEffects", ORE,
1246 TheLoop);
1247 return false;
1248 }
1249
1250 return canVectorizeIndirectUnsafeDependences();
1251 }
1252
1253 if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) {
1254 reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses",
1255 OREMsg: "write to a loop invariant address could not "
1256 "be vectorized",
1257 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE,
1258 TheLoop);
1259 return false;
1260 }
1261
1262 // We can vectorize stores to invariant address when final reduction value is
1263 // guaranteed to be stored at the end of the loop. Also, if decision to
1264 // vectorize loop is made, runtime checks are added so as to make sure that
1265 // invariant address won't alias with any other objects.
1266 if (!LAI->getStoresToInvariantAddresses().empty()) {
1267 // For each invariant address, check if last stored value is unconditional
1268 // and the address is not calculated inside the loop.
1269 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1270 if (!isInvariantStoreOfReduction(SI))
1271 continue;
1272
1273 if (blockNeedsPredication(BB: SI->getParent())) {
1274 reportVectorizationFailure(
1275 DebugMsg: "We don't allow storing to uniform addresses",
1276 OREMsg: "write of conditional recurring variant value to a loop "
1277 "invariant address could not be vectorized",
1278 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1279 return false;
1280 }
1281
1282 // Invariant address should be defined outside of loop. LICM pass usually
1283 // makes sure it happens, but in rare cases it does not, we do not want
1284 // to overcomplicate vectorization to support this case.
1285 if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1286 if (TheLoop->contains(Inst: Ptr)) {
1287 reportVectorizationFailure(
1288 DebugMsg: "Invariant address is calculated inside the loop",
1289 OREMsg: "write to a loop invariant address could not "
1290 "be vectorized",
1291 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1292 return false;
1293 }
1294 }
1295 }
1296
1297 if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) {
1298 // For each invariant address, check its last stored value is the result
1299 // of one of our reductions.
1300 //
1301 // We do not check if dependence with loads exists because that is already
1302 // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress.
1303 ScalarEvolution *SE = PSE.getSE();
1304 SmallVector<StoreInst *, 4> UnhandledStores;
1305 for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1306 if (isInvariantStoreOfReduction(SI)) {
1307 // Earlier stores to this address are effectively deadcode.
1308 // With opaque pointers it is possible for one pointer to be used with
1309 // different sizes of stored values:
1310 // store i32 0, ptr %x
1311 // store i8 0, ptr %x
1312 // The latest store doesn't complitely overwrite the first one in the
1313 // example. That is why we have to make sure that types of stored
1314 // values are same.
1315 // TODO: Check that bitwidth of unhandled store is smaller then the
1316 // one that overwrites it and add a test.
1317 erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1318 return storeToSameAddress(SE, A: SI, B: I) &&
1319 I->getValueOperand()->getType() ==
1320 SI->getValueOperand()->getType();
1321 });
1322 continue;
1323 }
1324 UnhandledStores.push_back(Elt: SI);
1325 }
1326
1327 bool IsOK = UnhandledStores.empty();
1328 // TODO: we should also validate against InvariantMemSets.
1329 if (!IsOK) {
1330 reportVectorizationFailure(
1331 DebugMsg: "We don't allow storing to uniform addresses",
1332 OREMsg: "write to a loop invariant address could not "
1333 "be vectorized",
1334 ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1335 return false;
1336 }
1337 }
1338 }
1339
1340 PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1341 return true;
1342}
1343
1344bool LoopVectorizationLegality::canVectorizeFPMath(
1345 bool EnableStrictReductions) {
1346
1347 // First check if there is any ExactFP math or if we allow reassociations
1348 if (!Requirements->getExactFPInst() || Hints->allowReordering())
1349 return true;
1350
1351 // If the above is false, we have ExactFPMath & do not allow reordering.
1352 // If the EnableStrictReductions flag is set, first check if we have any
1353 // Exact FP induction vars, which we cannot vectorize.
1354 if (!EnableStrictReductions ||
1355 any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1356 InductionDescriptor IndDesc = Induction.second;
1357 return IndDesc.getExactFPMathInst();
1358 }))
1359 return false;
1360
1361 // We can now only vectorize if all reductions with Exact FP math also
1362 // have the isOrdered flag set, which indicates that we can move the
1363 // reduction operations in-loop.
1364 return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1365 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1366 return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered();
1367 }));
1368}
1369
1370bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1371 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1372 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1373 return RdxDesc.IntermediateStore == SI;
1374 });
1375}
1376
1377bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1378 return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1379 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1380 if (!RdxDesc.IntermediateStore)
1381 return false;
1382
1383 ScalarEvolution *SE = PSE.getSE();
1384 Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1385 return V == InvariantAddress ||
1386 SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1387 });
1388}
1389
1390bool LoopVectorizationLegality::isInductionPhi(const Value *V) const {
1391 Value *In0 = const_cast<Value *>(V);
1392 PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1393 if (!PN)
1394 return false;
1395
1396 return Inductions.count(Key: PN);
1397}
1398
1399const InductionDescriptor *
1400LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const {
1401 if (!isInductionPhi(V: Phi))
1402 return nullptr;
1403 auto &ID = getInductionVars().find(Key: Phi)->second;
1404 if (ID.getKind() == InductionDescriptor::IK_IntInduction ||
1405 ID.getKind() == InductionDescriptor::IK_FpInduction)
1406 return &ID;
1407 return nullptr;
1408}
1409
1410const InductionDescriptor *
1411LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const {
1412 if (!isInductionPhi(V: Phi))
1413 return nullptr;
1414 auto &ID = getInductionVars().find(Key: Phi)->second;
1415 if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1416 return &ID;
1417 return nullptr;
1418}
1419
1420bool LoopVectorizationLegality::isCastedInductionVariable(
1421 const Value *V) const {
1422 auto *Inst = dyn_cast<Instruction>(Val: V);
1423 return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1424}
1425
1426bool LoopVectorizationLegality::isInductionVariable(const Value *V) const {
1427 return isInductionPhi(V) || isCastedInductionVariable(V);
1428}
1429
1430bool LoopVectorizationLegality::isFixedOrderRecurrence(
1431 const PHINode *Phi) const {
1432 return FixedOrderRecurrences.count(Ptr: Phi);
1433}
1434
1435bool LoopVectorizationLegality::blockNeedsPredication(
1436 const BasicBlock *BB) const {
1437 // When vectorizing early exits, create predicates for the latch block only.
1438 // For a single early exit, it must be a direct predecessor of the latch.
1439 // For multiple early exits, they form a chain where each exiting block
1440 // dominates all subsequent blocks up to the latch.
1441 BasicBlock *Latch = TheLoop->getLoopLatch();
1442 if (hasUncountableEarlyExit())
1443 return BB == Latch;
1444 return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1445}
1446
1447bool LoopVectorizationLegality::blockCanBePredicated(
1448 BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
1449 SmallPtrSetImpl<const Instruction *> &MaskedOp) const {
1450 for (Instruction &I : *BB) {
1451 // We can predicate blocks with calls to assume, as long as we drop them in
1452 // case we flatten the CFG via predication.
1453 if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) {
1454 MaskedOp.insert(Ptr: &I);
1455 continue;
1456 }
1457
1458 // Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1459 // TODO: there might be cases that it should block the vectorization. Let's
1460 // ignore those for now.
1461 if (isa<NoAliasScopeDeclInst>(Val: &I))
1462 continue;
1463
1464 // We can allow masked calls if there's at least one vector variant, even
1465 // if we end up scalarizing due to the cost model calculations.
1466 // TODO: Allow other calls if they have appropriate attributes... readonly
1467 // and argmemonly?
1468 if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1469 if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1470 MaskedOp.insert(Ptr: CI);
1471 continue;
1472 }
1473
1474 // Loads are handled via masking (or speculated if safe to do so.)
1475 if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1476 if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1477 MaskedOp.insert(Ptr: LI);
1478 continue;
1479 }
1480
1481 // Predicated store requires some form of masking:
1482 // 1) masked store HW instruction,
1483 // 2) emulation via load-blend-store (only if safe and legal to do so,
1484 // be aware on the race conditions), or
1485 // 3) element-by-element predicate check and scalar store.
1486 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1487 MaskedOp.insert(Ptr: SI);
1488 continue;
1489 }
1490
1491 if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
1492 return false;
1493 }
1494
1495 return true;
1496}
1497
1498bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1499 if (!EnableIfConversion) {
1500 reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1501 ORETag: "IfConversionDisabled", ORE, TheLoop);
1502 return false;
1503 }
1504
1505 assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
1506
1507 // A list of pointers which are known to be dereferenceable within scope of
1508 // the loop body for each iteration of the loop which executes. That is,
1509 // the memory pointed to can be dereferenced (with the access size implied by
1510 // the value's type) unconditionally within the loop header without
1511 // introducing a new fault.
1512 SmallPtrSet<Value *, 8> SafePointers;
1513
1514 // Collect safe addresses.
1515 for (BasicBlock *BB : TheLoop->blocks()) {
1516 if (!blockNeedsPredication(BB)) {
1517 for (Instruction &I : *BB)
1518 if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1519 SafePointers.insert(Ptr);
1520 continue;
1521 }
1522
1523 // For a block which requires predication, a address may be safe to access
1524 // in the loop w/o predication if we can prove dereferenceability facts
1525 // sufficient to ensure it'll never fault within the loop. For the moment,
1526 // we restrict this to loads; stores are more complicated due to
1527 // concurrency restrictions.
1528 ScalarEvolution &SE = *PSE.getSE();
1529 SmallVector<const SCEVPredicate *, 4> Predicates;
1530 for (Instruction &I : *BB) {
1531 LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1532
1533 // Make sure we can execute all computations feeding into Ptr in the loop
1534 // w/o triggering UB and that none of the out-of-loop operands are poison.
1535 // We do not need to check if operations inside the loop can produce
1536 // poison due to flags (e.g. due to an inbounds GEP going out of bounds),
1537 // because flags will be dropped when executing them unconditionally.
1538 // TODO: Results could be improved by considering poison-propagation
1539 // properties of visited ops.
1540 auto CanSpeculatePointerOp = [this](Value *Ptr) {
1541 SmallVector<Value *> Worklist = {Ptr};
1542 SmallPtrSet<Value *, 4> Visited;
1543 while (!Worklist.empty()) {
1544 Value *CurrV = Worklist.pop_back_val();
1545 if (!Visited.insert(Ptr: CurrV).second)
1546 continue;
1547
1548 auto *CurrI = dyn_cast<Instruction>(Val: CurrV);
1549 if (!CurrI || !TheLoop->contains(Inst: CurrI)) {
1550 // If operands from outside the loop may be poison then Ptr may also
1551 // be poison.
1552 if (!isGuaranteedNotToBePoison(V: CurrV, AC,
1553 CtxI: TheLoop->getLoopPredecessor()
1554 ->getTerminator()
1555 ->getIterator(),
1556 DT))
1557 return false;
1558 continue;
1559 }
1560
1561 // A loaded value may be poison, independent of any flags.
1562 if (isa<LoadInst>(Val: CurrI) && !isGuaranteedNotToBePoison(V: CurrV, AC))
1563 return false;
1564
1565 // For other ops, assume poison can only be introduced via flags,
1566 // which can be dropped.
1567 if (!isa<PHINode>(Val: CurrI) && !isSafeToSpeculativelyExecute(I: CurrI))
1568 return false;
1569 append_range(C&: Worklist, R: CurrI->operands());
1570 }
1571 return true;
1572 };
1573 // Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so
1574 // that it will consider loops that need guarding by SCEV checks. The
1575 // vectoriser will generate these checks if we decide to vectorise.
1576 if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1577 CanSpeculatePointerOp(LI->getPointerOperand()) &&
1578 isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC,
1579 Predicates: &Predicates))
1580 SafePointers.insert(Ptr: LI->getPointerOperand());
1581 Predicates.clear();
1582 }
1583 }
1584
1585 // Collect the blocks that need predication.
1586 for (BasicBlock *BB : TheLoop->blocks()) {
1587 // We support only branches and switch statements as terminators inside the
1588 // loop.
1589 if (isa<SwitchInst>(Val: BB->getTerminator())) {
1590 if (TheLoop->isLoopExiting(BB)) {
1591 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported switch",
1592 ORETag: "LoopContainsUnsupportedSwitch", ORE,
1593 TheLoop, I: BB->getTerminator());
1594 return false;
1595 }
1596 } else if (!isa<BranchInst>(Val: BB->getTerminator())) {
1597 reportVectorizationFailure(DebugMsg: "Loop contains an unsupported terminator",
1598 ORETag: "LoopContainsUnsupportedTerminator", ORE,
1599 TheLoop, I: BB->getTerminator());
1600 return false;
1601 }
1602
1603 // We must be able to predicate all blocks that need to be predicated.
1604 if (blockNeedsPredication(BB) &&
1605 !blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1606 reportVectorizationFailure(
1607 DebugMsg: "Control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1608 ORE, TheLoop, I: BB->getTerminator());
1609 return false;
1610 }
1611 }
1612
1613 // We can if-convert this loop.
1614 return true;
1615}
1616
1617// Helper function to canVectorizeLoopNestCFG.
1618bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1619 bool UseVPlanNativePath) {
1620 assert((UseVPlanNativePath || Lp->isInnermost()) &&
1621 "VPlan-native path is not enabled.");
1622
1623 // TODO: ORE should be improved to show more accurate information when an
1624 // outer loop can't be vectorized because a nested loop is not understood or
1625 // legal. Something like: "outer_loop_location: loop not vectorized:
1626 // (inner_loop_location) loop control flow is not understood by vectorizer".
1627
1628 // Store the result and return it at the end instead of exiting early, in case
1629 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1630 bool Result = true;
1631 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1632
1633 // We must have a loop in canonical form. Loops with indirectbr in them cannot
1634 // be canonicalized.
1635 if (!Lp->getLoopPreheader()) {
1636 reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1637 OREMsg: "loop control flow is not understood by vectorizer",
1638 ORETag: "CFGNotUnderstood", ORE, TheLoop);
1639 if (DoExtraAnalysis)
1640 Result = false;
1641 else
1642 return false;
1643 }
1644
1645 // We must have a single backedge.
1646 if (Lp->getNumBackEdges() != 1) {
1647 reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1648 OREMsg: "loop control flow is not understood by vectorizer",
1649 ORETag: "CFGNotUnderstood", ORE, TheLoop);
1650 if (DoExtraAnalysis)
1651 Result = false;
1652 else
1653 return false;
1654 }
1655
1656 // The latch must be terminated by a BranchInst.
1657 BasicBlock *Latch = Lp->getLoopLatch();
1658 if (Latch && !isa<BranchInst>(Val: Latch->getTerminator())) {
1659 reportVectorizationFailure(
1660 DebugMsg: "The loop latch terminator is not a BranchInst",
1661 OREMsg: "loop control flow is not understood by vectorizer", ORETag: "CFGNotUnderstood",
1662 ORE, TheLoop);
1663 if (DoExtraAnalysis)
1664 Result = false;
1665 else
1666 return false;
1667 }
1668
1669 return Result;
1670}
1671
1672bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1673 Loop *Lp, bool UseVPlanNativePath) {
1674 // Store the result and return it at the end instead of exiting early, in case
1675 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1676 bool Result = true;
1677 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1678 if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1679 if (DoExtraAnalysis)
1680 Result = false;
1681 else
1682 return false;
1683 }
1684
1685 // Recursively check whether the loop control flow of nested loops is
1686 // understood.
1687 for (Loop *SubLp : *Lp)
1688 if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1689 if (DoExtraAnalysis)
1690 Result = false;
1691 else
1692 return false;
1693 }
1694
1695 return Result;
1696}
1697
1698bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1699 BasicBlock *LatchBB = TheLoop->getLoopLatch();
1700 if (!LatchBB) {
1701 reportVectorizationFailure(DebugMsg: "Loop does not have a latch",
1702 OREMsg: "Cannot vectorize early exit loop",
1703 ORETag: "NoLatchEarlyExit", ORE, TheLoop);
1704 return false;
1705 }
1706
1707 if (Reductions.size() || FixedOrderRecurrences.size()) {
1708 reportVectorizationFailure(
1709 DebugMsg: "Found reductions or recurrences in early-exit loop",
1710 OREMsg: "Cannot vectorize early exit loop with reductions or recurrences",
1711 ORETag: "RecurrencesInEarlyExitLoop", ORE, TheLoop);
1712 return false;
1713 }
1714
1715 SmallVector<BasicBlock *, 8> ExitingBlocks;
1716 TheLoop->getExitingBlocks(ExitingBlocks);
1717
1718 // Keep a record of all the exiting blocks.
1719 SmallVector<const SCEVPredicate *, 4> Predicates;
1720 SmallVector<BasicBlock *> UncountableExitingBlocks;
1721 for (BasicBlock *BB : ExitingBlocks) {
1722 const SCEV *EC =
1723 PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: BB, Predicates: &Predicates);
1724 if (isa<SCEVCouldNotCompute>(Val: EC)) {
1725 if (size(Range: successors(BB)) != 2) {
1726 reportVectorizationFailure(
1727 DebugMsg: "Early exiting block does not have exactly two successors",
1728 OREMsg: "Incorrect number of successors from early exiting block",
1729 ORETag: "EarlyExitTooManySuccessors", ORE, TheLoop);
1730 return false;
1731 }
1732
1733 UncountableExitingBlocks.push_back(Elt: BB);
1734 } else
1735 CountableExitingBlocks.push_back(Elt: BB);
1736 }
1737 // We can safely ignore the predicates here because when vectorizing the loop
1738 // the PredicatatedScalarEvolution class will keep track of all predicates
1739 // for each exiting block anyway. This happens when calling
1740 // PSE.getSymbolicMaxBackedgeTakenCount() below.
1741 Predicates.clear();
1742
1743 if (UncountableExitingBlocks.empty()) {
1744 LLVM_DEBUG(dbgs() << "LV: Could not find any uncountable exits");
1745 return false;
1746 }
1747
1748 // Sort exiting blocks by dominance order to establish a clear chain.
1749 llvm::sort(C&: UncountableExitingBlocks, Comp: [this](BasicBlock *A, BasicBlock *B) {
1750 return DT->properlyDominates(A, B);
1751 });
1752
1753 // Verify that exits form a strict dominance chain: each block must
1754 // dominate the next. This ensures each exit is only dominated by its
1755 // predecessors in the chain.
1756 for (unsigned I = 0; I + 1 < UncountableExitingBlocks.size(); ++I) {
1757 if (!DT->properlyDominates(A: UncountableExitingBlocks[I],
1758 B: UncountableExitingBlocks[I + 1])) {
1759 reportVectorizationFailure(
1760 DebugMsg: "Uncountable early exits do not form a dominance chain",
1761 OREMsg: "Cannot vectorize early exit loop with non-dominating exits",
1762 ORETag: "NonDominatingEarlyExits", ORE, TheLoop);
1763 return false;
1764 }
1765 }
1766
1767 BasicBlock *LatchPredBB = LatchBB->getUniquePredecessor();
1768 if (LatchPredBB != UncountableExitingBlocks.back()) {
1769 reportVectorizationFailure(
1770 DebugMsg: "Last early exiting block in the chain is not the latch predecessor",
1771 OREMsg: "Cannot vectorize early exit loop", ORETag: "EarlyExitNotLatchPredecessor", ORE,
1772 TheLoop);
1773 return false;
1774 }
1775
1776 // The latch block must have a countable exit.
1777 if (isa<SCEVCouldNotCompute>(
1778 Val: PSE.getSE()->getPredicatedExitCount(L: TheLoop, ExitingBlock: LatchBB, Predicates: &Predicates))) {
1779 reportVectorizationFailure(
1780 DebugMsg: "Cannot determine exact exit count for latch block",
1781 OREMsg: "Cannot vectorize early exit loop",
1782 ORETag: "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1783 return false;
1784 }
1785 assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
1786 "Latch block not found in list of countable exits!");
1787
1788 // Check to see if there are instructions that could potentially generate
1789 // exceptions or have side-effects.
1790 auto IsSafeOperation = [](Instruction *I) -> bool {
1791 switch (I->getOpcode()) {
1792 case Instruction::Load:
1793 case Instruction::Store:
1794 case Instruction::PHI:
1795 case Instruction::Br:
1796 // These are checked separately.
1797 return true;
1798 default:
1799 return isSafeToSpeculativelyExecute(I);
1800 }
1801 };
1802
1803 bool HasSideEffects = false;
1804 for (auto *BB : TheLoop->blocks())
1805 for (auto &I : *BB) {
1806 if (I.mayWriteToMemory()) {
1807 if (isa<StoreInst>(Val: &I) && cast<StoreInst>(Val: &I)->isSimple()) {
1808 HasSideEffects = true;
1809 continue;
1810 }
1811
1812 // We don't support complex writes to memory.
1813 reportVectorizationFailure(
1814 DebugMsg: "Complex writes to memory unsupported in early exit loops",
1815 OREMsg: "Cannot vectorize early exit loop with complex writes to memory",
1816 ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
1817 return false;
1818 }
1819
1820 if (!IsSafeOperation(&I)) {
1821 reportVectorizationFailure(DebugMsg: "Early exit loop contains operations that "
1822 "cannot be speculatively executed",
1823 ORETag: "UnsafeOperationsEarlyExitLoop", ORE,
1824 TheLoop);
1825 return false;
1826 }
1827 }
1828
1829 SmallVector<LoadInst *, 4> NonDerefLoads;
1830 // TODO: Handle loops that may fault.
1831 if (!HasSideEffects) {
1832 // Read-only loop.
1833 Predicates.clear();
1834 if (!isReadOnlyLoop(L: TheLoop, SE: PSE.getSE(), DT, AC, NonDereferenceableAndAlignedLoads&: NonDerefLoads,
1835 Predicates: &Predicates)) {
1836 reportVectorizationFailure(
1837 DebugMsg: "Loop may fault", OREMsg: "Cannot vectorize non-read-only early exit loop",
1838 ORETag: "NonReadOnlyEarlyExitLoop", ORE, TheLoop);
1839 return false;
1840 }
1841 } else {
1842 // Check all uncountable exiting blocks for movable loads.
1843 for (BasicBlock *ExitingBB : UncountableExitingBlocks) {
1844 if (!canUncountableExitConditionLoadBeMoved(ExitingBlock: ExitingBB))
1845 return false;
1846 }
1847 }
1848
1849 // Check non-dereferenceable loads if any.
1850 for (LoadInst *LI : NonDerefLoads) {
1851 // Only support unit-stride access for now.
1852 int Stride = isConsecutivePtr(AccessTy: LI->getType(), Ptr: LI->getPointerOperand());
1853 if (Stride != 1) {
1854 reportVectorizationFailure(
1855 DebugMsg: "Loop contains potentially faulting strided load",
1856 OREMsg: "Cannot vectorize early exit loop with "
1857 "strided fault-only-first load",
1858 ORETag: "EarlyExitLoopWithStridedFaultOnlyFirstLoad", ORE, TheLoop);
1859 return false;
1860 }
1861 PotentiallyFaultingLoads.insert(Ptr: LI);
1862 LLVM_DEBUG(dbgs() << "LV: Found potentially faulting load: " << *LI
1863 << "\n");
1864 }
1865
1866 [[maybe_unused]] const SCEV *SymbolicMaxBTC =
1867 PSE.getSymbolicMaxBackedgeTakenCount();
1868 // Since we have an exact exit count for the latch and the early exit
1869 // dominates the latch, then this should guarantee a computed SCEV value.
1870 assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1871 "Failed to get symbolic expression for backedge taken count");
1872 LLVM_DEBUG(dbgs() << "LV: Found an early exit loop with symbolic max "
1873 "backedge taken count: "
1874 << *SymbolicMaxBTC << '\n');
1875 HasUncountableEarlyExit = true;
1876 UncountableExitWithSideEffects = HasSideEffects;
1877 return true;
1878}
1879
1880bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
1881 BasicBlock *ExitingBlock) {
1882 // Try to find a load in the critical path for the uncountable exit condition.
1883 // This is currently matching about the simplest form we can, expecting
1884 // only one in-loop load, the result of which is directly compared against
1885 // a loop-invariant value.
1886 // FIXME: We're insisting on a single use for now, because otherwise we will
1887 // need to make PHI nodes for other users. That can be done once the initial
1888 // transform code lands.
1889 auto *Br = cast<BranchInst>(Val: ExitingBlock->getTerminator());
1890
1891 using namespace llvm::PatternMatch;
1892 Instruction *L = nullptr;
1893 Value *Ptr = nullptr;
1894 Value *R = nullptr;
1895 if (!match(V: Br->getCondition(),
1896 P: m_OneUse(SubPattern: m_ICmp(L: m_OneUse(SubPattern: m_Instruction(I&: L, Match: m_Load(Op: m_Value(V&: Ptr)))),
1897 R: m_Value(V&: R))))) {
1898 reportVectorizationFailure(
1899 DebugMsg: "Early exit loop with store but no supported condition load",
1900 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1901 return false;
1902 }
1903
1904 // FIXME: Don't rely on operand ordering for the comparison.
1905 if (!TheLoop->isLoopInvariant(V: R)) {
1906 reportVectorizationFailure(
1907 DebugMsg: "Early exit loop with store but no supported condition load",
1908 ORETag: "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
1909 return false;
1910 }
1911
1912 // Make sure that the load address is not loop invariant; we want an
1913 // address calculation that we can rotate to the next vector iteration.
1914 const auto *AR = dyn_cast<SCEVAddRecExpr>(Val: PSE.getSE()->getSCEV(V: Ptr));
1915 if (!AR || AR->getLoop() != TheLoop || !AR->isAffine()) {
1916 reportVectorizationFailure(
1917 DebugMsg: "Uncountable exit condition depends on load with an address that is "
1918 "not an add recurrence in the loop",
1919 ORETag: "EarlyExitLoadInvariantAddress", ORE, TheLoop);
1920 return false;
1921 }
1922
1923 // FIXME: Support gathers after first-faulting load support lands.
1924 SmallVector<const SCEVPredicate *, 4> Predicates;
1925 LoadInst *Load = cast<LoadInst>(Val: L);
1926 if (!isDereferenceableAndAlignedInLoop(LI: Load, L: TheLoop, SE&: *PSE.getSE(), DT&: *DT, AC,
1927 Predicates: &Predicates)) {
1928 reportVectorizationFailure(
1929 DebugMsg: "Loop may fault",
1930 OREMsg: "Cannot vectorize potentially faulting early exit loop",
1931 ORETag: "PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1932 return false;
1933 }
1934
1935 ICFLoopSafetyInfo SafetyInfo;
1936 SafetyInfo.computeLoopSafetyInfo(CurLoop: TheLoop);
1937 // We need to know that load will be executed before we can hoist a
1938 // copy out to run just before the first iteration.
1939 if (!SafetyInfo.isGuaranteedToExecute(Inst: *Load, DT, CurLoop: TheLoop)) {
1940 reportVectorizationFailure(
1941 DebugMsg: "Load for uncountable exit not guaranteed to execute",
1942 ORETag: "ConditionalUncountableExitLoad", ORE, TheLoop);
1943 return false;
1944 }
1945
1946 // Prohibit any potential aliasing with any instruction in the loop which
1947 // might store to memory.
1948 // FIXME: Relax this constraint where possible.
1949 for (auto *BB : TheLoop->blocks()) {
1950 for (auto &I : *BB) {
1951 if (&I == Load)
1952 continue;
1953
1954 if (I.mayWriteToMemory()) {
1955 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1956 AliasResult AR = AA->alias(V1: Ptr, V2: SI->getPointerOperand());
1957 if (AR == AliasResult::NoAlias)
1958 continue;
1959 }
1960
1961 reportVectorizationFailure(
1962 DebugMsg: "Cannot determine whether critical uncountable exit load address "
1963 "does not alias with a memory write",
1964 ORETag: "CantVectorizeAliasWithCriticalUncountableExitLoad", ORE, TheLoop);
1965 return false;
1966 }
1967 }
1968 }
1969
1970 return true;
1971}
1972
1973bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1974 // Store the result and return it at the end instead of exiting early, in case
1975 // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1976 bool Result = true;
1977
1978 bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1979 // Check whether the loop-related control flow in the loop nest is expected by
1980 // vectorizer.
1981 if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1982 if (DoExtraAnalysis) {
1983 LLVM_DEBUG(dbgs() << "LV: legality check failed: loop nest");
1984 Result = false;
1985 } else {
1986 return false;
1987 }
1988 }
1989
1990 // We need to have a loop header.
1991 LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1992 << '\n');
1993
1994 // Specific checks for outer loops. We skip the remaining legal checks at this
1995 // point because they don't support outer loops.
1996 if (!TheLoop->isInnermost()) {
1997 assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1998
1999 if (!canVectorizeOuterLoop()) {
2000 reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
2001 ORETag: "UnsupportedOuterLoop", ORE, TheLoop);
2002 // TODO: Implement DoExtraAnalysis when subsequent legal checks support
2003 // outer loops.
2004 return false;
2005 }
2006
2007 LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
2008 return Result;
2009 }
2010
2011 assert(TheLoop->isInnermost() && "Inner loop expected.");
2012 // Check if we can if-convert non-single-bb loops.
2013 unsigned NumBlocks = TheLoop->getNumBlocks();
2014 if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
2015 LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
2016 if (DoExtraAnalysis)
2017 Result = false;
2018 else
2019 return false;
2020 }
2021
2022 // Check if we can vectorize the instructions and CFG in this loop.
2023 if (!canVectorizeInstrs()) {
2024 LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
2025 if (DoExtraAnalysis)
2026 Result = false;
2027 else
2028 return false;
2029 }
2030
2031 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
2032 if (TheLoop->getExitingBlock()) {
2033 reportVectorizationFailure(DebugMsg: "Cannot vectorize uncountable loop",
2034 ORETag: "UnsupportedUncountableLoop", ORE, TheLoop);
2035 if (DoExtraAnalysis)
2036 Result = false;
2037 else
2038 return false;
2039 } else {
2040 if (!isVectorizableEarlyExitLoop()) {
2041 assert(!hasUncountableEarlyExit() &&
2042 !hasUncountableExitWithSideEffects() &&
2043 "Must be false without vectorizable early-exit loop");
2044 if (DoExtraAnalysis)
2045 Result = false;
2046 else
2047 return false;
2048 }
2049 }
2050 }
2051
2052 // Go over each instruction and look at memory deps.
2053 if (!canVectorizeMemory()) {
2054 LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
2055 if (DoExtraAnalysis)
2056 Result = false;
2057 else
2058 return false;
2059 }
2060
2061 // Bail out for state-changing loops with uncountable exits for now.
2062 if (UncountableExitWithSideEffects) {
2063 reportVectorizationFailure(
2064 DebugMsg: "Writes to memory unsupported in early exit loops",
2065 OREMsg: "Cannot vectorize early exit loop with writes to memory",
2066 ORETag: "WritesInEarlyExitLoop", ORE, TheLoop);
2067 return false;
2068 }
2069
2070 if (Result) {
2071 LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
2072 << (LAI->getRuntimePointerChecking()->Need
2073 ? " (with a runtime bound check)"
2074 : "")
2075 << "!\n");
2076 }
2077
2078 unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
2079 if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
2080 SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
2081
2082 if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
2083 LLVM_DEBUG(dbgs() << "LV: Vectorization not profitable "
2084 "due to SCEVThreshold");
2085 reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
2086 OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
2087 ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
2088 if (DoExtraAnalysis)
2089 Result = false;
2090 else
2091 return false;
2092 }
2093
2094 // Okay! We've done all the tests. If any have failed, return false. Otherwise
2095 // we can vectorize, and at this point we don't have any other mem analysis
2096 // which may limit our maximum vectorization factor, so just return true with
2097 // no restrictions.
2098 return Result;
2099}
2100
2101bool LoopVectorizationLegality::canFoldTailByMasking() const {
2102 // The only loops we can vectorize without a scalar epilogue, are loops with
2103 // a bottom-test and a single exiting block. We'd have to handle the fact
2104 // that not every instruction executes on the last iteration. This will
2105 // require a lane mask which varies through the vector loop body. (TODO)
2106 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
2107 LLVM_DEBUG(
2108 dbgs()
2109 << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
2110 return false;
2111 }
2112
2113 LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
2114
2115 SmallPtrSet<const Value *, 8> ReductionLiveOuts;
2116
2117 for (const auto &Reduction : getReductionVars())
2118 ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
2119
2120 for (const auto &Entry : getInductionVars()) {
2121 PHINode *OrigPhi = Entry.first;
2122 for (User *U : OrigPhi->users()) {
2123 auto *UI = cast<Instruction>(Val: U);
2124 if (!TheLoop->contains(Inst: UI)) {
2125 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
2126 "outside user for "
2127 << *UI << "\n");
2128 return false;
2129 }
2130 }
2131 }
2132
2133 // The list of pointers that we can safely read and write to remains empty.
2134 SmallPtrSet<Value *, 8> SafePointers;
2135
2136 // Check all blocks for predication, including those that ordinarily do not
2137 // need predication such as the header block.
2138 SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
2139 for (BasicBlock *BB : TheLoop->blocks()) {
2140 if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
2141 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
2142 return false;
2143 }
2144 }
2145
2146 LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
2147
2148 return true;
2149}
2150
2151void LoopVectorizationLegality::prepareToFoldTailByMasking() {
2152 // The list of pointers that we can safely read and write to remains empty.
2153 SmallPtrSet<Value *, 8> SafePointers;
2154
2155 // Mark all blocks for predication, including those that ordinarily do not
2156 // need predication such as the header block.
2157 for (BasicBlock *BB : TheLoop->blocks()) {
2158 [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp);
2159 assert(R && "Must be able to predicate block when tail-folding.");
2160 }
2161}
2162
2163} // namespace llvm
2164