1 | //===- LoopVectorizationLegality.cpp --------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file provides loop vectorization legality analysis. Original code |
10 | // resided in LoopVectorize.cpp for a long time. |
11 | // |
12 | // At this point, it is implemented as a utility class, not as an analysis |
13 | // pass. It should be easy to create an analysis pass around it if there |
14 | // is a need (but D45420 needs to happen first). |
15 | // |
16 | |
17 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
18 | #include "llvm/Analysis/Loads.h" |
19 | #include "llvm/Analysis/LoopInfo.h" |
20 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
21 | #include "llvm/Analysis/TargetLibraryInfo.h" |
22 | #include "llvm/Analysis/TargetTransformInfo.h" |
23 | #include "llvm/Analysis/ValueTracking.h" |
24 | #include "llvm/Analysis/VectorUtils.h" |
25 | #include "llvm/IR/IntrinsicInst.h" |
26 | #include "llvm/IR/PatternMatch.h" |
27 | #include "llvm/Transforms/Utils/SizeOpts.h" |
28 | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
29 | |
30 | using namespace llvm; |
31 | using namespace PatternMatch; |
32 | |
33 | #define LV_NAME "loop-vectorize" |
34 | #define DEBUG_TYPE LV_NAME |
35 | |
36 | static cl::opt<bool> |
37 | EnableIfConversion("enable-if-conversion" , cl::init(Val: true), cl::Hidden, |
38 | cl::desc("Enable if-conversion during vectorization." )); |
39 | |
40 | static cl::opt<bool> |
41 | AllowStridedPointerIVs("lv-strided-pointer-ivs" , cl::init(Val: false), cl::Hidden, |
42 | cl::desc("Enable recognition of non-constant strided " |
43 | "pointer induction variables." )); |
44 | |
45 | namespace llvm { |
46 | cl::opt<bool> |
47 | HintsAllowReordering("hints-allow-reordering" , cl::init(Val: true), cl::Hidden, |
48 | cl::desc("Allow enabling loop hints to reorder " |
49 | "FP operations during vectorization." )); |
50 | } |
51 | |
52 | // TODO: Move size-based thresholds out of legality checking, make cost based |
53 | // decisions instead of hard thresholds. |
54 | static cl::opt<unsigned> VectorizeSCEVCheckThreshold( |
55 | "vectorize-scev-check-threshold" , cl::init(Val: 16), cl::Hidden, |
56 | cl::desc("The maximum number of SCEV checks allowed." )); |
57 | |
58 | static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( |
59 | "pragma-vectorize-scev-check-threshold" , cl::init(Val: 128), cl::Hidden, |
60 | cl::desc("The maximum number of SCEV checks allowed with a " |
61 | "vectorize(enable) pragma" )); |
62 | |
63 | static cl::opt<LoopVectorizeHints::ScalableForceKind> |
64 | ForceScalableVectorization( |
65 | "scalable-vectorization" , cl::init(Val: LoopVectorizeHints::SK_Unspecified), |
66 | cl::Hidden, |
67 | cl::desc("Control whether the compiler can use scalable vectors to " |
68 | "vectorize a loop" ), |
69 | cl::values( |
70 | clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off" , |
71 | "Scalable vectorization is disabled." ), |
72 | clEnumValN( |
73 | LoopVectorizeHints::SK_PreferScalable, "preferred" , |
74 | "Scalable vectorization is available and favored when the " |
75 | "cost is inconclusive." ), |
76 | clEnumValN( |
77 | LoopVectorizeHints::SK_PreferScalable, "on" , |
78 | "Scalable vectorization is available and favored when the " |
79 | "cost is inconclusive." ))); |
80 | |
81 | /// Maximum vectorization interleave count. |
82 | static const unsigned MaxInterleaveFactor = 16; |
83 | |
84 | namespace llvm { |
85 | |
86 | bool LoopVectorizeHints::Hint::validate(unsigned Val) { |
87 | switch (Kind) { |
88 | case HK_WIDTH: |
89 | return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth; |
90 | case HK_INTERLEAVE: |
91 | return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor; |
92 | case HK_FORCE: |
93 | return (Val <= 1); |
94 | case HK_ISVECTORIZED: |
95 | case HK_PREDICATE: |
96 | case HK_SCALABLE: |
97 | return (Val == 0 || Val == 1); |
98 | } |
99 | return false; |
100 | } |
101 | |
102 | LoopVectorizeHints::(const Loop *L, |
103 | bool InterleaveOnlyWhenForced, |
104 | OptimizationRemarkEmitter &ORE, |
105 | const TargetTransformInfo *TTI) |
106 | : Width("vectorize.width" , VectorizerParams::VectorizationFactor, HK_WIDTH), |
107 | Interleave("interleave.count" , InterleaveOnlyWhenForced, HK_INTERLEAVE), |
108 | Force("vectorize.enable" , FK_Undefined, HK_FORCE), |
109 | IsVectorized("isvectorized" , 0, HK_ISVECTORIZED), |
110 | Predicate("vectorize.predicate.enable" , FK_Undefined, HK_PREDICATE), |
111 | Scalable("vectorize.scalable.enable" , SK_Unspecified, HK_SCALABLE), |
112 | TheLoop(L), ORE(ORE) { |
113 | // Populate values with existing loop metadata. |
114 | getHintsFromMetadata(); |
115 | |
116 | // force-vector-interleave overrides DisableInterleaving. |
117 | if (VectorizerParams::isInterleaveForced()) |
118 | Interleave.Value = VectorizerParams::VectorizationInterleave; |
119 | |
120 | // If the metadata doesn't explicitly specify whether to enable scalable |
121 | // vectorization, then decide based on the following criteria (increasing |
122 | // level of priority): |
123 | // - Target default |
124 | // - Metadata width |
125 | // - Force option (always overrides) |
126 | if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { |
127 | if (TTI) |
128 | Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable |
129 | : SK_FixedWidthOnly; |
130 | |
131 | if (Width.Value) |
132 | // If the width is set, but the metadata says nothing about the scalable |
133 | // property, then assume it concerns only a fixed-width UserVF. |
134 | // If width is not set, the flag takes precedence. |
135 | Scalable.Value = SK_FixedWidthOnly; |
136 | } |
137 | |
138 | // If the flag is set to force any use of scalable vectors, override the loop |
139 | // hints. |
140 | if (ForceScalableVectorization.getValue() != |
141 | LoopVectorizeHints::SK_Unspecified) |
142 | Scalable.Value = ForceScalableVectorization.getValue(); |
143 | |
144 | // Scalable vectorization is disabled if no preference is specified. |
145 | if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) |
146 | Scalable.Value = SK_FixedWidthOnly; |
147 | |
148 | if (IsVectorized.Value != 1) |
149 | // If the vectorization width and interleaving count are both 1 then |
150 | // consider the loop to have been already vectorized because there's |
151 | // nothing more that we can do. |
152 | IsVectorized.Value = |
153 | getWidth() == ElementCount::getFixed(MinVal: 1) && getInterleave() == 1; |
154 | LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == 1) dbgs() |
155 | << "LV: Interleaving disabled by the pass manager\n" ); |
156 | } |
157 | |
158 | void LoopVectorizeHints::setAlreadyVectorized() { |
159 | LLVMContext &Context = TheLoop->getHeader()->getContext(); |
160 | |
161 | MDNode *IsVectorizedMD = MDNode::get( |
162 | Context, |
163 | MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized" ), |
164 | ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt(32, 1)))}); |
165 | MDNode *LoopID = TheLoop->getLoopID(); |
166 | MDNode *NewLoopID = |
167 | makePostTransformationMetadata(Context, OrigLoopID: LoopID, |
168 | RemovePrefixes: {Twine(Prefix(), "vectorize." ).str(), |
169 | Twine(Prefix(), "interleave." ).str()}, |
170 | AddAttrs: {IsVectorizedMD}); |
171 | TheLoop->setLoopID(NewLoopID); |
172 | |
173 | // Update internal cache. |
174 | IsVectorized.Value = 1; |
175 | } |
176 | |
177 | bool LoopVectorizeHints::allowVectorization( |
178 | Function *F, Loop *L, bool VectorizeOnlyWhenForced) const { |
179 | if (getForce() == LoopVectorizeHints::FK_Disabled) { |
180 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n" ); |
181 | emitRemarkWithHints(); |
182 | return false; |
183 | } |
184 | |
185 | if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) { |
186 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n" ); |
187 | emitRemarkWithHints(); |
188 | return false; |
189 | } |
190 | |
191 | if (getIsVectorized() == 1) { |
192 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n" ); |
193 | // FIXME: Add interleave.disable metadata. This will allow |
194 | // vectorize.disable to be used without disabling the pass and errors |
195 | // to differentiate between disabled vectorization and a width of 1. |
196 | ORE.emit(RemarkBuilder: [&]() { |
197 | return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(), |
198 | "AllDisabled" , L->getStartLoc(), |
199 | L->getHeader()) |
200 | << "loop not vectorized: vectorization and interleaving are " |
201 | "explicitly disabled, or the loop has already been " |
202 | "vectorized" ; |
203 | }); |
204 | return false; |
205 | } |
206 | |
207 | return true; |
208 | } |
209 | |
210 | void LoopVectorizeHints::() const { |
211 | using namespace ore; |
212 | |
213 | ORE.emit(RemarkBuilder: [&]() { |
214 | if (Force.Value == LoopVectorizeHints::FK_Disabled) |
215 | return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled" , |
216 | TheLoop->getStartLoc(), |
217 | TheLoop->getHeader()) |
218 | << "loop not vectorized: vectorization is explicitly disabled" ; |
219 | else { |
220 | OptimizationRemarkMissed R(LV_NAME, "MissedDetails" , |
221 | TheLoop->getStartLoc(), TheLoop->getHeader()); |
222 | R << "loop not vectorized" ; |
223 | if (Force.Value == LoopVectorizeHints::FK_Enabled) { |
224 | R << " (Force=" << NV("Force" , true); |
225 | if (Width.Value != 0) |
226 | R << ", Vector Width=" << NV("VectorWidth" , getWidth()); |
227 | if (getInterleave() != 0) |
228 | R << ", Interleave Count=" << NV("InterleaveCount" , getInterleave()); |
229 | R << ")" ; |
230 | } |
231 | return R; |
232 | } |
233 | }); |
234 | } |
235 | |
236 | const char *LoopVectorizeHints::vectorizeAnalysisPassName() const { |
237 | if (getWidth() == ElementCount::getFixed(MinVal: 1)) |
238 | return LV_NAME; |
239 | if (getForce() == LoopVectorizeHints::FK_Disabled) |
240 | return LV_NAME; |
241 | if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero()) |
242 | return LV_NAME; |
243 | return OptimizationRemarkAnalysis::AlwaysPrint; |
244 | } |
245 | |
246 | bool LoopVectorizeHints::allowReordering() const { |
247 | // Allow the vectorizer to change the order of operations if enabling |
248 | // loop hints are provided |
249 | ElementCount EC = getWidth(); |
250 | return HintsAllowReordering && |
251 | (getForce() == LoopVectorizeHints::FK_Enabled || |
252 | EC.getKnownMinValue() > 1); |
253 | } |
254 | |
255 | void LoopVectorizeHints::getHintsFromMetadata() { |
256 | MDNode *LoopID = TheLoop->getLoopID(); |
257 | if (!LoopID) |
258 | return; |
259 | |
260 | // First operand should refer to the loop id itself. |
261 | assert(LoopID->getNumOperands() > 0 && "requires at least one operand" ); |
262 | assert(LoopID->getOperand(0) == LoopID && "invalid loop id" ); |
263 | |
264 | for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) { |
265 | const MDString *S = nullptr; |
266 | SmallVector<Metadata *, 4> Args; |
267 | |
268 | // The expected hint is either a MDString or a MDNode with the first |
269 | // operand a MDString. |
270 | if (const MDNode *MD = dyn_cast<MDNode>(Val: MDO)) { |
271 | if (!MD || MD->getNumOperands() == 0) |
272 | continue; |
273 | S = dyn_cast<MDString>(Val: MD->getOperand(I: 0)); |
274 | for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) |
275 | Args.push_back(Elt: MD->getOperand(I: i)); |
276 | } else { |
277 | S = dyn_cast<MDString>(Val: MDO); |
278 | assert(Args.size() == 0 && "too many arguments for MDString" ); |
279 | } |
280 | |
281 | if (!S) |
282 | continue; |
283 | |
284 | // Check if the hint starts with the loop metadata prefix. |
285 | StringRef Name = S->getString(); |
286 | if (Args.size() == 1) |
287 | setHint(Name, Arg: Args[0]); |
288 | } |
289 | } |
290 | |
291 | void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { |
292 | if (!Name.starts_with(Prefix: Prefix())) |
293 | return; |
294 | Name = Name.substr(Start: Prefix().size(), N: StringRef::npos); |
295 | |
296 | const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg); |
297 | if (!C) |
298 | return; |
299 | unsigned Val = C->getZExtValue(); |
300 | |
301 | Hint *Hints[] = {&Width, &Interleave, &Force, |
302 | &IsVectorized, &Predicate, &Scalable}; |
303 | for (auto *H : Hints) { |
304 | if (Name == H->Name) { |
305 | if (H->validate(Val)) |
306 | H->Value = Val; |
307 | else |
308 | LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n" ); |
309 | break; |
310 | } |
311 | } |
312 | } |
313 | |
314 | // Return true if the inner loop \p Lp is uniform with regard to the outer loop |
315 | // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes |
316 | // executing the inner loop will execute the same iterations). This check is |
317 | // very constrained for now but it will be relaxed in the future. \p Lp is |
318 | // considered uniform if it meets all the following conditions: |
319 | // 1) it has a canonical IV (starting from 0 and with stride 1), |
320 | // 2) its latch terminator is a conditional branch and, |
321 | // 3) its latch condition is a compare instruction whose operands are the |
322 | // canonical IV and an OuterLp invariant. |
323 | // This check doesn't take into account the uniformity of other conditions not |
324 | // related to the loop latch because they don't affect the loop uniformity. |
325 | // |
326 | // NOTE: We decided to keep all these checks and its associated documentation |
327 | // together so that we can easily have a picture of the current supported loop |
328 | // nests. However, some of the current checks don't depend on \p OuterLp and |
329 | // would be redundantly executed for each \p Lp if we invoked this function for |
330 | // different candidate outer loops. This is not the case for now because we |
331 | // don't currently have the infrastructure to evaluate multiple candidate outer |
332 | // loops and \p OuterLp will be a fixed parameter while we only support explicit |
333 | // outer loop vectorization. It's also very likely that these checks go away |
334 | // before introducing the aforementioned infrastructure. However, if this is not |
335 | // the case, we should move the \p OuterLp independent checks to a separate |
336 | // function that is only executed once for each \p Lp. |
337 | static bool isUniformLoop(Loop *Lp, Loop *OuterLp) { |
338 | assert(Lp->getLoopLatch() && "Expected loop with a single latch." ); |
339 | |
340 | // If Lp is the outer loop, it's uniform by definition. |
341 | if (Lp == OuterLp) |
342 | return true; |
343 | assert(OuterLp->contains(Lp) && "OuterLp must contain Lp." ); |
344 | |
345 | // 1. |
346 | PHINode *IV = Lp->getCanonicalInductionVariable(); |
347 | if (!IV) { |
348 | LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n" ); |
349 | return false; |
350 | } |
351 | |
352 | // 2. |
353 | BasicBlock *Latch = Lp->getLoopLatch(); |
354 | auto *LatchBr = dyn_cast<BranchInst>(Val: Latch->getTerminator()); |
355 | if (!LatchBr || LatchBr->isUnconditional()) { |
356 | LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n" ); |
357 | return false; |
358 | } |
359 | |
360 | // 3. |
361 | auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition()); |
362 | if (!LatchCmp) { |
363 | LLVM_DEBUG( |
364 | dbgs() << "LV: Loop latch condition is not a compare instruction.\n" ); |
365 | return false; |
366 | } |
367 | |
368 | Value *CondOp0 = LatchCmp->getOperand(i_nocapture: 0); |
369 | Value *CondOp1 = LatchCmp->getOperand(i_nocapture: 1); |
370 | Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch); |
371 | if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) && |
372 | !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) { |
373 | LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n" ); |
374 | return false; |
375 | } |
376 | |
377 | return true; |
378 | } |
379 | |
380 | // Return true if \p Lp and all its nested loops are uniform with regard to \p |
381 | // OuterLp. |
382 | static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) { |
383 | if (!isUniformLoop(Lp, OuterLp)) |
384 | return false; |
385 | |
386 | // Check if nested loops are uniform. |
387 | for (Loop *SubLp : *Lp) |
388 | if (!isUniformLoopNest(Lp: SubLp, OuterLp)) |
389 | return false; |
390 | |
391 | return true; |
392 | } |
393 | |
394 | static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) { |
395 | if (Ty->isPointerTy()) |
396 | return DL.getIntPtrType(Ty); |
397 | |
398 | // It is possible that char's or short's overflow when we ask for the loop's |
399 | // trip count, work around this by changing the type size. |
400 | if (Ty->getScalarSizeInBits() < 32) |
401 | return Type::getInt32Ty(C&: Ty->getContext()); |
402 | |
403 | return Ty; |
404 | } |
405 | |
406 | static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) { |
407 | Ty0 = convertPointerToIntegerType(DL, Ty: Ty0); |
408 | Ty1 = convertPointerToIntegerType(DL, Ty: Ty1); |
409 | if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits()) |
410 | return Ty0; |
411 | return Ty1; |
412 | } |
413 | |
414 | /// Check that the instruction has outside loop users and is not an |
415 | /// identified reduction variable. |
416 | static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, |
417 | SmallPtrSetImpl<Value *> &AllowedExit) { |
418 | // Reductions, Inductions and non-header phis are allowed to have exit users. All |
419 | // other instructions must not have external users. |
420 | if (!AllowedExit.count(Ptr: Inst)) |
421 | // Check that all of the users of the loop are inside the BB. |
422 | for (User *U : Inst->users()) { |
423 | Instruction *UI = cast<Instruction>(Val: U); |
424 | // This user may be a reduction exit value. |
425 | if (!TheLoop->contains(Inst: UI)) { |
426 | LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n'); |
427 | return true; |
428 | } |
429 | } |
430 | return false; |
431 | } |
432 | |
433 | /// Returns true if A and B have same pointer operands or same SCEVs addresses |
434 | static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, |
435 | StoreInst *B) { |
436 | // Compare store |
437 | if (A == B) |
438 | return true; |
439 | |
440 | // Otherwise Compare pointers |
441 | Value *APtr = A->getPointerOperand(); |
442 | Value *BPtr = B->getPointerOperand(); |
443 | if (APtr == BPtr) |
444 | return true; |
445 | |
446 | // Otherwise compare address SCEVs |
447 | if (SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr)) |
448 | return true; |
449 | |
450 | return false; |
451 | } |
452 | |
453 | int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy, |
454 | Value *Ptr) const { |
455 | // FIXME: Currently, the set of symbolic strides is sometimes queried before |
456 | // it's collected. This happens from canVectorizeWithIfConvert, when the |
457 | // pointer is checked to reference consecutive elements suitable for a |
458 | // masked access. |
459 | const auto &Strides = |
460 | LAI ? LAI->getSymbolicStrides() : DenseMap<Value *, const SCEV *>(); |
461 | |
462 | Function *F = TheLoop->getHeader()->getParent(); |
463 | bool OptForSize = F->hasOptSize() || |
464 | llvm::shouldOptimizeForSize(BB: TheLoop->getHeader(), PSI, BFI, |
465 | QueryType: PGSOQueryType::IRPass); |
466 | bool CanAddPredicate = !OptForSize; |
467 | int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, |
468 | Assume: CanAddPredicate, ShouldCheckWrap: false).value_or(u: 0); |
469 | if (Stride == 1 || Stride == -1) |
470 | return Stride; |
471 | return 0; |
472 | } |
473 | |
474 | bool LoopVectorizationLegality::isInvariant(Value *V) const { |
475 | return LAI->isInvariant(V); |
476 | } |
477 | |
478 | namespace { |
479 | /// A rewriter to build the SCEVs for each of the VF lanes in the expected |
480 | /// vectorized loop, which can then be compared to detect their uniformity. This |
481 | /// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) |
482 | /// with new AddRecs where the step is multiplied by StepMultiplier and Offset * |
483 | /// Step is added. Also checks if all sub-expressions are analyzable w.r.t. |
484 | /// uniformity. |
485 | class SCEVAddRecForUniformityRewriter |
486 | : public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> { |
487 | /// Multiplier to be applied to the step of AddRecs in TheLoop. |
488 | unsigned StepMultiplier; |
489 | |
490 | /// Offset to be added to the AddRecs in TheLoop. |
491 | unsigned Offset; |
492 | |
493 | /// Loop for which to rewrite AddRecsFor. |
494 | Loop *TheLoop; |
495 | |
496 | /// Is any sub-expressions not analyzable w.r.t. uniformity? |
497 | bool CannotAnalyze = false; |
498 | |
499 | bool canAnalyze() const { return !CannotAnalyze; } |
500 | |
501 | public: |
502 | SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier, |
503 | unsigned Offset, Loop *TheLoop) |
504 | : SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset), |
505 | TheLoop(TheLoop) {} |
506 | |
507 | const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) { |
508 | assert(Expr->getLoop() == TheLoop && |
509 | "addrec outside of TheLoop must be invariant and should have been " |
510 | "handled earlier" ); |
511 | // Build a new AddRec by multiplying the step by StepMultiplier and |
512 | // incrementing the start by Offset * step. |
513 | Type *Ty = Expr->getType(); |
514 | auto *Step = Expr->getStepRecurrence(SE); |
515 | if (!SE.isLoopInvariant(S: Step, L: TheLoop)) { |
516 | CannotAnalyze = true; |
517 | return Expr; |
518 | } |
519 | auto *NewStep = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier)); |
520 | auto *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset)); |
521 | auto *NewStart = SE.getAddExpr(LHS: Expr->getStart(), RHS: ScaledOffset); |
522 | return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap); |
523 | } |
524 | |
525 | const SCEV *visit(const SCEV *S) { |
526 | if (CannotAnalyze || SE.isLoopInvariant(S, L: TheLoop)) |
527 | return S; |
528 | return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S); |
529 | } |
530 | |
531 | const SCEV *visitUnknown(const SCEVUnknown *S) { |
532 | if (SE.isLoopInvariant(S, L: TheLoop)) |
533 | return S; |
534 | // The value could vary across iterations. |
535 | CannotAnalyze = true; |
536 | return S; |
537 | } |
538 | |
539 | const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) { |
540 | // Could not analyze the expression. |
541 | CannotAnalyze = true; |
542 | return S; |
543 | } |
544 | |
545 | static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE, |
546 | unsigned StepMultiplier, unsigned Offset, |
547 | Loop *TheLoop) { |
548 | /// Bail out if the expression does not contain an UDiv expression. |
549 | /// Uniform values which are not loop invariant require operations to strip |
550 | /// out the lowest bits. For now just look for UDivs and use it to avoid |
551 | /// re-writing UDIV-free expressions for other lanes to limit compile time. |
552 | if (!SCEVExprContains(Root: S, |
553 | Pred: [](const SCEV *S) { return isa<SCEVUDivExpr>(Val: S); })) |
554 | return SE.getCouldNotCompute(); |
555 | |
556 | SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset, |
557 | TheLoop); |
558 | const SCEV *Result = Rewriter.visit(S); |
559 | |
560 | if (Rewriter.canAnalyze()) |
561 | return Result; |
562 | return SE.getCouldNotCompute(); |
563 | } |
564 | }; |
565 | |
566 | } // namespace |
567 | |
568 | bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const { |
569 | if (isInvariant(V)) |
570 | return true; |
571 | if (VF.isScalable()) |
572 | return false; |
573 | if (VF.isScalar()) |
574 | return true; |
575 | |
576 | // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is |
577 | // never considered uniform. |
578 | auto *SE = PSE.getSE(); |
579 | if (!SE->isSCEVable(Ty: V->getType())) |
580 | return false; |
581 | const SCEV *S = SE->getSCEV(V); |
582 | |
583 | // Rewrite AddRecs in TheLoop to step by VF and check if the expression for |
584 | // lane 0 matches the expressions for all other lanes. |
585 | unsigned FixedVF = VF.getKnownMinValue(); |
586 | const SCEV *FirstLaneExpr = |
587 | SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: 0, TheLoop); |
588 | if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr)) |
589 | return false; |
590 | |
591 | // Make sure the expressions for lanes FixedVF-1..1 match the expression for |
592 | // lane 0. We check lanes in reverse order for compile-time, as frequently |
593 | // checking the last lane is sufficient to rule out uniformity. |
594 | return all_of(Range: reverse(C: seq<unsigned>(Begin: 1, End: FixedVF)), P: [&](unsigned I) { |
595 | const SCEV *IthLaneExpr = |
596 | SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop); |
597 | return FirstLaneExpr == IthLaneExpr; |
598 | }); |
599 | } |
600 | |
601 | bool LoopVectorizationLegality::isUniformMemOp(Instruction &I, |
602 | ElementCount VF) const { |
603 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
604 | if (!Ptr) |
605 | return false; |
606 | // Note: There's nothing inherent which prevents predicated loads and |
607 | // stores from being uniform. The current lowering simply doesn't handle |
608 | // it; in particular, the cost model distinguishes scatter/gather from |
609 | // scalar w/predication, and we currently rely on the scalar path. |
610 | return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent()); |
611 | } |
612 | |
613 | bool LoopVectorizationLegality::canVectorizeOuterLoop() { |
614 | assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop." ); |
615 | // Store the result and return it at the end instead of exiting early, in case |
616 | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
617 | bool Result = true; |
618 | bool = ORE->allowExtraAnalysis(DEBUG_TYPE); |
619 | |
620 | for (BasicBlock *BB : TheLoop->blocks()) { |
621 | // Check whether the BB terminator is a BranchInst. Any other terminator is |
622 | // not supported yet. |
623 | auto *Br = dyn_cast<BranchInst>(Val: BB->getTerminator()); |
624 | if (!Br) { |
625 | reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator" , |
626 | OREMsg: "loop control flow is not understood by vectorizer" , |
627 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
628 | if (DoExtraAnalysis) |
629 | Result = false; |
630 | else |
631 | return false; |
632 | } |
633 | |
634 | // Check whether the BranchInst is a supported one. Only unconditional |
635 | // branches, conditional branches with an outer loop invariant condition or |
636 | // backedges are supported. |
637 | // FIXME: We skip these checks when VPlan predication is enabled as we |
638 | // want to allow divergent branches. This whole check will be removed |
639 | // once VPlan predication is on by default. |
640 | if (Br && Br->isConditional() && |
641 | !TheLoop->isLoopInvariant(V: Br->getCondition()) && |
642 | !LI->isLoopHeader(BB: Br->getSuccessor(i: 0)) && |
643 | !LI->isLoopHeader(BB: Br->getSuccessor(i: 1))) { |
644 | reportVectorizationFailure(DebugMsg: "Unsupported conditional branch" , |
645 | OREMsg: "loop control flow is not understood by vectorizer" , |
646 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
647 | if (DoExtraAnalysis) |
648 | Result = false; |
649 | else |
650 | return false; |
651 | } |
652 | } |
653 | |
654 | // Check whether inner loops are uniform. At this point, we only support |
655 | // simple outer loops scenarios with uniform nested loops. |
656 | if (!isUniformLoopNest(Lp: TheLoop /*loop nest*/, |
657 | OuterLp: TheLoop /*context outer loop*/)) { |
658 | reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops" , |
659 | OREMsg: "loop control flow is not understood by vectorizer" , |
660 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
661 | if (DoExtraAnalysis) |
662 | Result = false; |
663 | else |
664 | return false; |
665 | } |
666 | |
667 | // Check whether we are able to set up outer loop induction. |
668 | if (!setupOuterLoopInductions()) { |
669 | reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)" , |
670 | OREMsg: "Unsupported outer loop Phi(s)" , |
671 | ORETag: "UnsupportedPhi" , ORE, TheLoop); |
672 | if (DoExtraAnalysis) |
673 | Result = false; |
674 | else |
675 | return false; |
676 | } |
677 | |
678 | return Result; |
679 | } |
680 | |
681 | void LoopVectorizationLegality::addInductionPhi( |
682 | PHINode *Phi, const InductionDescriptor &ID, |
683 | SmallPtrSetImpl<Value *> &AllowedExit) { |
684 | Inductions[Phi] = ID; |
685 | |
686 | // In case this induction also comes with casts that we know we can ignore |
687 | // in the vectorized loop body, record them here. All casts could be recorded |
688 | // here for ignoring, but suffices to record only the first (as it is the |
689 | // only one that may bw used outside the cast sequence). |
690 | const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts(); |
691 | if (!Casts.empty()) |
692 | InductionCastsToIgnore.insert(Ptr: *Casts.begin()); |
693 | |
694 | Type *PhiTy = Phi->getType(); |
695 | const DataLayout &DL = Phi->getDataLayout(); |
696 | |
697 | // Get the widest type. |
698 | if (!PhiTy->isFloatingPointTy()) { |
699 | if (!WidestIndTy) |
700 | WidestIndTy = convertPointerToIntegerType(DL, Ty: PhiTy); |
701 | else |
702 | WidestIndTy = getWiderType(DL, Ty0: PhiTy, Ty1: WidestIndTy); |
703 | } |
704 | |
705 | // Int inductions are special because we only allow one IV. |
706 | if (ID.getKind() == InductionDescriptor::IK_IntInduction && |
707 | ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() && |
708 | isa<Constant>(Val: ID.getStartValue()) && |
709 | cast<Constant>(Val: ID.getStartValue())->isNullValue()) { |
710 | |
711 | // Use the phi node with the widest type as induction. Use the last |
712 | // one if there are multiple (no good reason for doing this other |
713 | // than it is expedient). We've checked that it begins at zero and |
714 | // steps by one, so this is a canonical induction variable. |
715 | if (!PrimaryInduction || PhiTy == WidestIndTy) |
716 | PrimaryInduction = Phi; |
717 | } |
718 | |
719 | // Both the PHI node itself, and the "post-increment" value feeding |
720 | // back into the PHI node may have external users. |
721 | // We can allow those uses, except if the SCEVs we have for them rely |
722 | // on predicates that only hold within the loop, since allowing the exit |
723 | // currently means re-using this SCEV outside the loop (see PR33706 for more |
724 | // details). |
725 | if (PSE.getPredicate().isAlwaysTrue()) { |
726 | AllowedExit.insert(Ptr: Phi); |
727 | AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch())); |
728 | } |
729 | |
730 | LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n" ); |
731 | } |
732 | |
733 | bool LoopVectorizationLegality::setupOuterLoopInductions() { |
734 | BasicBlock * = TheLoop->getHeader(); |
735 | |
736 | // Returns true if a given Phi is a supported induction. |
737 | auto isSupportedPhi = [&](PHINode &Phi) -> bool { |
738 | InductionDescriptor ID; |
739 | if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) && |
740 | ID.getKind() == InductionDescriptor::IK_IntInduction) { |
741 | addInductionPhi(Phi: &Phi, ID, AllowedExit); |
742 | return true; |
743 | } else { |
744 | // Bail out for any Phi in the outer loop header that is not a supported |
745 | // induction. |
746 | LLVM_DEBUG( |
747 | dbgs() |
748 | << "LV: Found unsupported PHI for outer loop vectorization.\n" ); |
749 | return false; |
750 | } |
751 | }; |
752 | |
753 | if (llvm::all_of(Range: Header->phis(), P: isSupportedPhi)) |
754 | return true; |
755 | else |
756 | return false; |
757 | } |
758 | |
759 | /// Checks if a function is scalarizable according to the TLI, in |
760 | /// the sense that it should be vectorized and then expanded in |
761 | /// multiple scalar calls. This is represented in the |
762 | /// TLI via mappings that do not specify a vector name, as in the |
763 | /// following example: |
764 | /// |
765 | /// const VecDesc VecIntrinsics[] = { |
766 | /// {"llvm.phx.abs.i32", "", 4} |
767 | /// }; |
768 | static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) { |
769 | const StringRef ScalarName = CI.getCalledFunction()->getName(); |
770 | bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName); |
771 | // Check that all known VFs are not associated to a vector |
772 | // function, i.e. the vector name is emty. |
773 | if (Scalarize) { |
774 | ElementCount WidestFixedVF, WidestScalableVF; |
775 | TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF); |
776 | for (ElementCount VF = ElementCount::getFixed(MinVal: 2); |
777 | ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= 2) |
778 | Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF); |
779 | for (ElementCount VF = ElementCount::getScalable(MinVal: 1); |
780 | ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= 2) |
781 | Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF); |
782 | assert((WidestScalableVF.isZero() || !Scalarize) && |
783 | "Caller may decide to scalarize a variant using a scalable VF" ); |
784 | } |
785 | return Scalarize; |
786 | } |
787 | |
788 | bool LoopVectorizationLegality::canVectorizeInstrs() { |
789 | BasicBlock * = TheLoop->getHeader(); |
790 | |
791 | // For each block in the loop. |
792 | for (BasicBlock *BB : TheLoop->blocks()) { |
793 | // Scan the instructions in the block and look for hazards. |
794 | for (Instruction &I : *BB) { |
795 | if (auto *Phi = dyn_cast<PHINode>(Val: &I)) { |
796 | Type *PhiTy = Phi->getType(); |
797 | // Check that this PHI type is allowed. |
798 | if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() && |
799 | !PhiTy->isPointerTy()) { |
800 | reportVectorizationFailure(DebugMsg: "Found a non-int non-pointer PHI" , |
801 | OREMsg: "loop control flow is not understood by vectorizer" , |
802 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
803 | return false; |
804 | } |
805 | |
806 | // If this PHINode is not in the header block, then we know that we |
807 | // can convert it to select during if-conversion. No need to check if |
808 | // the PHIs in this block are induction or reduction variables. |
809 | if (BB != Header) { |
810 | // Non-header phi nodes that have outside uses can be vectorized. Add |
811 | // them to the list of allowed exits. |
812 | // Unsafe cyclic dependencies with header phis are identified during |
813 | // legalization for reduction, induction and fixed order |
814 | // recurrences. |
815 | AllowedExit.insert(Ptr: &I); |
816 | continue; |
817 | } |
818 | |
819 | // We only allow if-converted PHIs with exactly two incoming values. |
820 | if (Phi->getNumIncomingValues() != 2) { |
821 | reportVectorizationFailure(DebugMsg: "Found an invalid PHI" , |
822 | OREMsg: "loop control flow is not understood by vectorizer" , |
823 | ORETag: "CFGNotUnderstood" , ORE, TheLoop, I: Phi); |
824 | return false; |
825 | } |
826 | |
827 | RecurrenceDescriptor RedDes; |
828 | if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC, |
829 | DT, SE: PSE.getSE())) { |
830 | Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst()); |
831 | AllowedExit.insert(Ptr: RedDes.getLoopExitInstr()); |
832 | Reductions[Phi] = RedDes; |
833 | continue; |
834 | } |
835 | |
836 | // We prevent matching non-constant strided pointer IVS to preserve |
837 | // historical vectorizer behavior after a generalization of the |
838 | // IVDescriptor code. The intent is to remove this check, but we |
839 | // have to fix issues around code quality for such loops first. |
840 | auto isDisallowedStridedPointerInduction = |
841 | [](const InductionDescriptor &ID) { |
842 | if (AllowStridedPointerIVs) |
843 | return false; |
844 | return ID.getKind() == InductionDescriptor::IK_PtrInduction && |
845 | ID.getConstIntStepValue() == nullptr; |
846 | }; |
847 | |
848 | // TODO: Instead of recording the AllowedExit, it would be good to |
849 | // record the complementary set: NotAllowedExit. These include (but may |
850 | // not be limited to): |
851 | // 1. Reduction phis as they represent the one-before-last value, which |
852 | // is not available when vectorized |
853 | // 2. Induction phis and increment when SCEV predicates cannot be used |
854 | // outside the loop - see addInductionPhi |
855 | // 3. Non-Phis with outside uses when SCEV predicates cannot be used |
856 | // outside the loop - see call to hasOutsideLoopUser in the non-phi |
857 | // handling below |
858 | // 4. FixedOrderRecurrence phis that can possibly be handled by |
859 | // extraction. |
860 | // By recording these, we can then reason about ways to vectorize each |
861 | // of these NotAllowedExit. |
862 | InductionDescriptor ID; |
863 | if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) && |
864 | !isDisallowedStridedPointerInduction(ID)) { |
865 | addInductionPhi(Phi, ID, AllowedExit); |
866 | Requirements->addExactFPMathInst(I: ID.getExactFPMathInst()); |
867 | continue; |
868 | } |
869 | |
870 | if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) { |
871 | AllowedExit.insert(Ptr: Phi); |
872 | FixedOrderRecurrences.insert(Ptr: Phi); |
873 | continue; |
874 | } |
875 | |
876 | // As a last resort, coerce the PHI to a AddRec expression |
877 | // and re-try classifying it a an induction PHI. |
878 | if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) && |
879 | !isDisallowedStridedPointerInduction(ID)) { |
880 | addInductionPhi(Phi, ID, AllowedExit); |
881 | continue; |
882 | } |
883 | |
884 | reportVectorizationFailure(DebugMsg: "Found an unidentified PHI" , |
885 | OREMsg: "value that could not be identified as " |
886 | "reduction is used outside the loop" , |
887 | ORETag: "NonReductionValueUsedOutsideLoop" , ORE, TheLoop, I: Phi); |
888 | return false; |
889 | } // end of PHI handling |
890 | |
891 | // We handle calls that: |
892 | // * Are debug info intrinsics. |
893 | // * Have a mapping to an IR intrinsic. |
894 | // * Have a vector version available. |
895 | auto *CI = dyn_cast<CallInst>(Val: &I); |
896 | |
897 | if (CI && !getVectorIntrinsicIDForCall(CI, TLI) && |
898 | !isa<DbgInfoIntrinsic>(Val: CI) && |
899 | !(CI->getCalledFunction() && TLI && |
900 | (!VFDatabase::getMappings(CI: *CI).empty() || |
901 | isTLIScalarize(TLI: *TLI, CI: *CI)))) { |
902 | // If the call is a recognized math libary call, it is likely that |
903 | // we can vectorize it given loosened floating-point constraints. |
904 | LibFunc Func; |
905 | bool IsMathLibCall = |
906 | TLI && CI->getCalledFunction() && |
907 | CI->getType()->isFloatingPointTy() && |
908 | TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) && |
909 | TLI->hasOptimizedCodeGen(F: Func); |
910 | |
911 | if (IsMathLibCall) { |
912 | // TODO: Ideally, we should not use clang-specific language here, |
913 | // but it's hard to provide meaningful yet generic advice. |
914 | // Also, should this be guarded by allowExtraAnalysis() and/or be part |
915 | // of the returned info from isFunctionVectorizable()? |
916 | reportVectorizationFailure( |
917 | DebugMsg: "Found a non-intrinsic callsite" , |
918 | OREMsg: "library call cannot be vectorized. " |
919 | "Try compiling with -fno-math-errno, -ffast-math, " |
920 | "or similar flags" , |
921 | ORETag: "CantVectorizeLibcall" , ORE, TheLoop, I: CI); |
922 | } else { |
923 | reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite" , |
924 | OREMsg: "call instruction cannot be vectorized" , |
925 | ORETag: "CantVectorizeLibcall" , ORE, TheLoop, I: CI); |
926 | } |
927 | return false; |
928 | } |
929 | |
930 | // Some intrinsics have scalar arguments and should be same in order for |
931 | // them to be vectorized (i.e. loop invariant). |
932 | if (CI) { |
933 | auto *SE = PSE.getSE(); |
934 | Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI); |
935 | for (unsigned i = 0, e = CI->arg_size(); i != e; ++i) |
936 | if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: i)) { |
937 | if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: i)), L: TheLoop)) { |
938 | reportVectorizationFailure(DebugMsg: "Found unvectorizable intrinsic" , |
939 | OREMsg: "intrinsic instruction cannot be vectorized" , |
940 | ORETag: "CantVectorizeIntrinsic" , ORE, TheLoop, I: CI); |
941 | return false; |
942 | } |
943 | } |
944 | } |
945 | |
946 | // If we found a vectorized variant of a function, note that so LV can |
947 | // make better decisions about maximum VF. |
948 | if (CI && !VFDatabase::getMappings(CI: *CI).empty()) |
949 | VecCallVariantsFound = true; |
950 | |
951 | // Check that the instruction return type is vectorizable. |
952 | // Also, we can't vectorize extractelement instructions. |
953 | if ((!VectorType::isValidElementType(ElemTy: I.getType()) && |
954 | !I.getType()->isVoidTy()) || |
955 | isa<ExtractElementInst>(Val: I)) { |
956 | reportVectorizationFailure(DebugMsg: "Found unvectorizable type" , |
957 | OREMsg: "instruction return type cannot be vectorized" , |
958 | ORETag: "CantVectorizeInstructionReturnType" , ORE, TheLoop, I: &I); |
959 | return false; |
960 | } |
961 | |
962 | // Check that the stored type is vectorizable. |
963 | if (auto *ST = dyn_cast<StoreInst>(Val: &I)) { |
964 | Type *T = ST->getValueOperand()->getType(); |
965 | if (!VectorType::isValidElementType(ElemTy: T)) { |
966 | reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized" , |
967 | OREMsg: "store instruction cannot be vectorized" , |
968 | ORETag: "CantVectorizeStore" , ORE, TheLoop, I: ST); |
969 | return false; |
970 | } |
971 | |
972 | // For nontemporal stores, check that a nontemporal vector version is |
973 | // supported on the target. |
974 | if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) { |
975 | // Arbitrarily try a vector of 2 elements. |
976 | auto *VecTy = FixedVectorType::get(ElementType: T, /*NumElts=*/2); |
977 | assert(VecTy && "did not find vectorized version of stored type" ); |
978 | if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) { |
979 | reportVectorizationFailure( |
980 | DebugMsg: "nontemporal store instruction cannot be vectorized" , |
981 | OREMsg: "nontemporal store instruction cannot be vectorized" , |
982 | ORETag: "CantVectorizeNontemporalStore" , ORE, TheLoop, I: ST); |
983 | return false; |
984 | } |
985 | } |
986 | |
987 | } else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) { |
988 | if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) { |
989 | // For nontemporal loads, check that a nontemporal vector version is |
990 | // supported on the target (arbitrarily try a vector of 2 elements). |
991 | auto *VecTy = FixedVectorType::get(ElementType: I.getType(), /*NumElts=*/2); |
992 | assert(VecTy && "did not find vectorized version of load type" ); |
993 | if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) { |
994 | reportVectorizationFailure( |
995 | DebugMsg: "nontemporal load instruction cannot be vectorized" , |
996 | OREMsg: "nontemporal load instruction cannot be vectorized" , |
997 | ORETag: "CantVectorizeNontemporalLoad" , ORE, TheLoop, I: LD); |
998 | return false; |
999 | } |
1000 | } |
1001 | |
1002 | // FP instructions can allow unsafe algebra, thus vectorizable by |
1003 | // non-IEEE-754 compliant SIMD units. |
1004 | // This applies to floating-point math operations and calls, not memory |
1005 | // operations, shuffles, or casts, as they don't change precision or |
1006 | // semantics. |
1007 | } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) && |
1008 | !I.isFast()) { |
1009 | LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n" ); |
1010 | Hints->setPotentiallyUnsafe(); |
1011 | } |
1012 | |
1013 | // Reduction instructions are allowed to have exit users. |
1014 | // All other instructions must not have external users. |
1015 | if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) { |
1016 | // We can safely vectorize loops where instructions within the loop are |
1017 | // used outside the loop only if the SCEV predicates within the loop is |
1018 | // same as outside the loop. Allowing the exit means reusing the SCEV |
1019 | // outside the loop. |
1020 | if (PSE.getPredicate().isAlwaysTrue()) { |
1021 | AllowedExit.insert(Ptr: &I); |
1022 | continue; |
1023 | } |
1024 | reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop" , |
1025 | OREMsg: "value cannot be used outside the loop" , |
1026 | ORETag: "ValueUsedOutsideLoop" , ORE, TheLoop, I: &I); |
1027 | return false; |
1028 | } |
1029 | } // next instr. |
1030 | } |
1031 | |
1032 | if (!PrimaryInduction) { |
1033 | if (Inductions.empty()) { |
1034 | reportVectorizationFailure(DebugMsg: "Did not find one integer induction var" , |
1035 | OREMsg: "loop induction variable could not be identified" , |
1036 | ORETag: "NoInductionVariable" , ORE, TheLoop); |
1037 | return false; |
1038 | } else if (!WidestIndTy) { |
1039 | reportVectorizationFailure(DebugMsg: "Did not find one integer induction var" , |
1040 | OREMsg: "integer loop induction variable could not be identified" , |
1041 | ORETag: "NoIntegerInductionVariable" , ORE, TheLoop); |
1042 | return false; |
1043 | } else { |
1044 | LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n" ); |
1045 | } |
1046 | } |
1047 | |
1048 | // Now we know the widest induction type, check if our found induction |
1049 | // is the same size. If it's not, unset it here and InnerLoopVectorizer |
1050 | // will create another. |
1051 | if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType()) |
1052 | PrimaryInduction = nullptr; |
1053 | |
1054 | return true; |
1055 | } |
1056 | |
1057 | bool LoopVectorizationLegality::canVectorizeMemory() { |
1058 | LAI = &LAIs.getInfo(L&: *TheLoop); |
1059 | const OptimizationRemarkAnalysis *LAR = LAI->getReport(); |
1060 | if (LAR) { |
1061 | ORE->emit(RemarkBuilder: [&]() { |
1062 | return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(), |
1063 | "loop not vectorized: " , *LAR); |
1064 | }); |
1065 | } |
1066 | |
1067 | if (!LAI->canVectorizeMemory()) |
1068 | return false; |
1069 | |
1070 | if (LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress()) { |
1071 | reportVectorizationFailure(DebugMsg: "We don't allow storing to uniform addresses" , |
1072 | OREMsg: "write to a loop invariant address could not " |
1073 | "be vectorized" , |
1074 | ORETag: "CantVectorizeStoreToLoopInvariantAddress" , ORE, |
1075 | TheLoop); |
1076 | return false; |
1077 | } |
1078 | |
1079 | // We can vectorize stores to invariant address when final reduction value is |
1080 | // guaranteed to be stored at the end of the loop. Also, if decision to |
1081 | // vectorize loop is made, runtime checks are added so as to make sure that |
1082 | // invariant address won't alias with any other objects. |
1083 | if (!LAI->getStoresToInvariantAddresses().empty()) { |
1084 | // For each invariant address, check if last stored value is unconditional |
1085 | // and the address is not calculated inside the loop. |
1086 | for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { |
1087 | if (!isInvariantStoreOfReduction(SI)) |
1088 | continue; |
1089 | |
1090 | if (blockNeedsPredication(BB: SI->getParent())) { |
1091 | reportVectorizationFailure( |
1092 | DebugMsg: "We don't allow storing to uniform addresses" , |
1093 | OREMsg: "write of conditional recurring variant value to a loop " |
1094 | "invariant address could not be vectorized" , |
1095 | ORETag: "CantVectorizeStoreToLoopInvariantAddress" , ORE, TheLoop); |
1096 | return false; |
1097 | } |
1098 | |
1099 | // Invariant address should be defined outside of loop. LICM pass usually |
1100 | // makes sure it happens, but in rare cases it does not, we do not want |
1101 | // to overcomplicate vectorization to support this case. |
1102 | if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) { |
1103 | if (TheLoop->contains(Inst: Ptr)) { |
1104 | reportVectorizationFailure( |
1105 | DebugMsg: "Invariant address is calculated inside the loop" , |
1106 | OREMsg: "write to a loop invariant address could not " |
1107 | "be vectorized" , |
1108 | ORETag: "CantVectorizeStoreToLoopInvariantAddress" , ORE, TheLoop); |
1109 | return false; |
1110 | } |
1111 | } |
1112 | } |
1113 | |
1114 | if (LAI->hasStoreStoreDependenceInvolvingLoopInvariantAddress()) { |
1115 | // For each invariant address, check its last stored value is the result |
1116 | // of one of our reductions. |
1117 | // |
1118 | // We do not check if dependence with loads exists because that is already |
1119 | // checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. |
1120 | ScalarEvolution *SE = PSE.getSE(); |
1121 | SmallVector<StoreInst *, 4> UnhandledStores; |
1122 | for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) { |
1123 | if (isInvariantStoreOfReduction(SI)) { |
1124 | // Earlier stores to this address are effectively deadcode. |
1125 | // With opaque pointers it is possible for one pointer to be used with |
1126 | // different sizes of stored values: |
1127 | // store i32 0, ptr %x |
1128 | // store i8 0, ptr %x |
1129 | // The latest store doesn't complitely overwrite the first one in the |
1130 | // example. That is why we have to make sure that types of stored |
1131 | // values are same. |
1132 | // TODO: Check that bitwidth of unhandled store is smaller then the |
1133 | // one that overwrites it and add a test. |
1134 | erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) { |
1135 | return storeToSameAddress(SE, A: SI, B: I) && |
1136 | I->getValueOperand()->getType() == |
1137 | SI->getValueOperand()->getType(); |
1138 | }); |
1139 | continue; |
1140 | } |
1141 | UnhandledStores.push_back(Elt: SI); |
1142 | } |
1143 | |
1144 | bool IsOK = UnhandledStores.empty(); |
1145 | // TODO: we should also validate against InvariantMemSets. |
1146 | if (!IsOK) { |
1147 | reportVectorizationFailure( |
1148 | DebugMsg: "We don't allow storing to uniform addresses" , |
1149 | OREMsg: "write to a loop invariant address could not " |
1150 | "be vectorized" , |
1151 | ORETag: "CantVectorizeStoreToLoopInvariantAddress" , ORE, TheLoop); |
1152 | return false; |
1153 | } |
1154 | } |
1155 | } |
1156 | |
1157 | PSE.addPredicate(Pred: LAI->getPSE().getPredicate()); |
1158 | return true; |
1159 | } |
1160 | |
1161 | bool LoopVectorizationLegality::canVectorizeFPMath( |
1162 | bool EnableStrictReductions) { |
1163 | |
1164 | // First check if there is any ExactFP math or if we allow reassociations |
1165 | if (!Requirements->getExactFPInst() || Hints->allowReordering()) |
1166 | return true; |
1167 | |
1168 | // If the above is false, we have ExactFPMath & do not allow reordering. |
1169 | // If the EnableStrictReductions flag is set, first check if we have any |
1170 | // Exact FP induction vars, which we cannot vectorize. |
1171 | if (!EnableStrictReductions || |
1172 | any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool { |
1173 | InductionDescriptor IndDesc = Induction.second; |
1174 | return IndDesc.getExactFPMathInst(); |
1175 | })) |
1176 | return false; |
1177 | |
1178 | // We can now only vectorize if all reductions with Exact FP math also |
1179 | // have the isOrdered flag set, which indicates that we can move the |
1180 | // reduction operations in-loop. |
1181 | return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool { |
1182 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
1183 | return !RdxDesc.hasExactFPMath() || RdxDesc.isOrdered(); |
1184 | })); |
1185 | } |
1186 | |
1187 | bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) { |
1188 | return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool { |
1189 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
1190 | return RdxDesc.IntermediateStore == SI; |
1191 | }); |
1192 | } |
1193 | |
1194 | bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) { |
1195 | return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool { |
1196 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
1197 | if (!RdxDesc.IntermediateStore) |
1198 | return false; |
1199 | |
1200 | ScalarEvolution *SE = PSE.getSE(); |
1201 | Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand(); |
1202 | return V == InvariantAddress || |
1203 | SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress); |
1204 | }); |
1205 | } |
1206 | |
1207 | bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { |
1208 | Value *In0 = const_cast<Value *>(V); |
1209 | PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0); |
1210 | if (!PN) |
1211 | return false; |
1212 | |
1213 | return Inductions.count(Key: PN); |
1214 | } |
1215 | |
1216 | const InductionDescriptor * |
1217 | LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { |
1218 | if (!isInductionPhi(V: Phi)) |
1219 | return nullptr; |
1220 | auto &ID = getInductionVars().find(Key: Phi)->second; |
1221 | if (ID.getKind() == InductionDescriptor::IK_IntInduction || |
1222 | ID.getKind() == InductionDescriptor::IK_FpInduction) |
1223 | return &ID; |
1224 | return nullptr; |
1225 | } |
1226 | |
1227 | const InductionDescriptor * |
1228 | LoopVectorizationLegality::getPointerInductionDescriptor(PHINode *Phi) const { |
1229 | if (!isInductionPhi(V: Phi)) |
1230 | return nullptr; |
1231 | auto &ID = getInductionVars().find(Key: Phi)->second; |
1232 | if (ID.getKind() == InductionDescriptor::IK_PtrInduction) |
1233 | return &ID; |
1234 | return nullptr; |
1235 | } |
1236 | |
1237 | bool LoopVectorizationLegality::isCastedInductionVariable( |
1238 | const Value *V) const { |
1239 | auto *Inst = dyn_cast<Instruction>(Val: V); |
1240 | return (Inst && InductionCastsToIgnore.count(Ptr: Inst)); |
1241 | } |
1242 | |
1243 | bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { |
1244 | return isInductionPhi(V) || isCastedInductionVariable(V); |
1245 | } |
1246 | |
1247 | bool LoopVectorizationLegality::isFixedOrderRecurrence( |
1248 | const PHINode *Phi) const { |
1249 | return FixedOrderRecurrences.count(Ptr: Phi); |
1250 | } |
1251 | |
1252 | bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const { |
1253 | return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT); |
1254 | } |
1255 | |
1256 | bool LoopVectorizationLegality::blockCanBePredicated( |
1257 | BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, |
1258 | SmallPtrSetImpl<const Instruction *> &MaskedOp) const { |
1259 | for (Instruction &I : *BB) { |
1260 | // We can predicate blocks with calls to assume, as long as we drop them in |
1261 | // case we flatten the CFG via predication. |
1262 | if (match(V: &I, P: m_Intrinsic<Intrinsic::assume>())) { |
1263 | MaskedOp.insert(Ptr: &I); |
1264 | continue; |
1265 | } |
1266 | |
1267 | // Do not let llvm.experimental.noalias.scope.decl block the vectorization. |
1268 | // TODO: there might be cases that it should block the vectorization. Let's |
1269 | // ignore those for now. |
1270 | if (isa<NoAliasScopeDeclInst>(Val: &I)) |
1271 | continue; |
1272 | |
1273 | // We can allow masked calls if there's at least one vector variant, even |
1274 | // if we end up scalarizing due to the cost model calculations. |
1275 | // TODO: Allow other calls if they have appropriate attributes... readonly |
1276 | // and argmemonly? |
1277 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) |
1278 | if (VFDatabase::hasMaskedVariant(CI: *CI)) { |
1279 | MaskedOp.insert(Ptr: CI); |
1280 | continue; |
1281 | } |
1282 | |
1283 | // Loads are handled via masking (or speculated if safe to do so.) |
1284 | if (auto *LI = dyn_cast<LoadInst>(Val: &I)) { |
1285 | if (!SafePtrs.count(Ptr: LI->getPointerOperand())) |
1286 | MaskedOp.insert(Ptr: LI); |
1287 | continue; |
1288 | } |
1289 | |
1290 | // Predicated store requires some form of masking: |
1291 | // 1) masked store HW instruction, |
1292 | // 2) emulation via load-blend-store (only if safe and legal to do so, |
1293 | // be aware on the race conditions), or |
1294 | // 3) element-by-element predicate check and scalar store. |
1295 | if (auto *SI = dyn_cast<StoreInst>(Val: &I)) { |
1296 | MaskedOp.insert(Ptr: SI); |
1297 | continue; |
1298 | } |
1299 | |
1300 | if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow()) |
1301 | return false; |
1302 | } |
1303 | |
1304 | return true; |
1305 | } |
1306 | |
1307 | bool LoopVectorizationLegality::canVectorizeWithIfConvert() { |
1308 | if (!EnableIfConversion) { |
1309 | reportVectorizationFailure(DebugMsg: "If-conversion is disabled" , |
1310 | OREMsg: "if-conversion is disabled" , |
1311 | ORETag: "IfConversionDisabled" , |
1312 | ORE, TheLoop); |
1313 | return false; |
1314 | } |
1315 | |
1316 | assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable" ); |
1317 | |
1318 | // A list of pointers which are known to be dereferenceable within scope of |
1319 | // the loop body for each iteration of the loop which executes. That is, |
1320 | // the memory pointed to can be dereferenced (with the access size implied by |
1321 | // the value's type) unconditionally within the loop header without |
1322 | // introducing a new fault. |
1323 | SmallPtrSet<Value *, 8> SafePointers; |
1324 | |
1325 | // Collect safe addresses. |
1326 | for (BasicBlock *BB : TheLoop->blocks()) { |
1327 | if (!blockNeedsPredication(BB)) { |
1328 | for (Instruction &I : *BB) |
1329 | if (auto *Ptr = getLoadStorePointerOperand(V: &I)) |
1330 | SafePointers.insert(Ptr); |
1331 | continue; |
1332 | } |
1333 | |
1334 | // For a block which requires predication, a address may be safe to access |
1335 | // in the loop w/o predication if we can prove dereferenceability facts |
1336 | // sufficient to ensure it'll never fault within the loop. For the moment, |
1337 | // we restrict this to loads; stores are more complicated due to |
1338 | // concurrency restrictions. |
1339 | ScalarEvolution &SE = *PSE.getSE(); |
1340 | for (Instruction &I : *BB) { |
1341 | LoadInst *LI = dyn_cast<LoadInst>(Val: &I); |
1342 | if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) && |
1343 | isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC)) |
1344 | SafePointers.insert(Ptr: LI->getPointerOperand()); |
1345 | } |
1346 | } |
1347 | |
1348 | // Collect the blocks that need predication. |
1349 | for (BasicBlock *BB : TheLoop->blocks()) { |
1350 | // We don't support switch statements inside loops. |
1351 | if (!isa<BranchInst>(Val: BB->getTerminator())) { |
1352 | reportVectorizationFailure(DebugMsg: "Loop contains a switch statement" , |
1353 | OREMsg: "loop contains a switch statement" , |
1354 | ORETag: "LoopContainsSwitch" , ORE, TheLoop, |
1355 | I: BB->getTerminator()); |
1356 | return false; |
1357 | } |
1358 | |
1359 | // We must be able to predicate all blocks that need to be predicated. |
1360 | if (blockNeedsPredication(BB) && |
1361 | !blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) { |
1362 | reportVectorizationFailure( |
1363 | DebugMsg: "Control flow cannot be substituted for a select" , |
1364 | OREMsg: "control flow cannot be substituted for a select" , ORETag: "NoCFGForSelect" , |
1365 | ORE, TheLoop, I: BB->getTerminator()); |
1366 | return false; |
1367 | } |
1368 | } |
1369 | |
1370 | // We can if-convert this loop. |
1371 | return true; |
1372 | } |
1373 | |
1374 | // Helper function to canVectorizeLoopNestCFG. |
1375 | bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, |
1376 | bool UseVPlanNativePath) { |
1377 | assert((UseVPlanNativePath || Lp->isInnermost()) && |
1378 | "VPlan-native path is not enabled." ); |
1379 | |
1380 | // TODO: ORE should be improved to show more accurate information when an |
1381 | // outer loop can't be vectorized because a nested loop is not understood or |
1382 | // legal. Something like: "outer_loop_location: loop not vectorized: |
1383 | // (inner_loop_location) loop control flow is not understood by vectorizer". |
1384 | |
1385 | // Store the result and return it at the end instead of exiting early, in case |
1386 | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1387 | bool Result = true; |
1388 | bool = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1389 | |
1390 | // We must have a loop in canonical form. Loops with indirectbr in them cannot |
1391 | // be canonicalized. |
1392 | if (!Lp->getLoopPreheader()) { |
1393 | reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header" , |
1394 | OREMsg: "loop control flow is not understood by vectorizer" , |
1395 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
1396 | if (DoExtraAnalysis) |
1397 | Result = false; |
1398 | else |
1399 | return false; |
1400 | } |
1401 | |
1402 | // We must have a single backedge. |
1403 | if (Lp->getNumBackEdges() != 1) { |
1404 | reportVectorizationFailure(DebugMsg: "The loop must have a single backedge" , |
1405 | OREMsg: "loop control flow is not understood by vectorizer" , |
1406 | ORETag: "CFGNotUnderstood" , ORE, TheLoop); |
1407 | if (DoExtraAnalysis) |
1408 | Result = false; |
1409 | else |
1410 | return false; |
1411 | } |
1412 | |
1413 | return Result; |
1414 | } |
1415 | |
1416 | bool LoopVectorizationLegality::canVectorizeLoopNestCFG( |
1417 | Loop *Lp, bool UseVPlanNativePath) { |
1418 | // Store the result and return it at the end instead of exiting early, in case |
1419 | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1420 | bool Result = true; |
1421 | bool = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1422 | if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) { |
1423 | if (DoExtraAnalysis) |
1424 | Result = false; |
1425 | else |
1426 | return false; |
1427 | } |
1428 | |
1429 | // Recursively check whether the loop control flow of nested loops is |
1430 | // understood. |
1431 | for (Loop *SubLp : *Lp) |
1432 | if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) { |
1433 | if (DoExtraAnalysis) |
1434 | Result = false; |
1435 | else |
1436 | return false; |
1437 | } |
1438 | |
1439 | return Result; |
1440 | } |
1441 | |
1442 | bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { |
1443 | // Store the result and return it at the end instead of exiting early, in case |
1444 | // allowExtraAnalysis is used to report multiple reasons for not vectorizing. |
1445 | bool Result = true; |
1446 | |
1447 | bool = ORE->allowExtraAnalysis(DEBUG_TYPE); |
1448 | // Check whether the loop-related control flow in the loop nest is expected by |
1449 | // vectorizer. |
1450 | if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) { |
1451 | if (DoExtraAnalysis) |
1452 | Result = false; |
1453 | else |
1454 | return false; |
1455 | } |
1456 | |
1457 | // We need to have a loop header. |
1458 | LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName() |
1459 | << '\n'); |
1460 | |
1461 | // Specific checks for outer loops. We skip the remaining legal checks at this |
1462 | // point because they don't support outer loops. |
1463 | if (!TheLoop->isInnermost()) { |
1464 | assert(UseVPlanNativePath && "VPlan-native path is not enabled." ); |
1465 | |
1466 | if (!canVectorizeOuterLoop()) { |
1467 | reportVectorizationFailure(DebugMsg: "Unsupported outer loop" , |
1468 | OREMsg: "unsupported outer loop" , |
1469 | ORETag: "UnsupportedOuterLoop" , |
1470 | ORE, TheLoop); |
1471 | // TODO: Implement DoExtraAnalysis when subsequent legal checks support |
1472 | // outer loops. |
1473 | return false; |
1474 | } |
1475 | |
1476 | LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n" ); |
1477 | return Result; |
1478 | } |
1479 | |
1480 | assert(TheLoop->isInnermost() && "Inner loop expected." ); |
1481 | // Check if we can if-convert non-single-bb loops. |
1482 | unsigned NumBlocks = TheLoop->getNumBlocks(); |
1483 | if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { |
1484 | LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n" ); |
1485 | if (DoExtraAnalysis) |
1486 | Result = false; |
1487 | else |
1488 | return false; |
1489 | } |
1490 | |
1491 | // Check if we can vectorize the instructions and CFG in this loop. |
1492 | if (!canVectorizeInstrs()) { |
1493 | LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n" ); |
1494 | if (DoExtraAnalysis) |
1495 | Result = false; |
1496 | else |
1497 | return false; |
1498 | } |
1499 | |
1500 | // Go over each instruction and look at memory deps. |
1501 | if (!canVectorizeMemory()) { |
1502 | LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n" ); |
1503 | if (DoExtraAnalysis) |
1504 | Result = false; |
1505 | else |
1506 | return false; |
1507 | } |
1508 | |
1509 | if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) { |
1510 | reportVectorizationFailure(DebugMsg: "could not determine number of loop iterations" , |
1511 | OREMsg: "could not determine number of loop iterations" , |
1512 | ORETag: "CantComputeNumberOfIterations" , ORE, TheLoop); |
1513 | if (DoExtraAnalysis) |
1514 | Result = false; |
1515 | else |
1516 | return false; |
1517 | } |
1518 | |
1519 | LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop" |
1520 | << (LAI->getRuntimePointerChecking()->Need |
1521 | ? " (with a runtime bound check)" |
1522 | : "" ) |
1523 | << "!\n" ); |
1524 | |
1525 | unsigned SCEVThreshold = VectorizeSCEVCheckThreshold; |
1526 | if (Hints->getForce() == LoopVectorizeHints::FK_Enabled) |
1527 | SCEVThreshold = PragmaVectorizeSCEVCheckThreshold; |
1528 | |
1529 | if (PSE.getPredicate().getComplexity() > SCEVThreshold) { |
1530 | reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed" , |
1531 | OREMsg: "Too many SCEV assumptions need to be made and checked at runtime" , |
1532 | ORETag: "TooManySCEVRunTimeChecks" , ORE, TheLoop); |
1533 | if (DoExtraAnalysis) |
1534 | Result = false; |
1535 | else |
1536 | return false; |
1537 | } |
1538 | |
1539 | // Okay! We've done all the tests. If any have failed, return false. Otherwise |
1540 | // we can vectorize, and at this point we don't have any other mem analysis |
1541 | // which may limit our maximum vectorization factor, so just return true with |
1542 | // no restrictions. |
1543 | return Result; |
1544 | } |
1545 | |
1546 | bool LoopVectorizationLegality::canFoldTailByMasking() const { |
1547 | |
1548 | LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n" ); |
1549 | |
1550 | SmallPtrSet<const Value *, 8> ReductionLiveOuts; |
1551 | |
1552 | for (const auto &Reduction : getReductionVars()) |
1553 | ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr()); |
1554 | |
1555 | // TODO: handle non-reduction outside users when tail is folded by masking. |
1556 | for (auto *AE : AllowedExit) { |
1557 | // Check that all users of allowed exit values are inside the loop or |
1558 | // are the live-out of a reduction. |
1559 | if (ReductionLiveOuts.count(Ptr: AE)) |
1560 | continue; |
1561 | for (User *U : AE->users()) { |
1562 | Instruction *UI = cast<Instruction>(Val: U); |
1563 | if (TheLoop->contains(Inst: UI)) |
1564 | continue; |
1565 | LLVM_DEBUG( |
1566 | dbgs() |
1567 | << "LV: Cannot fold tail by masking, loop has an outside user for " |
1568 | << *UI << "\n" ); |
1569 | return false; |
1570 | } |
1571 | } |
1572 | |
1573 | for (const auto &Entry : getInductionVars()) { |
1574 | PHINode *OrigPhi = Entry.first; |
1575 | for (User *U : OrigPhi->users()) { |
1576 | auto *UI = cast<Instruction>(Val: U); |
1577 | if (!TheLoop->contains(Inst: UI)) { |
1578 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an " |
1579 | "outside user for " |
1580 | << *UI << "\n" ); |
1581 | return false; |
1582 | } |
1583 | } |
1584 | } |
1585 | |
1586 | // The list of pointers that we can safely read and write to remains empty. |
1587 | SmallPtrSet<Value *, 8> SafePointers; |
1588 | |
1589 | // Check all blocks for predication, including those that ordinarily do not |
1590 | // need predication such as the header block. |
1591 | SmallPtrSet<const Instruction *, 8> TmpMaskedOp; |
1592 | for (BasicBlock *BB : TheLoop->blocks()) { |
1593 | if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) { |
1594 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n" ); |
1595 | return false; |
1596 | } |
1597 | } |
1598 | |
1599 | LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n" ); |
1600 | |
1601 | return true; |
1602 | } |
1603 | |
1604 | void LoopVectorizationLegality::prepareToFoldTailByMasking() { |
1605 | // The list of pointers that we can safely read and write to remains empty. |
1606 | SmallPtrSet<Value *, 8> SafePointers; |
1607 | |
1608 | // Mark all blocks for predication, including those that ordinarily do not |
1609 | // need predication such as the header block. |
1610 | for (BasicBlock *BB : TheLoop->blocks()) { |
1611 | [[maybe_unused]] bool R = blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp); |
1612 | assert(R && "Must be able to predicate block when tail-folding." ); |
1613 | } |
1614 | } |
1615 | |
1616 | } // namespace llvm |
1617 | |