1//===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements VFSelectionContext methods for loop vectorization
11/// VF selection, independent of cost-modeling decisions.
12///
13//===----------------------------------------------------------------------===//
14
15#include "LoopVectorizationPlanner.h"
16#include "llvm/Analysis/LoopInfo.h"
17#include "llvm/Analysis/OptimizationRemarkEmitter.h"
18#include "llvm/Analysis/ScalarEvolution.h"
19#include "llvm/IR/DiagnosticInfo.h"
20#include "llvm/Support/CommandLine.h"
21#include "llvm/Support/Debug.h"
22#include "llvm/Support/MathExtras.h"
23#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
24#include "llvm/Transforms/Vectorize/LoopVectorize.h"
25
26using namespace llvm;
27using namespace LoopVectorizationUtils;
28
29#define DEBUG_TYPE "loop-vectorize"
30
31extern cl::opt<bool> VPlanBuildOuterloopStressTest;
32
33static cl::opt<bool> MaximizeBandwidth(
34 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
35 cl::desc("Maximize bandwidth when selecting vectorization factor which "
36 "will be determined by the smallest type in loop."));
37
38static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
39 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
40 cl::Hidden,
41 cl::desc("Try wider VFs if they enable the use of vector variants"));
42
43static cl::opt<bool> ConsiderRegPressure(
44 "vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
45 cl::desc("Discard VFs if their register pressure is too high."));
46
47static cl::opt<bool> ForceTargetSupportsScalableVectors(
48 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
49 cl::desc(
50 "Pretend that scalable vectors are supported, even if the target does "
51 "not support them. This flag should only be used for testing."));
52
53cl::opt<bool> llvm::PreferInLoopReductions(
54 "prefer-inloop-reductions", cl::init(Val: false), cl::Hidden,
55 cl::desc("Prefer in-loop vector reductions, "
56 "overriding the targets preference."));
57
58/// Note: This currently only applies to `llvm.masked.load` and
59/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
60static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
61 "force-target-supports-masked-memory-ops", cl::init(Val: false), cl::Hidden,
62 cl::desc("Assume the target supports masked memory operations (used for "
63 "testing)."));
64
65static cl::opt<bool> ForceTargetSupportsGatherScatterOps(
66 "force-target-supports-gather-scatter-ops", cl::init(Val: false), cl::Hidden,
67 cl::desc("Assume the target supports gather/scatter operations (used for "
68 "testing)."));
69
70/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
71/// is passed, the message relates to that particular instruction.
72#ifndef NDEBUG
73static void debugVectorizationMessage(const StringRef Prefix,
74 const StringRef DebugMsg,
75 Instruction *I) {
76 dbgs() << "LV: " << Prefix << DebugMsg;
77 if (I != nullptr)
78 dbgs() << " " << *I;
79 else
80 dbgs() << '.';
81 dbgs() << '\n';
82}
83#endif
84
85/// Create an analysis remark that explains why vectorization failed
86/// \p RemarkName is the identifier for the remark. If \p I is passed it is an
87/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
88/// the location of the remark. If \p DL is passed, use it as debug location for
89/// the remark. \return the remark object that can be streamed to.
90static OptimizationRemarkAnalysis createLVAnalysis(StringRef RemarkName,
91 const Loop *TheLoop,
92 Instruction *I,
93 DebugLoc DL = {}) {
94 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
95 // If debug location is attached to the instruction, use it. Otherwise if DL
96 // was not provided, use the loop's.
97 if (I && I->getDebugLoc())
98 DL = I->getDebugLoc();
99 else if (!DL)
100 DL = TheLoop->getStartLoc();
101
102 return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion);
103}
104
105void LoopVectorizationUtils::reportVectorizationFailure(
106 const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag,
107 OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) {
108 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
109 ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I)
110 << "loop not vectorized: " << OREMsg);
111}
112
113void LoopVectorizationUtils::reportVectorizationInfo(
114 const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE,
115 const Loop *TheLoop, Instruction *I, DebugLoc DL) {
116 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
117 ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I, DL) << Msg);
118}
119
120void LoopVectorizationUtils::reportVectorization(OptimizationRemarkEmitter *ORE,
121 Loop *TheLoop,
122 ElementCount VFWidth,
123 unsigned IC) {
124 LLVM_DEBUG(debugVectorizationMessage(
125 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
126 nullptr));
127 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
128 ORE->emit(RemarkBuilder: [&]() {
129 return OptimizationRemark(DEBUG_TYPE, "Vectorized", TheLoop->getStartLoc(),
130 TheLoop->getHeader())
131 << "vectorized " << LoopType << "loop (vectorization width: "
132 << ore::NV("VectorizationFactor", VFWidth)
133 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
134 });
135}
136
137bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I,
138 ElementCount VF) const {
139 assert(isa<LoadInst>(I) || isa<StoreInst>(I));
140 auto *Ty = getLoadStoreType(I);
141 const unsigned AS = getLoadStoreAddressSpace(I);
142 const Align Alignment = getLoadStoreAlignment(I);
143
144 return ForceTargetSupportsMaskedMemoryOps ||
145 (isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
146 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS));
147}
148
149bool VFSelectionContext::isLegalGatherOrScatter(Value *V,
150 ElementCount VF) const {
151 bool LI = isa<LoadInst>(Val: V);
152 bool SI = isa<StoreInst>(Val: V);
153 if (!LI && !SI)
154 return false;
155 auto *Ty = getLoadStoreType(I: V);
156 Align Align = getLoadStoreAlignment(I: V);
157 if (VF.isVector())
158 Ty = VectorType::get(ElementType: Ty, EC: VF);
159 return ForceTargetSupportsGatherScatterOps ||
160 (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
161 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
162}
163
164bool VFSelectionContext::supportsScalableVectors() const {
165 return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors ||
166 VectorizerParams::VectorizationFactor.isScalable();
167}
168
169bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const {
170 TargetTransformInfo::RegisterKind RegKind =
171 IsScalable ? TargetTransformInfo::RGK_ScalableVector
172 : TargetTransformInfo::RGK_FixedWidthVector;
173 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
174 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
175 (UseWiderVFIfCallVariantsPresent &&
176 Legal->hasVectorCallVariants())));
177}
178
179bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const {
180 if (ConsiderRegPressure.getNumOccurrences())
181 return ConsiderRegPressure;
182
183 // TODO: We should eventually consider register pressure for all targets. The
184 // TTI hook is temporary whilst target-specific issues are being fixed.
185 if (TTI.shouldConsiderVectorizationRegPressure())
186 return true;
187
188 if (!useMaxBandwidth(IsScalable: VF.isScalable()))
189 return false;
190 // Only calculate register pressure for VFs enabled by MaxBandwidth.
191 return ElementCount::isKnownGT(
192 LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
193 : MaxPermissibleVFWithoutMaxBW.FixedVF);
194}
195
196ElementCount VFSelectionContext::clampVFByMaxTripCount(
197 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
198 bool FoldTailByMasking, bool RequiresScalarEpilogue) const {
199 unsigned EstimatedVF = VF.getKnownMinValue();
200 if (VF.isScalable() && F.hasFnAttribute(Kind: Attribute::VScaleRange)) {
201 auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange);
202 auto Min = Attr.getVScaleRangeMin();
203 EstimatedVF *= Min;
204 }
205
206 // When a scalar epilogue is required, at least one iteration of the scalar
207 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
208 // max VF that results in a dead vector loop.
209 if (MaxTripCount > 0 && RequiresScalarEpilogue)
210 MaxTripCount -= 1;
211
212 // When the user specifies an interleave count, we need to ensure that
213 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
214 unsigned IC = UserIC > 0 ? UserIC : 1;
215 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
216
217 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
218 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
219 // If upper bound loop trip count (TC) is known at compile time there is no
220 // point in choosing VF greater than TC / IC (as done in the loop below).
221 // Select maximum power of two which doesn't exceed TC / IC. If VF is
222 // scalable, we only fall back on a fixed VF when the TC is less than or
223 // equal to the known number of lanes.
224 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
225 if (ClampedUpperTripCount == 0)
226 ClampedUpperTripCount = 1;
227 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
228 "exceeding the constant trip count"
229 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
230 << ClampedUpperTripCount << "\n");
231 return ElementCount::get(MinVal: ClampedUpperTripCount,
232 Scalable: FoldTailByMasking ? VF.isScalable() : false);
233 }
234 return VF;
235}
236
237ElementCount VFSelectionContext::getMaximizedVFForTarget(
238 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
239 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking,
240 bool RequiresScalarEpilogue) {
241 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
242 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
243 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
244 : TargetTransformInfo::RGK_FixedWidthVector);
245
246 // Convenience function to return the minimum of two ElementCounts.
247 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
248 assert((LHS.isScalable() == RHS.isScalable()) &&
249 "Scalable flags must match");
250 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
251 };
252
253 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
254 // Note that both WidestRegister and WidestType may not be a powers of 2.
255 auto MaxVectorElementCount = ElementCount::get(
256 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
257 Scalable: ComputeScalableMaxVF);
258 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
259 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
260 << (MaxVectorElementCount * WidestType) << " bits.\n");
261
262 if (!MaxVectorElementCount) {
263 LLVM_DEBUG(dbgs() << "LV: The target has no "
264 << (ComputeScalableMaxVF ? "scalable" : "fixed")
265 << " vector registers.\n");
266 return ElementCount::getFixed(MinVal: 1);
267 }
268
269 ElementCount MaxVF =
270 clampVFByMaxTripCount(VF: MaxVectorElementCount, MaxTripCount, UserIC,
271 FoldTailByMasking, RequiresScalarEpilogue);
272 // If the MaxVF was already clamped, there's no point in trying to pick a
273 // larger one.
274 if (MaxVF != MaxVectorElementCount)
275 return MaxVF;
276
277 if (MaxVF.isScalable())
278 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
279 else
280 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
281
282 if (useMaxBandwidth(IsScalable: ComputeScalableMaxVF)) {
283 auto MaxVectorElementCountMaxBW = ElementCount::get(
284 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
285 Scalable: ComputeScalableMaxVF);
286 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
287
288 if (ElementCount MinVF =
289 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
290 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
291 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
292 << ") with target's minimum: " << MinVF << '\n');
293 MaxVF = MinVF;
294 }
295 }
296
297 MaxVF = clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC,
298 FoldTailByMasking, RequiresScalarEpilogue);
299 }
300 return MaxVF;
301}
302
303std::optional<unsigned> llvm::getMaxVScale(const Function &F,
304 const TargetTransformInfo &TTI) {
305 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
306 return MaxVScale;
307
308 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
309 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
310
311 return std::nullopt;
312}
313
314bool VFSelectionContext::isScalableVectorizationAllowed() {
315 if (IsScalableVectorizationAllowed)
316 return *IsScalableVectorizationAllowed;
317
318 IsScalableVectorizationAllowed = false;
319 if (!supportsScalableVectors())
320 return false;
321
322 if (Hints->isScalableVectorizationDisabled()) {
323 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
324 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
325 return false;
326 }
327
328 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
329
330 auto MaxScalableVF = ElementCount::getScalable(
331 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
332
333 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
334 // FIXME: While for scalable vectors this is currently sufficient, this should
335 // be replaced by a more detailed mechanism that filters out specific VFs,
336 // instead of invalidating vectorization for a whole set of VFs based on the
337 // MaxVF.
338
339 // Disable scalable vectorization if the loop contains unsupported reductions.
340 if (!all_of(Range: Legal->getReductionVars(), P: [&](const auto &Reduction) -> bool {
341 return TTI.isLegalToVectorizeReduction(RdxDesc: Reduction.second, VF: MaxScalableVF);
342 })) {
343 reportVectorizationInfo(
344 Msg: "Scalable vectorization not supported for the reduction "
345 "operations found in this loop.",
346 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
347 return false;
348 }
349
350 // Disable scalable vectorization if the loop contains any instructions
351 // with element types not supported for scalable vectors.
352 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
353 return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty);
354 })) {
355 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
356 "for all element types found in this loop.",
357 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
358 return false;
359 }
360
361 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) {
362 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
363 "for safe distance analysis.",
364 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
365 return false;
366 }
367
368 IsScalableVectorizationAllowed = true;
369 return true;
370}
371
372ElementCount
373VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) {
374 if (!isScalableVectorizationAllowed())
375 return ElementCount::getScalable(MinVal: 0);
376
377 auto MaxScalableVF = ElementCount::getScalable(
378 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
379 if (Legal->isSafeForAnyVectorWidth())
380 return MaxScalableVF;
381
382 std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI);
383 // Limit MaxScalableVF by the maximum safe dependence distance.
384 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
385
386 if (!MaxScalableVF)
387 reportVectorizationInfo(
388 Msg: "Max legal vector width too small, scalable vectorization "
389 "unfeasible.",
390 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
391
392 return MaxScalableVF;
393}
394
395FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF(
396 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
397 bool FoldTailByMasking, bool RequiresScalarEpilogue) {
398 auto [SmallestType, WidestType] = getSmallestAndWidestTypes();
399
400 // Get the maximum safe dependence distance in bits computed by LAA.
401 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
402 // the memory accesses that is most restrictive (involved in the smallest
403 // dependence distance).
404 unsigned MaxSafeElementsPowerOf2 =
405 llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
406 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
407 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
408 MaxSafeElementsPowerOf2 =
409 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
410 }
411
412 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
413 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
414
415 if (!Legal->isSafeForAnyVectorWidth())
416 MaxSafeElements = MaxSafeElementsPowerOf2;
417
418 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
419 << ".\n");
420 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
421 << ".\n");
422
423 // First analyze the UserVF, fall back if the UserVF should be ignored.
424 if (UserVF) {
425 auto MaxSafeUserVF =
426 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
427
428 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
429 // If `VF=vscale x N` is safe, then so is `VF=N`
430 if (UserVF.isScalable())
431 return FixedScalableVFPair(
432 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
433
434 return UserVF;
435 }
436
437 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
438
439 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
440 // is better to ignore the hint and let the compiler choose a suitable VF.
441 if (!UserVF.isScalable()) {
442 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
443 << " is unsafe, clamping to max safe VF="
444 << MaxSafeFixedVF << ".\n");
445 ORE->emit(RemarkBuilder: [&]() {
446 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
447 TheLoop->getStartLoc(),
448 TheLoop->getHeader())
449 << "User-specified vectorization factor "
450 << ore::NV("UserVectorizationFactor", UserVF)
451 << " is unsafe, clamping to maximum safe vectorization factor "
452 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
453 });
454 return MaxSafeFixedVF;
455 }
456
457 if (!supportsScalableVectors()) {
458 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
459 << " is ignored because scalable vectors are not "
460 "available.\n");
461 ORE->emit(RemarkBuilder: [&]() {
462 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
463 TheLoop->getStartLoc(),
464 TheLoop->getHeader())
465 << "User-specified vectorization factor "
466 << ore::NV("UserVectorizationFactor", UserVF)
467 << " is ignored because the target does not support scalable "
468 "vectors. The compiler will pick a more suitable value.";
469 });
470 } else {
471 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
472 << " is unsafe. Ignoring scalable UserVF.\n");
473 ORE->emit(RemarkBuilder: [&]() {
474 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
475 TheLoop->getStartLoc(),
476 TheLoop->getHeader())
477 << "User-specified vectorization factor "
478 << ore::NV("UserVectorizationFactor", UserVF)
479 << " is unsafe. Ignoring the hint to let the compiler pick a "
480 "more suitable value.";
481 });
482 }
483 }
484
485 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
486 << " / " << WidestType << " bits.\n");
487
488 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
489 ElementCount::getScalable(MinVal: 0));
490 if (auto MaxVF = getMaximizedVFForTarget(
491 MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeFixedVF, UserIC,
492 FoldTailByMasking, RequiresScalarEpilogue))
493 Result.FixedVF = MaxVF;
494
495 if (auto MaxVF = getMaximizedVFForTarget(
496 MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeScalableVF, UserIC,
497 FoldTailByMasking, RequiresScalarEpilogue))
498 if (MaxVF.isScalable()) {
499 Result.ScalableVF = MaxVF;
500 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
501 << "\n");
502 }
503
504 return Result;
505}
506
507std::pair<unsigned, unsigned>
508VFSelectionContext::getSmallestAndWidestTypes() const {
509 unsigned MinWidth = -1U;
510 unsigned MaxWidth = 8;
511 const DataLayout &DL = F.getDataLayout();
512 // For in-loop reductions, no element types are added to ElementTypesInLoop
513 // if there are no loads/stores in the loop. In this case, check through the
514 // reduction variables to determine the maximum width.
515 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
516 for (const auto &[_, RdxDesc] : Legal->getReductionVars()) {
517 // When finding the min width used by the recurrence we need to account
518 // for casts on the input operands of the recurrence.
519 MinWidth = std::min(
520 a: MinWidth,
521 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
522 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
523 MaxWidth = std::max(a: MaxWidth,
524 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
525 }
526 } else {
527 for (Type *T : ElementTypesInLoop) {
528 MinWidth = std::min<unsigned>(
529 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
530 MaxWidth = std::max<unsigned>(
531 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
532 }
533 }
534 return {MinWidth, MaxWidth};
535}
536
537void VFSelectionContext::collectElementTypesForWidening(
538 const SmallPtrSetImpl<const Value *> *ValuesToIgnore) {
539 ElementTypesInLoop.clear();
540 // For each block.
541 for (BasicBlock *BB : TheLoop->blocks()) {
542 // For each instruction in the loop.
543 for (Instruction &I : *BB) {
544 Type *T = I.getType();
545
546 // Skip ignored values.
547 if (ValuesToIgnore && ValuesToIgnore->contains(Ptr: &I))
548 continue;
549
550 // Only examine Loads, Stores and PHINodes.
551 if (!isa<LoadInst, StoreInst, PHINode>(Val: I))
552 continue;
553
554 // Examine PHI nodes that are reduction variables. Update the type to
555 // account for the recurrence type.
556 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
557 if (!Legal->isReductionVariable(PN))
558 continue;
559 const RecurrenceDescriptor &RdxDesc =
560 Legal->getRecurrenceDescriptor(PN);
561 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
562 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
563 Ty: RdxDesc.getRecurrenceType()))
564 continue;
565 T = RdxDesc.getRecurrenceType();
566 }
567
568 // Examine the stored values.
569 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
570 T = ST->getValueOperand()->getType();
571
572 assert(T->isSized() &&
573 "Expected the load/store/recurrence type to be sized");
574
575 ElementTypesInLoop.insert(Ptr: T);
576 }
577 }
578}
579
580void VFSelectionContext::initializeVScaleForTuning() {
581 if (!supportsScalableVectors())
582 return;
583
584 if (F.hasFnAttribute(Kind: Attribute::VScaleRange)) {
585 auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange);
586 auto Min = Attr.getVScaleRangeMin();
587 auto Max = Attr.getVScaleRangeMax();
588 if (Max && Min == Max) {
589 VScaleForTuning = Max;
590 return;
591 }
592 }
593
594 VScaleForTuning = TTI.getVScaleForTuning();
595}
596
597bool VFSelectionContext::useOrderedReductions(
598 const RecurrenceDescriptor &RdxDesc) const {
599 return !Hints->allowReordering() && RdxDesc.isOrdered();
600}
601
602bool VFSelectionContext::runtimeChecksRequired() {
603 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
604
605 Loop *L = const_cast<Loop *>(TheLoop);
606 if (Legal->getRuntimePointerChecking()->Need) {
607 reportVectorizationFailure(
608 DebugMsg: "Runtime ptr check is required with -Os/-Oz",
609 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
610 "loop with '#pragma clang loop vectorize(enable)' when "
611 "compiling with -Os/-Oz",
612 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
613 return true;
614 }
615
616 if (!PSE.getPredicate().isAlwaysTrue()) {
617 reportVectorizationFailure(
618 DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
619 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
620 "loop with '#pragma clang loop vectorize(enable)' when "
621 "compiling with -Os/-Oz",
622 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
623 return true;
624 }
625
626 // FIXME: Avoid specializing for stride==1 instead of bailing out.
627 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
628 reportVectorizationFailure(
629 DebugMsg: "Runtime stride check for small trip count",
630 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
631 "this loop without such check by compiling with -Os/-Oz",
632 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop: L);
633 return true;
634 }
635
636 return false;
637}
638
639void VFSelectionContext::computeMinimalBitwidths() {
640 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
641}
642
643void VFSelectionContext::collectInLoopReductions() {
644 // Avoid duplicating work finding in-loop reductions.
645 if (!InLoopReductions.empty())
646 return;
647
648 for (const auto &Reduction : Legal->getReductionVars()) {
649 PHINode *Phi = Reduction.first;
650 const RecurrenceDescriptor &RdxDesc = Reduction.second;
651
652 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
653 // separately and should not be considered for in-loop reductions.
654 if (RdxDesc.hasUsesOutsideReductionChain())
655 continue;
656
657 // We don't collect reductions that are type promoted (yet).
658 if (RdxDesc.getRecurrenceType() != Phi->getType())
659 continue;
660
661 // In-loop AnyOf and FindIV reductions are not yet supported.
662 RecurKind Kind = RdxDesc.getRecurrenceKind();
663 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
664 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
665 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
666 continue;
667
668 // If the target would prefer this reduction to happen "in-loop", then we
669 // want to record it as such.
670 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
671 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
672 continue;
673
674 // Check that we can correctly put the reductions into the loop, by
675 // finding the chain of operations that leads from the phi to the loop
676 // exit value.
677 SmallVector<Instruction *, 4> ReductionOperations =
678 RdxDesc.getReductionOpChain(Phi, L: const_cast<Loop *>(TheLoop));
679 bool InLoop = !ReductionOperations.empty();
680
681 if (InLoop) {
682 InLoopReductions.insert(Ptr: Phi);
683 // Add the elements to InLoopReductionImmediateChains for cost modelling.
684 Instruction *LastChain = Phi;
685 for (auto *I : ReductionOperations) {
686 InLoopReductionImmediateChains[I] = LastChain;
687 LastChain = I;
688 }
689 }
690 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
691 << " reduction for phi: " << *Phi << "\n");
692 }
693}
694
695bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
696 const VectorizationFactor &B,
697 const unsigned MaxTripCount,
698 bool HasTail,
699 bool IsEpilogue) const {
700 InstructionCost CostA = A.Cost;
701 InstructionCost CostB = B.Cost;
702
703 // When there is a hint to always prefer scalable vectors, honour that hint.
704 if (Hints.isScalableVectorizationAlwaysPreferred())
705 if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() &&
706 !B.Width.isScalar())
707 return true;
708
709 // Improve estimate for the vector width if it is scalable.
710 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
711 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
712 if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) {
713 if (A.Width.isScalable())
714 EstimatedWidthA *= *VScale;
715 if (B.Width.isScalable())
716 EstimatedWidthB *= *VScale;
717 }
718
719 // When optimizing for size choose whichever is smallest, which will be the
720 // one with the smallest cost for the whole loop. On a tie pick the larger
721 // vector width, on the assumption that throughput will be greater.
722 if (Config.CostKind == TTI::TCK_CodeSize)
723 return CostA < CostB ||
724 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
725
726 // Assume vscale may be larger than 1 (or the value being tuned for),
727 // so that scalable vectorization is slightly favorable over fixed-width
728 // vectorization.
729 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
730 A.Width.isScalable() && !B.Width.isScalable();
731
732 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
733 const InstructionCost &RHS) {
734 return PreferScalable ? LHS <= RHS : LHS < RHS;
735 };
736
737 // To avoid the need for FP division:
738 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
739 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
740 bool LowerCostWithoutTC =
741 CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
742 if (!MaxTripCount)
743 return LowerCostWithoutTC;
744
745 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
746 InstructionCost VectorCost,
747 InstructionCost ScalarCost) {
748 // If the trip count is a known (possibly small) constant, the trip count
749 // will be rounded up to an integer number of iterations under
750 // FoldTailByMasking. The total cost in that case will be
751 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
752 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
753 // some extra overheads, but for the purpose of comparing the costs of
754 // different VFs we can use this to compare the total loop-body cost
755 // expected after vectorization.
756 if (HasTail)
757 return VectorCost * (MaxTripCount / VF) +
758 ScalarCost * (MaxTripCount % VF);
759 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
760 };
761
762 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
763 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
764 bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
765 LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
766 dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
767 << " has lower cost than VF "
768 << (LowerCostWithTC ? B.Width : A.Width)
769 << " when taking the cost of the remaining scalar loop iterations "
770 "into consideration for a maximum trip count of "
771 << MaxTripCount << ".\n";
772 });
773 return LowerCostWithTC;
774}
775
776bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
777 const VectorizationFactor &B,
778 bool HasTail,
779 bool IsEpilogue) const {
780 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
781 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
782 IsEpilogue);
783}
784
785// TODO: we could return a pair of values that specify the max VF and
786// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
787// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
788// doesn't have a cost model that can choose which plan to execute if
789// more than one is generated.
790FixedScalableVFPair
791VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) {
792 if (UserVF.isScalable() && !supportsScalableVectors()) {
793 reportVectorizationFailure(
794 DebugMsg: "Scalable vectorization requested but not supported by the target",
795 OREMsg: "the scalable user-specified vectorization width for outer-loop "
796 "vectorization cannot be used because the target does not support "
797 "scalable vectors.",
798 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
799 return FixedScalableVFPair::getNone();
800 }
801
802 ElementCount VF = UserVF;
803 if (VF.isZero()) {
804 auto [_, WidestType] = getSmallestAndWidestTypes();
805
806 auto RegKind = TTI.enableScalableVectorization()
807 ? TargetTransformInfo::RGK_ScalableVector
808 : TargetTransformInfo::RGK_FixedWidthVector;
809
810 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
811 // The widest type may be wider than the register width and WidestType may
812 // not be a power of two; round the element count down to a power of two.
813 unsigned N = std::max<uint64_t>(
814 a: 1, b: llvm::bit_floor(Value: RegSize.getKnownMinValue() / WidestType));
815 VF = ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
816 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
817
818 // Make sure we have a VF > 1 for stress testing.
819 if (VPlanBuildOuterloopStressTest && VF.isScalar()) {
820 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
821 << "overriding computed VF.\n");
822 VF = ElementCount::getFixed(MinVal: 4);
823 }
824 }
825 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
826 "VF needs to be a power of two");
827 if (VF.isScalar())
828 return FixedScalableVFPair::getNone();
829 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
830 << "VF " << VF << " to build VPlans.\n");
831 return FixedScalableVFPair(VF);
832}
833