| 1 | //===- LoopVectorizationPlanner.cpp - VF selection and planning -----------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements VFSelectionContext methods for loop vectorization |
| 11 | /// VF selection, independent of cost-modeling decisions. |
| 12 | /// |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "LoopVectorizationPlanner.h" |
| 16 | #include "llvm/Analysis/LoopInfo.h" |
| 17 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 18 | #include "llvm/Analysis/ScalarEvolution.h" |
| 19 | #include "llvm/IR/DiagnosticInfo.h" |
| 20 | #include "llvm/Support/CommandLine.h" |
| 21 | #include "llvm/Support/Debug.h" |
| 22 | #include "llvm/Support/MathExtras.h" |
| 23 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
| 24 | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
| 25 | |
| 26 | using namespace llvm; |
| 27 | using namespace LoopVectorizationUtils; |
| 28 | |
| 29 | #define DEBUG_TYPE "loop-vectorize" |
| 30 | |
| 31 | extern cl::opt<bool> VPlanBuildOuterloopStressTest; |
| 32 | |
| 33 | static cl::opt<bool> MaximizeBandwidth( |
| 34 | "vectorizer-maximize-bandwidth" , cl::init(Val: false), cl::Hidden, |
| 35 | cl::desc("Maximize bandwidth when selecting vectorization factor which " |
| 36 | "will be determined by the smallest type in loop." )); |
| 37 | |
| 38 | static cl::opt<bool> UseWiderVFIfCallVariantsPresent( |
| 39 | "vectorizer-maximize-bandwidth-for-vector-calls" , cl::init(Val: true), |
| 40 | cl::Hidden, |
| 41 | cl::desc("Try wider VFs if they enable the use of vector variants" )); |
| 42 | |
| 43 | static cl::opt<bool> ConsiderRegPressure( |
| 44 | "vectorizer-consider-reg-pressure" , cl::init(Val: false), cl::Hidden, |
| 45 | cl::desc("Discard VFs if their register pressure is too high." )); |
| 46 | |
| 47 | static cl::opt<bool> ForceTargetSupportsScalableVectors( |
| 48 | "force-target-supports-scalable-vectors" , cl::init(Val: false), cl::Hidden, |
| 49 | cl::desc( |
| 50 | "Pretend that scalable vectors are supported, even if the target does " |
| 51 | "not support them. This flag should only be used for testing." )); |
| 52 | |
| 53 | cl::opt<bool> llvm::PreferInLoopReductions( |
| 54 | "prefer-inloop-reductions" , cl::init(Val: false), cl::Hidden, |
| 55 | cl::desc("Prefer in-loop vector reductions, " |
| 56 | "overriding the targets preference." )); |
| 57 | |
| 58 | /// Note: This currently only applies to `llvm.masked.load` and |
| 59 | /// `llvm.masked.store`. TODO: Extend this to cover other operations as needed. |
| 60 | static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps( |
| 61 | "force-target-supports-masked-memory-ops" , cl::init(Val: false), cl::Hidden, |
| 62 | cl::desc("Assume the target supports masked memory operations (used for " |
| 63 | "testing)." )); |
| 64 | |
| 65 | static cl::opt<bool> ForceTargetSupportsGatherScatterOps( |
| 66 | "force-target-supports-gather-scatter-ops" , cl::init(Val: false), cl::Hidden, |
| 67 | cl::desc("Assume the target supports gather/scatter operations (used for " |
| 68 | "testing)." )); |
| 69 | |
| 70 | /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I |
| 71 | /// is passed, the message relates to that particular instruction. |
| 72 | #ifndef NDEBUG |
| 73 | static void debugVectorizationMessage(const StringRef Prefix, |
| 74 | const StringRef DebugMsg, |
| 75 | Instruction *I) { |
| 76 | dbgs() << "LV: " << Prefix << DebugMsg; |
| 77 | if (I != nullptr) |
| 78 | dbgs() << " " << *I; |
| 79 | else |
| 80 | dbgs() << '.'; |
| 81 | dbgs() << '\n'; |
| 82 | } |
| 83 | #endif |
| 84 | |
| 85 | /// Create an analysis remark that explains why vectorization failed |
| 86 | /// \p RemarkName is the identifier for the remark. If \p I is passed it is an |
| 87 | /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
| 88 | /// the location of the remark. If \p DL is passed, use it as debug location for |
| 89 | /// the remark. \return the remark object that can be streamed to. |
| 90 | static OptimizationRemarkAnalysis createLVAnalysis(StringRef , |
| 91 | const Loop *TheLoop, |
| 92 | Instruction *I, |
| 93 | DebugLoc DL = {}) { |
| 94 | BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader(); |
| 95 | // If debug location is attached to the instruction, use it. Otherwise if DL |
| 96 | // was not provided, use the loop's. |
| 97 | if (I && I->getDebugLoc()) |
| 98 | DL = I->getDebugLoc(); |
| 99 | else if (!DL) |
| 100 | DL = TheLoop->getStartLoc(); |
| 101 | |
| 102 | return OptimizationRemarkAnalysis(DEBUG_TYPE, RemarkName, DL, CodeRegion); |
| 103 | } |
| 104 | |
| 105 | void LoopVectorizationUtils::( |
| 106 | const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, |
| 107 | OptimizationRemarkEmitter *ORE, const Loop *TheLoop, Instruction *I) { |
| 108 | LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: " , DebugMsg, I)); |
| 109 | ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I) |
| 110 | << "loop not vectorized: " << OREMsg); |
| 111 | } |
| 112 | |
| 113 | void LoopVectorizationUtils::( |
| 114 | const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, |
| 115 | const Loop *TheLoop, Instruction *I, DebugLoc DL) { |
| 116 | LLVM_DEBUG(debugVectorizationMessage("" , Msg, I)); |
| 117 | ORE->emit(OptDiag: createLVAnalysis(RemarkName: ORETag, TheLoop, I, DL) << Msg); |
| 118 | } |
| 119 | |
| 120 | void LoopVectorizationUtils::(OptimizationRemarkEmitter *ORE, |
| 121 | Loop *TheLoop, |
| 122 | ElementCount VFWidth, |
| 123 | unsigned IC) { |
| 124 | LLVM_DEBUG(debugVectorizationMessage( |
| 125 | "Vectorizing: " , TheLoop->isInnermost() ? "innermost loop" : "outer loop" , |
| 126 | nullptr)); |
| 127 | StringRef LoopType = TheLoop->isInnermost() ? "" : "outer " ; |
| 128 | ORE->emit(RemarkBuilder: [&]() { |
| 129 | return OptimizationRemark(DEBUG_TYPE, "Vectorized" , TheLoop->getStartLoc(), |
| 130 | TheLoop->getHeader()) |
| 131 | << "vectorized " << LoopType << "loop (vectorization width: " |
| 132 | << ore::NV("VectorizationFactor" , VFWidth) |
| 133 | << ", interleaved count: " << ore::NV("InterleaveCount" , IC) << ")" ; |
| 134 | }); |
| 135 | } |
| 136 | |
| 137 | bool VFSelectionContext::isLegalMaskedLoadOrStore(Instruction *I, |
| 138 | ElementCount VF) const { |
| 139 | assert(isa<LoadInst>(I) || isa<StoreInst>(I)); |
| 140 | auto *Ty = getLoadStoreType(I); |
| 141 | const unsigned AS = getLoadStoreAddressSpace(I); |
| 142 | const Align Alignment = getLoadStoreAlignment(I); |
| 143 | |
| 144 | return ForceTargetSupportsMaskedMemoryOps || |
| 145 | (isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS) |
| 146 | : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS)); |
| 147 | } |
| 148 | |
| 149 | bool VFSelectionContext::isLegalGatherOrScatter(Value *V, |
| 150 | ElementCount VF) const { |
| 151 | bool LI = isa<LoadInst>(Val: V); |
| 152 | bool SI = isa<StoreInst>(Val: V); |
| 153 | if (!LI && !SI) |
| 154 | return false; |
| 155 | auto *Ty = getLoadStoreType(I: V); |
| 156 | Align Align = getLoadStoreAlignment(I: V); |
| 157 | if (VF.isVector()) |
| 158 | Ty = VectorType::get(ElementType: Ty, EC: VF); |
| 159 | return ForceTargetSupportsGatherScatterOps || |
| 160 | (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) || |
| 161 | (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align)); |
| 162 | } |
| 163 | |
| 164 | bool VFSelectionContext::supportsScalableVectors() const { |
| 165 | return TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors || |
| 166 | VectorizerParams::VectorizationFactor.isScalable(); |
| 167 | } |
| 168 | |
| 169 | bool VFSelectionContext::useMaxBandwidth(bool IsScalable) const { |
| 170 | TargetTransformInfo::RegisterKind RegKind = |
| 171 | IsScalable ? TargetTransformInfo::RGK_ScalableVector |
| 172 | : TargetTransformInfo::RGK_FixedWidthVector; |
| 173 | return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 && |
| 174 | (TTI.shouldMaximizeVectorBandwidth(K: RegKind) || |
| 175 | (UseWiderVFIfCallVariantsPresent && |
| 176 | Legal->hasVectorCallVariants()))); |
| 177 | } |
| 178 | |
| 179 | bool VFSelectionContext::shouldConsiderRegPressureForVF(ElementCount VF) const { |
| 180 | if (ConsiderRegPressure.getNumOccurrences()) |
| 181 | return ConsiderRegPressure; |
| 182 | |
| 183 | // TODO: We should eventually consider register pressure for all targets. The |
| 184 | // TTI hook is temporary whilst target-specific issues are being fixed. |
| 185 | if (TTI.shouldConsiderVectorizationRegPressure()) |
| 186 | return true; |
| 187 | |
| 188 | if (!useMaxBandwidth(IsScalable: VF.isScalable())) |
| 189 | return false; |
| 190 | // Only calculate register pressure for VFs enabled by MaxBandwidth. |
| 191 | return ElementCount::isKnownGT( |
| 192 | LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF |
| 193 | : MaxPermissibleVFWithoutMaxBW.FixedVF); |
| 194 | } |
| 195 | |
| 196 | ElementCount VFSelectionContext::clampVFByMaxTripCount( |
| 197 | ElementCount VF, unsigned MaxTripCount, unsigned UserIC, |
| 198 | bool FoldTailByMasking, bool RequiresScalarEpilogue) const { |
| 199 | unsigned EstimatedVF = VF.getKnownMinValue(); |
| 200 | if (VF.isScalable() && F.hasFnAttribute(Kind: Attribute::VScaleRange)) { |
| 201 | auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange); |
| 202 | auto Min = Attr.getVScaleRangeMin(); |
| 203 | EstimatedVF *= Min; |
| 204 | } |
| 205 | |
| 206 | // When a scalar epilogue is required, at least one iteration of the scalar |
| 207 | // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a |
| 208 | // max VF that results in a dead vector loop. |
| 209 | if (MaxTripCount > 0 && RequiresScalarEpilogue) |
| 210 | MaxTripCount -= 1; |
| 211 | |
| 212 | // When the user specifies an interleave count, we need to ensure that |
| 213 | // VF * UserIC <= MaxTripCount to avoid a dead vector loop. |
| 214 | unsigned IC = UserIC > 0 ? UserIC : 1; |
| 215 | unsigned EstimatedVFTimesIC = EstimatedVF * IC; |
| 216 | |
| 217 | if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC && |
| 218 | (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) { |
| 219 | // If upper bound loop trip count (TC) is known at compile time there is no |
| 220 | // point in choosing VF greater than TC / IC (as done in the loop below). |
| 221 | // Select maximum power of two which doesn't exceed TC / IC. If VF is |
| 222 | // scalable, we only fall back on a fixed VF when the TC is less than or |
| 223 | // equal to the known number of lanes. |
| 224 | auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC); |
| 225 | if (ClampedUpperTripCount == 0) |
| 226 | ClampedUpperTripCount = 1; |
| 227 | LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " |
| 228 | "exceeding the constant trip count" |
| 229 | << (UserIC > 0 ? " divided by UserIC" : "" ) << ": " |
| 230 | << ClampedUpperTripCount << "\n" ); |
| 231 | return ElementCount::get(MinVal: ClampedUpperTripCount, |
| 232 | Scalable: FoldTailByMasking ? VF.isScalable() : false); |
| 233 | } |
| 234 | return VF; |
| 235 | } |
| 236 | |
| 237 | ElementCount VFSelectionContext::getMaximizedVFForTarget( |
| 238 | unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, |
| 239 | ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking, |
| 240 | bool RequiresScalarEpilogue) { |
| 241 | bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); |
| 242 | const TypeSize WidestRegister = TTI.getRegisterBitWidth( |
| 243 | K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
| 244 | : TargetTransformInfo::RGK_FixedWidthVector); |
| 245 | |
| 246 | // Convenience function to return the minimum of two ElementCounts. |
| 247 | auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { |
| 248 | assert((LHS.isScalable() == RHS.isScalable()) && |
| 249 | "Scalable flags must match" ); |
| 250 | return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; |
| 251 | }; |
| 252 | |
| 253 | // Ensure MaxVF is a power of 2; the dependence distance bound may not be. |
| 254 | // Note that both WidestRegister and WidestType may not be a powers of 2. |
| 255 | auto MaxVectorElementCount = ElementCount::get( |
| 256 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType), |
| 257 | Scalable: ComputeScalableMaxVF); |
| 258 | MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); |
| 259 | LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " |
| 260 | << (MaxVectorElementCount * WidestType) << " bits.\n" ); |
| 261 | |
| 262 | if (!MaxVectorElementCount) { |
| 263 | LLVM_DEBUG(dbgs() << "LV: The target has no " |
| 264 | << (ComputeScalableMaxVF ? "scalable" : "fixed" ) |
| 265 | << " vector registers.\n" ); |
| 266 | return ElementCount::getFixed(MinVal: 1); |
| 267 | } |
| 268 | |
| 269 | ElementCount MaxVF = |
| 270 | clampVFByMaxTripCount(VF: MaxVectorElementCount, MaxTripCount, UserIC, |
| 271 | FoldTailByMasking, RequiresScalarEpilogue); |
| 272 | // If the MaxVF was already clamped, there's no point in trying to pick a |
| 273 | // larger one. |
| 274 | if (MaxVF != MaxVectorElementCount) |
| 275 | return MaxVF; |
| 276 | |
| 277 | if (MaxVF.isScalable()) |
| 278 | MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; |
| 279 | else |
| 280 | MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF; |
| 281 | |
| 282 | if (useMaxBandwidth(IsScalable: ComputeScalableMaxVF)) { |
| 283 | auto MaxVectorElementCountMaxBW = ElementCount::get( |
| 284 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType), |
| 285 | Scalable: ComputeScalableMaxVF); |
| 286 | MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); |
| 287 | |
| 288 | if (ElementCount MinVF = |
| 289 | TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) { |
| 290 | if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) { |
| 291 | LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF |
| 292 | << ") with target's minimum: " << MinVF << '\n'); |
| 293 | MaxVF = MinVF; |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | MaxVF = clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, |
| 298 | FoldTailByMasking, RequiresScalarEpilogue); |
| 299 | } |
| 300 | return MaxVF; |
| 301 | } |
| 302 | |
| 303 | std::optional<unsigned> llvm::getMaxVScale(const Function &F, |
| 304 | const TargetTransformInfo &TTI) { |
| 305 | if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) |
| 306 | return MaxVScale; |
| 307 | |
| 308 | if (F.hasFnAttribute(Kind: Attribute::VScaleRange)) |
| 309 | return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax(); |
| 310 | |
| 311 | return std::nullopt; |
| 312 | } |
| 313 | |
| 314 | bool VFSelectionContext::isScalableVectorizationAllowed() { |
| 315 | if (IsScalableVectorizationAllowed) |
| 316 | return *IsScalableVectorizationAllowed; |
| 317 | |
| 318 | IsScalableVectorizationAllowed = false; |
| 319 | if (!supportsScalableVectors()) |
| 320 | return false; |
| 321 | |
| 322 | if (Hints->isScalableVectorizationDisabled()) { |
| 323 | reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled" , |
| 324 | ORETag: "ScalableVectorizationDisabled" , ORE, TheLoop); |
| 325 | return false; |
| 326 | } |
| 327 | |
| 328 | LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n" ); |
| 329 | |
| 330 | auto MaxScalableVF = ElementCount::getScalable( |
| 331 | MinVal: std::numeric_limits<ElementCount::ScalarTy>::max()); |
| 332 | |
| 333 | // Test that the loop-vectorizer can legalize all operations for this MaxVF. |
| 334 | // FIXME: While for scalable vectors this is currently sufficient, this should |
| 335 | // be replaced by a more detailed mechanism that filters out specific VFs, |
| 336 | // instead of invalidating vectorization for a whole set of VFs based on the |
| 337 | // MaxVF. |
| 338 | |
| 339 | // Disable scalable vectorization if the loop contains unsupported reductions. |
| 340 | if (!all_of(Range: Legal->getReductionVars(), P: [&](const auto &Reduction) -> bool { |
| 341 | return TTI.isLegalToVectorizeReduction(RdxDesc: Reduction.second, VF: MaxScalableVF); |
| 342 | })) { |
| 343 | reportVectorizationInfo( |
| 344 | Msg: "Scalable vectorization not supported for the reduction " |
| 345 | "operations found in this loop." , |
| 346 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
| 347 | return false; |
| 348 | } |
| 349 | |
| 350 | // Disable scalable vectorization if the loop contains any instructions |
| 351 | // with element types not supported for scalable vectors. |
| 352 | if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) { |
| 353 | return !Ty->isVoidTy() && !TTI.isElementTypeLegalForScalableVector(Ty); |
| 354 | })) { |
| 355 | reportVectorizationInfo(Msg: "Scalable vectorization is not supported " |
| 356 | "for all element types found in this loop." , |
| 357 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
| 358 | return false; |
| 359 | } |
| 360 | |
| 361 | if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F, TTI)) { |
| 362 | reportVectorizationInfo(Msg: "The target does not provide maximum vscale value " |
| 363 | "for safe distance analysis." , |
| 364 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
| 365 | return false; |
| 366 | } |
| 367 | |
| 368 | IsScalableVectorizationAllowed = true; |
| 369 | return true; |
| 370 | } |
| 371 | |
| 372 | ElementCount |
| 373 | VFSelectionContext::getMaxLegalScalableVF(unsigned MaxSafeElements) { |
| 374 | if (!isScalableVectorizationAllowed()) |
| 375 | return ElementCount::getScalable(MinVal: 0); |
| 376 | |
| 377 | auto MaxScalableVF = ElementCount::getScalable( |
| 378 | MinVal: std::numeric_limits<ElementCount::ScalarTy>::max()); |
| 379 | if (Legal->isSafeForAnyVectorWidth()) |
| 380 | return MaxScalableVF; |
| 381 | |
| 382 | std::optional<unsigned> MaxVScale = getMaxVScale(F, TTI); |
| 383 | // Limit MaxScalableVF by the maximum safe dependence distance. |
| 384 | MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale); |
| 385 | |
| 386 | if (!MaxScalableVF) |
| 387 | reportVectorizationInfo( |
| 388 | Msg: "Max legal vector width too small, scalable vectorization " |
| 389 | "unfeasible." , |
| 390 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
| 391 | |
| 392 | return MaxScalableVF; |
| 393 | } |
| 394 | |
| 395 | FixedScalableVFPair VFSelectionContext::computeFeasibleMaxVF( |
| 396 | unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC, |
| 397 | bool FoldTailByMasking, bool RequiresScalarEpilogue) { |
| 398 | auto [SmallestType, WidestType] = getSmallestAndWidestTypes(); |
| 399 | |
| 400 | // Get the maximum safe dependence distance in bits computed by LAA. |
| 401 | // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from |
| 402 | // the memory accesses that is most restrictive (involved in the smallest |
| 403 | // dependence distance). |
| 404 | unsigned MaxSafeElementsPowerOf2 = |
| 405 | llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType); |
| 406 | if (!Legal->isSafeForAnyStoreLoadForwardDistances()) { |
| 407 | unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits(); |
| 408 | MaxSafeElementsPowerOf2 = |
| 409 | std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType); |
| 410 | } |
| 411 | |
| 412 | auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2); |
| 413 | auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2); |
| 414 | |
| 415 | if (!Legal->isSafeForAnyVectorWidth()) |
| 416 | MaxSafeElements = MaxSafeElementsPowerOf2; |
| 417 | |
| 418 | LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF |
| 419 | << ".\n" ); |
| 420 | LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF |
| 421 | << ".\n" ); |
| 422 | |
| 423 | // First analyze the UserVF, fall back if the UserVF should be ignored. |
| 424 | if (UserVF) { |
| 425 | auto MaxSafeUserVF = |
| 426 | UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; |
| 427 | |
| 428 | if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) { |
| 429 | // If `VF=vscale x N` is safe, then so is `VF=N` |
| 430 | if (UserVF.isScalable()) |
| 431 | return FixedScalableVFPair( |
| 432 | ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF); |
| 433 | |
| 434 | return UserVF; |
| 435 | } |
| 436 | |
| 437 | assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); |
| 438 | |
| 439 | // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it |
| 440 | // is better to ignore the hint and let the compiler choose a suitable VF. |
| 441 | if (!UserVF.isScalable()) { |
| 442 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| 443 | << " is unsafe, clamping to max safe VF=" |
| 444 | << MaxSafeFixedVF << ".\n" ); |
| 445 | ORE->emit(RemarkBuilder: [&]() { |
| 446 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
| 447 | TheLoop->getStartLoc(), |
| 448 | TheLoop->getHeader()) |
| 449 | << "User-specified vectorization factor " |
| 450 | << ore::NV("UserVectorizationFactor" , UserVF) |
| 451 | << " is unsafe, clamping to maximum safe vectorization factor " |
| 452 | << ore::NV("VectorizationFactor" , MaxSafeFixedVF); |
| 453 | }); |
| 454 | return MaxSafeFixedVF; |
| 455 | } |
| 456 | |
| 457 | if (!supportsScalableVectors()) { |
| 458 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| 459 | << " is ignored because scalable vectors are not " |
| 460 | "available.\n" ); |
| 461 | ORE->emit(RemarkBuilder: [&]() { |
| 462 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
| 463 | TheLoop->getStartLoc(), |
| 464 | TheLoop->getHeader()) |
| 465 | << "User-specified vectorization factor " |
| 466 | << ore::NV("UserVectorizationFactor" , UserVF) |
| 467 | << " is ignored because the target does not support scalable " |
| 468 | "vectors. The compiler will pick a more suitable value." ; |
| 469 | }); |
| 470 | } else { |
| 471 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
| 472 | << " is unsafe. Ignoring scalable UserVF.\n" ); |
| 473 | ORE->emit(RemarkBuilder: [&]() { |
| 474 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
| 475 | TheLoop->getStartLoc(), |
| 476 | TheLoop->getHeader()) |
| 477 | << "User-specified vectorization factor " |
| 478 | << ore::NV("UserVectorizationFactor" , UserVF) |
| 479 | << " is unsafe. Ignoring the hint to let the compiler pick a " |
| 480 | "more suitable value." ; |
| 481 | }); |
| 482 | } |
| 483 | } |
| 484 | |
| 485 | LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
| 486 | << " / " << WidestType << " bits.\n" ); |
| 487 | |
| 488 | FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1), |
| 489 | ElementCount::getScalable(MinVal: 0)); |
| 490 | if (auto MaxVF = getMaximizedVFForTarget( |
| 491 | MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeFixedVF, UserIC, |
| 492 | FoldTailByMasking, RequiresScalarEpilogue)) |
| 493 | Result.FixedVF = MaxVF; |
| 494 | |
| 495 | if (auto MaxVF = getMaximizedVFForTarget( |
| 496 | MaxTripCount, SmallestType, WidestType, MaxSafeVF: MaxSafeScalableVF, UserIC, |
| 497 | FoldTailByMasking, RequiresScalarEpilogue)) |
| 498 | if (MaxVF.isScalable()) { |
| 499 | Result.ScalableVF = MaxVF; |
| 500 | LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF |
| 501 | << "\n" ); |
| 502 | } |
| 503 | |
| 504 | return Result; |
| 505 | } |
| 506 | |
| 507 | std::pair<unsigned, unsigned> |
| 508 | VFSelectionContext::getSmallestAndWidestTypes() const { |
| 509 | unsigned MinWidth = -1U; |
| 510 | unsigned MaxWidth = 8; |
| 511 | const DataLayout &DL = F.getDataLayout(); |
| 512 | // For in-loop reductions, no element types are added to ElementTypesInLoop |
| 513 | // if there are no loads/stores in the loop. In this case, check through the |
| 514 | // reduction variables to determine the maximum width. |
| 515 | if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { |
| 516 | for (const auto &[_, RdxDesc] : Legal->getReductionVars()) { |
| 517 | // When finding the min width used by the recurrence we need to account |
| 518 | // for casts on the input operands of the recurrence. |
| 519 | MinWidth = std::min( |
| 520 | a: MinWidth, |
| 521 | b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), |
| 522 | b: RdxDesc.getRecurrenceType()->getScalarSizeInBits())); |
| 523 | MaxWidth = std::max(a: MaxWidth, |
| 524 | b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()); |
| 525 | } |
| 526 | } else { |
| 527 | for (Type *T : ElementTypesInLoop) { |
| 528 | MinWidth = std::min<unsigned>( |
| 529 | a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
| 530 | MaxWidth = std::max<unsigned>( |
| 531 | a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
| 532 | } |
| 533 | } |
| 534 | return {MinWidth, MaxWidth}; |
| 535 | } |
| 536 | |
| 537 | void VFSelectionContext::collectElementTypesForWidening( |
| 538 | const SmallPtrSetImpl<const Value *> *ValuesToIgnore) { |
| 539 | ElementTypesInLoop.clear(); |
| 540 | // For each block. |
| 541 | for (BasicBlock *BB : TheLoop->blocks()) { |
| 542 | // For each instruction in the loop. |
| 543 | for (Instruction &I : *BB) { |
| 544 | Type *T = I.getType(); |
| 545 | |
| 546 | // Skip ignored values. |
| 547 | if (ValuesToIgnore && ValuesToIgnore->contains(Ptr: &I)) |
| 548 | continue; |
| 549 | |
| 550 | // Only examine Loads, Stores and PHINodes. |
| 551 | if (!isa<LoadInst, StoreInst, PHINode>(Val: I)) |
| 552 | continue; |
| 553 | |
| 554 | // Examine PHI nodes that are reduction variables. Update the type to |
| 555 | // account for the recurrence type. |
| 556 | if (auto *PN = dyn_cast<PHINode>(Val: &I)) { |
| 557 | if (!Legal->isReductionVariable(PN)) |
| 558 | continue; |
| 559 | const RecurrenceDescriptor &RdxDesc = |
| 560 | Legal->getRecurrenceDescriptor(PN); |
| 561 | if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || |
| 562 | TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(), |
| 563 | Ty: RdxDesc.getRecurrenceType())) |
| 564 | continue; |
| 565 | T = RdxDesc.getRecurrenceType(); |
| 566 | } |
| 567 | |
| 568 | // Examine the stored values. |
| 569 | if (auto *ST = dyn_cast<StoreInst>(Val: &I)) |
| 570 | T = ST->getValueOperand()->getType(); |
| 571 | |
| 572 | assert(T->isSized() && |
| 573 | "Expected the load/store/recurrence type to be sized" ); |
| 574 | |
| 575 | ElementTypesInLoop.insert(Ptr: T); |
| 576 | } |
| 577 | } |
| 578 | } |
| 579 | |
| 580 | void VFSelectionContext::initializeVScaleForTuning() { |
| 581 | if (!supportsScalableVectors()) |
| 582 | return; |
| 583 | |
| 584 | if (F.hasFnAttribute(Kind: Attribute::VScaleRange)) { |
| 585 | auto Attr = F.getFnAttribute(Kind: Attribute::VScaleRange); |
| 586 | auto Min = Attr.getVScaleRangeMin(); |
| 587 | auto Max = Attr.getVScaleRangeMax(); |
| 588 | if (Max && Min == Max) { |
| 589 | VScaleForTuning = Max; |
| 590 | return; |
| 591 | } |
| 592 | } |
| 593 | |
| 594 | VScaleForTuning = TTI.getVScaleForTuning(); |
| 595 | } |
| 596 | |
| 597 | bool VFSelectionContext::useOrderedReductions( |
| 598 | const RecurrenceDescriptor &RdxDesc) const { |
| 599 | return !Hints->allowReordering() && RdxDesc.isOrdered(); |
| 600 | } |
| 601 | |
| 602 | bool VFSelectionContext::runtimeChecksRequired() { |
| 603 | LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n" ); |
| 604 | |
| 605 | Loop *L = const_cast<Loop *>(TheLoop); |
| 606 | if (Legal->getRuntimePointerChecking()->Need) { |
| 607 | reportVectorizationFailure( |
| 608 | DebugMsg: "Runtime ptr check is required with -Os/-Oz" , |
| 609 | OREMsg: "runtime pointer checks needed. Enable vectorization of this " |
| 610 | "loop with '#pragma clang loop vectorize(enable)' when " |
| 611 | "compiling with -Os/-Oz" , |
| 612 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop: L); |
| 613 | return true; |
| 614 | } |
| 615 | |
| 616 | if (!PSE.getPredicate().isAlwaysTrue()) { |
| 617 | reportVectorizationFailure( |
| 618 | DebugMsg: "Runtime SCEV check is required with -Os/-Oz" , |
| 619 | OREMsg: "runtime SCEV checks needed. Enable vectorization of this " |
| 620 | "loop with '#pragma clang loop vectorize(enable)' when " |
| 621 | "compiling with -Os/-Oz" , |
| 622 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop: L); |
| 623 | return true; |
| 624 | } |
| 625 | |
| 626 | // FIXME: Avoid specializing for stride==1 instead of bailing out. |
| 627 | if (!Legal->getLAI()->getSymbolicStrides().empty()) { |
| 628 | reportVectorizationFailure( |
| 629 | DebugMsg: "Runtime stride check for small trip count" , |
| 630 | OREMsg: "runtime stride == 1 checks needed. Enable vectorization of " |
| 631 | "this loop without such check by compiling with -Os/-Oz" , |
| 632 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop: L); |
| 633 | return true; |
| 634 | } |
| 635 | |
| 636 | return false; |
| 637 | } |
| 638 | |
| 639 | void VFSelectionContext::computeMinimalBitwidths() { |
| 640 | MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI); |
| 641 | } |
| 642 | |
| 643 | void VFSelectionContext::collectInLoopReductions() { |
| 644 | // Avoid duplicating work finding in-loop reductions. |
| 645 | if (!InLoopReductions.empty()) |
| 646 | return; |
| 647 | |
| 648 | for (const auto &Reduction : Legal->getReductionVars()) { |
| 649 | PHINode *Phi = Reduction.first; |
| 650 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
| 651 | |
| 652 | // Multi-use reductions (e.g., used in FindLastIV patterns) are handled |
| 653 | // separately and should not be considered for in-loop reductions. |
| 654 | if (RdxDesc.hasUsesOutsideReductionChain()) |
| 655 | continue; |
| 656 | |
| 657 | // We don't collect reductions that are type promoted (yet). |
| 658 | if (RdxDesc.getRecurrenceType() != Phi->getType()) |
| 659 | continue; |
| 660 | |
| 661 | // In-loop AnyOf and FindIV reductions are not yet supported. |
| 662 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
| 663 | if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) || |
| 664 | RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) || |
| 665 | RecurrenceDescriptor::isFindLastRecurrenceKind(Kind)) |
| 666 | continue; |
| 667 | |
| 668 | // If the target would prefer this reduction to happen "in-loop", then we |
| 669 | // want to record it as such. |
| 670 | if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && |
| 671 | !TTI.preferInLoopReduction(Kind, Ty: Phi->getType())) |
| 672 | continue; |
| 673 | |
| 674 | // Check that we can correctly put the reductions into the loop, by |
| 675 | // finding the chain of operations that leads from the phi to the loop |
| 676 | // exit value. |
| 677 | SmallVector<Instruction *, 4> ReductionOperations = |
| 678 | RdxDesc.getReductionOpChain(Phi, L: const_cast<Loop *>(TheLoop)); |
| 679 | bool InLoop = !ReductionOperations.empty(); |
| 680 | |
| 681 | if (InLoop) { |
| 682 | InLoopReductions.insert(Ptr: Phi); |
| 683 | // Add the elements to InLoopReductionImmediateChains for cost modelling. |
| 684 | Instruction *LastChain = Phi; |
| 685 | for (auto *I : ReductionOperations) { |
| 686 | InLoopReductionImmediateChains[I] = LastChain; |
| 687 | LastChain = I; |
| 688 | } |
| 689 | } |
| 690 | LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop" ) |
| 691 | << " reduction for phi: " << *Phi << "\n" ); |
| 692 | } |
| 693 | } |
| 694 | |
| 695 | bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, |
| 696 | const VectorizationFactor &B, |
| 697 | const unsigned MaxTripCount, |
| 698 | bool HasTail, |
| 699 | bool IsEpilogue) const { |
| 700 | InstructionCost CostA = A.Cost; |
| 701 | InstructionCost CostB = B.Cost; |
| 702 | |
| 703 | // When there is a hint to always prefer scalable vectors, honour that hint. |
| 704 | if (Hints.isScalableVectorizationAlwaysPreferred()) |
| 705 | if (A.Width.isScalable() && CostA.isValid() && !B.Width.isScalable() && |
| 706 | !B.Width.isScalar()) |
| 707 | return true; |
| 708 | |
| 709 | // Improve estimate for the vector width if it is scalable. |
| 710 | unsigned EstimatedWidthA = A.Width.getKnownMinValue(); |
| 711 | unsigned EstimatedWidthB = B.Width.getKnownMinValue(); |
| 712 | if (std::optional<unsigned> VScale = Config.getVScaleForTuning()) { |
| 713 | if (A.Width.isScalable()) |
| 714 | EstimatedWidthA *= *VScale; |
| 715 | if (B.Width.isScalable()) |
| 716 | EstimatedWidthB *= *VScale; |
| 717 | } |
| 718 | |
| 719 | // When optimizing for size choose whichever is smallest, which will be the |
| 720 | // one with the smallest cost for the whole loop. On a tie pick the larger |
| 721 | // vector width, on the assumption that throughput will be greater. |
| 722 | if (Config.CostKind == TTI::TCK_CodeSize) |
| 723 | return CostA < CostB || |
| 724 | (CostA == CostB && EstimatedWidthA > EstimatedWidthB); |
| 725 | |
| 726 | // Assume vscale may be larger than 1 (or the value being tuned for), |
| 727 | // so that scalable vectorization is slightly favorable over fixed-width |
| 728 | // vectorization. |
| 729 | bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) && |
| 730 | A.Width.isScalable() && !B.Width.isScalable(); |
| 731 | |
| 732 | auto CmpFn = [PreferScalable](const InstructionCost &LHS, |
| 733 | const InstructionCost &RHS) { |
| 734 | return PreferScalable ? LHS <= RHS : LHS < RHS; |
| 735 | }; |
| 736 | |
| 737 | // To avoid the need for FP division: |
| 738 | // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) |
| 739 | // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) |
| 740 | bool LowerCostWithoutTC = |
| 741 | CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA); |
| 742 | if (!MaxTripCount) |
| 743 | return LowerCostWithoutTC; |
| 744 | |
| 745 | auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF, |
| 746 | InstructionCost VectorCost, |
| 747 | InstructionCost ScalarCost) { |
| 748 | // If the trip count is a known (possibly small) constant, the trip count |
| 749 | // will be rounded up to an integer number of iterations under |
| 750 | // FoldTailByMasking. The total cost in that case will be |
| 751 | // VecCost*ceil(TripCount/VF). When not folding the tail, the total |
| 752 | // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be |
| 753 | // some extra overheads, but for the purpose of comparing the costs of |
| 754 | // different VFs we can use this to compare the total loop-body cost |
| 755 | // expected after vectorization. |
| 756 | if (HasTail) |
| 757 | return VectorCost * (MaxTripCount / VF) + |
| 758 | ScalarCost * (MaxTripCount % VF); |
| 759 | return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF); |
| 760 | }; |
| 761 | |
| 762 | auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost); |
| 763 | auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost); |
| 764 | bool LowerCostWithTC = CmpFn(RTCostA, RTCostB); |
| 765 | LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) { |
| 766 | dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width) |
| 767 | << " has lower cost than VF " |
| 768 | << (LowerCostWithTC ? B.Width : A.Width) |
| 769 | << " when taking the cost of the remaining scalar loop iterations " |
| 770 | "into consideration for a maximum trip count of " |
| 771 | << MaxTripCount << ".\n" ; |
| 772 | }); |
| 773 | return LowerCostWithTC; |
| 774 | } |
| 775 | |
| 776 | bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A, |
| 777 | const VectorizationFactor &B, |
| 778 | bool HasTail, |
| 779 | bool IsEpilogue) const { |
| 780 | const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); |
| 781 | return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail, |
| 782 | IsEpilogue); |
| 783 | } |
| 784 | |
| 785 | // TODO: we could return a pair of values that specify the max VF and |
| 786 | // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of |
| 787 | // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment |
| 788 | // doesn't have a cost model that can choose which plan to execute if |
| 789 | // more than one is generated. |
| 790 | FixedScalableVFPair |
| 791 | VFSelectionContext::computeVPlanOuterloopVF(ElementCount UserVF) { |
| 792 | if (UserVF.isScalable() && !supportsScalableVectors()) { |
| 793 | reportVectorizationFailure( |
| 794 | DebugMsg: "Scalable vectorization requested but not supported by the target" , |
| 795 | OREMsg: "the scalable user-specified vectorization width for outer-loop " |
| 796 | "vectorization cannot be used because the target does not support " |
| 797 | "scalable vectors." , |
| 798 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
| 799 | return FixedScalableVFPair::getNone(); |
| 800 | } |
| 801 | |
| 802 | ElementCount VF = UserVF; |
| 803 | if (VF.isZero()) { |
| 804 | auto [_, WidestType] = getSmallestAndWidestTypes(); |
| 805 | |
| 806 | auto RegKind = TTI.enableScalableVectorization() |
| 807 | ? TargetTransformInfo::RGK_ScalableVector |
| 808 | : TargetTransformInfo::RGK_FixedWidthVector; |
| 809 | |
| 810 | TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind); |
| 811 | // The widest type may be wider than the register width and WidestType may |
| 812 | // not be a power of two; round the element count down to a power of two. |
| 813 | unsigned N = std::max<uint64_t>( |
| 814 | a: 1, b: llvm::bit_floor(Value: RegSize.getKnownMinValue() / WidestType)); |
| 815 | VF = ElementCount::get(MinVal: N, Scalable: RegSize.isScalable()); |
| 816 | LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n" ); |
| 817 | |
| 818 | // Make sure we have a VF > 1 for stress testing. |
| 819 | if (VPlanBuildOuterloopStressTest && VF.isScalar()) { |
| 820 | LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " |
| 821 | << "overriding computed VF.\n" ); |
| 822 | VF = ElementCount::getFixed(MinVal: 4); |
| 823 | } |
| 824 | } |
| 825 | assert(isPowerOf2_32(VF.getKnownMinValue()) && |
| 826 | "VF needs to be a power of two" ); |
| 827 | if (VF.isScalar()) |
| 828 | return FixedScalableVFPair::getNone(); |
| 829 | LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "" ) |
| 830 | << "VF " << VF << " to build VPlans.\n" ); |
| 831 | return FixedScalableVFPair(VF); |
| 832 | } |
| 833 | |