| 1 | //===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "VPlanAnalysis.h" |
| 10 | #include "VPlan.h" |
| 11 | #include "VPlanCFG.h" |
| 12 | #include "VPlanDominatorTree.h" |
| 13 | #include "VPlanHelpers.h" |
| 14 | #include "VPlanPatternMatch.h" |
| 15 | #include "llvm/ADT/PostOrderIterator.h" |
| 16 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 17 | |
| 18 | using namespace llvm; |
| 19 | using namespace VPlanPatternMatch; |
| 20 | |
| 21 | #define DEBUG_TYPE "vplan" |
| 22 | |
| 23 | void llvm::collectEphemeralRecipesForVPlan( |
| 24 | VPlan &Plan, DenseSet<VPRecipeBase *> &EphRecipes) { |
| 25 | // First, collect seed recipes which are operands of assumes. |
| 26 | SmallVector<VPRecipeBase *> Worklist; |
| 27 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 28 | Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) { |
| 29 | for (VPRecipeBase &R : *VPBB) { |
| 30 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
| 31 | if (!RepR || !match(V: RepR, P: m_Intrinsic<Intrinsic::assume>())) |
| 32 | continue; |
| 33 | Worklist.push_back(Elt: RepR); |
| 34 | EphRecipes.insert(V: RepR); |
| 35 | } |
| 36 | } |
| 37 | |
| 38 | // Process operands of candidates in worklist and add them to the set of |
| 39 | // ephemeral recipes, if they don't have side-effects and are only used by |
| 40 | // other ephemeral recipes. |
| 41 | while (!Worklist.empty()) { |
| 42 | VPRecipeBase *Cur = Worklist.pop_back_val(); |
| 43 | for (VPValue *Op : Cur->operands()) { |
| 44 | auto *OpR = Op->getDefiningRecipe(); |
| 45 | if (!OpR || OpR->mayHaveSideEffects() || EphRecipes.contains(V: OpR)) |
| 46 | continue; |
| 47 | if (any_of(Range: Op->users(), P: [EphRecipes](VPUser *U) { |
| 48 | auto *UR = dyn_cast<VPRecipeBase>(Val: U); |
| 49 | return !UR || !EphRecipes.contains(V: UR); |
| 50 | })) |
| 51 | continue; |
| 52 | EphRecipes.insert(V: OpR); |
| 53 | Worklist.push_back(Elt: OpR); |
| 54 | } |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | template void DomTreeBuilder::Calculate<DominatorTreeBase<VPBlockBase, false>>( |
| 59 | DominatorTreeBase<VPBlockBase, false> &DT); |
| 60 | |
| 61 | bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, |
| 62 | const VPRecipeBase *B) const { |
| 63 | if (A == B) |
| 64 | return false; |
| 65 | |
| 66 | auto LocalComesBefore = [](const VPRecipeBase *A, const VPRecipeBase *B) { |
| 67 | for (auto &R : *A->getParent()) { |
| 68 | if (&R == A) |
| 69 | return true; |
| 70 | if (&R == B) |
| 71 | return false; |
| 72 | } |
| 73 | llvm_unreachable("recipe not found" ); |
| 74 | }; |
| 75 | const VPBlockBase *ParentA = A->getParent(); |
| 76 | const VPBlockBase *ParentB = B->getParent(); |
| 77 | if (ParentA == ParentB) |
| 78 | return LocalComesBefore(A, B); |
| 79 | |
| 80 | return Base::properlyDominates(A: ParentA, B: ParentB); |
| 81 | } |
| 82 | |
| 83 | InstructionCost |
| 84 | VPRegisterUsage::spillCost(const TargetTransformInfo &TTI, |
| 85 | TargetTransformInfo::TargetCostKind CostKind, |
| 86 | unsigned OverrideMaxNumRegs) const { |
| 87 | InstructionCost Cost; |
| 88 | for (const auto &[RegClass, MaxUsers] : MaxLocalUsers) { |
| 89 | unsigned AvailableRegs = OverrideMaxNumRegs > 0 |
| 90 | ? OverrideMaxNumRegs |
| 91 | : TTI.getNumberOfRegisters(ClassID: RegClass); |
| 92 | if (MaxUsers > AvailableRegs) { |
| 93 | // Assume that for each register used past what's available we get one |
| 94 | // spill and reload. |
| 95 | unsigned Spills = MaxUsers - AvailableRegs; |
| 96 | InstructionCost SpillCost = |
| 97 | TTI.getRegisterClassSpillCost(ClassID: RegClass, CostKind) + |
| 98 | TTI.getRegisterClassReloadCost(ClassID: RegClass, CostKind); |
| 99 | InstructionCost TotalCost = Spills * SpillCost; |
| 100 | LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from " |
| 101 | << Spills << " spills of " |
| 102 | << TTI.getRegisterClassName(RegClass) << "\n" ); |
| 103 | Cost += TotalCost; |
| 104 | } |
| 105 | } |
| 106 | return Cost; |
| 107 | } |
| 108 | |
| 109 | SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan( |
| 110 | VPlan &Plan, ArrayRef<ElementCount> VFs, const TargetTransformInfo &TTI, |
| 111 | const SmallPtrSetImpl<const Value *> &ValuesToIgnore) { |
| 112 | // Each 'key' in the map opens a new interval. The values |
| 113 | // of the map are the index of the 'last seen' usage of the |
| 114 | // VPValue that is the key. |
| 115 | using IntervalMap = SmallDenseMap<VPValue *, unsigned, 16>; |
| 116 | |
| 117 | // Maps indices to recipes. |
| 118 | SmallVector<VPRecipeBase *, 64> Idx2Recipe; |
| 119 | // Marks the end of each interval. |
| 120 | IntervalMap EndPoint; |
| 121 | // Saves the list of VPValues that are used in the loop. |
| 122 | SmallPtrSet<VPValue *, 8> Ends; |
| 123 | // Saves the list of values that are used in the loop but are defined outside |
| 124 | // the loop (not including non-recipe values such as arguments and |
| 125 | // constants). |
| 126 | SmallSetVector<VPValue *, 8> LoopInvariants; |
| 127 | if (!Plan.getVectorTripCount().user_empty()) |
| 128 | LoopInvariants.insert(X: &Plan.getVectorTripCount()); |
| 129 | |
| 130 | // We scan the loop in a topological order in order and assign a number to |
| 131 | // each recipe. We use RPO to ensure that defs are met before their users. We |
| 132 | // assume that each recipe that has in-loop users starts an interval. We |
| 133 | // record every time that an in-loop value is used, so we have a list of the |
| 134 | // first occurences of each recipe and last occurrence of each VPValue. |
| 135 | VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); |
| 136 | ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT( |
| 137 | LoopRegion); |
| 138 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: RPOT)) { |
| 139 | if (!VPBB->getParent()) |
| 140 | break; |
| 141 | for (VPRecipeBase &R : *VPBB) { |
| 142 | Idx2Recipe.push_back(Elt: &R); |
| 143 | |
| 144 | // Save the end location of each USE. |
| 145 | for (VPValue *U : R.operands()) { |
| 146 | if (isa<VPRecipeValue>(Val: U)) { |
| 147 | // Overwrite previous end points. |
| 148 | EndPoint[U] = Idx2Recipe.size(); |
| 149 | Ends.insert(Ptr: U); |
| 150 | } else if (auto *IRV = dyn_cast<VPIRValue>(Val: U)) { |
| 151 | // Ignore non-recipe values such as arguments, constants, etc. |
| 152 | // FIXME: Might need some motivation why these values are ignored. If |
| 153 | // for example an argument is used inside the loop it will increase |
| 154 | // the register pressure (so shouldn't we add it to LoopInvariants). |
| 155 | if (!isa<Instruction>(Val: IRV->getValue())) |
| 156 | continue; |
| 157 | // This recipe is outside the loop, record it and continue. |
| 158 | LoopInvariants.insert(X: U); |
| 159 | } |
| 160 | // Other types of VPValue are currently not tracked. |
| 161 | } |
| 162 | } |
| 163 | if (VPBB == LoopRegion->getExiting()) { |
| 164 | // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the |
| 165 | // exiting block, where their increment will get materialized eventually. |
| 166 | for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { |
| 167 | if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R)) { |
| 168 | EndPoint[WideIV] = Idx2Recipe.size(); |
| 169 | Ends.insert(Ptr: WideIV); |
| 170 | } |
| 171 | } |
| 172 | } |
| 173 | } |
| 174 | |
| 175 | // Saves the list of intervals that end with the index in 'key'. |
| 176 | using VPValueList = SmallVector<VPValue *, 2>; |
| 177 | SmallDenseMap<unsigned, VPValueList, 16> TransposeEnds; |
| 178 | |
| 179 | // Next, we transpose the EndPoints into a multi map that holds the list of |
| 180 | // intervals that *end* at a specific location. |
| 181 | for (auto &Interval : EndPoint) |
| 182 | TransposeEnds[Interval.second].push_back(Elt: Interval.first); |
| 183 | |
| 184 | SmallPtrSet<VPValue *, 8> OpenIntervals; |
| 185 | SmallVector<VPRegisterUsage, 8> RUs(VFs.size()); |
| 186 | SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); |
| 187 | |
| 188 | LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n" ); |
| 189 | |
| 190 | const auto &TTICapture = TTI; |
| 191 | auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { |
| 192 | if (Ty->isTokenTy() || !VectorType::isValidElementType(ElemTy: Ty) || |
| 193 | (VF.isScalable() && |
| 194 | !TTICapture.isElementTypeLegalForScalableVector(Ty))) |
| 195 | return 0; |
| 196 | return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF)); |
| 197 | }; |
| 198 | |
| 199 | VPValue *CanIV = LoopRegion->getCanonicalIV(); |
| 200 | // Note: canonical IVs are retained even if they have no users. |
| 201 | if (!CanIV->user_empty()) |
| 202 | OpenIntervals.insert(Ptr: CanIV); |
| 203 | |
| 204 | // We scan the instructions linearly and record each time that a new interval |
| 205 | // starts, by placing it in a set. If we find this value in TransposEnds then |
| 206 | // we remove it from the set. The max register usage is the maximum register |
| 207 | // usage of the recipes of the set. |
| 208 | for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { |
| 209 | VPRecipeBase *R = Idx2Recipe[Idx]; |
| 210 | |
| 211 | // Remove all of the VPValues that end at this location. |
| 212 | VPValueList &List = TransposeEnds[Idx]; |
| 213 | for (VPValue *ToRemove : List) |
| 214 | OpenIntervals.erase(Ptr: ToRemove); |
| 215 | |
| 216 | // Ignore recipes that are never used within the loop and do not have side |
| 217 | // effects. |
| 218 | if (none_of(Range: R->definedValues(), |
| 219 | P: [&Ends](VPValue *Def) { return Ends.count(Ptr: Def); }) && |
| 220 | !R->mayHaveSideEffects()) |
| 221 | continue; |
| 222 | |
| 223 | // Skip recipes for ignored values. |
| 224 | // TODO: Should mark recipes for ephemeral values that cannot be removed |
| 225 | // explictly in VPlan. |
| 226 | if (isa<VPSingleDefRecipe>(Val: R) && |
| 227 | ValuesToIgnore.contains( |
| 228 | Ptr: cast<VPSingleDefRecipe>(Val: R)->getUnderlyingValue())) |
| 229 | continue; |
| 230 | |
| 231 | // For each VF find the maximum usage of registers. |
| 232 | for (unsigned J = 0, E = VFs.size(); J < E; ++J) { |
| 233 | // Count the number of registers used, per register class, given all open |
| 234 | // intervals. |
| 235 | // Note that elements in this SmallMapVector will be default constructed |
| 236 | // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if |
| 237 | // there is no previous entry for ClassID. |
| 238 | SmallMapVector<unsigned, unsigned, 4> RegUsage; |
| 239 | |
| 240 | for (auto *VPV : OpenIntervals) { |
| 241 | // Skip artificial values or values that weren't present in the original |
| 242 | // loop. |
| 243 | // TODO: Remove skipping values that weren't present in the original |
| 244 | // loop after removing the legacy |
| 245 | // LoopVectorizationCostModel::calculateRegisterUsage |
| 246 | if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe, |
| 247 | VPBranchOnMaskRecipe>(Val: VPV) || |
| 248 | match(V: VPV, P: m_ExtractLastPart(Op0: m_VPValue()))) |
| 249 | continue; |
| 250 | |
| 251 | if (VFs[J].isScalar() || |
| 252 | isa<VPRegionValue, VPReplicateRecipe, VPDerivedIVRecipe, |
| 253 | VPCurrentIterationPHIRecipe, VPScalarIVStepsRecipe>(Val: VPV) || |
| 254 | (isa<VPInstruction>(Val: VPV) && vputils::onlyScalarValuesUsed(Def: VPV)) || |
| 255 | (isa<VPReductionPHIRecipe>(Val: VPV) && |
| 256 | (cast<VPReductionPHIRecipe>(Val: VPV))->isInLoop())) { |
| 257 | unsigned ClassID = |
| 258 | TTI.getRegisterClassForType(Vector: false, Ty: VPV->getScalarType()); |
| 259 | // FIXME: The target might use more than one register for the type |
| 260 | // even in the scalar case. |
| 261 | RegUsage[ClassID] += 1; |
| 262 | } else { |
| 263 | // The output from scaled phis and scaled reductions actually has |
| 264 | // fewer lanes than the VF. |
| 265 | unsigned ScaleFactor = |
| 266 | vputils::getVFScaleFactor(R: VPV->getDefiningRecipe()); |
| 267 | ElementCount VF = VFs[J]; |
| 268 | if (ScaleFactor > 1) { |
| 269 | VF = VFs[J].divideCoefficientBy(RHS: ScaleFactor); |
| 270 | LLVM_DEBUG(dbgs() << "LV(REG): Scaled down VF from " << VFs[J] |
| 271 | << " to " << VF << " for " << *R << "\n" ;); |
| 272 | } |
| 273 | |
| 274 | Type *ScalarTy = VPV->getScalarType(); |
| 275 | unsigned ClassID = TTI.getRegisterClassForType(Vector: true, Ty: ScalarTy); |
| 276 | RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | for (const auto &Pair : RegUsage) { |
| 281 | auto &Entry = MaxUsages[J][Pair.first]; |
| 282 | Entry = std::max(a: Entry, b: Pair.second); |
| 283 | } |
| 284 | } |
| 285 | |
| 286 | LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " |
| 287 | << OpenIntervals.size() << '\n'); |
| 288 | |
| 289 | // Add used VPValues defined by the current recipe to the list of open |
| 290 | // intervals. |
| 291 | for (VPValue *DefV : R->definedValues()) |
| 292 | if (Ends.contains(Ptr: DefV)) |
| 293 | OpenIntervals.insert(Ptr: DefV); |
| 294 | } |
| 295 | |
| 296 | // We also search for instructions that are defined outside the loop, but are |
| 297 | // used inside the loop. We need this number separately from the max-interval |
| 298 | // usage number because when we unroll, loop-invariant values do not take |
| 299 | // more register. |
| 300 | VPRegisterUsage RU; |
| 301 | for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { |
| 302 | // Note that elements in this SmallMapVector will be default constructed |
| 303 | // as 0. So we can use "Invariant[ClassID] += n" in the code below even if |
| 304 | // there is no previous entry for ClassID. |
| 305 | SmallMapVector<unsigned, unsigned, 4> Invariant; |
| 306 | |
| 307 | for (auto *In : LoopInvariants) { |
| 308 | // FIXME: The target might use more than one register for the type |
| 309 | // even in the scalar case. |
| 310 | bool IsScalar = vputils::onlyScalarValuesUsed(Def: In); |
| 311 | |
| 312 | ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: 1) : VFs[Idx]; |
| 313 | unsigned ClassID = |
| 314 | TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: In->getScalarType()); |
| 315 | Invariant[ClassID] += GetRegUsage(In->getScalarType(), VF); |
| 316 | } |
| 317 | |
| 318 | LLVM_DEBUG({ |
| 319 | dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; |
| 320 | dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() |
| 321 | << " item\n" ; |
| 322 | for (const auto &pair : MaxUsages[Idx]) { |
| 323 | dbgs() << "LV(REG): RegisterClass: " |
| 324 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
| 325 | << " registers\n" ; |
| 326 | } |
| 327 | dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() |
| 328 | << " item\n" ; |
| 329 | for (const auto &pair : Invariant) { |
| 330 | dbgs() << "LV(REG): RegisterClass: " |
| 331 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
| 332 | << " registers\n" ; |
| 333 | } |
| 334 | }); |
| 335 | |
| 336 | RU.LoopInvariantRegs = Invariant; |
| 337 | RU.MaxLocalUsers = MaxUsages[Idx]; |
| 338 | RUs[Idx] = RU; |
| 339 | } |
| 340 | |
| 341 | return RUs; |
| 342 | } |
| 343 | |