| 1 | //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements explicit unrolling for VPlans. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "VPRecipeBuilder.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanCFG.h" |
| 18 | #include "VPlanHelpers.h" |
| 19 | #include "VPlanPatternMatch.h" |
| 20 | #include "VPlanTransforms.h" |
| 21 | #include "VPlanUtils.h" |
| 22 | #include "llvm/ADT/PostOrderIterator.h" |
| 23 | #include "llvm/ADT/STLExtras.h" |
| 24 | #include "llvm/ADT/ScopeExit.h" |
| 25 | #include "llvm/Analysis/IVDescriptors.h" |
| 26 | #include "llvm/IR/Constants.h" |
| 27 | #include "llvm/IR/Intrinsics.h" |
| 28 | |
| 29 | using namespace llvm; |
| 30 | using namespace llvm::VPlanPatternMatch; |
| 31 | |
| 32 | namespace { |
| 33 | |
| 34 | /// Helper to hold state needed for unrolling. It holds the Plan to unroll by |
| 35 | /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate |
| 36 | /// the unrolling transformation, where the original VPValues are retained for |
| 37 | /// part zero. |
| 38 | class UnrollState { |
| 39 | /// Plan to unroll. |
| 40 | VPlan &Plan; |
| 41 | /// Unroll factor to unroll by. |
| 42 | const unsigned UF; |
| 43 | |
| 44 | /// Unrolling may create recipes that should not be unrolled themselves. |
| 45 | /// Those are tracked in ToSkip. |
| 46 | SmallPtrSet<VPRecipeBase *, 8> ToSkip; |
| 47 | |
| 48 | // Associate with each VPValue of part 0 its unrolled instances of parts 1, |
| 49 | // ..., UF-1. |
| 50 | DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; |
| 51 | |
| 52 | /// Unroll replicate region \p VPR by cloning the region UF - 1 times. |
| 53 | void unrollReplicateRegionByUF(VPRegionBlock *VPR); |
| 54 | |
| 55 | /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across |
| 56 | /// all parts. |
| 57 | void unrollRecipeByUF(VPRecipeBase &R); |
| 58 | |
| 59 | /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled |
| 60 | /// depends on the concrete header phi. Inserts newly created recipes at \p |
| 61 | /// InsertPtForPhi. |
| 62 | void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
| 63 | VPBasicBlock::iterator InsertPtForPhi); |
| 64 | |
| 65 | /// Unroll a widen induction recipe \p IV. This introduces recipes to compute |
| 66 | /// the induction steps for each part. |
| 67 | void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, |
| 68 | VPBasicBlock::iterator InsertPtForPhi); |
| 69 | |
| 70 | VPValue *getConstantInt(unsigned Part) { |
| 71 | Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); |
| 72 | return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part); |
| 73 | } |
| 74 | |
| 75 | public: |
| 76 | UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF) {} |
| 77 | |
| 78 | void unrollBlock(VPBlockBase *VPB); |
| 79 | |
| 80 | VPValue *getValueForPart(VPValue *V, unsigned Part) { |
| 81 | if (Part == 0 || isa<VPIRValue, VPSymbolicValue, VPRegionValue>(Val: V)) |
| 82 | return V; |
| 83 | assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && |
| 84 | "accessed value does not exist" ); |
| 85 | return VPV2Parts[V][Part - 1]; |
| 86 | } |
| 87 | |
| 88 | /// Given a single original recipe \p OrigR (of part zero), and its copy \p |
| 89 | /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its |
| 90 | /// corresponding VPValue defined by \p CopyR. |
| 91 | void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, |
| 92 | unsigned Part) { |
| 93 | for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) { |
| 94 | const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV); |
| 95 | assert(V->second.size() == Part - 1 && "earlier parts not set" ); |
| 96 | V->second.push_back(Elt: CopyR->getVPValue(I: Idx)); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | /// Given a uniform recipe \p R, add it for all parts. |
| 101 | void addUniformForAllParts(VPSingleDefRecipe *R) { |
| 102 | const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R); |
| 103 | assert(Inserted && "uniform value already added" ); |
| 104 | for (unsigned Part = 0; Part != UF; ++Part) |
| 105 | V->second.push_back(Elt: R); |
| 106 | } |
| 107 | |
| 108 | bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); } |
| 109 | |
| 110 | /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part |
| 111 | /// \p P. |
| 112 | void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { |
| 113 | auto *Op = R->getOperand(N: OpIdx); |
| 114 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 115 | } |
| 116 | |
| 117 | /// Update \p R's operands with their corresponding VPValues for part \p P. |
| 118 | void remapOperands(VPRecipeBase *R, unsigned Part) { |
| 119 | for (const auto &[OpIdx, Op] : enumerate(First: R->operands())) |
| 120 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 121 | } |
| 122 | }; |
| 123 | } // namespace |
| 124 | |
| 125 | static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, |
| 126 | unsigned Part, VPlan &Plan) { |
| 127 | if (Part == 0) |
| 128 | return; |
| 129 | |
| 130 | VPBuilder Builder(Steps); |
| 131 | Type *BaseIVTy = Steps->getOperand(N: 0)->getScalarType(); |
| 132 | Type *IntStepTy = |
| 133 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
| 134 | VPValue *StartIndex = Steps->getVFValue(); |
| 135 | if (Part > 1) { |
| 136 | StartIndex = Builder.createOverflowingOp( |
| 137 | Opcode: Instruction::Mul, |
| 138 | Operands: {StartIndex, Plan.getConstantInt(Ty: StartIndex->getScalarType(), Val: Part)}); |
| 139 | } |
| 140 | StartIndex = Builder.createScalarSExtOrTrunc( |
| 141 | Op: StartIndex, ResultTy: IntStepTy, SrcTy: StartIndex->getScalarType(), DL: Steps->getDebugLoc()); |
| 142 | |
| 143 | if (BaseIVTy->isFloatingPointTy()) |
| 144 | StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex, |
| 145 | ResultTy: BaseIVTy, DL: Steps->getDebugLoc()); |
| 146 | |
| 147 | Steps->setStartIndex(StartIndex); |
| 148 | } |
| 149 | |
| 150 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
| 151 | VPBlockBase *InsertPt = VPR->getSingleSuccessor(); |
| 152 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 153 | auto *Copy = VPR->clone(); |
| 154 | VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt); |
| 155 | |
| 156 | auto PartI = vp_depth_first_shallow(G: Copy->getEntry()); |
| 157 | auto Part0 = vp_depth_first_shallow(G: VPR->getEntry()); |
| 158 | for (const auto &[PartIVPBB, Part0VPBB] : |
| 159 | zip(t: VPBlockUtils::blocksAs<VPBasicBlock>(Range&: PartI), |
| 160 | u: VPBlockUtils::blocksAs<VPBasicBlock>(Range&: Part0))) { |
| 161 | for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) { |
| 162 | remapOperands(R: &PartIR, Part); |
| 163 | if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR)) |
| 164 | addStartIndexForScalarSteps(Steps, Part, Plan); |
| 165 | |
| 166 | addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part); |
| 167 | } |
| 168 | } |
| 169 | } |
| 170 | } |
| 171 | |
| 172 | void UnrollState::unrollWidenInductionByUF( |
| 173 | VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { |
| 174 | VPBasicBlock *PH = cast<VPBasicBlock>( |
| 175 | Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); |
| 176 | Type *IVTy = IV->getScalarType(); |
| 177 | auto &ID = IV->getInductionDescriptor(); |
| 178 | FastMathFlags FMF; |
| 179 | VPIRFlags::WrapFlagsTy WrapFlags(false, false); |
| 180 | if (auto *IntOrFPInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: IV)) { |
| 181 | FMF = IntOrFPInd->getFastMathFlagsOrNone(); |
| 182 | if (IntOrFPInd->hasNoWrapFlags()) |
| 183 | WrapFlags = IntOrFPInd->getNoWrapFlags(); |
| 184 | } |
| 185 | |
| 186 | VPValue *ScalarStep = IV->getStepValue(); |
| 187 | VPBuilder Builder(PH); |
| 188 | Type *VectorStepTy = IVTy->isPointerTy() ? ScalarStep->getScalarType() : IVTy; |
| 189 | VPInstruction *VectorStep = Builder.createNaryOp( |
| 190 | Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy, Flags: FMF, |
| 191 | DL: IV->getDebugLoc()); |
| 192 | |
| 193 | ToSkip.insert(Ptr: VectorStep); |
| 194 | |
| 195 | // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 |
| 196 | // remains the header phi. Parts > 0 are computed by adding Step to the |
| 197 | // previous part. The header phi recipe will get 2 new operands: the step |
| 198 | // value for a single part and the last part, used to compute the backedge |
| 199 | // value during VPWidenInductionRecipe::execute. |
| 200 | // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 |
| 201 | // %Part.1 = %Part.0 + %VectorStep |
| 202 | // %Part.2 = %Part.1 + %VectorStep |
| 203 | // %Part.3 = %Part.2 + %VectorStep |
| 204 | // |
| 205 | // The newly added recipes are added to ToSkip to avoid interleaving them |
| 206 | // again. |
| 207 | VPValue *Prev = IV; |
| 208 | Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi); |
| 209 | unsigned AddOpc; |
| 210 | VPIRFlags AddFlags; |
| 211 | if (IVTy->isPointerTy()) { |
| 212 | AddOpc = VPInstruction::WidePtrAdd; |
| 213 | AddFlags = GEPNoWrapFlags::none(); |
| 214 | } else if (IVTy->isFloatingPointTy()) { |
| 215 | AddOpc = ID.getInductionOpcode(); |
| 216 | AddFlags = FMF; |
| 217 | } else { |
| 218 | AddOpc = Instruction::Add; |
| 219 | AddFlags = WrapFlags; |
| 220 | if (cast<VPWidenIntOrFpInductionRecipe>(Val: IV)->isCanonical()) |
| 221 | AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false); |
| 222 | } |
| 223 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 224 | std::string Name = |
| 225 | Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add" ; |
| 226 | |
| 227 | VPInstruction *Add = |
| 228 | Builder.createNaryOp(Opcode: AddOpc, |
| 229 | Operands: { |
| 230 | Prev, |
| 231 | VectorStep, |
| 232 | }, |
| 233 | Flags: AddFlags, DL: IV->getDebugLoc(), Name); |
| 234 | ToSkip.insert(Ptr: Add); |
| 235 | addRecipeForPart(OrigR: IV, CopyR: Add, Part); |
| 236 | Prev = Add; |
| 237 | } |
| 238 | IV->addUnrolledPartOperands(SplatVFStep: VectorStep, LastPart: Prev); |
| 239 | } |
| 240 | |
| 241 | void UnrollState::(VPHeaderPHIRecipe *R, |
| 242 | VPBasicBlock::iterator InsertPtForPhi) { |
| 243 | // First-order recurrences pass a single vector or scalar through their header |
| 244 | // phis, irrespective of interleaving. |
| 245 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R)) |
| 246 | return; |
| 247 | |
| 248 | // Generate step vectors for each unrolled part. |
| 249 | if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) { |
| 250 | unrollWidenInductionByUF(IV, InsertPtForPhi); |
| 251 | return; |
| 252 | } |
| 253 | |
| 254 | auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R); |
| 255 | if (RdxPhi && RdxPhi->isOrdered()) |
| 256 | return; |
| 257 | |
| 258 | auto InsertPt = std::next(x: R->getIterator()); |
| 259 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 260 | VPRecipeBase *Copy = R->clone(); |
| 261 | Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt); |
| 262 | addRecipeForPart(OrigR: R, CopyR: Copy, Part); |
| 263 | if (RdxPhi) { |
| 264 | // If the start value is a ReductionStartVector, use the identity value |
| 265 | // (second operand) for unrolled parts. If the scaling factor is > 1, |
| 266 | // create a new ReductionStartVector with the scale factor and both |
| 267 | // operands set to the identity value. |
| 268 | if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) { |
| 269 | assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && |
| 270 | "unexpected start VPInstruction" ); |
| 271 | if (Part != 1) |
| 272 | continue; |
| 273 | VPValue *StartV; |
| 274 | if (match(V: VPI->getOperand(N: 2), P: m_One())) { |
| 275 | StartV = VPI->getOperand(N: 1); |
| 276 | } else { |
| 277 | auto *C = VPI->clone(); |
| 278 | C->setOperand(I: 0, New: C->getOperand(N: 1)); |
| 279 | C->insertAfter(InsertPos: VPI); |
| 280 | StartV = C; |
| 281 | } |
| 282 | for (unsigned Part = 1; Part != UF; ++Part) |
| 283 | VPV2Parts[VPI][Part - 1] = StartV; |
| 284 | } |
| 285 | } else { |
| 286 | assert(isa<VPActiveLaneMaskPHIRecipe>(R) && |
| 287 | "unexpected header phi recipe not needing unrolled part" ); |
| 288 | } |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | /// Handle non-header-phi recipes. |
| 293 | void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { |
| 294 | if (match(V: &R, P: m_CombineOr(Ps: m_BranchOnCond(), Ps: m_BranchOnCount()))) |
| 295 | return; |
| 296 | |
| 297 | if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) { |
| 298 | if (vputils::onlyFirstPartUsed(Def: VPI)) { |
| 299 | addUniformForAllParts(R: VPI); |
| 300 | return; |
| 301 | } |
| 302 | } |
| 303 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
| 304 | if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) && |
| 305 | RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) { |
| 306 | // Stores to an invariant address only need to store the last part. |
| 307 | remapOperands(R: &R, Part: UF - 1); |
| 308 | return; |
| 309 | } |
| 310 | if (match(V: RepR, |
| 311 | P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) { |
| 312 | addUniformForAllParts(R: RepR); |
| 313 | return; |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | // Unroll non-uniform recipes. |
| 318 | auto InsertPt = std::next(x: R.getIterator()); |
| 319 | VPBasicBlock &VPBB = *R.getParent(); |
| 320 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 321 | VPRecipeBase *Copy = R.clone(); |
| 322 | Copy->insertBefore(BB&: VPBB, IP: InsertPt); |
| 323 | addRecipeForPart(OrigR: &R, CopyR: Copy, Part); |
| 324 | |
| 325 | // Phi operands are updated once all other recipes have been unrolled. |
| 326 | if (isa<VPWidenPHIRecipe>(Val: Copy)) |
| 327 | continue; |
| 328 | |
| 329 | VPValue *Op; |
| 330 | if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
| 331 | Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) { |
| 332 | Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1)); |
| 333 | Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part)); |
| 334 | continue; |
| 335 | } |
| 336 | if (isa<VPVectorPointerRecipe, VPWidenCanonicalIVRecipe>(Val: R)) { |
| 337 | VPBuilder Builder(&R); |
| 338 | const DataLayout &DL = Plan.getDataLayout(); |
| 339 | Type *IndexTy = |
| 340 | isa<VPWidenCanonicalIVRecipe>(Val: R) |
| 341 | ? Plan.getVectorLoopRegion()->getCanonicalIVType() |
| 342 | : DL.getIndexType(PtrTy: R.getVPSingleValue()->getScalarType()); |
| 343 | Type *VFTy = Plan.getVF().getScalarType(); |
| 344 | VPValue *VF = Builder.createScalarZExtOrTrunc( |
| 345 | Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown()); |
| 346 | // VFxUF does not wrap, so VF * Part also cannot wrap. |
| 347 | VPValue *VFxPart = Builder.createOverflowingOp( |
| 348 | Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)}, |
| 349 | WrapFlags: {true, true}); |
| 350 | if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: Copy)) |
| 351 | VecPtr->addPerPartOffset(VFxPart); |
| 352 | else |
| 353 | cast<VPWidenCanonicalIVRecipe>(Val: Copy)->addPerPartStep(Step: VFxPart); |
| 354 | continue; |
| 355 | } |
| 356 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) { |
| 357 | auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0)); |
| 358 | if (Phi && Phi->isOrdered()) { |
| 359 | auto &Parts = VPV2Parts[Phi]; |
| 360 | if (Part == 1) { |
| 361 | Parts.clear(); |
| 362 | Parts.push_back(Elt: Red); |
| 363 | } |
| 364 | Parts.push_back(Elt: Copy->getVPSingleValue()); |
| 365 | Phi->setOperand(I: 1, New: Copy->getVPSingleValue()); |
| 366 | } |
| 367 | } |
| 368 | if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: Copy)) { |
| 369 | // Materialize PartN offset for VectorEndPointer. |
| 370 | VEPR->setOperand(I: 0, New: R.getOperand(N: 0)); |
| 371 | VEPR->setOperand(I: 1, New: R.getOperand(N: 1)); |
| 372 | VEPR->materializeOffset(Part); |
| 373 | continue; |
| 374 | } |
| 375 | |
| 376 | remapOperands(R: Copy, Part); |
| 377 | |
| 378 | if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy)) |
| 379 | addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part, Plan); |
| 380 | |
| 381 | if (match(V: Copy, |
| 382 | P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) { |
| 383 | VPBuilder Builder(Copy); |
| 384 | VPValue *ScaledByPart = Builder.createOverflowingOp( |
| 385 | Opcode: Instruction::Mul, Operands: {Copy->getOperand(N: 1), getConstantInt(Part)}); |
| 386 | Copy->setOperand(I: 1, New: ScaledByPart); |
| 387 | } |
| 388 | } |
| 389 | if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) { |
| 390 | // Materialize Part0 offset for VectorEndPointer. |
| 391 | VEPR->materializeOffset(); |
| 392 | } |
| 393 | if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: &R)) { |
| 394 | // Set Part0 step for WidenCanonicalIV. |
| 395 | WideCanIV->addPerPartStep(Step: getConstantInt(Part: 0)); |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | void UnrollState::unrollBlock(VPBlockBase *VPB) { |
| 400 | auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB); |
| 401 | if (VPR) { |
| 402 | if (VPR->isReplicator()) |
| 403 | return unrollReplicateRegionByUF(VPR); |
| 404 | |
| 405 | // Traverse blocks in region in RPO to ensure defs are visited before uses |
| 406 | // across blocks. |
| 407 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> |
| 408 | RPOT(VPR->getEntry()); |
| 409 | for (VPBlockBase *VPB : RPOT) |
| 410 | unrollBlock(VPB); |
| 411 | return; |
| 412 | } |
| 413 | |
| 414 | // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. |
| 415 | auto *VPBB = cast<VPBasicBlock>(Val: VPB); |
| 416 | auto InsertPtForPhi = VPBB->getFirstNonPhi(); |
| 417 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 418 | if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R)) |
| 419 | continue; |
| 420 | |
| 421 | // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and |
| 422 | // ComputeReductionResult which combine all parts to compute the final |
| 423 | // value. |
| 424 | VPValue *Op1; |
| 425 | if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) || |
| 426 | match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) || |
| 427 | match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) || |
| 428 | match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) { |
| 429 | auto *VPI = cast<VPInstruction>(Val: &R); |
| 430 | addUniformForAllParts(R: VPI); |
| 431 | for (unsigned Part = 1; Part != UF; ++Part) |
| 432 | VPI->addOperand(Op: getValueForPart(V: Op1, Part)); |
| 433 | continue; |
| 434 | } |
| 435 | VPValue *Op0; |
| 436 | if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) { |
| 437 | auto *VPI = cast<VPInstruction>(Val: &R); |
| 438 | addUniformForAllParts(R: VPI); |
| 439 | for (unsigned Part = 1; Part != UF; ++Part) |
| 440 | VPI->addOperand(Op: getValueForPart(V: Op1, Part)); |
| 441 | continue; |
| 442 | } |
| 443 | |
| 444 | VPValue *Op2; |
| 445 | if (match(V: &R, P: m_ExtractLastActive(Op0: m_VPValue(), Op1: m_VPValue(V&: Op1), |
| 446 | Op2: m_VPValue(V&: Op2)))) { |
| 447 | auto *VPI = cast<VPInstruction>(Val: &R); |
| 448 | addUniformForAllParts(R: VPI); |
| 449 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 450 | VPI->addOperand(Op: getValueForPart(V: Op1, Part)); |
| 451 | VPI->addOperand(Op: getValueForPart(V: Op2, Part)); |
| 452 | } |
| 453 | continue; |
| 454 | } |
| 455 | |
| 456 | if (Plan.hasScalarVFOnly()) { |
| 457 | if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) || |
| 458 | match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) { |
| 459 | auto *I = cast<VPInstruction>(Val: &R); |
| 460 | bool IsPenultimatePart = |
| 461 | I->getOpcode() == VPInstruction::ExtractPenultimateElement; |
| 462 | unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1; |
| 463 | // For scalar VF, directly use the scalar part value. |
| 464 | I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx)); |
| 465 | continue; |
| 466 | } |
| 467 | } |
| 468 | // For vector VF, the penultimate element is always extracted from the last part. |
| 469 | if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) || |
| 470 | match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) { |
| 471 | addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R)); |
| 472 | R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1)); |
| 473 | continue; |
| 474 | } |
| 475 | |
| 476 | auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R); |
| 477 | if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) { |
| 478 | addUniformForAllParts(R: SingleDef); |
| 479 | continue; |
| 480 | } |
| 481 | |
| 482 | if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) { |
| 483 | unrollHeaderPHIByUF(R: H, InsertPtForPhi); |
| 484 | continue; |
| 485 | } |
| 486 | |
| 487 | unrollRecipeByUF(R); |
| 488 | } |
| 489 | } |
| 490 | |
| 491 | void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { |
| 492 | assert(UF > 0 && "Unroll factor must be positive" ); |
| 493 | Plan.setUF(UF); |
| 494 | llvm::scope_exit Cleanup([&Plan, UF]() { |
| 495 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
| 496 | // Remove recipes that are redundant after unrolling. |
| 497 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: Iter)) { |
| 498 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 499 | auto *VPI = dyn_cast<VPInstruction>(Val: &R); |
| 500 | if (VPI && |
| 501 | VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && |
| 502 | VPI->getOperand(N: 1) == &Plan.getVF()) { |
| 503 | VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0)); |
| 504 | VPI->eraseFromParent(); |
| 505 | } |
| 506 | } |
| 507 | } |
| 508 | |
| 509 | Type *TCTy = Plan.getTripCount()->getScalarType(); |
| 510 | Plan.getUF().replaceAllUsesWith(New: Plan.getConstantInt(Ty: TCTy, Val: UF)); |
| 511 | }); |
| 512 | if (UF == 1) { |
| 513 | return; |
| 514 | } |
| 515 | |
| 516 | UnrollState Unroller(Plan, UF); |
| 517 | |
| 518 | // Iterate over all blocks in the plan starting from Entry, and unroll |
| 519 | // recipes inside them. This includes the vector preheader and middle blocks, |
| 520 | // which may set up or post-process per-part values. |
| 521 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( |
| 522 | Plan.getEntry()); |
| 523 | for (VPBlockBase *VPB : RPOT) |
| 524 | Unroller.unrollBlock(VPB); |
| 525 | |
| 526 | unsigned Part = 1; |
| 527 | // Remap operands of cloned header phis to update backedge values. The header |
| 528 | // phis cloned during unrolling are just after the header phi for part 0. |
| 529 | // Reset Part to 1 when reaching the first (part 0) recipe of a block. |
| 530 | for (VPRecipeBase &H : |
| 531 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
| 532 | // The second operand of Fixed Order Recurrence phi's, feeding the spliced |
| 533 | // value across the backedge, needs to remap to the last part of the spliced |
| 534 | // value. |
| 535 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) { |
| 536 | Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1); |
| 537 | continue; |
| 538 | } |
| 539 | if (Unroller.contains(VPV: H.getVPSingleValue())) { |
| 540 | Part = 1; |
| 541 | continue; |
| 542 | } |
| 543 | Unroller.remapOperands(R: &H, Part); |
| 544 | Part++; |
| 545 | } |
| 546 | |
| 547 | VPlanTransforms::removeDeadRecipes(Plan); |
| 548 | } |
| 549 | |
| 550 | /// Add a lane offset to the start index of \p Steps. |
| 551 | static void addLaneToStartIndex(VPScalarIVStepsRecipe *Steps, unsigned Lane, |
| 552 | VPlan &Plan, VPRecipeBase *InsertPt) { |
| 553 | assert(Lane > 0 && "Zero lane adds no offset to start index" ); |
| 554 | Type *BaseIVTy = Steps->getOperand(N: 0)->getScalarType(); |
| 555 | |
| 556 | VPValue *OldStartIndex = Steps->getStartIndex(); |
| 557 | VPValue *LaneOffset; |
| 558 | unsigned AddOpcode; |
| 559 | // TODO: Retrieve the flags from Steps unconditionally. |
| 560 | VPIRFlags Flags; |
| 561 | if (BaseIVTy->isFloatingPointTy()) { |
| 562 | int SignedLane = static_cast<int>(Lane); |
| 563 | if (!OldStartIndex && Steps->getInductionOpcode() == Instruction::FSub) |
| 564 | SignedLane = -SignedLane; |
| 565 | LaneOffset = Plan.getOrAddLiveIn(V: ConstantFP::get(Ty: BaseIVTy, V: SignedLane)); |
| 566 | AddOpcode = Steps->getInductionOpcode(); |
| 567 | Flags = VPIRFlags(FastMathFlags()); |
| 568 | } else { |
| 569 | unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits(); |
| 570 | LaneOffset = Plan.getConstantInt( |
| 571 | Val: APInt(BaseIVBits, Lane, /*isSigned*/ false, /*implicitTrunc*/ true)); |
| 572 | AddOpcode = Instruction::Add; |
| 573 | Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false)); |
| 574 | } |
| 575 | |
| 576 | VPValue *NewStartIndex = LaneOffset; |
| 577 | if (OldStartIndex) { |
| 578 | VPBuilder Builder(InsertPt); |
| 579 | NewStartIndex = |
| 580 | Builder.createNaryOp(Opcode: AddOpcode, Operands: {OldStartIndex, LaneOffset}, Flags); |
| 581 | } |
| 582 | Steps->setStartIndex(NewStartIndex); |
| 583 | } |
| 584 | |
| 585 | /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe, |
| 586 | /// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p |
| 587 | /// Def2LaneDefs to look up scalar definitions for operands of \DefR. |
| 588 | static VPValue * |
| 589 | cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, |
| 590 | VPSingleDefRecipe *DefR, VPLane Lane, |
| 591 | const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { |
| 592 | assert((isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(DefR)) && |
| 593 | "DefR must be a VPReplicateRecipe, VPInstruction or " |
| 594 | "VPScalarIVStepsRecipe" ); |
| 595 | VPValue *Op; |
| 596 | if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) { |
| 597 | auto LaneDefs = Def2LaneDefs.find(Val: Op); |
| 598 | if (LaneDefs != Def2LaneDefs.end()) |
| 599 | return LaneDefs->second[Lane.getKnownLane()]; |
| 600 | |
| 601 | VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane()); |
| 602 | return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
| 603 | } |
| 604 | |
| 605 | // Collect the operands at Lane, creating extracts as needed. |
| 606 | SmallVector<VPValue *> NewOps; |
| 607 | for (VPValue *Op : DefR->operands()) { |
| 608 | // If Op is a definition that has been unrolled, directly use the clone for |
| 609 | // the corresponding lane. |
| 610 | auto LaneDefs = Def2LaneDefs.find(Val: Op); |
| 611 | if (LaneDefs != Def2LaneDefs.end()) { |
| 612 | NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]); |
| 613 | continue; |
| 614 | } |
| 615 | if (Lane.getKind() == VPLane::Kind::ScalableLast) { |
| 616 | // Look through mandatory Unpack. |
| 617 | [[maybe_unused]] bool Matched = |
| 618 | match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op))); |
| 619 | assert(Matched && "original op must have been Unpack" ); |
| 620 | auto * = |
| 621 | Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op}); |
| 622 | NewOps.push_back( |
| 623 | Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart})); |
| 624 | continue; |
| 625 | } |
| 626 | if (vputils::isSingleScalar(VPV: Op)) { |
| 627 | NewOps.push_back(Elt: Op); |
| 628 | continue; |
| 629 | } |
| 630 | |
| 631 | // Look through buildvector to avoid unnecessary extracts. |
| 632 | if (match(V: Op, P: m_BuildVector())) { |
| 633 | NewOps.push_back( |
| 634 | Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane())); |
| 635 | continue; |
| 636 | } |
| 637 | VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane()); |
| 638 | VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
| 639 | NewOps.push_back(Elt: Ext); |
| 640 | } |
| 641 | |
| 642 | VPSingleDefRecipe *New; |
| 643 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) { |
| 644 | // TODO: have cloning of replicate recipes also provide the desired result |
| 645 | // coupled with setting its operands to NewOps (deriving IsSingleScalar and |
| 646 | // Mask from the operands?) |
| 647 | New = VPBuilder::createSingleScalarOp( |
| 648 | Opcode: RepR->getOpcode(), Operands: NewOps, /*Mask=*/nullptr, Flags: *RepR, Metadata: *RepR, |
| 649 | DL: RepR->getDebugLoc(), UV: RepR->getUnderlyingInstr()); |
| 650 | } else { |
| 651 | New = DefR->clone(); |
| 652 | for (const auto &[Idx, Op] : enumerate(First&: NewOps)) { |
| 653 | New->setOperand(I: Idx, New: Op); |
| 654 | } |
| 655 | if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: New)) { |
| 656 | // Skip lane 0: an absent start index is implicitly zero. |
| 657 | unsigned KnownLane = Lane.getKnownLane(); |
| 658 | if (KnownLane != 0) |
| 659 | addLaneToStartIndex(Steps, Lane: KnownLane, Plan, InsertPt: DefR); |
| 660 | } |
| 661 | } |
| 662 | New->insertBefore(InsertPos: DefR); |
| 663 | return New; |
| 664 | } |
| 665 | |
| 666 | /// Convert recipes in region blocks to operate on a single lane 0. |
| 667 | /// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is |
| 668 | /// converted into BranchOnCond, PredInstPhi recipes are replaced by scalar phi |
| 669 | /// recipes with an additional poison operand, and extracts are created as |
| 670 | /// needed. |
| 671 | static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy, |
| 672 | VPBlockBase *Entry, |
| 673 | ElementCount VF) { |
| 674 | VPValue *Idx0 = Plan.getZero(Ty: IdxTy); |
| 675 | for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry)) { |
| 676 | for (VPRecipeBase &OldR : make_early_inc_range(Range&: cast<VPBasicBlock>(Val&: *VPB))) { |
| 677 | assert( |
| 678 | !isa<VPWidenPHIRecipe>(&OldR) && |
| 679 | !match(&OldR, |
| 680 | m_CombineOr( |
| 681 | m_InsertElement(m_VPValue(), m_VPValue(), m_VPValue()), |
| 682 | m_ExtractElement(m_VPValue(), m_VPValue()))) && |
| 683 | "must not contain wide phis, inserts or extracts before conversion" ); |
| 684 | |
| 685 | VPBuilder Builder(&OldR); |
| 686 | DebugLoc OldDL = OldR.getDebugLoc(); |
| 687 | // For scalar VF, operands are already scalar; no extraction needed. |
| 688 | if (!VF.isScalar()) { |
| 689 | for (const auto &[I, Op] : enumerate(First: OldR.operands())) { |
| 690 | // Skip operands that don't need extraction: values defined in the |
| 691 | // same block (already scalar), or values that are already single |
| 692 | // scalars. |
| 693 | // TODO: Support isSingleScalar for VPScalarIVStepsRecipe. |
| 694 | auto *DefR = Op->getDefiningRecipe(); |
| 695 | if ((isa_and_present<VPScalarIVStepsRecipe>(Val: DefR) && |
| 696 | DefR->getParent() == VPB) || |
| 697 | vputils::isSingleScalar(VPV: Op)) |
| 698 | continue; |
| 699 | |
| 700 | // Extract lane zero from values defined outside the region. |
| 701 | VPValue * = Builder.createNaryOp(Opcode: Instruction::ExtractElement, |
| 702 | Operands: {Op, Idx0}, DL: OldDL); |
| 703 | OldR.setOperand(I, New: Extract); |
| 704 | } |
| 705 | } |
| 706 | |
| 707 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &OldR)) { |
| 708 | auto *NewR = VPBuilder::createSingleScalarOp( |
| 709 | Opcode: RepR->getOpcode(), Operands: to_vector(Range: RepR->operands()), /*Mask=*/nullptr, |
| 710 | Flags: *RepR, Metadata: *RepR, DL: OldDL, UV: RepR->getUnderlyingInstr()); |
| 711 | NewR->insertBefore(InsertPos: RepR); |
| 712 | RepR->replaceAllUsesWith(New: NewR); |
| 713 | RepR->eraseFromParent(); |
| 714 | } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(Val: &OldR)) { |
| 715 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond, |
| 716 | Operands: {BranchOnMask->getOperand(N: 0)}, DL: OldDL); |
| 717 | BranchOnMask->eraseFromParent(); |
| 718 | } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(Val: &OldR)) { |
| 719 | VPValue *PredOp = PredPhi->getOperand(N: 0); |
| 720 | Type *PredTy = PredOp->getScalarType(); |
| 721 | VPValue *Poison = Plan.getPoison(Ty: PredTy); |
| 722 | VPPhi *NewPhi = Builder.createScalarPhi(IncomingValues: {Poison, PredOp}, DL: OldDL); |
| 723 | PredPhi->replaceAllUsesWith(New: NewPhi); |
| 724 | PredPhi->eraseFromParent(); |
| 725 | } else { |
| 726 | // TODO: Support isSingleScalar for VPScalarIVStepsRecipe. |
| 727 | assert((isa<VPScalarIVStepsRecipe>(OldR) || |
| 728 | (isa<VPInstruction>(OldR) && |
| 729 | vputils::isSingleScalar(OldR.getVPSingleValue()))) && |
| 730 | "unexpected unhandled recipe" ); |
| 731 | } |
| 732 | } |
| 733 | } |
| 734 | } |
| 735 | |
| 736 | /// Update recipes in the cloned blocks rooted at \p NewEntry to match \p Lane, |
| 737 | /// using the original blocks rooted at \p OldEntry as reference. |
| 738 | static void processLaneForReplicateRegion(VPlan &Plan, Type *IdxTy, |
| 739 | unsigned Lane, VPBasicBlock *OldEntry, |
| 740 | VPBasicBlock *NewEntry) { |
| 741 | DenseMap<VPValue *, VPValue *> Old2NewVPValues; |
| 742 | VPValue *IdxLane = Plan.getConstantInt(Ty: IdxTy, Val: Lane); |
| 743 | for (const auto &[OldBB, NewBB] : |
| 744 | zip_equal(t: vp_depth_first_shallow(G: OldEntry), |
| 745 | u: vp_depth_first_shallow(G: NewEntry))) { |
| 746 | for (auto &&[OldR, NewR] : |
| 747 | zip_equal(t&: *cast<VPBasicBlock>(Val: OldBB), u&: *cast<VPBasicBlock>(Val: NewBB))) { |
| 748 | for (const auto &[OldV, NewV] : |
| 749 | zip_equal(t: OldR.definedValues(), u: NewR.definedValues())) |
| 750 | Old2NewVPValues[OldV] = NewV; |
| 751 | |
| 752 | // Remap operands to use lane-specific values. |
| 753 | for (const auto &[I, OldOp] : enumerate(First: NewR.operands())) { |
| 754 | // Use cloned value if operand was defined in the region. |
| 755 | if (auto *NewOp = Old2NewVPValues.lookup(Val: OldOp)) |
| 756 | NewR.setOperand(I, New: NewOp); |
| 757 | } |
| 758 | |
| 759 | if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &NewR)) { |
| 760 | addLaneToStartIndex(Steps, Lane, Plan, InsertPt: Steps); |
| 761 | } else if (match(V: &NewR, P: m_ExtractElement(Op0: m_VPValue(), Op1: m_VPValue()))) { |
| 762 | assert(match(NewR.getOperand(1), m_ZeroInt()) && |
| 763 | "extract indices must be zero" ); |
| 764 | NewR.setOperand(I: 1, New: IdxLane); |
| 765 | } else if (auto *NewPhi = dyn_cast<VPPhi>(Val: &NewR)) { |
| 766 | auto *OldPhi = cast<VPPhi>(Val: &OldR); |
| 767 | assert(vputils::onlyFirstLaneUsed(OldPhi) && |
| 768 | "VPPhis expected to have only first lane used" ); |
| 769 | auto *BVUser = dyn_cast_or_null<VPInstruction>(Val: OldPhi->getSingleUser()); |
| 770 | if (BVUser && match(V: BVUser, P: m_CombineOr(Ps: m_BuildVector(), |
| 771 | Ps: m_BuildStructVector()))) { |
| 772 | assert(BVUser->getOperand(0) == OldPhi && |
| 773 | "Unexpected first operand of build vector user" ); |
| 774 | BVUser->setOperand(I: Lane, New: NewPhi); |
| 775 | } |
| 776 | } |
| 777 | } |
| 778 | } |
| 779 | } |
| 780 | |
| 781 | /// Dissolve a single replicate region by replicating its blocks for each lane |
| 782 | /// of \p VF. The region is disconnected, its blocks are reparented, cloned for |
| 783 | /// each lane, and reconnected in sequence. |
| 784 | static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF, |
| 785 | VPlan &Plan, Type *IdxTy) { |
| 786 | auto *FirstLaneEntry = cast<VPBasicBlock>(Val: Region->getEntry()); |
| 787 | auto *FirstLaneExiting = cast<VPBasicBlock>(Val: Region->getExiting()); |
| 788 | |
| 789 | // Disconnect and dissolve the region. |
| 790 | VPBlockBase *Predecessor = Region->getSinglePredecessor(); |
| 791 | assert(Predecessor && "Replicate region must have a single predecessor" ); |
| 792 | auto *Successor = cast<VPBasicBlock>(Val: Region->getSingleSuccessor()); |
| 793 | VPBlockUtils::disconnectBlocks(From: Predecessor, To: Region); |
| 794 | VPBlockUtils::disconnectBlocks(From: Region, To: Successor); |
| 795 | |
| 796 | VPRegionBlock *ParentRegion = Region->getParent(); |
| 797 | for (VPBlockBase *VPB : vp_depth_first_shallow(G: FirstLaneEntry)) |
| 798 | VPB->setParent(ParentRegion); |
| 799 | |
| 800 | // Process the original blocks for lane 0: converting their recipes to |
| 801 | // single-scalar. |
| 802 | convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, Entry: FirstLaneEntry, VF); |
| 803 | |
| 804 | // For scalar VF, just wire the blocks and return; no cloning or packing |
| 805 | // needed. |
| 806 | if (VF.isScalar()) { |
| 807 | VPBlockUtils::connectBlocks(From: Predecessor, To: FirstLaneEntry); |
| 808 | VPBlockUtils::connectBlocks(From: FirstLaneExiting, To: Successor); |
| 809 | return; |
| 810 | } |
| 811 | |
| 812 | // Create a BuildVector or BuildStructVector in successor block for every |
| 813 | // VPPhi in (first lane's) exiting block having vector uses. All their |
| 814 | // operands are initialized to poison and will be replaced when processing |
| 815 | // each clone, except for the operand of the first lane which set here. |
| 816 | // BuildVectors are recorded to be replaced later by chains of insert-element |
| 817 | // and widen phi's. |
| 818 | unsigned NumLanes = VF.getFixedValue(); |
| 819 | SmallVector<VPInstruction *> BuildVectors; |
| 820 | for (auto &R : FirstLaneExiting->phis()) { |
| 821 | auto *Phi = cast<VPPhi>(Val: &R); |
| 822 | if (vputils::onlyFirstLaneUsed(Def: Phi)) |
| 823 | continue; |
| 824 | |
| 825 | Type *ScalarTy = Phi->getScalarType(); |
| 826 | bool IsStruct = isa<StructType>(Val: ScalarTy); |
| 827 | VPValue *Poison = Plan.getPoison(Ty: ScalarTy); |
| 828 | SmallVector<VPValue *> BVOps(NumLanes, Poison); |
| 829 | auto *BV = new VPInstruction(IsStruct ? VPInstruction::BuildStructVector |
| 830 | : VPInstruction::BuildVector, |
| 831 | BVOps); |
| 832 | if (!IsStruct) |
| 833 | BuildVectors.push_back(Elt: BV); |
| 834 | Phi->replaceAllUsesWith(New: BV); |
| 835 | BV->setOperand(I: 0, New: Phi); |
| 836 | BV->insertBefore(BB&: *Successor, IP: Successor->getFirstNonPhi()); |
| 837 | } |
| 838 | |
| 839 | // Clone converted blocks for remaining lanes and process each in reverse |
| 840 | // order, connecting each lane's Exiting block to the subsequent lane's entry. |
| 841 | VPBlockBase *NextLaneEntry = Successor; |
| 842 | for (int Lane = NumLanes - 1; Lane > 0; --Lane) { |
| 843 | const auto &[CurrentLaneEntry, CurrentLaneExiting] = |
| 844 | VPBlockUtils::cloneFrom(Entry: FirstLaneEntry); |
| 845 | for (VPBlockBase *VPB : vp_depth_first_shallow(G: CurrentLaneEntry)) |
| 846 | VPB->setParent(ParentRegion); |
| 847 | processLaneForReplicateRegion(Plan, IdxTy, Lane, |
| 848 | OldEntry: cast<VPBasicBlock>(Val: FirstLaneEntry), |
| 849 | NewEntry: cast<VPBasicBlock>(Val: CurrentLaneEntry)); |
| 850 | VPBlockUtils::connectBlocks(From: CurrentLaneExiting, To: NextLaneEntry); |
| 851 | NextLaneEntry = CurrentLaneEntry; |
| 852 | } |
| 853 | |
| 854 | // Connect Predecessor to FirstLaneEntry, and FirstLaneRegionExit to |
| 855 | // NextLaneEntry which is the second lane region entry. The latter is |
| 856 | // done last so that earlier clonings from FirstLaneEntry stop at |
| 857 | // FirstLaneExiting. |
| 858 | VPBlockUtils::connectBlocks(From: Predecessor, To: FirstLaneEntry); |
| 859 | VPBlockUtils::connectBlocks(From: FirstLaneExiting, To: NextLaneEntry); |
| 860 | |
| 861 | // Fold BuildVector fed by scalar phis into VPWidenPHIRecipes with |
| 862 | // InsertElement per lane. |
| 863 | // TODO: check if this folding should be dropped. |
| 864 | for (VPInstruction *BV : BuildVectors) { |
| 865 | assert(BV->getNumOperands() == NumLanes && |
| 866 | "BuildVector must have one operand per lane" ); |
| 867 | for (const auto &[Idx, Op] : enumerate(First: BV->operands())) { |
| 868 | auto *ScalarPhi = cast<VPPhi>(Val: Op); |
| 869 | auto DL = ScalarPhi->getDebugLoc(); |
| 870 | auto *PredOp = cast<VPSingleDefRecipe>(Val: ScalarPhi->getOperand(N: 1)); |
| 871 | VPValue *Poison = ScalarPhi->getOperand(N: 0); |
| 872 | VPValue *PrevVal = Idx == 0 ? Poison : BV->getOperand(N: Idx - 1); |
| 873 | auto Builder = VPBuilder::getToInsertAfter(R: PredOp->getDefiningRecipe()); |
| 874 | auto *Insert = Builder.createNaryOp( |
| 875 | Opcode: Instruction::InsertElement, |
| 876 | Operands: {PrevVal, PredOp, Plan.getConstantInt(BitWidth: 64, Val: Idx)}, DL); |
| 877 | Builder.setInsertPoint(ScalarPhi); |
| 878 | auto *NewPhi = Builder.createWidenPhi(IncomingValues: {PrevVal, Insert}, DL); |
| 879 | ScalarPhi->replaceAllUsesWith(New: NewPhi); |
| 880 | ScalarPhi->eraseFromParent(); |
| 881 | } |
| 882 | BV->replaceAllUsesWith(New: BV->getOperand(N: NumLanes - 1)); |
| 883 | BV->eraseFromParent(); |
| 884 | } |
| 885 | } |
| 886 | |
| 887 | /// Collect and dissolve all replicate regions in the vector loop, replicating |
| 888 | /// their blocks and recipes for each lane of \p VF. |
| 889 | static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF, |
| 890 | Type *IdxTy) { |
| 891 | // Collect all replicate regions before modifying the CFG. |
| 892 | SmallVector<VPRegionBlock *> ReplicateRegions; |
| 893 | for (VPRegionBlock *Region : VPBlockUtils::blocksOnly<VPRegionBlock>( |
| 894 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) { |
| 895 | if (Region->isReplicator()) |
| 896 | ReplicateRegions.push_back(Elt: Region); |
| 897 | } |
| 898 | |
| 899 | assert((ReplicateRegions.empty() || !VF.isScalable()) && |
| 900 | "cannot replicate across scalable VFs" ); |
| 901 | |
| 902 | // Dissolve replicate regions by replicating their blocks for each lane. |
| 903 | // Traversing regions in reverse ensures that the successor of every region |
| 904 | // being processed is a basic-block, rather than another region. |
| 905 | for (VPRegionBlock *Region : reverse(C&: ReplicateRegions)) |
| 906 | dissolveReplicateRegion(Region, VF, Plan, IdxTy); |
| 907 | |
| 908 | VPlanTransforms::mergeBlocksIntoPredecessors(Plan); |
| 909 | } |
| 910 | |
| 911 | void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
| 912 | Type *IdxTy = IntegerType::get( |
| 913 | C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32); |
| 914 | |
| 915 | if (Plan.hasScalarVFOnly()) { |
| 916 | // When Plan is only unrolled by UF, replicating by VF amounts to dissolving |
| 917 | // replicate regions. |
| 918 | replicateReplicateRegionsByVF(Plan, VF, IdxTy); |
| 919 | return; |
| 920 | } |
| 921 | |
| 922 | // Visit all VPBBs outside the loop region and directly inside the top-level |
| 923 | // loop region. |
| 924 | auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 925 | Range: vp_depth_first_shallow(G: Plan.getEntry())); |
| 926 | auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 927 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())); |
| 928 | auto VPBBsToUnroll = |
| 929 | concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion); |
| 930 | // A mapping of current VPValue definitions to collections of new VPValues |
| 931 | // defined per lane. Serves to hook-up potential users of current VPValue |
| 932 | // definition that are replicated-per-VF later. |
| 933 | DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs; |
| 934 | // The removal of current recipes being replaced by new ones needs to be |
| 935 | // delayed after Def2LaneDefs is no longer in use. |
| 936 | SmallVector<VPRecipeBase *> ToRemove; |
| 937 | for (VPBasicBlock *VPBB : VPBBsToUnroll) { |
| 938 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 939 | if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: &R) || |
| 940 | (isa<VPReplicateRecipe>(Val: &R) && |
| 941 | cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) || |
| 942 | (isa<VPInstruction>(Val: &R) && |
| 943 | !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() && |
| 944 | cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack)) |
| 945 | continue; |
| 946 | |
| 947 | auto *DefR = cast<VPSingleDefRecipe>(Val: &R); |
| 948 | VPBuilder Builder(DefR); |
| 949 | if (DefR->user_empty()) { |
| 950 | // Create single-scalar version of DefR for all lanes. |
| 951 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 952 | cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs); |
| 953 | DefR->eraseFromParent(); |
| 954 | continue; |
| 955 | } |
| 956 | /// Create single-scalar version of DefR for all lanes. |
| 957 | SmallVector<VPValue *> LaneDefs; |
| 958 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 959 | LaneDefs.push_back( |
| 960 | Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs)); |
| 961 | |
| 962 | Def2LaneDefs[DefR] = LaneDefs; |
| 963 | /// Users that only demand the first lane can use the definition for lane |
| 964 | /// 0. |
| 965 | DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) { |
| 966 | if (U.usesFirstLaneOnly(Op: DefR)) |
| 967 | return true; |
| 968 | auto *VPI = dyn_cast<VPInstructionWithType>(Val: &U); |
| 969 | return VPI && Instruction::isCast(Opcode: VPI->getOpcode()); |
| 970 | }); |
| 971 | |
| 972 | // Update each build vector user that currently has DefR as its only |
| 973 | // operand, to have all LaneDefs as its operands. |
| 974 | for (VPUser *U : to_vector(Range: DefR->users())) { |
| 975 | auto *VPI = dyn_cast<VPInstruction>(Val: U); |
| 976 | if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && |
| 977 | VPI->getOpcode() != VPInstruction::BuildStructVector)) |
| 978 | continue; |
| 979 | assert(VPI->getNumOperands() == 1 && |
| 980 | "Build(Struct)Vector must have a single operand before " |
| 981 | "replicating by VF" ); |
| 982 | VPI->setOperand(I: 0, New: LaneDefs[0]); |
| 983 | for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs)) |
| 984 | VPI->addOperand(Op: LaneDef); |
| 985 | } |
| 986 | ToRemove.push_back(Elt: DefR); |
| 987 | } |
| 988 | } |
| 989 | for (auto *R : reverse(C&: ToRemove)) |
| 990 | R->eraseFromParent(); |
| 991 | |
| 992 | replicateReplicateRegionsByVF(Plan, VF, IdxTy); |
| 993 | } |
| 994 | |