| 1 | //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements explicit unrolling for VPlans. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "VPRecipeBuilder.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanCFG.h" |
| 18 | #include "VPlanHelpers.h" |
| 19 | #include "VPlanPatternMatch.h" |
| 20 | #include "VPlanTransforms.h" |
| 21 | #include "VPlanUtils.h" |
| 22 | #include "llvm/ADT/PostOrderIterator.h" |
| 23 | #include "llvm/ADT/STLExtras.h" |
| 24 | #include "llvm/ADT/ScopeExit.h" |
| 25 | #include "llvm/Analysis/IVDescriptors.h" |
| 26 | #include "llvm/IR/Intrinsics.h" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | using namespace llvm::VPlanPatternMatch; |
| 30 | |
| 31 | namespace { |
| 32 | |
| 33 | /// Helper to hold state needed for unrolling. It holds the Plan to unroll by |
| 34 | /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate |
| 35 | /// the unrolling transformation, where the original VPValues are retained for |
| 36 | /// part zero. |
| 37 | class UnrollState { |
| 38 | /// Plan to unroll. |
| 39 | VPlan &Plan; |
| 40 | /// Unroll factor to unroll by. |
| 41 | const unsigned UF; |
| 42 | /// Analysis for types. |
| 43 | VPTypeAnalysis TypeInfo; |
| 44 | |
| 45 | /// Unrolling may create recipes that should not be unrolled themselves. |
| 46 | /// Those are tracked in ToSkip. |
| 47 | SmallPtrSet<VPRecipeBase *, 8> ToSkip; |
| 48 | |
| 49 | // Associate with each VPValue of part 0 its unrolled instances of parts 1, |
| 50 | // ..., UF-1. |
| 51 | DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; |
| 52 | |
| 53 | /// Unroll replicate region \p VPR by cloning the region UF - 1 times. |
| 54 | void unrollReplicateRegionByUF(VPRegionBlock *VPR); |
| 55 | |
| 56 | /// Add a start index operand to \p Steps for \p Part. |
| 57 | void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, unsigned Part); |
| 58 | |
| 59 | /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across |
| 60 | /// all parts. |
| 61 | void unrollRecipeByUF(VPRecipeBase &R); |
| 62 | |
| 63 | /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled |
| 64 | /// depends on the concrete header phi. Inserts newly created recipes at \p |
| 65 | /// InsertPtForPhi. |
| 66 | void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
| 67 | VPBasicBlock::iterator InsertPtForPhi); |
| 68 | |
| 69 | /// Unroll a widen induction recipe \p IV. This introduces recipes to compute |
| 70 | /// the induction steps for each part. |
| 71 | void unrollWidenInductionByUF(VPWidenInductionRecipe *IV, |
| 72 | VPBasicBlock::iterator InsertPtForPhi); |
| 73 | |
| 74 | VPValue *getConstantInt(unsigned Part) { |
| 75 | Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType(); |
| 76 | return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part); |
| 77 | } |
| 78 | |
| 79 | public: |
| 80 | UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {} |
| 81 | |
| 82 | void unrollBlock(VPBlockBase *VPB); |
| 83 | |
| 84 | VPValue *getValueForPart(VPValue *V, unsigned Part) { |
| 85 | if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(Val: V)) |
| 86 | return V; |
| 87 | assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && |
| 88 | "accessed value does not exist" ); |
| 89 | return VPV2Parts[V][Part - 1]; |
| 90 | } |
| 91 | |
| 92 | /// Given a single original recipe \p OrigR (of part zero), and its copy \p |
| 93 | /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its |
| 94 | /// corresponding VPValue defined by \p CopyR. |
| 95 | void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, |
| 96 | unsigned Part) { |
| 97 | for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) { |
| 98 | const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV); |
| 99 | assert(V->second.size() == Part - 1 && "earlier parts not set" ); |
| 100 | V->second.push_back(Elt: CopyR->getVPValue(I: Idx)); |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | /// Given a uniform recipe \p R, add it for all parts. |
| 105 | void addUniformForAllParts(VPSingleDefRecipe *R) { |
| 106 | const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R); |
| 107 | assert(Inserted && "uniform value already added" ); |
| 108 | for (unsigned Part = 0; Part != UF; ++Part) |
| 109 | V->second.push_back(Elt: R); |
| 110 | } |
| 111 | |
| 112 | bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); } |
| 113 | |
| 114 | /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part |
| 115 | /// \p P. |
| 116 | void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { |
| 117 | auto *Op = R->getOperand(N: OpIdx); |
| 118 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 119 | } |
| 120 | |
| 121 | /// Update \p R's operands with their corresponding VPValues for part \p P. |
| 122 | void remapOperands(VPRecipeBase *R, unsigned Part) { |
| 123 | for (const auto &[OpIdx, Op] : enumerate(First: R->operands())) |
| 124 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 125 | } |
| 126 | }; |
| 127 | } // namespace |
| 128 | |
| 129 | void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, |
| 130 | unsigned Part) { |
| 131 | if (Part == 0) |
| 132 | return; |
| 133 | |
| 134 | VPBuilder Builder(Steps); |
| 135 | Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0)); |
| 136 | Type *IntStepTy = |
| 137 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
| 138 | VPValue *StartIndex = Steps->getVFValue(); |
| 139 | if (Part > 1) { |
| 140 | StartIndex = Builder.createOverflowingOp( |
| 141 | Opcode: Instruction::Mul, |
| 142 | Operands: {StartIndex, |
| 143 | Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: StartIndex), Val: Part)}); |
| 144 | } |
| 145 | StartIndex = Builder.createScalarSExtOrTrunc( |
| 146 | Op: StartIndex, ResultTy: IntStepTy, SrcTy: TypeInfo.inferScalarType(V: StartIndex), |
| 147 | DL: Steps->getDebugLoc()); |
| 148 | |
| 149 | if (BaseIVTy->isFloatingPointTy()) |
| 150 | StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex, |
| 151 | ResultTy: BaseIVTy, DL: Steps->getDebugLoc()); |
| 152 | |
| 153 | Steps->addOperand(Operand: StartIndex); |
| 154 | } |
| 155 | |
| 156 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
| 157 | VPBlockBase *InsertPt = VPR->getSingleSuccessor(); |
| 158 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 159 | auto *Copy = VPR->clone(); |
| 160 | VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt); |
| 161 | |
| 162 | auto PartI = vp_depth_first_shallow(G: Copy->getEntry()); |
| 163 | auto Part0 = vp_depth_first_shallow(G: VPR->getEntry()); |
| 164 | for (const auto &[PartIVPBB, Part0VPBB] : |
| 165 | zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI), |
| 166 | u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) { |
| 167 | for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) { |
| 168 | remapOperands(R: &PartIR, Part); |
| 169 | if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR)) |
| 170 | addStartIndexForScalarSteps(Steps, Part); |
| 171 | |
| 172 | addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part); |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | void UnrollState::unrollWidenInductionByUF( |
| 179 | VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { |
| 180 | VPBasicBlock *PH = cast<VPBasicBlock>( |
| 181 | Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); |
| 182 | Type *IVTy = TypeInfo.inferScalarType(V: IV); |
| 183 | auto &ID = IV->getInductionDescriptor(); |
| 184 | VPIRFlags Flags; |
| 185 | if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp())) |
| 186 | Flags = ID.getInductionBinOp()->getFastMathFlags(); |
| 187 | |
| 188 | VPValue *ScalarStep = IV->getStepValue(); |
| 189 | VPBuilder Builder(PH); |
| 190 | Type *VectorStepTy = |
| 191 | IVTy->isPointerTy() ? TypeInfo.inferScalarType(V: ScalarStep) : IVTy; |
| 192 | VPInstruction *VectorStep = Builder.createNaryOp( |
| 193 | Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy, |
| 194 | Flags, DL: IV->getDebugLoc()); |
| 195 | |
| 196 | ToSkip.insert(Ptr: VectorStep); |
| 197 | |
| 198 | // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 |
| 199 | // remains the header phi. Parts > 0 are computed by adding Step to the |
| 200 | // previous part. The header phi recipe will get 2 new operands: the step |
| 201 | // value for a single part and the last part, used to compute the backedge |
| 202 | // value during VPWidenInductionRecipe::execute. |
| 203 | // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 |
| 204 | // %Part.1 = %Part.0 + %VectorStep |
| 205 | // %Part.2 = %Part.1 + %VectorStep |
| 206 | // %Part.3 = %Part.2 + %VectorStep |
| 207 | // |
| 208 | // The newly added recipes are added to ToSkip to avoid interleaving them |
| 209 | // again. |
| 210 | VPValue *Prev = IV; |
| 211 | Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi); |
| 212 | unsigned AddOpc; |
| 213 | VPIRFlags AddFlags; |
| 214 | if (IVTy->isPointerTy()) { |
| 215 | AddOpc = VPInstruction::WidePtrAdd; |
| 216 | AddFlags = GEPNoWrapFlags::none(); |
| 217 | } else if (IVTy->isFloatingPointTy()) { |
| 218 | AddOpc = ID.getInductionOpcode(); |
| 219 | AddFlags = Flags; // FMF flags |
| 220 | } else { |
| 221 | AddOpc = Instruction::Add; |
| 222 | AddFlags = VPIRFlags::getDefaultFlags(Opcode: AddOpc); |
| 223 | } |
| 224 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 225 | std::string Name = |
| 226 | Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add" ; |
| 227 | |
| 228 | VPInstruction *Add = |
| 229 | Builder.createNaryOp(Opcode: AddOpc, |
| 230 | Operands: { |
| 231 | Prev, |
| 232 | VectorStep, |
| 233 | }, |
| 234 | Flags: AddFlags, DL: IV->getDebugLoc(), Name); |
| 235 | ToSkip.insert(Ptr: Add); |
| 236 | addRecipeForPart(OrigR: IV, CopyR: Add, Part); |
| 237 | Prev = Add; |
| 238 | } |
| 239 | IV->addOperand(Operand: VectorStep); |
| 240 | IV->addOperand(Operand: Prev); |
| 241 | } |
| 242 | |
| 243 | void UnrollState::(VPHeaderPHIRecipe *R, |
| 244 | VPBasicBlock::iterator InsertPtForPhi) { |
| 245 | // First-order recurrences pass a single vector or scalar through their header |
| 246 | // phis, irrespective of interleaving. |
| 247 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R)) |
| 248 | return; |
| 249 | |
| 250 | // Generate step vectors for each unrolled part. |
| 251 | if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) { |
| 252 | unrollWidenInductionByUF(IV, InsertPtForPhi); |
| 253 | return; |
| 254 | } |
| 255 | |
| 256 | auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R); |
| 257 | if (RdxPhi && RdxPhi->isOrdered()) |
| 258 | return; |
| 259 | |
| 260 | auto InsertPt = std::next(x: R->getIterator()); |
| 261 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 262 | VPRecipeBase *Copy = R->clone(); |
| 263 | Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt); |
| 264 | addRecipeForPart(OrigR: R, CopyR: Copy, Part); |
| 265 | if (RdxPhi) { |
| 266 | // If the start value is a ReductionStartVector, use the identity value |
| 267 | // (second operand) for unrolled parts. If the scaling factor is > 1, |
| 268 | // create a new ReductionStartVector with the scale factor and both |
| 269 | // operands set to the identity value. |
| 270 | if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) { |
| 271 | assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && |
| 272 | "unexpected start VPInstruction" ); |
| 273 | if (Part != 1) |
| 274 | continue; |
| 275 | VPValue *StartV; |
| 276 | if (match(V: VPI->getOperand(N: 2), P: m_One())) { |
| 277 | StartV = VPI->getOperand(N: 1); |
| 278 | } else { |
| 279 | auto *C = VPI->clone(); |
| 280 | C->setOperand(I: 0, New: C->getOperand(N: 1)); |
| 281 | C->insertAfter(InsertPos: VPI); |
| 282 | StartV = C; |
| 283 | } |
| 284 | for (unsigned Part = 1; Part != UF; ++Part) |
| 285 | VPV2Parts[VPI][Part - 1] = StartV; |
| 286 | } |
| 287 | Copy->addOperand(Operand: getConstantInt(Part)); |
| 288 | } else { |
| 289 | assert(isa<VPActiveLaneMaskPHIRecipe>(R) && |
| 290 | "unexpected header phi recipe not needing unrolled part" ); |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | /// Handle non-header-phi recipes. |
| 296 | void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { |
| 297 | if (match(V: &R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount()))) |
| 298 | return; |
| 299 | |
| 300 | if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) { |
| 301 | if (vputils::onlyFirstPartUsed(Def: VPI)) { |
| 302 | addUniformForAllParts(R: VPI); |
| 303 | return; |
| 304 | } |
| 305 | } |
| 306 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
| 307 | if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) && |
| 308 | RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) { |
| 309 | // Stores to an invariant address only need to store the last part. |
| 310 | remapOperands(R: &R, Part: UF - 1); |
| 311 | return; |
| 312 | } |
| 313 | if (match(V: RepR, |
| 314 | P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) { |
| 315 | addUniformForAllParts(R: RepR); |
| 316 | return; |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | // Unroll non-uniform recipes. |
| 321 | auto InsertPt = std::next(x: R.getIterator()); |
| 322 | VPBasicBlock &VPBB = *R.getParent(); |
| 323 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 324 | VPRecipeBase *Copy = R.clone(); |
| 325 | Copy->insertBefore(BB&: VPBB, IP: InsertPt); |
| 326 | addRecipeForPart(OrigR: &R, CopyR: Copy, Part); |
| 327 | |
| 328 | VPValue *Op; |
| 329 | if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
| 330 | Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) { |
| 331 | Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1)); |
| 332 | Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part)); |
| 333 | continue; |
| 334 | } |
| 335 | if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: &R)) { |
| 336 | VPBuilder Builder(VPR); |
| 337 | const DataLayout &DL = |
| 338 | Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout(); |
| 339 | Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: VPR)); |
| 340 | Type *VFTy = TypeInfo.inferScalarType(V: &Plan.getVF()); |
| 341 | VPValue *VF = Builder.createScalarZExtOrTrunc( |
| 342 | Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown()); |
| 343 | // VFxUF does not wrap, so VF * Part also cannot wrap. |
| 344 | VPValue *VFxPart = Builder.createOverflowingOp( |
| 345 | Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)}, |
| 346 | WrapFlags: {true, true}); |
| 347 | Copy->setOperand(I: 0, New: VPR->getOperand(N: 0)); |
| 348 | Copy->addOperand(Operand: VFxPart); |
| 349 | continue; |
| 350 | } |
| 351 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) { |
| 352 | auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0)); |
| 353 | if (Phi && Phi->isOrdered()) { |
| 354 | auto &Parts = VPV2Parts[Phi]; |
| 355 | if (Part == 1) { |
| 356 | Parts.clear(); |
| 357 | Parts.push_back(Elt: Red); |
| 358 | } |
| 359 | Parts.push_back(Elt: Copy->getVPSingleValue()); |
| 360 | Phi->setOperand(I: 1, New: Copy->getVPSingleValue()); |
| 361 | } |
| 362 | } |
| 363 | remapOperands(R: Copy, Part); |
| 364 | |
| 365 | if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy)) |
| 366 | addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part); |
| 367 | |
| 368 | // Add operand indicating the part to generate code for, to recipes still |
| 369 | // requiring it. |
| 370 | if (isa<VPWidenCanonicalIVRecipe, VPVectorEndPointerRecipe>(Val: Copy) || |
| 371 | match(V: Copy, |
| 372 | P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) |
| 373 | Copy->addOperand(Operand: getConstantInt(Part)); |
| 374 | |
| 375 | if (isa<VPVectorEndPointerRecipe>(Val: R)) |
| 376 | Copy->setOperand(I: 0, New: R.getOperand(N: 0)); |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | void UnrollState::unrollBlock(VPBlockBase *VPB) { |
| 381 | auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB); |
| 382 | if (VPR) { |
| 383 | if (VPR->isReplicator()) |
| 384 | return unrollReplicateRegionByUF(VPR); |
| 385 | |
| 386 | // Traverse blocks in region in RPO to ensure defs are visited before uses |
| 387 | // across blocks. |
| 388 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> |
| 389 | RPOT(VPR->getEntry()); |
| 390 | for (VPBlockBase *VPB : RPOT) |
| 391 | unrollBlock(VPB); |
| 392 | return; |
| 393 | } |
| 394 | |
| 395 | // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. |
| 396 | auto *VPBB = cast<VPBasicBlock>(Val: VPB); |
| 397 | auto InsertPtForPhi = VPBB->getFirstNonPhi(); |
| 398 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 399 | if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R)) |
| 400 | continue; |
| 401 | |
| 402 | // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and |
| 403 | // Compute*Result which combine all parts to compute the final value. |
| 404 | VPValue *Op1; |
| 405 | if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) || |
| 406 | match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) || |
| 407 | match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) || |
| 408 | match(V: &R, |
| 409 | P: m_ComputeAnyOfResult(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) || |
| 410 | match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) { |
| 411 | addUniformForAllParts(R: cast<VPInstruction>(Val: &R)); |
| 412 | for (unsigned Part = 1; Part != UF; ++Part) |
| 413 | R.addOperand(Operand: getValueForPart(V: Op1, Part)); |
| 414 | continue; |
| 415 | } |
| 416 | VPValue *Op0; |
| 417 | if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) { |
| 418 | addUniformForAllParts(R: cast<VPInstruction>(Val: &R)); |
| 419 | for (unsigned Part = 1; Part != UF; ++Part) |
| 420 | R.addOperand(Operand: getValueForPart(V: Op1, Part)); |
| 421 | continue; |
| 422 | } |
| 423 | |
| 424 | if (Plan.hasScalarVFOnly()) { |
| 425 | if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) || |
| 426 | match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) { |
| 427 | auto *I = cast<VPInstruction>(Val: &R); |
| 428 | bool IsPenultimatePart = |
| 429 | I->getOpcode() == VPInstruction::ExtractPenultimateElement; |
| 430 | unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1; |
| 431 | // For scalar VF, directly use the scalar part value. |
| 432 | I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx)); |
| 433 | continue; |
| 434 | } |
| 435 | } |
| 436 | // For vector VF, the penultimate element is always extracted from the last part. |
| 437 | if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) || |
| 438 | match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) { |
| 439 | addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R)); |
| 440 | R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1)); |
| 441 | continue; |
| 442 | } |
| 443 | |
| 444 | auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R); |
| 445 | if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) { |
| 446 | addUniformForAllParts(R: SingleDef); |
| 447 | continue; |
| 448 | } |
| 449 | |
| 450 | if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) { |
| 451 | unrollHeaderPHIByUF(R: H, InsertPtForPhi); |
| 452 | continue; |
| 453 | } |
| 454 | |
| 455 | unrollRecipeByUF(R); |
| 456 | } |
| 457 | } |
| 458 | |
| 459 | void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { |
| 460 | assert(UF > 0 && "Unroll factor must be positive" ); |
| 461 | Plan.setUF(UF); |
| 462 | llvm::scope_exit Cleanup([&Plan]() { |
| 463 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
| 464 | // Remove recipes that are redundant after unrolling. |
| 465 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
| 466 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 467 | auto *VPI = dyn_cast<VPInstruction>(Val: &R); |
| 468 | if (VPI && |
| 469 | VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && |
| 470 | VPI->getNumOperands() == 1) { |
| 471 | VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0)); |
| 472 | VPI->eraseFromParent(); |
| 473 | } |
| 474 | } |
| 475 | } |
| 476 | }); |
| 477 | if (UF == 1) { |
| 478 | return; |
| 479 | } |
| 480 | |
| 481 | UnrollState Unroller(Plan, UF); |
| 482 | |
| 483 | // Iterate over all blocks in the plan starting from Entry, and unroll |
| 484 | // recipes inside them. This includes the vector preheader and middle blocks, |
| 485 | // which may set up or post-process per-part values. |
| 486 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( |
| 487 | Plan.getEntry()); |
| 488 | for (VPBlockBase *VPB : RPOT) |
| 489 | Unroller.unrollBlock(VPB); |
| 490 | |
| 491 | unsigned Part = 1; |
| 492 | // Remap operands of cloned header phis to update backedge values. The header |
| 493 | // phis cloned during unrolling are just after the header phi for part 0. |
| 494 | // Reset Part to 1 when reaching the first (part 0) recipe of a block. |
| 495 | for (VPRecipeBase &H : |
| 496 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
| 497 | // The second operand of Fixed Order Recurrence phi's, feeding the spliced |
| 498 | // value across the backedge, needs to remap to the last part of the spliced |
| 499 | // value. |
| 500 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) { |
| 501 | Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1); |
| 502 | continue; |
| 503 | } |
| 504 | if (Unroller.contains(VPV: H.getVPSingleValue())) { |
| 505 | Part = 1; |
| 506 | continue; |
| 507 | } |
| 508 | Unroller.remapOperands(R: &H, Part); |
| 509 | Part++; |
| 510 | } |
| 511 | |
| 512 | VPlanTransforms::removeDeadRecipes(Plan); |
| 513 | } |
| 514 | |
| 515 | /// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or |
| 516 | /// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar |
| 517 | /// definitions for operands of \DefR. |
| 518 | static VPValue * |
| 519 | cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy, |
| 520 | VPSingleDefRecipe *DefR, VPLane Lane, |
| 521 | const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) { |
| 522 | VPValue *Op; |
| 523 | if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) { |
| 524 | auto LaneDefs = Def2LaneDefs.find(Val: Op); |
| 525 | if (LaneDefs != Def2LaneDefs.end()) |
| 526 | return LaneDefs->second[Lane.getKnownLane()]; |
| 527 | |
| 528 | VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane()); |
| 529 | return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
| 530 | } |
| 531 | |
| 532 | // Collect the operands at Lane, creating extracts as needed. |
| 533 | SmallVector<VPValue *> NewOps; |
| 534 | for (VPValue *Op : DefR->operands()) { |
| 535 | // If Op is a definition that has been unrolled, directly use the clone for |
| 536 | // the corresponding lane. |
| 537 | auto LaneDefs = Def2LaneDefs.find(Val: Op); |
| 538 | if (LaneDefs != Def2LaneDefs.end()) { |
| 539 | NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]); |
| 540 | continue; |
| 541 | } |
| 542 | if (Lane.getKind() == VPLane::Kind::ScalableLast) { |
| 543 | // Look through mandatory Unpack. |
| 544 | [[maybe_unused]] bool Matched = |
| 545 | match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op))); |
| 546 | assert(Matched && "original op must have been Unpack" ); |
| 547 | auto * = |
| 548 | Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op}); |
| 549 | NewOps.push_back( |
| 550 | Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart})); |
| 551 | continue; |
| 552 | } |
| 553 | if (vputils::isSingleScalar(VPV: Op)) { |
| 554 | NewOps.push_back(Elt: Op); |
| 555 | continue; |
| 556 | } |
| 557 | |
| 558 | // Look through buildvector to avoid unnecessary extracts. |
| 559 | if (match(V: Op, P: m_BuildVector())) { |
| 560 | NewOps.push_back( |
| 561 | Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane())); |
| 562 | continue; |
| 563 | } |
| 564 | VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane()); |
| 565 | VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
| 566 | NewOps.push_back(Elt: Ext); |
| 567 | } |
| 568 | |
| 569 | VPSingleDefRecipe *New; |
| 570 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) { |
| 571 | // TODO: have cloning of replicate recipes also provide the desired result |
| 572 | // coupled with setting its operands to NewOps (deriving IsSingleScalar and |
| 573 | // Mask from the operands?) |
| 574 | New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, |
| 575 | /*IsSingleScalar=*/true, /*Mask=*/nullptr, |
| 576 | *RepR, *RepR, RepR->getDebugLoc()); |
| 577 | } else { |
| 578 | assert(isa<VPInstruction>(DefR) && |
| 579 | "DefR must be a VPReplicateRecipe or VPInstruction" ); |
| 580 | New = DefR->clone(); |
| 581 | for (const auto &[Idx, Op] : enumerate(First&: NewOps)) { |
| 582 | New->setOperand(I: Idx, New: Op); |
| 583 | } |
| 584 | } |
| 585 | New->insertBefore(InsertPos: DefR); |
| 586 | return New; |
| 587 | } |
| 588 | |
| 589 | void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
| 590 | Type *IdxTy = IntegerType::get( |
| 591 | C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32); |
| 592 | |
| 593 | // Visit all VPBBs outside the loop region and directly inside the top-level |
| 594 | // loop region. |
| 595 | auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 596 | Range: vp_depth_first_shallow(G: Plan.getEntry())); |
| 597 | auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 598 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())); |
| 599 | auto VPBBsToUnroll = |
| 600 | concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion); |
| 601 | // A mapping of current VPValue definitions to collections of new VPValues |
| 602 | // defined per lane. Serves to hook-up potential users of current VPValue |
| 603 | // definition that are replicated-per-VF later. |
| 604 | DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs; |
| 605 | // The removal of current recipes being replaced by new ones needs to be |
| 606 | // delayed after Def2LaneDefs is no longer in use. |
| 607 | SmallVector<VPRecipeBase *> ToRemove; |
| 608 | for (VPBasicBlock *VPBB : VPBBsToUnroll) { |
| 609 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 610 | if (!isa<VPInstruction, VPReplicateRecipe>(Val: &R) || |
| 611 | (isa<VPReplicateRecipe>(Val: &R) && |
| 612 | cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) || |
| 613 | (isa<VPInstruction>(Val: &R) && |
| 614 | !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() && |
| 615 | cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack)) |
| 616 | continue; |
| 617 | |
| 618 | auto *DefR = cast<VPSingleDefRecipe>(Val: &R); |
| 619 | VPBuilder Builder(DefR); |
| 620 | if (DefR->getNumUsers() == 0) { |
| 621 | // Create single-scalar version of DefR for all lanes. |
| 622 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 623 | cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs); |
| 624 | DefR->eraseFromParent(); |
| 625 | continue; |
| 626 | } |
| 627 | /// Create single-scalar version of DefR for all lanes. |
| 628 | SmallVector<VPValue *> LaneDefs; |
| 629 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 630 | LaneDefs.push_back( |
| 631 | Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs)); |
| 632 | |
| 633 | Def2LaneDefs[DefR] = LaneDefs; |
| 634 | /// Users that only demand the first lane can use the definition for lane |
| 635 | /// 0. |
| 636 | DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) { |
| 637 | return U.usesFirstLaneOnly(Op: DefR); |
| 638 | }); |
| 639 | |
| 640 | // Update each build vector user that currently has DefR as its only |
| 641 | // operand, to have all LaneDefs as its operands. |
| 642 | for (VPUser *U : to_vector(Range: DefR->users())) { |
| 643 | auto *VPI = dyn_cast<VPInstruction>(Val: U); |
| 644 | if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector && |
| 645 | VPI->getOpcode() != VPInstruction::BuildStructVector)) |
| 646 | continue; |
| 647 | assert(VPI->getNumOperands() == 1 && |
| 648 | "Build(Struct)Vector must have a single operand before " |
| 649 | "replicating by VF" ); |
| 650 | VPI->setOperand(I: 0, New: LaneDefs[0]); |
| 651 | for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs)) |
| 652 | VPI->addOperand(Operand: LaneDef); |
| 653 | } |
| 654 | ToRemove.push_back(Elt: DefR); |
| 655 | } |
| 656 | } |
| 657 | for (auto *R : reverse(C&: ToRemove)) |
| 658 | R->eraseFromParent(); |
| 659 | } |
| 660 | |