| 1 | //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// |
| 9 | /// \file |
| 10 | /// This file implements explicit unrolling for VPlans. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "VPRecipeBuilder.h" |
| 15 | #include "VPlan.h" |
| 16 | #include "VPlanAnalysis.h" |
| 17 | #include "VPlanCFG.h" |
| 18 | #include "VPlanHelpers.h" |
| 19 | #include "VPlanPatternMatch.h" |
| 20 | #include "VPlanTransforms.h" |
| 21 | #include "VPlanUtils.h" |
| 22 | #include "llvm/ADT/PostOrderIterator.h" |
| 23 | #include "llvm/ADT/STLExtras.h" |
| 24 | #include "llvm/ADT/ScopeExit.h" |
| 25 | #include "llvm/Analysis/IVDescriptors.h" |
| 26 | #include "llvm/IR/Intrinsics.h" |
| 27 | |
| 28 | using namespace llvm; |
| 29 | using namespace llvm::VPlanPatternMatch; |
| 30 | |
| 31 | namespace { |
| 32 | |
| 33 | /// Helper to hold state needed for unrolling. It holds the Plan to unroll by |
| 34 | /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate |
| 35 | /// the unrolling transformation, where the original VPValues are retained for |
| 36 | /// part zero. |
| 37 | class UnrollState { |
| 38 | /// Plan to unroll. |
| 39 | VPlan &Plan; |
| 40 | /// Unroll factor to unroll by. |
| 41 | const unsigned UF; |
| 42 | /// Analysis for types. |
| 43 | VPTypeAnalysis TypeInfo; |
| 44 | |
| 45 | /// Unrolling may create recipes that should not be unrolled themselves. |
| 46 | /// Those are tracked in ToSkip. |
| 47 | SmallPtrSet<VPRecipeBase *, 8> ToSkip; |
| 48 | |
| 49 | // Associate with each VPValue of part 0 its unrolled instances of parts 1, |
| 50 | // ..., UF-1. |
| 51 | DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; |
| 52 | |
| 53 | /// Unroll replicate region \p VPR by cloning the region UF - 1 times. |
| 54 | void unrollReplicateRegionByUF(VPRegionBlock *VPR); |
| 55 | |
| 56 | /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across |
| 57 | /// all parts. |
| 58 | void unrollRecipeByUF(VPRecipeBase &R); |
| 59 | |
| 60 | /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled |
| 61 | /// depends on the concrete header phi. Inserts newly created recipes at \p |
| 62 | /// InsertPtForPhi. |
| 63 | void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, |
| 64 | VPBasicBlock::iterator InsertPtForPhi); |
| 65 | |
| 66 | /// Unroll a widen induction recipe \p IV. This introduces recipes to compute |
| 67 | /// the induction steps for each part. |
| 68 | void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, |
| 69 | VPBasicBlock::iterator InsertPtForPhi); |
| 70 | |
| 71 | VPValue *getConstantVPV(unsigned Part) { |
| 72 | Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); |
| 73 | return Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVIntTy, V: Part)); |
| 74 | } |
| 75 | |
| 76 | public: |
| 77 | UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) |
| 78 | : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} |
| 79 | |
| 80 | void unrollBlock(VPBlockBase *VPB); |
| 81 | |
| 82 | VPValue *getValueForPart(VPValue *V, unsigned Part) { |
| 83 | if (Part == 0 || V->isLiveIn()) |
| 84 | return V; |
| 85 | assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && |
| 86 | "accessed value does not exist" ); |
| 87 | return VPV2Parts[V][Part - 1]; |
| 88 | } |
| 89 | |
| 90 | /// Given a single original recipe \p OrigR (of part zero), and its copy \p |
| 91 | /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its |
| 92 | /// corresponding VPValue defined by \p CopyR. |
| 93 | void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, |
| 94 | unsigned Part) { |
| 95 | for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) { |
| 96 | auto Ins = VPV2Parts.insert(KV: {VPV, {}}); |
| 97 | assert(Ins.first->second.size() == Part - 1 && "earlier parts not set" ); |
| 98 | Ins.first->second.push_back(Elt: CopyR->getVPValue(I: Idx)); |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | /// Given a uniform recipe \p R, add it for all parts. |
| 103 | void addUniformForAllParts(VPSingleDefRecipe *R) { |
| 104 | auto Ins = VPV2Parts.insert(KV: {R, {}}); |
| 105 | assert(Ins.second && "uniform value already added" ); |
| 106 | for (unsigned Part = 0; Part != UF; ++Part) |
| 107 | Ins.first->second.push_back(Elt: R); |
| 108 | } |
| 109 | |
| 110 | bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); } |
| 111 | |
| 112 | /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part |
| 113 | /// \p P. |
| 114 | void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { |
| 115 | auto *Op = R->getOperand(N: OpIdx); |
| 116 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 117 | } |
| 118 | |
| 119 | /// Update \p R's operands with their corresponding VPValues for part \p P. |
| 120 | void remapOperands(VPRecipeBase *R, unsigned Part) { |
| 121 | for (const auto &[OpIdx, Op] : enumerate(First: R->operands())) |
| 122 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); |
| 123 | } |
| 124 | }; |
| 125 | } // namespace |
| 126 | |
| 127 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { |
| 128 | VPBlockBase *InsertPt = VPR->getSingleSuccessor(); |
| 129 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 130 | auto *Copy = VPR->clone(); |
| 131 | VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt); |
| 132 | |
| 133 | auto PartI = vp_depth_first_shallow(G: Copy->getEntry()); |
| 134 | auto Part0 = vp_depth_first_shallow(G: VPR->getEntry()); |
| 135 | for (const auto &[PartIVPBB, Part0VPBB] : |
| 136 | zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI), |
| 137 | u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) { |
| 138 | for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) { |
| 139 | remapOperands(R: &PartIR, Part); |
| 140 | if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR)) { |
| 141 | ScalarIVSteps->addOperand(Operand: getConstantVPV(Part)); |
| 142 | } |
| 143 | |
| 144 | addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part); |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | void UnrollState::unrollWidenInductionByUF( |
| 151 | VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { |
| 152 | VPBasicBlock *PH = cast<VPBasicBlock>( |
| 153 | Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); |
| 154 | Type *IVTy = TypeInfo.inferScalarType(V: IV); |
| 155 | auto &ID = IV->getInductionDescriptor(); |
| 156 | VPIRFlags Flags; |
| 157 | if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp())) |
| 158 | Flags = ID.getInductionBinOp()->getFastMathFlags(); |
| 159 | |
| 160 | VPValue *ScalarStep = IV->getStepValue(); |
| 161 | VPBuilder Builder(PH); |
| 162 | VPInstruction *VectorStep = Builder.createNaryOp( |
| 163 | Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: IVTy, Flags, |
| 164 | DL: IV->getDebugLoc()); |
| 165 | |
| 166 | ToSkip.insert(Ptr: VectorStep); |
| 167 | |
| 168 | // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 |
| 169 | // remains the header phi. Parts > 0 are computed by adding Step to the |
| 170 | // previous part. The header phi recipe will get 2 new operands: the step |
| 171 | // value for a single part and the last part, used to compute the backedge |
| 172 | // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = |
| 173 | // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 |
| 174 | // %Part.1 = %Part.0 + %VectorStep |
| 175 | // %Part.2 = %Part.1 + %VectorStep |
| 176 | // %Part.3 = %Part.2 + %VectorStep |
| 177 | // |
| 178 | // The newly added recipes are added to ToSkip to avoid interleaving them |
| 179 | // again. |
| 180 | VPValue *Prev = IV; |
| 181 | Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi); |
| 182 | unsigned AddOpc = |
| 183 | IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; |
| 184 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 185 | std::string Name = |
| 186 | Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add" ; |
| 187 | |
| 188 | VPInstruction *Add = Builder.createNaryOp(Opcode: AddOpc, |
| 189 | Operands: { |
| 190 | Prev, |
| 191 | VectorStep, |
| 192 | }, |
| 193 | Flags, DL: IV->getDebugLoc(), Name); |
| 194 | ToSkip.insert(Ptr: Add); |
| 195 | addRecipeForPart(OrigR: IV, CopyR: Add, Part); |
| 196 | Prev = Add; |
| 197 | } |
| 198 | IV->addOperand(Operand: VectorStep); |
| 199 | IV->addOperand(Operand: Prev); |
| 200 | } |
| 201 | |
| 202 | void UnrollState::(VPHeaderPHIRecipe *R, |
| 203 | VPBasicBlock::iterator InsertPtForPhi) { |
| 204 | // First-order recurrences pass a single vector or scalar through their header |
| 205 | // phis, irrespective of interleaving. |
| 206 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R)) |
| 207 | return; |
| 208 | |
| 209 | // Generate step vectors for each unrolled part. |
| 210 | if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: R)) { |
| 211 | unrollWidenInductionByUF(IV, InsertPtForPhi); |
| 212 | return; |
| 213 | } |
| 214 | |
| 215 | auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R); |
| 216 | if (RdxPhi && RdxPhi->isOrdered()) |
| 217 | return; |
| 218 | |
| 219 | auto InsertPt = std::next(x: R->getIterator()); |
| 220 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 221 | VPRecipeBase *Copy = R->clone(); |
| 222 | Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt); |
| 223 | addRecipeForPart(OrigR: R, CopyR: Copy, Part); |
| 224 | if (isa<VPWidenPointerInductionRecipe>(Val: R)) { |
| 225 | Copy->addOperand(Operand: R); |
| 226 | Copy->addOperand(Operand: getConstantVPV(Part)); |
| 227 | } else if (RdxPhi) { |
| 228 | // If the start value is a ReductionStartVector, use the identity value |
| 229 | // (second operand) for unrolled parts. If the scaling factor is > 1, |
| 230 | // create a new ReductionStartVector with the scale factor and both |
| 231 | // operands set to the identity value. |
| 232 | if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) { |
| 233 | assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && |
| 234 | "unexpected start VPInstruction" ); |
| 235 | if (Part != 1) |
| 236 | continue; |
| 237 | VPValue *StartV; |
| 238 | if (match(V: VPI->getOperand(N: 2), P: m_SpecificInt(V: 1))) { |
| 239 | StartV = VPI->getOperand(N: 1); |
| 240 | } else { |
| 241 | auto *C = VPI->clone(); |
| 242 | C->setOperand(I: 0, New: C->getOperand(N: 1)); |
| 243 | C->insertAfter(InsertPos: VPI); |
| 244 | StartV = C; |
| 245 | } |
| 246 | for (unsigned Part = 1; Part != UF; ++Part) |
| 247 | VPV2Parts[VPI][Part - 1] = StartV; |
| 248 | } |
| 249 | Copy->addOperand(Operand: getConstantVPV(Part)); |
| 250 | } else { |
| 251 | assert(isa<VPActiveLaneMaskPHIRecipe>(R) && |
| 252 | "unexpected header phi recipe not needing unrolled part" ); |
| 253 | } |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | /// Handle non-header-phi recipes. |
| 258 | void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { |
| 259 | if (match(V: &R, P: m_BranchOnCond(Op0: m_VPValue())) || |
| 260 | match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue()))) |
| 261 | return; |
| 262 | |
| 263 | if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) { |
| 264 | if (vputils::onlyFirstPartUsed(Def: VPI)) { |
| 265 | addUniformForAllParts(R: VPI); |
| 266 | return; |
| 267 | } |
| 268 | } |
| 269 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { |
| 270 | if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) && |
| 271 | RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) { |
| 272 | // Stores to an invariant address only need to store the last part. |
| 273 | remapOperands(R: &R, Part: UF - 1); |
| 274 | return; |
| 275 | } |
| 276 | if (auto *II = dyn_cast<IntrinsicInst>(Val: RepR->getUnderlyingValue())) { |
| 277 | if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { |
| 278 | addUniformForAllParts(R: RepR); |
| 279 | return; |
| 280 | } |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | // Unroll non-uniform recipes. |
| 285 | auto InsertPt = std::next(x: R.getIterator()); |
| 286 | VPBasicBlock &VPBB = *R.getParent(); |
| 287 | for (unsigned Part = 1; Part != UF; ++Part) { |
| 288 | VPRecipeBase *Copy = R.clone(); |
| 289 | Copy->insertBefore(BB&: VPBB, IP: InsertPt); |
| 290 | addRecipeForPart(OrigR: &R, CopyR: Copy, Part); |
| 291 | |
| 292 | VPValue *Op; |
| 293 | if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( |
| 294 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op)))) { |
| 295 | Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1)); |
| 296 | Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part)); |
| 297 | continue; |
| 298 | } |
| 299 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) { |
| 300 | auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0)); |
| 301 | if (Phi && Phi->isOrdered()) { |
| 302 | auto &Parts = VPV2Parts[Phi]; |
| 303 | if (Part == 1) { |
| 304 | Parts.clear(); |
| 305 | Parts.push_back(Elt: Red); |
| 306 | } |
| 307 | Parts.push_back(Elt: Copy->getVPSingleValue()); |
| 308 | Phi->setOperand(I: 1, New: Copy->getVPSingleValue()); |
| 309 | } |
| 310 | } |
| 311 | remapOperands(R: Copy, Part); |
| 312 | |
| 313 | // Add operand indicating the part to generate code for, to recipes still |
| 314 | // requiring it. |
| 315 | if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, |
| 316 | VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: Copy) || |
| 317 | match(V: Copy, P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( |
| 318 | Op0: m_VPValue()))) |
| 319 | Copy->addOperand(Operand: getConstantVPV(Part)); |
| 320 | |
| 321 | if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: R)) |
| 322 | Copy->setOperand(I: 0, New: R.getOperand(N: 0)); |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | void UnrollState::unrollBlock(VPBlockBase *VPB) { |
| 327 | auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB); |
| 328 | if (VPR) { |
| 329 | if (VPR->isReplicator()) |
| 330 | return unrollReplicateRegionByUF(VPR); |
| 331 | |
| 332 | // Traverse blocks in region in RPO to ensure defs are visited before uses |
| 333 | // across blocks. |
| 334 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> |
| 335 | RPOT(VPR->getEntry()); |
| 336 | for (VPBlockBase *VPB : RPOT) |
| 337 | unrollBlock(VPB); |
| 338 | return; |
| 339 | } |
| 340 | |
| 341 | // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. |
| 342 | auto *VPBB = cast<VPBasicBlock>(Val: VPB); |
| 343 | auto InsertPtForPhi = VPBB->getFirstNonPhi(); |
| 344 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 345 | if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R)) |
| 346 | continue; |
| 347 | |
| 348 | // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and |
| 349 | // Compute*Result which combine all parts to compute the final value. |
| 350 | VPValue *Op1; |
| 351 | if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Op0: m_VPValue(V&: Op1))) || |
| 352 | match(V: &R, P: m_VPInstruction<VPInstruction::FirstActiveLane>( |
| 353 | Op0: m_VPValue(V&: Op1))) || |
| 354 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeAnyOfResult>( |
| 355 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) || |
| 356 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeReductionResult>( |
| 357 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op1))) || |
| 358 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeFindIVResult>( |
| 359 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(), Op3: m_VPValue(V&: Op1)))) { |
| 360 | addUniformForAllParts(R: cast<VPInstruction>(Val: &R)); |
| 361 | for (unsigned Part = 1; Part != UF; ++Part) |
| 362 | R.addOperand(Operand: getValueForPart(V: Op1, Part)); |
| 363 | continue; |
| 364 | } |
| 365 | VPValue *Op0; |
| 366 | if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>( |
| 367 | Op0: m_VPValue(V&: Op0))) || |
| 368 | match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>( |
| 369 | Op0: m_VPValue(V&: Op0)))) { |
| 370 | addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R)); |
| 371 | if (Plan.hasScalarVFOnly()) { |
| 372 | auto *I = cast<VPInstruction>(Val: &R); |
| 373 | // Extracting from end with VF = 1 implies retrieving the last or |
| 374 | // penultimate scalar part (UF-1 or UF-2). |
| 375 | unsigned Offset = |
| 376 | I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; |
| 377 | I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: UF - Offset)); |
| 378 | R.eraseFromParent(); |
| 379 | } else { |
| 380 | // Otherwise we extract from the last part. |
| 381 | remapOperands(R: &R, Part: UF - 1); |
| 382 | } |
| 383 | continue; |
| 384 | } |
| 385 | |
| 386 | auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R); |
| 387 | if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) { |
| 388 | addUniformForAllParts(R: SingleDef); |
| 389 | continue; |
| 390 | } |
| 391 | |
| 392 | if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) { |
| 393 | unrollHeaderPHIByUF(R: H, InsertPtForPhi); |
| 394 | continue; |
| 395 | } |
| 396 | |
| 397 | unrollRecipeByUF(R); |
| 398 | } |
| 399 | } |
| 400 | |
| 401 | void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { |
| 402 | assert(UF > 0 && "Unroll factor must be positive" ); |
| 403 | Plan.setUF(UF); |
| 404 | auto Cleanup = make_scope_exit(F: [&Plan]() { |
| 405 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
| 406 | // Remove recipes that are redundant after unrolling. |
| 407 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
| 408 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 409 | auto *VPI = dyn_cast<VPInstruction>(Val: &R); |
| 410 | if (VPI && |
| 411 | VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && |
| 412 | VPI->getNumOperands() == 1) { |
| 413 | VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0)); |
| 414 | VPI->eraseFromParent(); |
| 415 | } |
| 416 | } |
| 417 | } |
| 418 | }); |
| 419 | if (UF == 1) { |
| 420 | return; |
| 421 | } |
| 422 | |
| 423 | UnrollState Unroller(Plan, UF, Ctx); |
| 424 | |
| 425 | // Iterate over all blocks in the plan starting from Entry, and unroll |
| 426 | // recipes inside them. This includes the vector preheader and middle blocks, |
| 427 | // which may set up or post-process per-part values. |
| 428 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( |
| 429 | Plan.getEntry()); |
| 430 | for (VPBlockBase *VPB : RPOT) |
| 431 | Unroller.unrollBlock(VPB); |
| 432 | |
| 433 | unsigned Part = 1; |
| 434 | // Remap operands of cloned header phis to update backedge values. The header |
| 435 | // phis cloned during unrolling are just after the header phi for part 0. |
| 436 | // Reset Part to 1 when reaching the first (part 0) recipe of a block. |
| 437 | for (VPRecipeBase &H : |
| 438 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
| 439 | // The second operand of Fixed Order Recurrence phi's, feeding the spliced |
| 440 | // value across the backedge, needs to remap to the last part of the spliced |
| 441 | // value. |
| 442 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) { |
| 443 | Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1); |
| 444 | continue; |
| 445 | } |
| 446 | if (Unroller.contains(VPV: H.getVPSingleValue()) || |
| 447 | isa<VPWidenPointerInductionRecipe>(Val: &H)) { |
| 448 | Part = 1; |
| 449 | continue; |
| 450 | } |
| 451 | Unroller.remapOperands(R: &H, Part); |
| 452 | Part++; |
| 453 | } |
| 454 | |
| 455 | VPlanTransforms::removeDeadRecipes(Plan); |
| 456 | } |
| 457 | |
| 458 | /// Create a single-scalar clone of \p RepR for lane \p Lane. |
| 459 | static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, |
| 460 | Type *IdxTy, VPReplicateRecipe *RepR, |
| 461 | VPLane Lane) { |
| 462 | // Collect the operands at Lane, creating extracts as needed. |
| 463 | SmallVector<VPValue *> NewOps; |
| 464 | for (VPValue *Op : RepR->operands()) { |
| 465 | if (vputils::isSingleScalar(VPV: Op)) { |
| 466 | NewOps.push_back(Elt: Op); |
| 467 | continue; |
| 468 | } |
| 469 | if (Lane.getKind() == VPLane::Kind::ScalableLast) { |
| 470 | NewOps.push_back( |
| 471 | Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Op})); |
| 472 | continue; |
| 473 | } |
| 474 | // Look through buildvector to avoid unnecessary extracts. |
| 475 | if (match(V: Op, P: m_BuildVector())) { |
| 476 | NewOps.push_back( |
| 477 | Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane())); |
| 478 | continue; |
| 479 | } |
| 480 | VPValue *Idx = |
| 481 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: IdxTy, V: Lane.getKnownLane())); |
| 482 | VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); |
| 483 | NewOps.push_back(Elt: Ext); |
| 484 | } |
| 485 | |
| 486 | auto *New = |
| 487 | new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, |
| 488 | /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); |
| 489 | New->insertBefore(InsertPos: RepR); |
| 490 | return New; |
| 491 | } |
| 492 | |
| 493 | void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { |
| 494 | Type *IdxTy = IntegerType::get( |
| 495 | C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32); |
| 496 | |
| 497 | // Visit all VPBBs outside the loop region and directly inside the top-level |
| 498 | // loop region. |
| 499 | auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 500 | Range: vp_depth_first_shallow(G: Plan.getEntry())); |
| 501 | auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( |
| 502 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())); |
| 503 | auto VPBBsToUnroll = |
| 504 | concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion); |
| 505 | for (VPBasicBlock *VPBB : VPBBsToUnroll) { |
| 506 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { |
| 507 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); |
| 508 | if (!RepR || RepR->isSingleScalar()) |
| 509 | continue; |
| 510 | |
| 511 | VPBuilder Builder(RepR); |
| 512 | if (RepR->getNumUsers() == 0) { |
| 513 | if (isa<StoreInst>(Val: RepR->getUnderlyingInstr()) && |
| 514 | vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) { |
| 515 | // Stores to invariant addresses need to store the last lane only. |
| 516 | cloneForLane(Plan, Builder, IdxTy, RepR, |
| 517 | Lane: VPLane::getLastLaneForVF(VF)); |
| 518 | } else { |
| 519 | // Create single-scalar version of RepR for all lanes. |
| 520 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 521 | cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I)); |
| 522 | } |
| 523 | RepR->eraseFromParent(); |
| 524 | continue; |
| 525 | } |
| 526 | /// Create single-scalar version of RepR for all lanes. |
| 527 | SmallVector<VPValue *> LaneDefs; |
| 528 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) |
| 529 | LaneDefs.push_back(Elt: cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I))); |
| 530 | |
| 531 | /// Users that only demand the first lane can use the definition for lane |
| 532 | /// 0. |
| 533 | RepR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [RepR](VPUser &U, unsigned) { |
| 534 | return U.onlyFirstLaneUsed(Op: RepR); |
| 535 | }); |
| 536 | |
| 537 | // If needed, create a Build(Struct)Vector recipe to insert the scalar |
| 538 | // lane values into a vector. |
| 539 | Type *ResTy = RepR->getUnderlyingInstr()->getType(); |
| 540 | VPValue *VecRes = Builder.createNaryOp( |
| 541 | Opcode: ResTy->isStructTy() ? VPInstruction::BuildStructVector |
| 542 | : VPInstruction::BuildVector, |
| 543 | Operands: LaneDefs); |
| 544 | RepR->replaceAllUsesWith(New: VecRes); |
| 545 | RepR->eraseFromParent(); |
| 546 | } |
| 547 | } |
| 548 | } |
| 549 | |