| 1 | //===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===// | 
|---|
| 2 | // | 
|---|
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|---|
| 4 | // See https://llvm.org/LICENSE.txt for license information. | 
|---|
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|---|
| 6 | // | 
|---|
| 7 | //===----------------------------------------------------------------------===// | 
|---|
| 8 | /// | 
|---|
| 9 | /// \file | 
|---|
| 10 | /// This file implements explicit unrolling for VPlans. | 
|---|
| 11 | /// | 
|---|
| 12 | //===----------------------------------------------------------------------===// | 
|---|
| 13 |  | 
|---|
| 14 | #include "VPRecipeBuilder.h" | 
|---|
| 15 | #include "VPlan.h" | 
|---|
| 16 | #include "VPlanAnalysis.h" | 
|---|
| 17 | #include "VPlanCFG.h" | 
|---|
| 18 | #include "VPlanHelpers.h" | 
|---|
| 19 | #include "VPlanPatternMatch.h" | 
|---|
| 20 | #include "VPlanTransforms.h" | 
|---|
| 21 | #include "VPlanUtils.h" | 
|---|
| 22 | #include "llvm/ADT/PostOrderIterator.h" | 
|---|
| 23 | #include "llvm/ADT/STLExtras.h" | 
|---|
| 24 | #include "llvm/ADT/ScopeExit.h" | 
|---|
| 25 | #include "llvm/Analysis/IVDescriptors.h" | 
|---|
| 26 | #include "llvm/IR/Intrinsics.h" | 
|---|
| 27 |  | 
|---|
| 28 | using namespace llvm; | 
|---|
| 29 | using namespace llvm::VPlanPatternMatch; | 
|---|
| 30 |  | 
|---|
| 31 | namespace { | 
|---|
| 32 |  | 
|---|
| 33 | /// Helper to hold state needed for unrolling. It holds the Plan to unroll by | 
|---|
| 34 | /// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate | 
|---|
| 35 | /// the unrolling transformation, where the original VPValues are retained for | 
|---|
| 36 | /// part zero. | 
|---|
| 37 | class UnrollState { | 
|---|
| 38 | /// Plan to unroll. | 
|---|
| 39 | VPlan &Plan; | 
|---|
| 40 | /// Unroll factor to unroll by. | 
|---|
| 41 | const unsigned UF; | 
|---|
| 42 | /// Analysis for types. | 
|---|
| 43 | VPTypeAnalysis TypeInfo; | 
|---|
| 44 |  | 
|---|
| 45 | /// Unrolling may create recipes that should not be unrolled themselves. | 
|---|
| 46 | /// Those are tracked in ToSkip. | 
|---|
| 47 | SmallPtrSet<VPRecipeBase *, 8> ToSkip; | 
|---|
| 48 |  | 
|---|
| 49 | // Associate with each VPValue of part 0 its unrolled instances of parts 1, | 
|---|
| 50 | // ..., UF-1. | 
|---|
| 51 | DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts; | 
|---|
| 52 |  | 
|---|
| 53 | /// Unroll replicate region \p VPR by cloning the region UF - 1 times. | 
|---|
| 54 | void unrollReplicateRegionByUF(VPRegionBlock *VPR); | 
|---|
| 55 |  | 
|---|
| 56 | /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across | 
|---|
| 57 | /// all parts. | 
|---|
| 58 | void unrollRecipeByUF(VPRecipeBase &R); | 
|---|
| 59 |  | 
|---|
| 60 | /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled | 
|---|
| 61 | /// depends on the concrete header phi. Inserts newly created recipes at \p | 
|---|
| 62 | /// InsertPtForPhi. | 
|---|
| 63 | void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R, | 
|---|
| 64 | VPBasicBlock::iterator InsertPtForPhi); | 
|---|
| 65 |  | 
|---|
| 66 | /// Unroll a widen induction recipe \p IV. This introduces recipes to compute | 
|---|
| 67 | /// the induction steps for each part. | 
|---|
| 68 | void unrollWidenInductionByUF(VPWidenIntOrFpInductionRecipe *IV, | 
|---|
| 69 | VPBasicBlock::iterator InsertPtForPhi); | 
|---|
| 70 |  | 
|---|
| 71 | VPValue *getConstantVPV(unsigned Part) { | 
|---|
| 72 | Type *CanIVIntTy = Plan.getCanonicalIV()->getScalarType(); | 
|---|
| 73 | return Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: CanIVIntTy, V: Part)); | 
|---|
| 74 | } | 
|---|
| 75 |  | 
|---|
| 76 | public: | 
|---|
| 77 | UnrollState(VPlan &Plan, unsigned UF, LLVMContext &Ctx) | 
|---|
| 78 | : Plan(Plan), UF(UF), TypeInfo(Plan.getCanonicalIV()->getScalarType()) {} | 
|---|
| 79 |  | 
|---|
| 80 | void unrollBlock(VPBlockBase *VPB); | 
|---|
| 81 |  | 
|---|
| 82 | VPValue *getValueForPart(VPValue *V, unsigned Part) { | 
|---|
| 83 | if (Part == 0 || V->isLiveIn()) | 
|---|
| 84 | return V; | 
|---|
| 85 | assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) && | 
|---|
| 86 | "accessed value does not exist"); | 
|---|
| 87 | return VPV2Parts[V][Part - 1]; | 
|---|
| 88 | } | 
|---|
| 89 |  | 
|---|
| 90 | /// Given a single original recipe \p OrigR (of part zero), and its copy \p | 
|---|
| 91 | /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its | 
|---|
| 92 | /// corresponding VPValue defined by \p CopyR. | 
|---|
| 93 | void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR, | 
|---|
| 94 | unsigned Part) { | 
|---|
| 95 | for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) { | 
|---|
| 96 | auto Ins = VPV2Parts.insert(KV: {VPV, {}}); | 
|---|
| 97 | assert(Ins.first->second.size() == Part - 1 && "earlier parts not set"); | 
|---|
| 98 | Ins.first->second.push_back(Elt: CopyR->getVPValue(I: Idx)); | 
|---|
| 99 | } | 
|---|
| 100 | } | 
|---|
| 101 |  | 
|---|
| 102 | /// Given a uniform recipe \p R, add it for all parts. | 
|---|
| 103 | void addUniformForAllParts(VPSingleDefRecipe *R) { | 
|---|
| 104 | auto Ins = VPV2Parts.insert(KV: {R, {}}); | 
|---|
| 105 | assert(Ins.second && "uniform value already added"); | 
|---|
| 106 | for (unsigned Part = 0; Part != UF; ++Part) | 
|---|
| 107 | Ins.first->second.push_back(Elt: R); | 
|---|
| 108 | } | 
|---|
| 109 |  | 
|---|
| 110 | bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); } | 
|---|
| 111 |  | 
|---|
| 112 | /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part | 
|---|
| 113 | /// \p P. | 
|---|
| 114 | void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) { | 
|---|
| 115 | auto *Op = R->getOperand(N: OpIdx); | 
|---|
| 116 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); | 
|---|
| 117 | } | 
|---|
| 118 |  | 
|---|
| 119 | /// Update \p R's operands with their corresponding VPValues for part \p P. | 
|---|
| 120 | void remapOperands(VPRecipeBase *R, unsigned Part) { | 
|---|
| 121 | for (const auto &[OpIdx, Op] : enumerate(First: R->operands())) | 
|---|
| 122 | R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part)); | 
|---|
| 123 | } | 
|---|
| 124 | }; | 
|---|
| 125 | } // namespace | 
|---|
| 126 |  | 
|---|
| 127 | void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) { | 
|---|
| 128 | VPBlockBase *InsertPt = VPR->getSingleSuccessor(); | 
|---|
| 129 | for (unsigned Part = 1; Part != UF; ++Part) { | 
|---|
| 130 | auto *Copy = VPR->clone(); | 
|---|
| 131 | VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt); | 
|---|
| 132 |  | 
|---|
| 133 | auto PartI = vp_depth_first_shallow(G: Copy->getEntry()); | 
|---|
| 134 | auto Part0 = vp_depth_first_shallow(G: VPR->getEntry()); | 
|---|
| 135 | for (const auto &[PartIVPBB, Part0VPBB] : | 
|---|
| 136 | zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI), | 
|---|
| 137 | u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) { | 
|---|
| 138 | for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) { | 
|---|
| 139 | remapOperands(R: &PartIR, Part); | 
|---|
| 140 | if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR)) { | 
|---|
| 141 | ScalarIVSteps->addOperand(Operand: getConstantVPV(Part)); | 
|---|
| 142 | } | 
|---|
| 143 |  | 
|---|
| 144 | addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part); | 
|---|
| 145 | } | 
|---|
| 146 | } | 
|---|
| 147 | } | 
|---|
| 148 | } | 
|---|
| 149 |  | 
|---|
| 150 | void UnrollState::unrollWidenInductionByUF( | 
|---|
| 151 | VPWidenIntOrFpInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) { | 
|---|
| 152 | VPBasicBlock *PH = cast<VPBasicBlock>( | 
|---|
| 153 | Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor()); | 
|---|
| 154 | Type *IVTy = TypeInfo.inferScalarType(V: IV); | 
|---|
| 155 | auto &ID = IV->getInductionDescriptor(); | 
|---|
| 156 | VPIRFlags Flags; | 
|---|
| 157 | if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp())) | 
|---|
| 158 | Flags = ID.getInductionBinOp()->getFastMathFlags(); | 
|---|
| 159 |  | 
|---|
| 160 | VPValue *ScalarStep = IV->getStepValue(); | 
|---|
| 161 | VPBuilder Builder(PH); | 
|---|
| 162 | VPInstruction *VectorStep = Builder.createNaryOp( | 
|---|
| 163 | Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: IVTy, Flags, | 
|---|
| 164 | DL: IV->getDebugLoc()); | 
|---|
| 165 |  | 
|---|
| 166 | ToSkip.insert(Ptr: VectorStep); | 
|---|
| 167 |  | 
|---|
| 168 | // Now create recipes to compute the induction steps for part 1 .. UF. Part 0 | 
|---|
| 169 | // remains the header phi. Parts > 0 are computed by adding Step to the | 
|---|
| 170 | // previous part. The header phi recipe will get 2 new operands: the step | 
|---|
| 171 | // value for a single part and the last part, used to compute the backedge | 
|---|
| 172 | // value during VPWidenIntOrFpInductionRecipe::execute. %Part.0 = | 
|---|
| 173 | // VPWidenIntOrFpInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3 | 
|---|
| 174 | // %Part.1 = %Part.0 + %VectorStep | 
|---|
| 175 | // %Part.2 = %Part.1 + %VectorStep | 
|---|
| 176 | // %Part.3 = %Part.2 + %VectorStep | 
|---|
| 177 | // | 
|---|
| 178 | // The newly added recipes are added to ToSkip to avoid interleaving them | 
|---|
| 179 | // again. | 
|---|
| 180 | VPValue *Prev = IV; | 
|---|
| 181 | Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi); | 
|---|
| 182 | unsigned AddOpc = | 
|---|
| 183 | IVTy->isFloatingPointTy() ? ID.getInductionOpcode() : Instruction::Add; | 
|---|
| 184 | for (unsigned Part = 1; Part != UF; ++Part) { | 
|---|
| 185 | std::string Name = | 
|---|
| 186 | Part > 1 ? "step.add."+ std::to_string(val: Part) : "step.add"; | 
|---|
| 187 |  | 
|---|
| 188 | VPInstruction *Add = Builder.createNaryOp(Opcode: AddOpc, | 
|---|
| 189 | Operands: { | 
|---|
| 190 | Prev, | 
|---|
| 191 | VectorStep, | 
|---|
| 192 | }, | 
|---|
| 193 | Flags, DL: IV->getDebugLoc(), Name); | 
|---|
| 194 | ToSkip.insert(Ptr: Add); | 
|---|
| 195 | addRecipeForPart(OrigR: IV, CopyR: Add, Part); | 
|---|
| 196 | Prev = Add; | 
|---|
| 197 | } | 
|---|
| 198 | IV->addOperand(Operand: VectorStep); | 
|---|
| 199 | IV->addOperand(Operand: Prev); | 
|---|
| 200 | } | 
|---|
| 201 |  | 
|---|
| 202 | void UnrollState::(VPHeaderPHIRecipe *R, | 
|---|
| 203 | VPBasicBlock::iterator InsertPtForPhi) { | 
|---|
| 204 | // First-order recurrences pass a single vector or scalar through their header | 
|---|
| 205 | // phis, irrespective of interleaving. | 
|---|
| 206 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R)) | 
|---|
| 207 | return; | 
|---|
| 208 |  | 
|---|
| 209 | // Generate step vectors for each unrolled part. | 
|---|
| 210 | if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: R)) { | 
|---|
| 211 | unrollWidenInductionByUF(IV, InsertPtForPhi); | 
|---|
| 212 | return; | 
|---|
| 213 | } | 
|---|
| 214 |  | 
|---|
| 215 | auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R); | 
|---|
| 216 | if (RdxPhi && RdxPhi->isOrdered()) | 
|---|
| 217 | return; | 
|---|
| 218 |  | 
|---|
| 219 | auto InsertPt = std::next(x: R->getIterator()); | 
|---|
| 220 | for (unsigned Part = 1; Part != UF; ++Part) { | 
|---|
| 221 | VPRecipeBase *Copy = R->clone(); | 
|---|
| 222 | Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt); | 
|---|
| 223 | addRecipeForPart(OrigR: R, CopyR: Copy, Part); | 
|---|
| 224 | if (isa<VPWidenPointerInductionRecipe>(Val: R)) { | 
|---|
| 225 | Copy->addOperand(Operand: R); | 
|---|
| 226 | Copy->addOperand(Operand: getConstantVPV(Part)); | 
|---|
| 227 | } else if (RdxPhi) { | 
|---|
| 228 | // If the start value is a ReductionStartVector, use the identity value | 
|---|
| 229 | // (second operand) for unrolled parts. If the scaling factor is > 1, | 
|---|
| 230 | // create a new ReductionStartVector with the scale factor and both | 
|---|
| 231 | // operands set to the identity value. | 
|---|
| 232 | if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) { | 
|---|
| 233 | assert(VPI->getOpcode() == VPInstruction::ReductionStartVector && | 
|---|
| 234 | "unexpected start VPInstruction"); | 
|---|
| 235 | if (Part != 1) | 
|---|
| 236 | continue; | 
|---|
| 237 | VPValue *StartV; | 
|---|
| 238 | if (match(V: VPI->getOperand(N: 2), P: m_SpecificInt(V: 1))) { | 
|---|
| 239 | StartV = VPI->getOperand(N: 1); | 
|---|
| 240 | } else { | 
|---|
| 241 | auto *C = VPI->clone(); | 
|---|
| 242 | C->setOperand(I: 0, New: C->getOperand(N: 1)); | 
|---|
| 243 | C->insertAfter(InsertPos: VPI); | 
|---|
| 244 | StartV = C; | 
|---|
| 245 | } | 
|---|
| 246 | for (unsigned Part = 1; Part != UF; ++Part) | 
|---|
| 247 | VPV2Parts[VPI][Part - 1] = StartV; | 
|---|
| 248 | } | 
|---|
| 249 | Copy->addOperand(Operand: getConstantVPV(Part)); | 
|---|
| 250 | } else { | 
|---|
| 251 | assert(isa<VPActiveLaneMaskPHIRecipe>(R) && | 
|---|
| 252 | "unexpected header phi recipe not needing unrolled part"); | 
|---|
| 253 | } | 
|---|
| 254 | } | 
|---|
| 255 | } | 
|---|
| 256 |  | 
|---|
| 257 | /// Handle non-header-phi recipes. | 
|---|
| 258 | void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { | 
|---|
| 259 | if (match(V: &R, P: m_BranchOnCond(Op0: m_VPValue())) || | 
|---|
| 260 | match(V: &R, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue()))) | 
|---|
| 261 | return; | 
|---|
| 262 |  | 
|---|
| 263 | if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) { | 
|---|
| 264 | if (vputils::onlyFirstPartUsed(Def: VPI)) { | 
|---|
| 265 | addUniformForAllParts(R: VPI); | 
|---|
| 266 | return; | 
|---|
| 267 | } | 
|---|
| 268 | } | 
|---|
| 269 | if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) { | 
|---|
| 270 | if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) && | 
|---|
| 271 | RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) { | 
|---|
| 272 | // Stores to an invariant address only need to store the last part. | 
|---|
| 273 | remapOperands(R: &R, Part: UF - 1); | 
|---|
| 274 | return; | 
|---|
| 275 | } | 
|---|
| 276 | if (auto *II = dyn_cast<IntrinsicInst>(Val: RepR->getUnderlyingValue())) { | 
|---|
| 277 | if (II->getIntrinsicID() == Intrinsic::experimental_noalias_scope_decl) { | 
|---|
| 278 | addUniformForAllParts(R: RepR); | 
|---|
| 279 | return; | 
|---|
| 280 | } | 
|---|
| 281 | } | 
|---|
| 282 | } | 
|---|
| 283 |  | 
|---|
| 284 | // Unroll non-uniform recipes. | 
|---|
| 285 | auto InsertPt = std::next(x: R.getIterator()); | 
|---|
| 286 | VPBasicBlock &VPBB = *R.getParent(); | 
|---|
| 287 | for (unsigned Part = 1; Part != UF; ++Part) { | 
|---|
| 288 | VPRecipeBase *Copy = R.clone(); | 
|---|
| 289 | Copy->insertBefore(BB&: VPBB, IP: InsertPt); | 
|---|
| 290 | addRecipeForPart(OrigR: &R, CopyR: Copy, Part); | 
|---|
| 291 |  | 
|---|
| 292 | VPValue *Op; | 
|---|
| 293 | if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>( | 
|---|
| 294 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op)))) { | 
|---|
| 295 | Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1)); | 
|---|
| 296 | Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part)); | 
|---|
| 297 | continue; | 
|---|
| 298 | } | 
|---|
| 299 | if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) { | 
|---|
| 300 | auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0)); | 
|---|
| 301 | if (Phi && Phi->isOrdered()) { | 
|---|
| 302 | auto &Parts = VPV2Parts[Phi]; | 
|---|
| 303 | if (Part == 1) { | 
|---|
| 304 | Parts.clear(); | 
|---|
| 305 | Parts.push_back(Elt: Red); | 
|---|
| 306 | } | 
|---|
| 307 | Parts.push_back(Elt: Copy->getVPSingleValue()); | 
|---|
| 308 | Phi->setOperand(I: 1, New: Copy->getVPSingleValue()); | 
|---|
| 309 | } | 
|---|
| 310 | } | 
|---|
| 311 | remapOperands(R: Copy, Part); | 
|---|
| 312 |  | 
|---|
| 313 | // Add operand indicating the part to generate code for, to recipes still | 
|---|
| 314 | // requiring it. | 
|---|
| 315 | if (isa<VPScalarIVStepsRecipe, VPWidenCanonicalIVRecipe, | 
|---|
| 316 | VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: Copy) || | 
|---|
| 317 | match(V: Copy, P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>( | 
|---|
| 318 | Op0: m_VPValue()))) | 
|---|
| 319 | Copy->addOperand(Operand: getConstantVPV(Part)); | 
|---|
| 320 |  | 
|---|
| 321 | if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Val: R)) | 
|---|
| 322 | Copy->setOperand(I: 0, New: R.getOperand(N: 0)); | 
|---|
| 323 | } | 
|---|
| 324 | } | 
|---|
| 325 |  | 
|---|
| 326 | void UnrollState::unrollBlock(VPBlockBase *VPB) { | 
|---|
| 327 | auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB); | 
|---|
| 328 | if (VPR) { | 
|---|
| 329 | if (VPR->isReplicator()) | 
|---|
| 330 | return unrollReplicateRegionByUF(VPR); | 
|---|
| 331 |  | 
|---|
| 332 | // Traverse blocks in region in RPO to ensure defs are visited before uses | 
|---|
| 333 | // across blocks. | 
|---|
| 334 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> | 
|---|
| 335 | RPOT(VPR->getEntry()); | 
|---|
| 336 | for (VPBlockBase *VPB : RPOT) | 
|---|
| 337 | unrollBlock(VPB); | 
|---|
| 338 | return; | 
|---|
| 339 | } | 
|---|
| 340 |  | 
|---|
| 341 | // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes. | 
|---|
| 342 | auto *VPBB = cast<VPBasicBlock>(Val: VPB); | 
|---|
| 343 | auto InsertPtForPhi = VPBB->getFirstNonPhi(); | 
|---|
| 344 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { | 
|---|
| 345 | if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R)) | 
|---|
| 346 | continue; | 
|---|
| 347 |  | 
|---|
| 348 | // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and | 
|---|
| 349 | // Compute*Result which combine all parts to compute the final value. | 
|---|
| 350 | VPValue *Op1; | 
|---|
| 351 | if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Op0: m_VPValue(V&: Op1))) || | 
|---|
| 352 | match(V: &R, P: m_VPInstruction<VPInstruction::FirstActiveLane>( | 
|---|
| 353 | Op0: m_VPValue(V&: Op1))) || | 
|---|
| 354 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeAnyOfResult>( | 
|---|
| 355 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) || | 
|---|
| 356 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeReductionResult>( | 
|---|
| 357 | Op0: m_VPValue(), Op1: m_VPValue(V&: Op1))) || | 
|---|
| 358 | match(V: &R, P: m_VPInstruction<VPInstruction::ComputeFindIVResult>( | 
|---|
| 359 | Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(), Op3: m_VPValue(V&: Op1)))) { | 
|---|
| 360 | addUniformForAllParts(R: cast<VPInstruction>(Val: &R)); | 
|---|
| 361 | for (unsigned Part = 1; Part != UF; ++Part) | 
|---|
| 362 | R.addOperand(Operand: getValueForPart(V: Op1, Part)); | 
|---|
| 363 | continue; | 
|---|
| 364 | } | 
|---|
| 365 | VPValue *Op0; | 
|---|
| 366 | if (match(V: &R, P: m_VPInstruction<VPInstruction::ExtractLastElement>( | 
|---|
| 367 | Op0: m_VPValue(V&: Op0))) || | 
|---|
| 368 | match(V: &R, P: m_VPInstruction<VPInstruction::ExtractPenultimateElement>( | 
|---|
| 369 | Op0: m_VPValue(V&: Op0)))) { | 
|---|
| 370 | addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R)); | 
|---|
| 371 | if (Plan.hasScalarVFOnly()) { | 
|---|
| 372 | auto *I = cast<VPInstruction>(Val: &R); | 
|---|
| 373 | // Extracting from end with VF = 1 implies retrieving the last or | 
|---|
| 374 | // penultimate scalar part (UF-1 or UF-2). | 
|---|
| 375 | unsigned Offset = | 
|---|
| 376 | I->getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2; | 
|---|
| 377 | I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: UF - Offset)); | 
|---|
| 378 | R.eraseFromParent(); | 
|---|
| 379 | } else { | 
|---|
| 380 | // Otherwise we extract from the last part. | 
|---|
| 381 | remapOperands(R: &R, Part: UF - 1); | 
|---|
| 382 | } | 
|---|
| 383 | continue; | 
|---|
| 384 | } | 
|---|
| 385 |  | 
|---|
| 386 | auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R); | 
|---|
| 387 | if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) { | 
|---|
| 388 | addUniformForAllParts(R: SingleDef); | 
|---|
| 389 | continue; | 
|---|
| 390 | } | 
|---|
| 391 |  | 
|---|
| 392 | if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) { | 
|---|
| 393 | unrollHeaderPHIByUF(R: H, InsertPtForPhi); | 
|---|
| 394 | continue; | 
|---|
| 395 | } | 
|---|
| 396 |  | 
|---|
| 397 | unrollRecipeByUF(R); | 
|---|
| 398 | } | 
|---|
| 399 | } | 
|---|
| 400 |  | 
|---|
| 401 | void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx) { | 
|---|
| 402 | assert(UF > 0 && "Unroll factor must be positive"); | 
|---|
| 403 | Plan.setUF(UF); | 
|---|
| 404 | auto Cleanup = make_scope_exit(F: [&Plan]() { | 
|---|
| 405 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); | 
|---|
| 406 | // Remove recipes that are redundant after unrolling. | 
|---|
| 407 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { | 
|---|
| 408 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { | 
|---|
| 409 | auto *VPI = dyn_cast<VPInstruction>(Val: &R); | 
|---|
| 410 | if (VPI && | 
|---|
| 411 | VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && | 
|---|
| 412 | VPI->getNumOperands() == 1) { | 
|---|
| 413 | VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0)); | 
|---|
| 414 | VPI->eraseFromParent(); | 
|---|
| 415 | } | 
|---|
| 416 | } | 
|---|
| 417 | } | 
|---|
| 418 | }); | 
|---|
| 419 | if (UF == 1) { | 
|---|
| 420 | return; | 
|---|
| 421 | } | 
|---|
| 422 |  | 
|---|
| 423 | UnrollState Unroller(Plan, UF, Ctx); | 
|---|
| 424 |  | 
|---|
| 425 | // Iterate over all blocks in the plan starting from Entry, and unroll | 
|---|
| 426 | // recipes inside them. This includes the vector preheader and middle blocks, | 
|---|
| 427 | // which may set up or post-process per-part values. | 
|---|
| 428 | ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT( | 
|---|
| 429 | Plan.getEntry()); | 
|---|
| 430 | for (VPBlockBase *VPB : RPOT) | 
|---|
| 431 | Unroller.unrollBlock(VPB); | 
|---|
| 432 |  | 
|---|
| 433 | unsigned Part = 1; | 
|---|
| 434 | // Remap operands of cloned header phis to update backedge values. The header | 
|---|
| 435 | // phis cloned during unrolling are just after the header phi for part 0. | 
|---|
| 436 | // Reset Part to 1 when reaching the first (part 0) recipe of a block. | 
|---|
| 437 | for (VPRecipeBase &H : | 
|---|
| 438 | Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { | 
|---|
| 439 | // The second operand of Fixed Order Recurrence phi's, feeding the spliced | 
|---|
| 440 | // value across the backedge, needs to remap to the last part of the spliced | 
|---|
| 441 | // value. | 
|---|
| 442 | if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) { | 
|---|
| 443 | Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1); | 
|---|
| 444 | continue; | 
|---|
| 445 | } | 
|---|
| 446 | if (Unroller.contains(VPV: H.getVPSingleValue()) || | 
|---|
| 447 | isa<VPWidenPointerInductionRecipe>(Val: &H)) { | 
|---|
| 448 | Part = 1; | 
|---|
| 449 | continue; | 
|---|
| 450 | } | 
|---|
| 451 | Unroller.remapOperands(R: &H, Part); | 
|---|
| 452 | Part++; | 
|---|
| 453 | } | 
|---|
| 454 |  | 
|---|
| 455 | VPlanTransforms::removeDeadRecipes(Plan); | 
|---|
| 456 | } | 
|---|
| 457 |  | 
|---|
| 458 | /// Create a single-scalar clone of \p RepR for lane \p Lane. | 
|---|
| 459 | static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder, | 
|---|
| 460 | Type *IdxTy, VPReplicateRecipe *RepR, | 
|---|
| 461 | VPLane Lane) { | 
|---|
| 462 | // Collect the operands at Lane, creating extracts as needed. | 
|---|
| 463 | SmallVector<VPValue *> NewOps; | 
|---|
| 464 | for (VPValue *Op : RepR->operands()) { | 
|---|
| 465 | if (vputils::isSingleScalar(VPV: Op)) { | 
|---|
| 466 | NewOps.push_back(Elt: Op); | 
|---|
| 467 | continue; | 
|---|
| 468 | } | 
|---|
| 469 | if (Lane.getKind() == VPLane::Kind::ScalableLast) { | 
|---|
| 470 | NewOps.push_back( | 
|---|
| 471 | Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Op})); | 
|---|
| 472 | continue; | 
|---|
| 473 | } | 
|---|
| 474 | // Look through buildvector to avoid unnecessary extracts. | 
|---|
| 475 | if (match(V: Op, P: m_BuildVector())) { | 
|---|
| 476 | NewOps.push_back( | 
|---|
| 477 | Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane())); | 
|---|
| 478 | continue; | 
|---|
| 479 | } | 
|---|
| 480 | VPValue *Idx = | 
|---|
| 481 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: IdxTy, V: Lane.getKnownLane())); | 
|---|
| 482 | VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx}); | 
|---|
| 483 | NewOps.push_back(Elt: Ext); | 
|---|
| 484 | } | 
|---|
| 485 |  | 
|---|
| 486 | auto *New = | 
|---|
| 487 | new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps, | 
|---|
| 488 | /*IsSingleScalar=*/true, /*Mask=*/nullptr, *RepR); | 
|---|
| 489 | New->insertBefore(InsertPos: RepR); | 
|---|
| 490 | return New; | 
|---|
| 491 | } | 
|---|
| 492 |  | 
|---|
| 493 | void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) { | 
|---|
| 494 | Type *IdxTy = IntegerType::get( | 
|---|
| 495 | C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32); | 
|---|
| 496 |  | 
|---|
| 497 | // Visit all VPBBs outside the loop region and directly inside the top-level | 
|---|
| 498 | // loop region. | 
|---|
| 499 | auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( | 
|---|
| 500 | Range: vp_depth_first_shallow(G: Plan.getEntry())); | 
|---|
| 501 | auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>( | 
|---|
| 502 | Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry())); | 
|---|
| 503 | auto VPBBsToUnroll = | 
|---|
| 504 | concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion); | 
|---|
| 505 | for (VPBasicBlock *VPBB : VPBBsToUnroll) { | 
|---|
| 506 | for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) { | 
|---|
| 507 | auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R); | 
|---|
| 508 | if (!RepR || RepR->isSingleScalar()) | 
|---|
| 509 | continue; | 
|---|
| 510 |  | 
|---|
| 511 | VPBuilder Builder(RepR); | 
|---|
| 512 | if (RepR->getNumUsers() == 0) { | 
|---|
| 513 | if (isa<StoreInst>(Val: RepR->getUnderlyingInstr()) && | 
|---|
| 514 | vputils::isSingleScalar(VPV: RepR->getOperand(N: 1))) { | 
|---|
| 515 | // Stores to invariant addresses need to store the last lane only. | 
|---|
| 516 | cloneForLane(Plan, Builder, IdxTy, RepR, | 
|---|
| 517 | Lane: VPLane::getLastLaneForVF(VF)); | 
|---|
| 518 | } else { | 
|---|
| 519 | // Create single-scalar version of RepR for all lanes. | 
|---|
| 520 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | 
|---|
| 521 | cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I)); | 
|---|
| 522 | } | 
|---|
| 523 | RepR->eraseFromParent(); | 
|---|
| 524 | continue; | 
|---|
| 525 | } | 
|---|
| 526 | /// Create single-scalar version of RepR for all lanes. | 
|---|
| 527 | SmallVector<VPValue *> LaneDefs; | 
|---|
| 528 | for (unsigned I = 0; I != VF.getKnownMinValue(); ++I) | 
|---|
| 529 | LaneDefs.push_back(Elt: cloneForLane(Plan, Builder, IdxTy, RepR, Lane: VPLane(I))); | 
|---|
| 530 |  | 
|---|
| 531 | /// Users that only demand the first lane can use the definition for lane | 
|---|
| 532 | /// 0. | 
|---|
| 533 | RepR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [RepR](VPUser &U, unsigned) { | 
|---|
| 534 | return U.onlyFirstLaneUsed(Op: RepR); | 
|---|
| 535 | }); | 
|---|
| 536 |  | 
|---|
| 537 | // If needed, create a Build(Struct)Vector recipe to insert the scalar | 
|---|
| 538 | // lane values into a vector. | 
|---|
| 539 | Type *ResTy = RepR->getUnderlyingInstr()->getType(); | 
|---|
| 540 | VPValue *VecRes = Builder.createNaryOp( | 
|---|
| 541 | Opcode: ResTy->isStructTy() ? VPInstruction::BuildStructVector | 
|---|
| 542 | : VPInstruction::BuildVector, | 
|---|
| 543 | Operands: LaneDefs); | 
|---|
| 544 | RepR->replaceAllUsesWith(New: VecRes); | 
|---|
| 545 | RepR->eraseFromParent(); | 
|---|
| 546 | } | 
|---|
| 547 | } | 
|---|
| 548 | } | 
|---|
| 549 |  | 
|---|