1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
22#include "llvm/ADT/PostOrderIterator.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/Analysis/IVDescriptors.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Intrinsics.h"
28
29using namespace llvm;
30using namespace llvm::VPlanPatternMatch;
31
32namespace {
33
34/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
35/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
36/// the unrolling transformation, where the original VPValues are retained for
37/// part zero.
38class UnrollState {
39 /// Plan to unroll.
40 VPlan &Plan;
41 /// Unroll factor to unroll by.
42 const unsigned UF;
43
44 /// Unrolling may create recipes that should not be unrolled themselves.
45 /// Those are tracked in ToSkip.
46 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
47
48 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
49 // ..., UF-1.
50 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
51
52 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
53 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
54
55 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
56 /// all parts.
57 void unrollRecipeByUF(VPRecipeBase &R);
58
59 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
60 /// depends on the concrete header phi. Inserts newly created recipes at \p
61 /// InsertPtForPhi.
62 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
63 VPBasicBlock::iterator InsertPtForPhi);
64
65 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
66 /// the induction steps for each part.
67 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
68 VPBasicBlock::iterator InsertPtForPhi);
69
70 VPValue *getConstantInt(unsigned Part) {
71 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
72 return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part);
73 }
74
75public:
76 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF) {}
77
78 void unrollBlock(VPBlockBase *VPB);
79
80 VPValue *getValueForPart(VPValue *V, unsigned Part) {
81 if (Part == 0 || isa<VPIRValue, VPSymbolicValue, VPRegionValue>(Val: V))
82 return V;
83 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
84 "accessed value does not exist");
85 return VPV2Parts[V][Part - 1];
86 }
87
88 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
89 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
90 /// corresponding VPValue defined by \p CopyR.
91 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
92 unsigned Part) {
93 for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) {
94 const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV);
95 assert(V->second.size() == Part - 1 && "earlier parts not set");
96 V->second.push_back(Elt: CopyR->getVPValue(I: Idx));
97 }
98 }
99
100 /// Given a uniform recipe \p R, add it for all parts.
101 void addUniformForAllParts(VPSingleDefRecipe *R) {
102 const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R);
103 assert(Inserted && "uniform value already added");
104 for (unsigned Part = 0; Part != UF; ++Part)
105 V->second.push_back(Elt: R);
106 }
107
108 bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); }
109
110 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
111 /// \p P.
112 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
113 auto *Op = R->getOperand(N: OpIdx);
114 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
115 }
116
117 /// Update \p R's operands with their corresponding VPValues for part \p P.
118 void remapOperands(VPRecipeBase *R, unsigned Part) {
119 for (const auto &[OpIdx, Op] : enumerate(First: R->operands()))
120 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
121 }
122};
123} // namespace
124
125static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
126 unsigned Part, VPlan &Plan) {
127 if (Part == 0)
128 return;
129
130 VPBuilder Builder(Steps);
131 Type *BaseIVTy = Steps->getOperand(N: 0)->getScalarType();
132 Type *IntStepTy =
133 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
134 VPValue *StartIndex = Steps->getVFValue();
135 if (Part > 1) {
136 StartIndex = Builder.createOverflowingOp(
137 Opcode: Instruction::Mul,
138 Operands: {StartIndex, Plan.getConstantInt(Ty: StartIndex->getScalarType(), Val: Part)});
139 }
140 StartIndex = Builder.createScalarSExtOrTrunc(
141 Op: StartIndex, ResultTy: IntStepTy, SrcTy: StartIndex->getScalarType(), DL: Steps->getDebugLoc());
142
143 if (BaseIVTy->isFloatingPointTy())
144 StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex,
145 ResultTy: BaseIVTy, DL: Steps->getDebugLoc());
146
147 Steps->setStartIndex(StartIndex);
148}
149
150void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
151 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
152 for (unsigned Part = 1; Part != UF; ++Part) {
153 auto *Copy = VPR->clone();
154 VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt);
155
156 auto PartI = vp_depth_first_shallow(G: Copy->getEntry());
157 auto Part0 = vp_depth_first_shallow(G: VPR->getEntry());
158 for (const auto &[PartIVPBB, Part0VPBB] :
159 zip(t: VPBlockUtils::blocksAs<VPBasicBlock>(Range&: PartI),
160 u: VPBlockUtils::blocksAs<VPBasicBlock>(Range&: Part0))) {
161 for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) {
162 remapOperands(R: &PartIR, Part);
163 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR))
164 addStartIndexForScalarSteps(Steps, Part, Plan);
165
166 addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part);
167 }
168 }
169 }
170}
171
172void UnrollState::unrollWidenInductionByUF(
173 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
174 VPBasicBlock *PH = cast<VPBasicBlock>(
175 Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
176 Type *IVTy = IV->getScalarType();
177 auto &ID = IV->getInductionDescriptor();
178 FastMathFlags FMF;
179 VPIRFlags::WrapFlagsTy WrapFlags(false, false);
180 if (auto *IntOrFPInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: IV)) {
181 FMF = IntOrFPInd->getFastMathFlagsOrNone();
182 if (IntOrFPInd->hasNoWrapFlags())
183 WrapFlags = IntOrFPInd->getNoWrapFlags();
184 }
185
186 VPValue *ScalarStep = IV->getStepValue();
187 VPBuilder Builder(PH);
188 Type *VectorStepTy = IVTy->isPointerTy() ? ScalarStep->getScalarType() : IVTy;
189 VPInstruction *VectorStep = Builder.createNaryOp(
190 Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy, Flags: FMF,
191 DL: IV->getDebugLoc());
192
193 ToSkip.insert(Ptr: VectorStep);
194
195 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
196 // remains the header phi. Parts > 0 are computed by adding Step to the
197 // previous part. The header phi recipe will get 2 new operands: the step
198 // value for a single part and the last part, used to compute the backedge
199 // value during VPWidenInductionRecipe::execute.
200 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
201 // %Part.1 = %Part.0 + %VectorStep
202 // %Part.2 = %Part.1 + %VectorStep
203 // %Part.3 = %Part.2 + %VectorStep
204 //
205 // The newly added recipes are added to ToSkip to avoid interleaving them
206 // again.
207 VPValue *Prev = IV;
208 Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi);
209 unsigned AddOpc;
210 VPIRFlags AddFlags;
211 if (IVTy->isPointerTy()) {
212 AddOpc = VPInstruction::WidePtrAdd;
213 AddFlags = GEPNoWrapFlags::none();
214 } else if (IVTy->isFloatingPointTy()) {
215 AddOpc = ID.getInductionOpcode();
216 AddFlags = FMF;
217 } else {
218 AddOpc = Instruction::Add;
219 AddFlags = WrapFlags;
220 if (cast<VPWidenIntOrFpInductionRecipe>(Val: IV)->isCanonical())
221 AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false);
222 }
223 for (unsigned Part = 1; Part != UF; ++Part) {
224 std::string Name =
225 Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add";
226
227 VPInstruction *Add =
228 Builder.createNaryOp(Opcode: AddOpc,
229 Operands: {
230 Prev,
231 VectorStep,
232 },
233 Flags: AddFlags, DL: IV->getDebugLoc(), Name);
234 ToSkip.insert(Ptr: Add);
235 addRecipeForPart(OrigR: IV, CopyR: Add, Part);
236 Prev = Add;
237 }
238 IV->addUnrolledPartOperands(SplatVFStep: VectorStep, LastPart: Prev);
239}
240
241void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
242 VPBasicBlock::iterator InsertPtForPhi) {
243 // First-order recurrences pass a single vector or scalar through their header
244 // phis, irrespective of interleaving.
245 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R))
246 return;
247
248 // Generate step vectors for each unrolled part.
249 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) {
250 unrollWidenInductionByUF(IV, InsertPtForPhi);
251 return;
252 }
253
254 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R);
255 if (RdxPhi && RdxPhi->isOrdered())
256 return;
257
258 auto InsertPt = std::next(x: R->getIterator());
259 for (unsigned Part = 1; Part != UF; ++Part) {
260 VPRecipeBase *Copy = R->clone();
261 Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt);
262 addRecipeForPart(OrigR: R, CopyR: Copy, Part);
263 if (RdxPhi) {
264 // If the start value is a ReductionStartVector, use the identity value
265 // (second operand) for unrolled parts. If the scaling factor is > 1,
266 // create a new ReductionStartVector with the scale factor and both
267 // operands set to the identity value.
268 if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) {
269 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
270 "unexpected start VPInstruction");
271 if (Part != 1)
272 continue;
273 VPValue *StartV;
274 if (match(V: VPI->getOperand(N: 2), P: m_One())) {
275 StartV = VPI->getOperand(N: 1);
276 } else {
277 auto *C = VPI->clone();
278 C->setOperand(I: 0, New: C->getOperand(N: 1));
279 C->insertAfter(InsertPos: VPI);
280 StartV = C;
281 }
282 for (unsigned Part = 1; Part != UF; ++Part)
283 VPV2Parts[VPI][Part - 1] = StartV;
284 }
285 } else {
286 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
287 "unexpected header phi recipe not needing unrolled part");
288 }
289 }
290}
291
292/// Handle non-header-phi recipes.
293void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
294 if (match(V: &R, P: m_CombineOr(Ps: m_BranchOnCond(), Ps: m_BranchOnCount())))
295 return;
296
297 if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) {
298 if (vputils::onlyFirstPartUsed(Def: VPI)) {
299 addUniformForAllParts(R: VPI);
300 return;
301 }
302 }
303 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
304 if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) &&
305 RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) {
306 // Stores to an invariant address only need to store the last part.
307 remapOperands(R: &R, Part: UF - 1);
308 return;
309 }
310 if (match(V: RepR,
311 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
312 addUniformForAllParts(R: RepR);
313 return;
314 }
315 }
316
317 // Unroll non-uniform recipes.
318 auto InsertPt = std::next(x: R.getIterator());
319 VPBasicBlock &VPBB = *R.getParent();
320 for (unsigned Part = 1; Part != UF; ++Part) {
321 VPRecipeBase *Copy = R.clone();
322 Copy->insertBefore(BB&: VPBB, IP: InsertPt);
323 addRecipeForPart(OrigR: &R, CopyR: Copy, Part);
324
325 // Phi operands are updated once all other recipes have been unrolled.
326 if (isa<VPWidenPHIRecipe>(Val: Copy))
327 continue;
328
329 VPValue *Op;
330 if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
331 Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) {
332 Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1));
333 Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part));
334 continue;
335 }
336 if (isa<VPVectorPointerRecipe, VPWidenCanonicalIVRecipe>(Val: R)) {
337 VPBuilder Builder(&R);
338 const DataLayout &DL = Plan.getDataLayout();
339 Type *IndexTy =
340 isa<VPWidenCanonicalIVRecipe>(Val: R)
341 ? Plan.getVectorLoopRegion()->getCanonicalIVType()
342 : DL.getIndexType(PtrTy: R.getVPSingleValue()->getScalarType());
343 Type *VFTy = Plan.getVF().getScalarType();
344 VPValue *VF = Builder.createScalarZExtOrTrunc(
345 Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown());
346 // VFxUF does not wrap, so VF * Part also cannot wrap.
347 VPValue *VFxPart = Builder.createOverflowingOp(
348 Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)},
349 WrapFlags: {true, true});
350 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(Val: Copy))
351 VecPtr->addPerPartOffset(VFxPart);
352 else
353 cast<VPWidenCanonicalIVRecipe>(Val: Copy)->addPerPartStep(Step: VFxPart);
354 continue;
355 }
356 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) {
357 auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0));
358 if (Phi && Phi->isOrdered()) {
359 auto &Parts = VPV2Parts[Phi];
360 if (Part == 1) {
361 Parts.clear();
362 Parts.push_back(Elt: Red);
363 }
364 Parts.push_back(Elt: Copy->getVPSingleValue());
365 Phi->setOperand(I: 1, New: Copy->getVPSingleValue());
366 }
367 }
368 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: Copy)) {
369 // Materialize PartN offset for VectorEndPointer.
370 VEPR->setOperand(I: 0, New: R.getOperand(N: 0));
371 VEPR->setOperand(I: 1, New: R.getOperand(N: 1));
372 VEPR->materializeOffset(Part);
373 continue;
374 }
375
376 remapOperands(R: Copy, Part);
377
378 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy))
379 addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part, Plan);
380
381 if (match(V: Copy,
382 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
383 VPBuilder Builder(Copy);
384 VPValue *ScaledByPart = Builder.createOverflowingOp(
385 Opcode: Instruction::Mul, Operands: {Copy->getOperand(N: 1), getConstantInt(Part)});
386 Copy->setOperand(I: 1, New: ScaledByPart);
387 }
388 }
389 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
390 // Materialize Part0 offset for VectorEndPointer.
391 VEPR->materializeOffset();
392 }
393 if (auto *WideCanIV = dyn_cast<VPWidenCanonicalIVRecipe>(Val: &R)) {
394 // Set Part0 step for WidenCanonicalIV.
395 WideCanIV->addPerPartStep(Step: getConstantInt(Part: 0));
396 }
397}
398
399void UnrollState::unrollBlock(VPBlockBase *VPB) {
400 auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB);
401 if (VPR) {
402 if (VPR->isReplicator())
403 return unrollReplicateRegionByUF(VPR);
404
405 // Traverse blocks in region in RPO to ensure defs are visited before uses
406 // across blocks.
407 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
408 RPOT(VPR->getEntry());
409 for (VPBlockBase *VPB : RPOT)
410 unrollBlock(VPB);
411 return;
412 }
413
414 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
415 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
416 auto InsertPtForPhi = VPBB->getFirstNonPhi();
417 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
418 if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R))
419 continue;
420
421 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
422 // ComputeReductionResult which combine all parts to compute the final
423 // value.
424 VPValue *Op1;
425 if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) ||
426 match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) ||
427 match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) ||
428 match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) {
429 auto *VPI = cast<VPInstruction>(Val: &R);
430 addUniformForAllParts(R: VPI);
431 for (unsigned Part = 1; Part != UF; ++Part)
432 VPI->addOperand(Op: getValueForPart(V: Op1, Part));
433 continue;
434 }
435 VPValue *Op0;
436 if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) {
437 auto *VPI = cast<VPInstruction>(Val: &R);
438 addUniformForAllParts(R: VPI);
439 for (unsigned Part = 1; Part != UF; ++Part)
440 VPI->addOperand(Op: getValueForPart(V: Op1, Part));
441 continue;
442 }
443
444 VPValue *Op2;
445 if (match(V: &R, P: m_ExtractLastActive(Op0: m_VPValue(), Op1: m_VPValue(V&: Op1),
446 Op2: m_VPValue(V&: Op2)))) {
447 auto *VPI = cast<VPInstruction>(Val: &R);
448 addUniformForAllParts(R: VPI);
449 for (unsigned Part = 1; Part != UF; ++Part) {
450 VPI->addOperand(Op: getValueForPart(V: Op1, Part));
451 VPI->addOperand(Op: getValueForPart(V: Op2, Part));
452 }
453 continue;
454 }
455
456 if (Plan.hasScalarVFOnly()) {
457 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) ||
458 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
459 auto *I = cast<VPInstruction>(Val: &R);
460 bool IsPenultimatePart =
461 I->getOpcode() == VPInstruction::ExtractPenultimateElement;
462 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
463 // For scalar VF, directly use the scalar part value.
464 I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx));
465 continue;
466 }
467 }
468 // For vector VF, the penultimate element is always extracted from the last part.
469 if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) ||
470 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
471 addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R));
472 R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1));
473 continue;
474 }
475
476 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R);
477 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) {
478 addUniformForAllParts(R: SingleDef);
479 continue;
480 }
481
482 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) {
483 unrollHeaderPHIByUF(R: H, InsertPtForPhi);
484 continue;
485 }
486
487 unrollRecipeByUF(R);
488 }
489}
490
491void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
492 assert(UF > 0 && "Unroll factor must be positive");
493 Plan.setUF(UF);
494 llvm::scope_exit Cleanup([&Plan, UF]() {
495 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
496 // Remove recipes that are redundant after unrolling.
497 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: Iter)) {
498 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
499 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
500 if (VPI &&
501 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
502 VPI->getOperand(N: 1) == &Plan.getVF()) {
503 VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0));
504 VPI->eraseFromParent();
505 }
506 }
507 }
508
509 Type *TCTy = Plan.getTripCount()->getScalarType();
510 Plan.getUF().replaceAllUsesWith(New: Plan.getConstantInt(Ty: TCTy, Val: UF));
511 });
512 if (UF == 1) {
513 return;
514 }
515
516 UnrollState Unroller(Plan, UF);
517
518 // Iterate over all blocks in the plan starting from Entry, and unroll
519 // recipes inside them. This includes the vector preheader and middle blocks,
520 // which may set up or post-process per-part values.
521 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
522 Plan.getEntry());
523 for (VPBlockBase *VPB : RPOT)
524 Unroller.unrollBlock(VPB);
525
526 unsigned Part = 1;
527 // Remap operands of cloned header phis to update backedge values. The header
528 // phis cloned during unrolling are just after the header phi for part 0.
529 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
530 for (VPRecipeBase &H :
531 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
532 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
533 // value across the backedge, needs to remap to the last part of the spliced
534 // value.
535 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) {
536 Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1);
537 continue;
538 }
539 if (Unroller.contains(VPV: H.getVPSingleValue())) {
540 Part = 1;
541 continue;
542 }
543 Unroller.remapOperands(R: &H, Part);
544 Part++;
545 }
546
547 VPlanTransforms::removeDeadRecipes(Plan);
548}
549
550/// Add a lane offset to the start index of \p Steps.
551static void addLaneToStartIndex(VPScalarIVStepsRecipe *Steps, unsigned Lane,
552 VPlan &Plan, VPRecipeBase *InsertPt) {
553 assert(Lane > 0 && "Zero lane adds no offset to start index");
554 Type *BaseIVTy = Steps->getOperand(N: 0)->getScalarType();
555
556 VPValue *OldStartIndex = Steps->getStartIndex();
557 VPValue *LaneOffset;
558 unsigned AddOpcode;
559 // TODO: Retrieve the flags from Steps unconditionally.
560 VPIRFlags Flags;
561 if (BaseIVTy->isFloatingPointTy()) {
562 int SignedLane = static_cast<int>(Lane);
563 if (!OldStartIndex && Steps->getInductionOpcode() == Instruction::FSub)
564 SignedLane = -SignedLane;
565 LaneOffset = Plan.getOrAddLiveIn(V: ConstantFP::get(Ty: BaseIVTy, V: SignedLane));
566 AddOpcode = Steps->getInductionOpcode();
567 Flags = VPIRFlags(FastMathFlags());
568 } else {
569 unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits();
570 LaneOffset = Plan.getConstantInt(
571 Val: APInt(BaseIVBits, Lane, /*isSigned*/ false, /*implicitTrunc*/ true));
572 AddOpcode = Instruction::Add;
573 Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false));
574 }
575
576 VPValue *NewStartIndex = LaneOffset;
577 if (OldStartIndex) {
578 VPBuilder Builder(InsertPt);
579 NewStartIndex =
580 Builder.createNaryOp(Opcode: AddOpcode, Operands: {OldStartIndex, LaneOffset}, Flags);
581 }
582 Steps->setStartIndex(NewStartIndex);
583}
584
585/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe,
586/// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p
587/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
588static VPValue *
589cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
590 VPSingleDefRecipe *DefR, VPLane Lane,
591 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
592 assert((isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(DefR)) &&
593 "DefR must be a VPReplicateRecipe, VPInstruction or "
594 "VPScalarIVStepsRecipe");
595 VPValue *Op;
596 if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) {
597 auto LaneDefs = Def2LaneDefs.find(Val: Op);
598 if (LaneDefs != Def2LaneDefs.end())
599 return LaneDefs->second[Lane.getKnownLane()];
600
601 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
602 return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
603 }
604
605 // Collect the operands at Lane, creating extracts as needed.
606 SmallVector<VPValue *> NewOps;
607 for (VPValue *Op : DefR->operands()) {
608 // If Op is a definition that has been unrolled, directly use the clone for
609 // the corresponding lane.
610 auto LaneDefs = Def2LaneDefs.find(Val: Op);
611 if (LaneDefs != Def2LaneDefs.end()) {
612 NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]);
613 continue;
614 }
615 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
616 // Look through mandatory Unpack.
617 [[maybe_unused]] bool Matched =
618 match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)));
619 assert(Matched && "original op must have been Unpack");
620 auto *ExtractPart =
621 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op});
622 NewOps.push_back(
623 Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart}));
624 continue;
625 }
626 if (vputils::isSingleScalar(VPV: Op)) {
627 NewOps.push_back(Elt: Op);
628 continue;
629 }
630
631 // Look through buildvector to avoid unnecessary extracts.
632 if (match(V: Op, P: m_BuildVector())) {
633 NewOps.push_back(
634 Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane()));
635 continue;
636 }
637 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
638 VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
639 NewOps.push_back(Elt: Ext);
640 }
641
642 VPSingleDefRecipe *New;
643 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) {
644 // TODO: have cloning of replicate recipes also provide the desired result
645 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
646 // Mask from the operands?)
647 New = VPBuilder::createSingleScalarOp(
648 Opcode: RepR->getOpcode(), Operands: NewOps, /*Mask=*/nullptr, Flags: *RepR, Metadata: *RepR,
649 DL: RepR->getDebugLoc(), UV: RepR->getUnderlyingInstr());
650 } else {
651 New = DefR->clone();
652 for (const auto &[Idx, Op] : enumerate(First&: NewOps)) {
653 New->setOperand(I: Idx, New: Op);
654 }
655 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: New)) {
656 // Skip lane 0: an absent start index is implicitly zero.
657 unsigned KnownLane = Lane.getKnownLane();
658 if (KnownLane != 0)
659 addLaneToStartIndex(Steps, Lane: KnownLane, Plan, InsertPt: DefR);
660 }
661 }
662 New->insertBefore(InsertPos: DefR);
663 return New;
664}
665
666/// Convert recipes in region blocks to operate on a single lane 0.
667/// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
668/// converted into BranchOnCond, PredInstPhi recipes are replaced by scalar phi
669/// recipes with an additional poison operand, and extracts are created as
670/// needed.
671static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
672 VPBlockBase *Entry,
673 ElementCount VF) {
674 VPValue *Idx0 = Plan.getZero(Ty: IdxTy);
675 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry)) {
676 for (VPRecipeBase &OldR : make_early_inc_range(Range&: cast<VPBasicBlock>(Val&: *VPB))) {
677 assert(
678 !isa<VPWidenPHIRecipe>(&OldR) &&
679 !match(&OldR,
680 m_CombineOr(
681 m_InsertElement(m_VPValue(), m_VPValue(), m_VPValue()),
682 m_ExtractElement(m_VPValue(), m_VPValue()))) &&
683 "must not contain wide phis, inserts or extracts before conversion");
684
685 VPBuilder Builder(&OldR);
686 DebugLoc OldDL = OldR.getDebugLoc();
687 // For scalar VF, operands are already scalar; no extraction needed.
688 if (!VF.isScalar()) {
689 for (const auto &[I, Op] : enumerate(First: OldR.operands())) {
690 // Skip operands that don't need extraction: values defined in the
691 // same block (already scalar), or values that are already single
692 // scalars.
693 // TODO: Support isSingleScalar for VPScalarIVStepsRecipe.
694 auto *DefR = Op->getDefiningRecipe();
695 if ((isa_and_present<VPScalarIVStepsRecipe>(Val: DefR) &&
696 DefR->getParent() == VPB) ||
697 vputils::isSingleScalar(VPV: Op))
698 continue;
699
700 // Extract lane zero from values defined outside the region.
701 VPValue *Extract = Builder.createNaryOp(Opcode: Instruction::ExtractElement,
702 Operands: {Op, Idx0}, DL: OldDL);
703 OldR.setOperand(I, New: Extract);
704 }
705 }
706
707 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &OldR)) {
708 auto *NewR = VPBuilder::createSingleScalarOp(
709 Opcode: RepR->getOpcode(), Operands: to_vector(Range: RepR->operands()), /*Mask=*/nullptr,
710 Flags: *RepR, Metadata: *RepR, DL: OldDL, UV: RepR->getUnderlyingInstr());
711 NewR->insertBefore(InsertPos: RepR);
712 RepR->replaceAllUsesWith(New: NewR);
713 RepR->eraseFromParent();
714 } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(Val: &OldR)) {
715 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond,
716 Operands: {BranchOnMask->getOperand(N: 0)}, DL: OldDL);
717 BranchOnMask->eraseFromParent();
718 } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(Val: &OldR)) {
719 VPValue *PredOp = PredPhi->getOperand(N: 0);
720 Type *PredTy = PredOp->getScalarType();
721 VPValue *Poison = Plan.getPoison(Ty: PredTy);
722 VPPhi *NewPhi = Builder.createScalarPhi(IncomingValues: {Poison, PredOp}, DL: OldDL);
723 PredPhi->replaceAllUsesWith(New: NewPhi);
724 PredPhi->eraseFromParent();
725 } else {
726 // TODO: Support isSingleScalar for VPScalarIVStepsRecipe.
727 assert((isa<VPScalarIVStepsRecipe>(OldR) ||
728 (isa<VPInstruction>(OldR) &&
729 vputils::isSingleScalar(OldR.getVPSingleValue()))) &&
730 "unexpected unhandled recipe");
731 }
732 }
733 }
734}
735
736/// Update recipes in the cloned blocks rooted at \p NewEntry to match \p Lane,
737/// using the original blocks rooted at \p OldEntry as reference.
738static void processLaneForReplicateRegion(VPlan &Plan, Type *IdxTy,
739 unsigned Lane, VPBasicBlock *OldEntry,
740 VPBasicBlock *NewEntry) {
741 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
742 VPValue *IdxLane = Plan.getConstantInt(Ty: IdxTy, Val: Lane);
743 for (const auto &[OldBB, NewBB] :
744 zip_equal(t: vp_depth_first_shallow(G: OldEntry),
745 u: vp_depth_first_shallow(G: NewEntry))) {
746 for (auto &&[OldR, NewR] :
747 zip_equal(t&: *cast<VPBasicBlock>(Val: OldBB), u&: *cast<VPBasicBlock>(Val: NewBB))) {
748 for (const auto &[OldV, NewV] :
749 zip_equal(t: OldR.definedValues(), u: NewR.definedValues()))
750 Old2NewVPValues[OldV] = NewV;
751
752 // Remap operands to use lane-specific values.
753 for (const auto &[I, OldOp] : enumerate(First: NewR.operands())) {
754 // Use cloned value if operand was defined in the region.
755 if (auto *NewOp = Old2NewVPValues.lookup(Val: OldOp))
756 NewR.setOperand(I, New: NewOp);
757 }
758
759 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &NewR)) {
760 addLaneToStartIndex(Steps, Lane, Plan, InsertPt: Steps);
761 } else if (match(V: &NewR, P: m_ExtractElement(Op0: m_VPValue(), Op1: m_VPValue()))) {
762 assert(match(NewR.getOperand(1), m_ZeroInt()) &&
763 "extract indices must be zero");
764 NewR.setOperand(I: 1, New: IdxLane);
765 } else if (auto *NewPhi = dyn_cast<VPPhi>(Val: &NewR)) {
766 auto *OldPhi = cast<VPPhi>(Val: &OldR);
767 assert(vputils::onlyFirstLaneUsed(OldPhi) &&
768 "VPPhis expected to have only first lane used");
769 auto *BVUser = dyn_cast_or_null<VPInstruction>(Val: OldPhi->getSingleUser());
770 if (BVUser && match(V: BVUser, P: m_CombineOr(Ps: m_BuildVector(),
771 Ps: m_BuildStructVector()))) {
772 assert(BVUser->getOperand(0) == OldPhi &&
773 "Unexpected first operand of build vector user");
774 BVUser->setOperand(I: Lane, New: NewPhi);
775 }
776 }
777 }
778 }
779}
780
781/// Dissolve a single replicate region by replicating its blocks for each lane
782/// of \p VF. The region is disconnected, its blocks are reparented, cloned for
783/// each lane, and reconnected in sequence.
784static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
785 VPlan &Plan, Type *IdxTy) {
786 auto *FirstLaneEntry = cast<VPBasicBlock>(Val: Region->getEntry());
787 auto *FirstLaneExiting = cast<VPBasicBlock>(Val: Region->getExiting());
788
789 // Disconnect and dissolve the region.
790 VPBlockBase *Predecessor = Region->getSinglePredecessor();
791 assert(Predecessor && "Replicate region must have a single predecessor");
792 auto *Successor = cast<VPBasicBlock>(Val: Region->getSingleSuccessor());
793 VPBlockUtils::disconnectBlocks(From: Predecessor, To: Region);
794 VPBlockUtils::disconnectBlocks(From: Region, To: Successor);
795
796 VPRegionBlock *ParentRegion = Region->getParent();
797 for (VPBlockBase *VPB : vp_depth_first_shallow(G: FirstLaneEntry))
798 VPB->setParent(ParentRegion);
799
800 // Process the original blocks for lane 0: converting their recipes to
801 // single-scalar.
802 convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, Entry: FirstLaneEntry, VF);
803
804 // For scalar VF, just wire the blocks and return; no cloning or packing
805 // needed.
806 if (VF.isScalar()) {
807 VPBlockUtils::connectBlocks(From: Predecessor, To: FirstLaneEntry);
808 VPBlockUtils::connectBlocks(From: FirstLaneExiting, To: Successor);
809 return;
810 }
811
812 // Create a BuildVector or BuildStructVector in successor block for every
813 // VPPhi in (first lane's) exiting block having vector uses. All their
814 // operands are initialized to poison and will be replaced when processing
815 // each clone, except for the operand of the first lane which set here.
816 // BuildVectors are recorded to be replaced later by chains of insert-element
817 // and widen phi's.
818 unsigned NumLanes = VF.getFixedValue();
819 SmallVector<VPInstruction *> BuildVectors;
820 for (auto &R : FirstLaneExiting->phis()) {
821 auto *Phi = cast<VPPhi>(Val: &R);
822 if (vputils::onlyFirstLaneUsed(Def: Phi))
823 continue;
824
825 Type *ScalarTy = Phi->getScalarType();
826 bool IsStruct = isa<StructType>(Val: ScalarTy);
827 VPValue *Poison = Plan.getPoison(Ty: ScalarTy);
828 SmallVector<VPValue *> BVOps(NumLanes, Poison);
829 auto *BV = new VPInstruction(IsStruct ? VPInstruction::BuildStructVector
830 : VPInstruction::BuildVector,
831 BVOps);
832 if (!IsStruct)
833 BuildVectors.push_back(Elt: BV);
834 Phi->replaceAllUsesWith(New: BV);
835 BV->setOperand(I: 0, New: Phi);
836 BV->insertBefore(BB&: *Successor, IP: Successor->getFirstNonPhi());
837 }
838
839 // Clone converted blocks for remaining lanes and process each in reverse
840 // order, connecting each lane's Exiting block to the subsequent lane's entry.
841 VPBlockBase *NextLaneEntry = Successor;
842 for (int Lane = NumLanes - 1; Lane > 0; --Lane) {
843 const auto &[CurrentLaneEntry, CurrentLaneExiting] =
844 VPBlockUtils::cloneFrom(Entry: FirstLaneEntry);
845 for (VPBlockBase *VPB : vp_depth_first_shallow(G: CurrentLaneEntry))
846 VPB->setParent(ParentRegion);
847 processLaneForReplicateRegion(Plan, IdxTy, Lane,
848 OldEntry: cast<VPBasicBlock>(Val: FirstLaneEntry),
849 NewEntry: cast<VPBasicBlock>(Val: CurrentLaneEntry));
850 VPBlockUtils::connectBlocks(From: CurrentLaneExiting, To: NextLaneEntry);
851 NextLaneEntry = CurrentLaneEntry;
852 }
853
854 // Connect Predecessor to FirstLaneEntry, and FirstLaneRegionExit to
855 // NextLaneEntry which is the second lane region entry. The latter is
856 // done last so that earlier clonings from FirstLaneEntry stop at
857 // FirstLaneExiting.
858 VPBlockUtils::connectBlocks(From: Predecessor, To: FirstLaneEntry);
859 VPBlockUtils::connectBlocks(From: FirstLaneExiting, To: NextLaneEntry);
860
861 // Fold BuildVector fed by scalar phis into VPWidenPHIRecipes with
862 // InsertElement per lane.
863 // TODO: check if this folding should be dropped.
864 for (VPInstruction *BV : BuildVectors) {
865 assert(BV->getNumOperands() == NumLanes &&
866 "BuildVector must have one operand per lane");
867 for (const auto &[Idx, Op] : enumerate(First: BV->operands())) {
868 auto *ScalarPhi = cast<VPPhi>(Val: Op);
869 auto DL = ScalarPhi->getDebugLoc();
870 auto *PredOp = cast<VPSingleDefRecipe>(Val: ScalarPhi->getOperand(N: 1));
871 VPValue *Poison = ScalarPhi->getOperand(N: 0);
872 VPValue *PrevVal = Idx == 0 ? Poison : BV->getOperand(N: Idx - 1);
873 auto Builder = VPBuilder::getToInsertAfter(R: PredOp->getDefiningRecipe());
874 auto *Insert = Builder.createNaryOp(
875 Opcode: Instruction::InsertElement,
876 Operands: {PrevVal, PredOp, Plan.getConstantInt(BitWidth: 64, Val: Idx)}, DL);
877 Builder.setInsertPoint(ScalarPhi);
878 auto *NewPhi = Builder.createWidenPhi(IncomingValues: {PrevVal, Insert}, DL);
879 ScalarPhi->replaceAllUsesWith(New: NewPhi);
880 ScalarPhi->eraseFromParent();
881 }
882 BV->replaceAllUsesWith(New: BV->getOperand(N: NumLanes - 1));
883 BV->eraseFromParent();
884 }
885}
886
887/// Collect and dissolve all replicate regions in the vector loop, replicating
888/// their blocks and recipes for each lane of \p VF.
889static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF,
890 Type *IdxTy) {
891 // Collect all replicate regions before modifying the CFG.
892 SmallVector<VPRegionBlock *> ReplicateRegions;
893 for (VPRegionBlock *Region : VPBlockUtils::blocksOnly<VPRegionBlock>(
894 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
895 if (Region->isReplicator())
896 ReplicateRegions.push_back(Elt: Region);
897 }
898
899 assert((ReplicateRegions.empty() || !VF.isScalable()) &&
900 "cannot replicate across scalable VFs");
901
902 // Dissolve replicate regions by replicating their blocks for each lane.
903 // Traversing regions in reverse ensures that the successor of every region
904 // being processed is a basic-block, rather than another region.
905 for (VPRegionBlock *Region : reverse(C&: ReplicateRegions))
906 dissolveReplicateRegion(Region, VF, Plan, IdxTy);
907
908 VPlanTransforms::mergeBlocksIntoPredecessors(Plan);
909}
910
911void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
912 Type *IdxTy = IntegerType::get(
913 C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32);
914
915 if (Plan.hasScalarVFOnly()) {
916 // When Plan is only unrolled by UF, replicating by VF amounts to dissolving
917 // replicate regions.
918 replicateReplicateRegionsByVF(Plan, VF, IdxTy);
919 return;
920 }
921
922 // Visit all VPBBs outside the loop region and directly inside the top-level
923 // loop region.
924 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
925 Range: vp_depth_first_shallow(G: Plan.getEntry()));
926 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
927 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()));
928 auto VPBBsToUnroll =
929 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion);
930 // A mapping of current VPValue definitions to collections of new VPValues
931 // defined per lane. Serves to hook-up potential users of current VPValue
932 // definition that are replicated-per-VF later.
933 DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
934 // The removal of current recipes being replaced by new ones needs to be
935 // delayed after Def2LaneDefs is no longer in use.
936 SmallVector<VPRecipeBase *> ToRemove;
937 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
938 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
939 if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: &R) ||
940 (isa<VPReplicateRecipe>(Val: &R) &&
941 cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) ||
942 (isa<VPInstruction>(Val: &R) &&
943 !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() &&
944 cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack))
945 continue;
946
947 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
948 VPBuilder Builder(DefR);
949 if (DefR->user_empty()) {
950 // Create single-scalar version of DefR for all lanes.
951 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
952 cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs);
953 DefR->eraseFromParent();
954 continue;
955 }
956 /// Create single-scalar version of DefR for all lanes.
957 SmallVector<VPValue *> LaneDefs;
958 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
959 LaneDefs.push_back(
960 Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs));
961
962 Def2LaneDefs[DefR] = LaneDefs;
963 /// Users that only demand the first lane can use the definition for lane
964 /// 0.
965 DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) {
966 if (U.usesFirstLaneOnly(Op: DefR))
967 return true;
968 auto *VPI = dyn_cast<VPInstructionWithType>(Val: &U);
969 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
970 });
971
972 // Update each build vector user that currently has DefR as its only
973 // operand, to have all LaneDefs as its operands.
974 for (VPUser *U : to_vector(Range: DefR->users())) {
975 auto *VPI = dyn_cast<VPInstruction>(Val: U);
976 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
977 VPI->getOpcode() != VPInstruction::BuildStructVector))
978 continue;
979 assert(VPI->getNumOperands() == 1 &&
980 "Build(Struct)Vector must have a single operand before "
981 "replicating by VF");
982 VPI->setOperand(I: 0, New: LaneDefs[0]);
983 for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs))
984 VPI->addOperand(Op: LaneDef);
985 }
986 ToRemove.push_back(Elt: DefR);
987 }
988 }
989 for (auto *R : reverse(C&: ToRemove))
990 R->eraseFromParent();
991
992 replicateReplicateRegionsByVF(Plan, VF, IdxTy);
993}
994