1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
22#include "llvm/ADT/PostOrderIterator.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/Analysis/IVDescriptors.h"
26#include "llvm/IR/Intrinsics.h"
27
28using namespace llvm;
29using namespace llvm::VPlanPatternMatch;
30
31namespace {
32
33/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35/// the unrolling transformation, where the original VPValues are retained for
36/// part zero.
37class UnrollState {
38 /// Plan to unroll.
39 VPlan &Plan;
40 /// Unroll factor to unroll by.
41 const unsigned UF;
42 /// Analysis for types.
43 VPTypeAnalysis TypeInfo;
44
45 /// Unrolling may create recipes that should not be unrolled themselves.
46 /// Those are tracked in ToSkip.
47 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48
49 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50 // ..., UF-1.
51 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52
53 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55
56 /// Add a start index operand to \p Steps for \p Part.
57 void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, unsigned Part);
58
59 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
60 /// all parts.
61 void unrollRecipeByUF(VPRecipeBase &R);
62
63 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
64 /// depends on the concrete header phi. Inserts newly created recipes at \p
65 /// InsertPtForPhi.
66 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
67 VPBasicBlock::iterator InsertPtForPhi);
68
69 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
70 /// the induction steps for each part.
71 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
72 VPBasicBlock::iterator InsertPtForPhi);
73
74 VPValue *getConstantInt(unsigned Part) {
75 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
76 return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part);
77 }
78
79public:
80 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
81
82 void unrollBlock(VPBlockBase *VPB);
83
84 VPValue *getValueForPart(VPValue *V, unsigned Part) {
85 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(Val: V))
86 return V;
87 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
88 "accessed value does not exist");
89 return VPV2Parts[V][Part - 1];
90 }
91
92 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
93 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
94 /// corresponding VPValue defined by \p CopyR.
95 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
96 unsigned Part) {
97 for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) {
98 const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV);
99 assert(V->second.size() == Part - 1 && "earlier parts not set");
100 V->second.push_back(Elt: CopyR->getVPValue(I: Idx));
101 }
102 }
103
104 /// Given a uniform recipe \p R, add it for all parts.
105 void addUniformForAllParts(VPSingleDefRecipe *R) {
106 const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R);
107 assert(Inserted && "uniform value already added");
108 for (unsigned Part = 0; Part != UF; ++Part)
109 V->second.push_back(Elt: R);
110 }
111
112 bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); }
113
114 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
115 /// \p P.
116 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
117 auto *Op = R->getOperand(N: OpIdx);
118 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
119 }
120
121 /// Update \p R's operands with their corresponding VPValues for part \p P.
122 void remapOperands(VPRecipeBase *R, unsigned Part) {
123 for (const auto &[OpIdx, Op] : enumerate(First: R->operands()))
124 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
125 }
126};
127} // namespace
128
129void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
130 unsigned Part) {
131 if (Part == 0)
132 return;
133
134 VPBuilder Builder(Steps);
135 Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0));
136 Type *IntStepTy =
137 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
138 VPValue *StartIndex = Steps->getVFValue();
139 if (Part > 1) {
140 StartIndex = Builder.createOverflowingOp(
141 Opcode: Instruction::Mul,
142 Operands: {StartIndex,
143 Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: StartIndex), Val: Part)});
144 }
145 StartIndex = Builder.createScalarSExtOrTrunc(
146 Op: StartIndex, ResultTy: IntStepTy, SrcTy: TypeInfo.inferScalarType(V: StartIndex),
147 DL: Steps->getDebugLoc());
148
149 if (BaseIVTy->isFloatingPointTy())
150 StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex,
151 ResultTy: BaseIVTy, DL: Steps->getDebugLoc());
152
153 Steps->addOperand(Operand: StartIndex);
154}
155
156void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
157 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
158 for (unsigned Part = 1; Part != UF; ++Part) {
159 auto *Copy = VPR->clone();
160 VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt);
161
162 auto PartI = vp_depth_first_shallow(G: Copy->getEntry());
163 auto Part0 = vp_depth_first_shallow(G: VPR->getEntry());
164 for (const auto &[PartIVPBB, Part0VPBB] :
165 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI),
166 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) {
167 for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) {
168 remapOperands(R: &PartIR, Part);
169 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR))
170 addStartIndexForScalarSteps(Steps, Part);
171
172 addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part);
173 }
174 }
175 }
176}
177
178void UnrollState::unrollWidenInductionByUF(
179 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
180 VPBasicBlock *PH = cast<VPBasicBlock>(
181 Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
182 Type *IVTy = TypeInfo.inferScalarType(V: IV);
183 auto &ID = IV->getInductionDescriptor();
184 VPIRFlags Flags;
185 if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp()))
186 Flags = ID.getInductionBinOp()->getFastMathFlags();
187
188 VPValue *ScalarStep = IV->getStepValue();
189 VPBuilder Builder(PH);
190 Type *VectorStepTy =
191 IVTy->isPointerTy() ? TypeInfo.inferScalarType(V: ScalarStep) : IVTy;
192 VPInstruction *VectorStep = Builder.createNaryOp(
193 Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy,
194 Flags, DL: IV->getDebugLoc());
195
196 ToSkip.insert(Ptr: VectorStep);
197
198 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
199 // remains the header phi. Parts > 0 are computed by adding Step to the
200 // previous part. The header phi recipe will get 2 new operands: the step
201 // value for a single part and the last part, used to compute the backedge
202 // value during VPWidenInductionRecipe::execute.
203 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
204 // %Part.1 = %Part.0 + %VectorStep
205 // %Part.2 = %Part.1 + %VectorStep
206 // %Part.3 = %Part.2 + %VectorStep
207 //
208 // The newly added recipes are added to ToSkip to avoid interleaving them
209 // again.
210 VPValue *Prev = IV;
211 Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi);
212 unsigned AddOpc;
213 VPIRFlags AddFlags;
214 if (IVTy->isPointerTy()) {
215 AddOpc = VPInstruction::WidePtrAdd;
216 AddFlags = GEPNoWrapFlags::none();
217 } else if (IVTy->isFloatingPointTy()) {
218 AddOpc = ID.getInductionOpcode();
219 AddFlags = Flags; // FMF flags
220 } else {
221 AddOpc = Instruction::Add;
222 AddFlags = VPIRFlags::getDefaultFlags(Opcode: AddOpc);
223 }
224 for (unsigned Part = 1; Part != UF; ++Part) {
225 std::string Name =
226 Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add";
227
228 VPInstruction *Add =
229 Builder.createNaryOp(Opcode: AddOpc,
230 Operands: {
231 Prev,
232 VectorStep,
233 },
234 Flags: AddFlags, DL: IV->getDebugLoc(), Name);
235 ToSkip.insert(Ptr: Add);
236 addRecipeForPart(OrigR: IV, CopyR: Add, Part);
237 Prev = Add;
238 }
239 IV->addOperand(Operand: VectorStep);
240 IV->addOperand(Operand: Prev);
241}
242
243void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
244 VPBasicBlock::iterator InsertPtForPhi) {
245 // First-order recurrences pass a single vector or scalar through their header
246 // phis, irrespective of interleaving.
247 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R))
248 return;
249
250 // Generate step vectors for each unrolled part.
251 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) {
252 unrollWidenInductionByUF(IV, InsertPtForPhi);
253 return;
254 }
255
256 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R);
257 if (RdxPhi && RdxPhi->isOrdered())
258 return;
259
260 auto InsertPt = std::next(x: R->getIterator());
261 for (unsigned Part = 1; Part != UF; ++Part) {
262 VPRecipeBase *Copy = R->clone();
263 Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt);
264 addRecipeForPart(OrigR: R, CopyR: Copy, Part);
265 if (RdxPhi) {
266 // If the start value is a ReductionStartVector, use the identity value
267 // (second operand) for unrolled parts. If the scaling factor is > 1,
268 // create a new ReductionStartVector with the scale factor and both
269 // operands set to the identity value.
270 if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) {
271 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
272 "unexpected start VPInstruction");
273 if (Part != 1)
274 continue;
275 VPValue *StartV;
276 if (match(V: VPI->getOperand(N: 2), P: m_One())) {
277 StartV = VPI->getOperand(N: 1);
278 } else {
279 auto *C = VPI->clone();
280 C->setOperand(I: 0, New: C->getOperand(N: 1));
281 C->insertAfter(InsertPos: VPI);
282 StartV = C;
283 }
284 for (unsigned Part = 1; Part != UF; ++Part)
285 VPV2Parts[VPI][Part - 1] = StartV;
286 }
287 } else {
288 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
289 "unexpected header phi recipe not needing unrolled part");
290 }
291 }
292}
293
294/// Handle non-header-phi recipes.
295void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
296 if (match(V: &R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount())))
297 return;
298
299 if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) {
300 if (vputils::onlyFirstPartUsed(Def: VPI)) {
301 addUniformForAllParts(R: VPI);
302 return;
303 }
304 }
305 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
306 if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) &&
307 RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) {
308 // Stores to an invariant address only need to store the last part.
309 remapOperands(R: &R, Part: UF - 1);
310 return;
311 }
312 if (match(V: RepR,
313 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
314 addUniformForAllParts(R: RepR);
315 return;
316 }
317 }
318
319 // Unroll non-uniform recipes.
320 auto InsertPt = std::next(x: R.getIterator());
321 VPBasicBlock &VPBB = *R.getParent();
322 for (unsigned Part = 1; Part != UF; ++Part) {
323 VPRecipeBase *Copy = R.clone();
324 Copy->insertBefore(BB&: VPBB, IP: InsertPt);
325 addRecipeForPart(OrigR: &R, CopyR: Copy, Part);
326
327 VPValue *Op;
328 if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
329 Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) {
330 Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1));
331 Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part));
332 continue;
333 }
334 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: &R)) {
335 VPBuilder Builder(VPR);
336 const DataLayout &DL =
337 Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout();
338 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: VPR));
339 Type *VFTy = TypeInfo.inferScalarType(V: &Plan.getVF());
340 VPValue *VF = Builder.createScalarZExtOrTrunc(
341 Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown());
342 // VFxUF does not wrap, so VF * Part also cannot wrap.
343 VPValue *VFxPart = Builder.createOverflowingOp(
344 Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)},
345 WrapFlags: {true, true});
346 Copy->setOperand(I: 0, New: VPR->getOperand(N: 0));
347 Copy->addOperand(Operand: VFxPart);
348 continue;
349 }
350 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) {
351 auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0));
352 if (Phi && Phi->isOrdered()) {
353 auto &Parts = VPV2Parts[Phi];
354 if (Part == 1) {
355 Parts.clear();
356 Parts.push_back(Elt: Red);
357 }
358 Parts.push_back(Elt: Copy->getVPSingleValue());
359 Phi->setOperand(I: 1, New: Copy->getVPSingleValue());
360 }
361 }
362 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: Copy)) {
363 // Materialize PartN offset for VectorEndPointer.
364 VEPR->setOperand(I: 0, New: R.getOperand(N: 0));
365 VEPR->setOperand(I: 1, New: R.getOperand(N: 1));
366 VEPR->materializeOffset(Part);
367 continue;
368 }
369
370 remapOperands(R: Copy, Part);
371
372 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy))
373 addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part);
374
375 // Add operand indicating the part to generate code for, to recipes still
376 // requiring it.
377 if (isa<VPWidenCanonicalIVRecipe>(Val: Copy))
378 Copy->addOperand(Operand: getConstantInt(Part));
379
380 if (match(V: Copy,
381 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
382 VPBuilder Builder(Copy);
383 VPValue *ScaledByPart = Builder.createOverflowingOp(
384 Opcode: Instruction::Mul, Operands: {Copy->getOperand(N: 1), getConstantInt(Part)});
385 Copy->setOperand(I: 1, New: ScaledByPart);
386 }
387 }
388 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
389 // Materialize Part0 offset for VectorEndPointer.
390 VEPR->materializeOffset();
391 }
392}
393
394void UnrollState::unrollBlock(VPBlockBase *VPB) {
395 auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB);
396 if (VPR) {
397 if (VPR->isReplicator())
398 return unrollReplicateRegionByUF(VPR);
399
400 // Traverse blocks in region in RPO to ensure defs are visited before uses
401 // across blocks.
402 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
403 RPOT(VPR->getEntry());
404 for (VPBlockBase *VPB : RPOT)
405 unrollBlock(VPB);
406 return;
407 }
408
409 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
410 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
411 auto InsertPtForPhi = VPBB->getFirstNonPhi();
412 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
413 if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R))
414 continue;
415
416 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
417 // Compute*Result which combine all parts to compute the final value.
418 VPValue *Op1;
419 if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) ||
420 match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) ||
421 match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) ||
422 match(V: &R,
423 P: m_ComputeAnyOfResult(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) ||
424 match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) {
425 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
426 for (unsigned Part = 1; Part != UF; ++Part)
427 R.addOperand(Operand: getValueForPart(V: Op1, Part));
428 continue;
429 }
430 VPValue *Op0;
431 if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) {
432 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
433 for (unsigned Part = 1; Part != UF; ++Part)
434 R.addOperand(Operand: getValueForPart(V: Op1, Part));
435 continue;
436 }
437
438 if (Plan.hasScalarVFOnly()) {
439 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) ||
440 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
441 auto *I = cast<VPInstruction>(Val: &R);
442 bool IsPenultimatePart =
443 I->getOpcode() == VPInstruction::ExtractPenultimateElement;
444 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
445 // For scalar VF, directly use the scalar part value.
446 I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx));
447 continue;
448 }
449 }
450 // For vector VF, the penultimate element is always extracted from the last part.
451 if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) ||
452 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
453 addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R));
454 R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1));
455 continue;
456 }
457
458 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R);
459 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) {
460 addUniformForAllParts(R: SingleDef);
461 continue;
462 }
463
464 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) {
465 unrollHeaderPHIByUF(R: H, InsertPtForPhi);
466 continue;
467 }
468
469 unrollRecipeByUF(R);
470 }
471}
472
473void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
474 assert(UF > 0 && "Unroll factor must be positive");
475 Plan.setUF(UF);
476 llvm::scope_exit Cleanup([&Plan]() {
477 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
478 // Remove recipes that are redundant after unrolling.
479 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
480 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
481 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
482 if (VPI &&
483 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
484 VPI->getOperand(N: 1) == &Plan.getVF()) {
485 VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0));
486 VPI->eraseFromParent();
487 }
488 }
489 }
490 });
491 if (UF == 1) {
492 return;
493 }
494
495 UnrollState Unroller(Plan, UF);
496
497 // Iterate over all blocks in the plan starting from Entry, and unroll
498 // recipes inside them. This includes the vector preheader and middle blocks,
499 // which may set up or post-process per-part values.
500 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
501 Plan.getEntry());
502 for (VPBlockBase *VPB : RPOT)
503 Unroller.unrollBlock(VPB);
504
505 unsigned Part = 1;
506 // Remap operands of cloned header phis to update backedge values. The header
507 // phis cloned during unrolling are just after the header phi for part 0.
508 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
509 for (VPRecipeBase &H :
510 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
511 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
512 // value across the backedge, needs to remap to the last part of the spliced
513 // value.
514 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) {
515 Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1);
516 continue;
517 }
518 if (Unroller.contains(VPV: H.getVPSingleValue())) {
519 Part = 1;
520 continue;
521 }
522 Unroller.remapOperands(R: &H, Part);
523 Part++;
524 }
525
526 VPlanTransforms::removeDeadRecipes(Plan);
527}
528
529/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
530/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
531/// definitions for operands of \DefR.
532static VPValue *
533cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
534 VPSingleDefRecipe *DefR, VPLane Lane,
535 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
536 VPValue *Op;
537 if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) {
538 auto LaneDefs = Def2LaneDefs.find(Val: Op);
539 if (LaneDefs != Def2LaneDefs.end())
540 return LaneDefs->second[Lane.getKnownLane()];
541
542 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
543 return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
544 }
545
546 // Collect the operands at Lane, creating extracts as needed.
547 SmallVector<VPValue *> NewOps;
548 for (VPValue *Op : DefR->operands()) {
549 // If Op is a definition that has been unrolled, directly use the clone for
550 // the corresponding lane.
551 auto LaneDefs = Def2LaneDefs.find(Val: Op);
552 if (LaneDefs != Def2LaneDefs.end()) {
553 NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]);
554 continue;
555 }
556 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
557 // Look through mandatory Unpack.
558 [[maybe_unused]] bool Matched =
559 match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)));
560 assert(Matched && "original op must have been Unpack");
561 auto *ExtractPart =
562 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op});
563 NewOps.push_back(
564 Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart}));
565 continue;
566 }
567 if (vputils::isSingleScalar(VPV: Op)) {
568 NewOps.push_back(Elt: Op);
569 continue;
570 }
571
572 // Look through buildvector to avoid unnecessary extracts.
573 if (match(V: Op, P: m_BuildVector())) {
574 NewOps.push_back(
575 Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane()));
576 continue;
577 }
578 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
579 VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
580 NewOps.push_back(Elt: Ext);
581 }
582
583 VPSingleDefRecipe *New;
584 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) {
585 // TODO: have cloning of replicate recipes also provide the desired result
586 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
587 // Mask from the operands?)
588 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
589 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
590 *RepR, *RepR, RepR->getDebugLoc());
591 } else {
592 assert(isa<VPInstruction>(DefR) &&
593 "DefR must be a VPReplicateRecipe or VPInstruction");
594 New = DefR->clone();
595 for (const auto &[Idx, Op] : enumerate(First&: NewOps)) {
596 New->setOperand(I: Idx, New: Op);
597 }
598 }
599 New->insertBefore(InsertPos: DefR);
600 return New;
601}
602
603void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
604 if (Plan.hasScalarVFOnly())
605 return;
606
607 Type *IdxTy = IntegerType::get(
608 C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32);
609
610 // Visit all VPBBs outside the loop region and directly inside the top-level
611 // loop region.
612 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
613 Range: vp_depth_first_shallow(G: Plan.getEntry()));
614 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
615 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()));
616 auto VPBBsToUnroll =
617 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion);
618 // A mapping of current VPValue definitions to collections of new VPValues
619 // defined per lane. Serves to hook-up potential users of current VPValue
620 // definition that are replicated-per-VF later.
621 DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
622 // The removal of current recipes being replaced by new ones needs to be
623 // delayed after Def2LaneDefs is no longer in use.
624 SmallVector<VPRecipeBase *> ToRemove;
625 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
626 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
627 if (!isa<VPInstruction, VPReplicateRecipe>(Val: &R) ||
628 (isa<VPReplicateRecipe>(Val: &R) &&
629 cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) ||
630 (isa<VPInstruction>(Val: &R) &&
631 !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() &&
632 cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack))
633 continue;
634
635 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
636 VPBuilder Builder(DefR);
637 if (DefR->getNumUsers() == 0) {
638 // Create single-scalar version of DefR for all lanes.
639 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
640 cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs);
641 DefR->eraseFromParent();
642 continue;
643 }
644 /// Create single-scalar version of DefR for all lanes.
645 SmallVector<VPValue *> LaneDefs;
646 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
647 LaneDefs.push_back(
648 Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs));
649
650 Def2LaneDefs[DefR] = LaneDefs;
651 /// Users that only demand the first lane can use the definition for lane
652 /// 0.
653 DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) {
654 return U.usesFirstLaneOnly(Op: DefR);
655 });
656
657 // Update each build vector user that currently has DefR as its only
658 // operand, to have all LaneDefs as its operands.
659 for (VPUser *U : to_vector(Range: DefR->users())) {
660 auto *VPI = dyn_cast<VPInstruction>(Val: U);
661 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
662 VPI->getOpcode() != VPInstruction::BuildStructVector))
663 continue;
664 assert(VPI->getNumOperands() == 1 &&
665 "Build(Struct)Vector must have a single operand before "
666 "replicating by VF");
667 VPI->setOperand(I: 0, New: LaneDefs[0]);
668 for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs))
669 VPI->addOperand(Operand: LaneDef);
670 }
671 ToRemove.push_back(Elt: DefR);
672 }
673 }
674 for (auto *R : reverse(C&: ToRemove))
675 R->eraseFromParent();
676}
677