1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
22#include "llvm/ADT/PostOrderIterator.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/Analysis/IVDescriptors.h"
26#include "llvm/IR/Intrinsics.h"
27
28using namespace llvm;
29using namespace llvm::VPlanPatternMatch;
30
31namespace {
32
33/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35/// the unrolling transformation, where the original VPValues are retained for
36/// part zero.
37class UnrollState {
38 /// Plan to unroll.
39 VPlan &Plan;
40 /// Unroll factor to unroll by.
41 const unsigned UF;
42 /// Analysis for types.
43 VPTypeAnalysis TypeInfo;
44
45 /// Unrolling may create recipes that should not be unrolled themselves.
46 /// Those are tracked in ToSkip.
47 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48
49 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50 // ..., UF-1.
51 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52
53 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55
56 /// Add a start index operand to \p Steps for \p Part.
57 void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps, unsigned Part);
58
59 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
60 /// all parts.
61 void unrollRecipeByUF(VPRecipeBase &R);
62
63 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
64 /// depends on the concrete header phi. Inserts newly created recipes at \p
65 /// InsertPtForPhi.
66 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
67 VPBasicBlock::iterator InsertPtForPhi);
68
69 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
70 /// the induction steps for each part.
71 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
72 VPBasicBlock::iterator InsertPtForPhi);
73
74 VPValue *getConstantInt(unsigned Part) {
75 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
76 return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part);
77 }
78
79public:
80 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
81
82 void unrollBlock(VPBlockBase *VPB);
83
84 VPValue *getValueForPart(VPValue *V, unsigned Part) {
85 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(Val: V))
86 return V;
87 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
88 "accessed value does not exist");
89 return VPV2Parts[V][Part - 1];
90 }
91
92 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
93 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
94 /// corresponding VPValue defined by \p CopyR.
95 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
96 unsigned Part) {
97 for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) {
98 const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV);
99 assert(V->second.size() == Part - 1 && "earlier parts not set");
100 V->second.push_back(Elt: CopyR->getVPValue(I: Idx));
101 }
102 }
103
104 /// Given a uniform recipe \p R, add it for all parts.
105 void addUniformForAllParts(VPSingleDefRecipe *R) {
106 const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R);
107 assert(Inserted && "uniform value already added");
108 for (unsigned Part = 0; Part != UF; ++Part)
109 V->second.push_back(Elt: R);
110 }
111
112 bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); }
113
114 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
115 /// \p P.
116 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
117 auto *Op = R->getOperand(N: OpIdx);
118 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
119 }
120
121 /// Update \p R's operands with their corresponding VPValues for part \p P.
122 void remapOperands(VPRecipeBase *R, unsigned Part) {
123 for (const auto &[OpIdx, Op] : enumerate(First: R->operands()))
124 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
125 }
126};
127} // namespace
128
129void UnrollState::addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
130 unsigned Part) {
131 if (Part == 0)
132 return;
133
134 VPBuilder Builder(Steps);
135 Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0));
136 Type *IntStepTy =
137 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
138 VPValue *StartIndex = Steps->getVFValue();
139 if (Part > 1) {
140 StartIndex = Builder.createOverflowingOp(
141 Opcode: Instruction::Mul,
142 Operands: {StartIndex,
143 Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: StartIndex), Val: Part)});
144 }
145 StartIndex = Builder.createScalarSExtOrTrunc(
146 Op: StartIndex, ResultTy: IntStepTy, SrcTy: TypeInfo.inferScalarType(V: StartIndex),
147 DL: Steps->getDebugLoc());
148
149 if (BaseIVTy->isFloatingPointTy())
150 StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex,
151 ResultTy: BaseIVTy, DL: Steps->getDebugLoc());
152
153 Steps->addOperand(Operand: StartIndex);
154}
155
156void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
157 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
158 for (unsigned Part = 1; Part != UF; ++Part) {
159 auto *Copy = VPR->clone();
160 VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt);
161
162 auto PartI = vp_depth_first_shallow(G: Copy->getEntry());
163 auto Part0 = vp_depth_first_shallow(G: VPR->getEntry());
164 for (const auto &[PartIVPBB, Part0VPBB] :
165 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI),
166 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) {
167 for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) {
168 remapOperands(R: &PartIR, Part);
169 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR))
170 addStartIndexForScalarSteps(Steps, Part);
171
172 addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part);
173 }
174 }
175 }
176}
177
178void UnrollState::unrollWidenInductionByUF(
179 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
180 VPBasicBlock *PH = cast<VPBasicBlock>(
181 Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
182 Type *IVTy = TypeInfo.inferScalarType(V: IV);
183 auto &ID = IV->getInductionDescriptor();
184 VPIRFlags Flags;
185 if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp()))
186 Flags = ID.getInductionBinOp()->getFastMathFlags();
187
188 VPValue *ScalarStep = IV->getStepValue();
189 VPBuilder Builder(PH);
190 Type *VectorStepTy =
191 IVTy->isPointerTy() ? TypeInfo.inferScalarType(V: ScalarStep) : IVTy;
192 VPInstruction *VectorStep = Builder.createNaryOp(
193 Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy,
194 Flags, DL: IV->getDebugLoc());
195
196 ToSkip.insert(Ptr: VectorStep);
197
198 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
199 // remains the header phi. Parts > 0 are computed by adding Step to the
200 // previous part. The header phi recipe will get 2 new operands: the step
201 // value for a single part and the last part, used to compute the backedge
202 // value during VPWidenInductionRecipe::execute.
203 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
204 // %Part.1 = %Part.0 + %VectorStep
205 // %Part.2 = %Part.1 + %VectorStep
206 // %Part.3 = %Part.2 + %VectorStep
207 //
208 // The newly added recipes are added to ToSkip to avoid interleaving them
209 // again.
210 VPValue *Prev = IV;
211 Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi);
212 unsigned AddOpc;
213 VPIRFlags AddFlags;
214 if (IVTy->isPointerTy()) {
215 AddOpc = VPInstruction::WidePtrAdd;
216 AddFlags = GEPNoWrapFlags::none();
217 } else if (IVTy->isFloatingPointTy()) {
218 AddOpc = ID.getInductionOpcode();
219 AddFlags = Flags; // FMF flags
220 } else {
221 AddOpc = Instruction::Add;
222 AddFlags = VPIRFlags::getDefaultFlags(Opcode: AddOpc);
223 }
224 for (unsigned Part = 1; Part != UF; ++Part) {
225 std::string Name =
226 Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add";
227
228 VPInstruction *Add =
229 Builder.createNaryOp(Opcode: AddOpc,
230 Operands: {
231 Prev,
232 VectorStep,
233 },
234 Flags: AddFlags, DL: IV->getDebugLoc(), Name);
235 ToSkip.insert(Ptr: Add);
236 addRecipeForPart(OrigR: IV, CopyR: Add, Part);
237 Prev = Add;
238 }
239 IV->addOperand(Operand: VectorStep);
240 IV->addOperand(Operand: Prev);
241}
242
243void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
244 VPBasicBlock::iterator InsertPtForPhi) {
245 // First-order recurrences pass a single vector or scalar through their header
246 // phis, irrespective of interleaving.
247 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R))
248 return;
249
250 // Generate step vectors for each unrolled part.
251 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) {
252 unrollWidenInductionByUF(IV, InsertPtForPhi);
253 return;
254 }
255
256 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R);
257 if (RdxPhi && RdxPhi->isOrdered())
258 return;
259
260 auto InsertPt = std::next(x: R->getIterator());
261 for (unsigned Part = 1; Part != UF; ++Part) {
262 VPRecipeBase *Copy = R->clone();
263 Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt);
264 addRecipeForPart(OrigR: R, CopyR: Copy, Part);
265 if (RdxPhi) {
266 // If the start value is a ReductionStartVector, use the identity value
267 // (second operand) for unrolled parts. If the scaling factor is > 1,
268 // create a new ReductionStartVector with the scale factor and both
269 // operands set to the identity value.
270 if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) {
271 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
272 "unexpected start VPInstruction");
273 if (Part != 1)
274 continue;
275 VPValue *StartV;
276 if (match(V: VPI->getOperand(N: 2), P: m_One())) {
277 StartV = VPI->getOperand(N: 1);
278 } else {
279 auto *C = VPI->clone();
280 C->setOperand(I: 0, New: C->getOperand(N: 1));
281 C->insertAfter(InsertPos: VPI);
282 StartV = C;
283 }
284 for (unsigned Part = 1; Part != UF; ++Part)
285 VPV2Parts[VPI][Part - 1] = StartV;
286 }
287 Copy->addOperand(Operand: getConstantInt(Part));
288 } else {
289 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
290 "unexpected header phi recipe not needing unrolled part");
291 }
292 }
293}
294
295/// Handle non-header-phi recipes.
296void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
297 if (match(V: &R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount())))
298 return;
299
300 if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) {
301 if (vputils::onlyFirstPartUsed(Def: VPI)) {
302 addUniformForAllParts(R: VPI);
303 return;
304 }
305 }
306 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
307 if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) &&
308 RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) {
309 // Stores to an invariant address only need to store the last part.
310 remapOperands(R: &R, Part: UF - 1);
311 return;
312 }
313 if (match(V: RepR,
314 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
315 addUniformForAllParts(R: RepR);
316 return;
317 }
318 }
319
320 // Unroll non-uniform recipes.
321 auto InsertPt = std::next(x: R.getIterator());
322 VPBasicBlock &VPBB = *R.getParent();
323 for (unsigned Part = 1; Part != UF; ++Part) {
324 VPRecipeBase *Copy = R.clone();
325 Copy->insertBefore(BB&: VPBB, IP: InsertPt);
326 addRecipeForPart(OrigR: &R, CopyR: Copy, Part);
327
328 VPValue *Op;
329 if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
330 Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) {
331 Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1));
332 Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part));
333 continue;
334 }
335 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: &R)) {
336 VPBuilder Builder(VPR);
337 const DataLayout &DL =
338 Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout();
339 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: VPR));
340 Type *VFTy = TypeInfo.inferScalarType(V: &Plan.getVF());
341 VPValue *VF = Builder.createScalarZExtOrTrunc(
342 Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown());
343 // VFxUF does not wrap, so VF * Part also cannot wrap.
344 VPValue *VFxPart = Builder.createOverflowingOp(
345 Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)},
346 WrapFlags: {true, true});
347 Copy->setOperand(I: 0, New: VPR->getOperand(N: 0));
348 Copy->addOperand(Operand: VFxPart);
349 continue;
350 }
351 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) {
352 auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0));
353 if (Phi && Phi->isOrdered()) {
354 auto &Parts = VPV2Parts[Phi];
355 if (Part == 1) {
356 Parts.clear();
357 Parts.push_back(Elt: Red);
358 }
359 Parts.push_back(Elt: Copy->getVPSingleValue());
360 Phi->setOperand(I: 1, New: Copy->getVPSingleValue());
361 }
362 }
363 remapOperands(R: Copy, Part);
364
365 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy))
366 addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part);
367
368 // Add operand indicating the part to generate code for, to recipes still
369 // requiring it.
370 if (isa<VPWidenCanonicalIVRecipe, VPVectorEndPointerRecipe>(Val: Copy) ||
371 match(V: Copy,
372 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
373 Copy->addOperand(Operand: getConstantInt(Part));
374
375 if (isa<VPVectorEndPointerRecipe>(Val: R))
376 Copy->setOperand(I: 0, New: R.getOperand(N: 0));
377 }
378}
379
380void UnrollState::unrollBlock(VPBlockBase *VPB) {
381 auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB);
382 if (VPR) {
383 if (VPR->isReplicator())
384 return unrollReplicateRegionByUF(VPR);
385
386 // Traverse blocks in region in RPO to ensure defs are visited before uses
387 // across blocks.
388 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
389 RPOT(VPR->getEntry());
390 for (VPBlockBase *VPB : RPOT)
391 unrollBlock(VPB);
392 return;
393 }
394
395 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
396 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
397 auto InsertPtForPhi = VPBB->getFirstNonPhi();
398 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
399 if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R))
400 continue;
401
402 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
403 // Compute*Result which combine all parts to compute the final value.
404 VPValue *Op1;
405 if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) ||
406 match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) ||
407 match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) ||
408 match(V: &R,
409 P: m_ComputeAnyOfResult(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) ||
410 match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) {
411 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
412 for (unsigned Part = 1; Part != UF; ++Part)
413 R.addOperand(Operand: getValueForPart(V: Op1, Part));
414 continue;
415 }
416 VPValue *Op0;
417 if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) {
418 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
419 for (unsigned Part = 1; Part != UF; ++Part)
420 R.addOperand(Operand: getValueForPart(V: Op1, Part));
421 continue;
422 }
423
424 if (Plan.hasScalarVFOnly()) {
425 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) ||
426 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
427 auto *I = cast<VPInstruction>(Val: &R);
428 bool IsPenultimatePart =
429 I->getOpcode() == VPInstruction::ExtractPenultimateElement;
430 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
431 // For scalar VF, directly use the scalar part value.
432 I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx));
433 continue;
434 }
435 }
436 // For vector VF, the penultimate element is always extracted from the last part.
437 if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) ||
438 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
439 addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R));
440 R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1));
441 continue;
442 }
443
444 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R);
445 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) {
446 addUniformForAllParts(R: SingleDef);
447 continue;
448 }
449
450 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) {
451 unrollHeaderPHIByUF(R: H, InsertPtForPhi);
452 continue;
453 }
454
455 unrollRecipeByUF(R);
456 }
457}
458
459void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
460 assert(UF > 0 && "Unroll factor must be positive");
461 Plan.setUF(UF);
462 llvm::scope_exit Cleanup([&Plan]() {
463 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
464 // Remove recipes that are redundant after unrolling.
465 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
466 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
467 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
468 if (VPI &&
469 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
470 VPI->getNumOperands() == 1) {
471 VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0));
472 VPI->eraseFromParent();
473 }
474 }
475 }
476 });
477 if (UF == 1) {
478 return;
479 }
480
481 UnrollState Unroller(Plan, UF);
482
483 // Iterate over all blocks in the plan starting from Entry, and unroll
484 // recipes inside them. This includes the vector preheader and middle blocks,
485 // which may set up or post-process per-part values.
486 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
487 Plan.getEntry());
488 for (VPBlockBase *VPB : RPOT)
489 Unroller.unrollBlock(VPB);
490
491 unsigned Part = 1;
492 // Remap operands of cloned header phis to update backedge values. The header
493 // phis cloned during unrolling are just after the header phi for part 0.
494 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
495 for (VPRecipeBase &H :
496 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
497 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
498 // value across the backedge, needs to remap to the last part of the spliced
499 // value.
500 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) {
501 Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1);
502 continue;
503 }
504 if (Unroller.contains(VPV: H.getVPSingleValue())) {
505 Part = 1;
506 continue;
507 }
508 Unroller.remapOperands(R: &H, Part);
509 Part++;
510 }
511
512 VPlanTransforms::removeDeadRecipes(Plan);
513}
514
515/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe or
516/// VPInstruction) for lane \p Lane. Use \p Def2LaneDefs to look up scalar
517/// definitions for operands of \DefR.
518static VPValue *
519cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
520 VPSingleDefRecipe *DefR, VPLane Lane,
521 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
522 VPValue *Op;
523 if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) {
524 auto LaneDefs = Def2LaneDefs.find(Val: Op);
525 if (LaneDefs != Def2LaneDefs.end())
526 return LaneDefs->second[Lane.getKnownLane()];
527
528 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
529 return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
530 }
531
532 // Collect the operands at Lane, creating extracts as needed.
533 SmallVector<VPValue *> NewOps;
534 for (VPValue *Op : DefR->operands()) {
535 // If Op is a definition that has been unrolled, directly use the clone for
536 // the corresponding lane.
537 auto LaneDefs = Def2LaneDefs.find(Val: Op);
538 if (LaneDefs != Def2LaneDefs.end()) {
539 NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]);
540 continue;
541 }
542 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
543 // Look through mandatory Unpack.
544 [[maybe_unused]] bool Matched =
545 match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)));
546 assert(Matched && "original op must have been Unpack");
547 auto *ExtractPart =
548 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op});
549 NewOps.push_back(
550 Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart}));
551 continue;
552 }
553 if (vputils::isSingleScalar(VPV: Op)) {
554 NewOps.push_back(Elt: Op);
555 continue;
556 }
557
558 // Look through buildvector to avoid unnecessary extracts.
559 if (match(V: Op, P: m_BuildVector())) {
560 NewOps.push_back(
561 Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane()));
562 continue;
563 }
564 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
565 VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
566 NewOps.push_back(Elt: Ext);
567 }
568
569 VPSingleDefRecipe *New;
570 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) {
571 // TODO: have cloning of replicate recipes also provide the desired result
572 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
573 // Mask from the operands?)
574 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
575 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
576 *RepR, *RepR, RepR->getDebugLoc());
577 } else {
578 assert(isa<VPInstruction>(DefR) &&
579 "DefR must be a VPReplicateRecipe or VPInstruction");
580 New = DefR->clone();
581 for (const auto &[Idx, Op] : enumerate(First&: NewOps)) {
582 New->setOperand(I: Idx, New: Op);
583 }
584 }
585 New->insertBefore(InsertPos: DefR);
586 return New;
587}
588
589void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
590 Type *IdxTy = IntegerType::get(
591 C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32);
592
593 // Visit all VPBBs outside the loop region and directly inside the top-level
594 // loop region.
595 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
596 Range: vp_depth_first_shallow(G: Plan.getEntry()));
597 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
598 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()));
599 auto VPBBsToUnroll =
600 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion);
601 // A mapping of current VPValue definitions to collections of new VPValues
602 // defined per lane. Serves to hook-up potential users of current VPValue
603 // definition that are replicated-per-VF later.
604 DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
605 // The removal of current recipes being replaced by new ones needs to be
606 // delayed after Def2LaneDefs is no longer in use.
607 SmallVector<VPRecipeBase *> ToRemove;
608 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
609 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
610 if (!isa<VPInstruction, VPReplicateRecipe>(Val: &R) ||
611 (isa<VPReplicateRecipe>(Val: &R) &&
612 cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) ||
613 (isa<VPInstruction>(Val: &R) &&
614 !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() &&
615 cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack))
616 continue;
617
618 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
619 VPBuilder Builder(DefR);
620 if (DefR->getNumUsers() == 0) {
621 // Create single-scalar version of DefR for all lanes.
622 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
623 cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs);
624 DefR->eraseFromParent();
625 continue;
626 }
627 /// Create single-scalar version of DefR for all lanes.
628 SmallVector<VPValue *> LaneDefs;
629 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
630 LaneDefs.push_back(
631 Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs));
632
633 Def2LaneDefs[DefR] = LaneDefs;
634 /// Users that only demand the first lane can use the definition for lane
635 /// 0.
636 DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) {
637 return U.usesFirstLaneOnly(Op: DefR);
638 });
639
640 // Update each build vector user that currently has DefR as its only
641 // operand, to have all LaneDefs as its operands.
642 for (VPUser *U : to_vector(Range: DefR->users())) {
643 auto *VPI = dyn_cast<VPInstruction>(Val: U);
644 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
645 VPI->getOpcode() != VPInstruction::BuildStructVector))
646 continue;
647 assert(VPI->getNumOperands() == 1 &&
648 "Build(Struct)Vector must have a single operand before "
649 "replicating by VF");
650 VPI->setOperand(I: 0, New: LaneDefs[0]);
651 for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs))
652 VPI->addOperand(Operand: LaneDef);
653 }
654 ToRemove.push_back(Elt: DefR);
655 }
656 }
657 for (auto *R : reverse(C&: ToRemove))
658 R->eraseFromParent();
659}
660