1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
22#include "llvm/ADT/PostOrderIterator.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/Analysis/IVDescriptors.h"
26#include "llvm/IR/Intrinsics.h"
27
28using namespace llvm;
29using namespace llvm::VPlanPatternMatch;
30
31namespace {
32
33/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
34/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
35/// the unrolling transformation, where the original VPValues are retained for
36/// part zero.
37class UnrollState {
38 /// Plan to unroll.
39 VPlan &Plan;
40 /// Unroll factor to unroll by.
41 const unsigned UF;
42 /// Analysis for types.
43 VPTypeAnalysis TypeInfo;
44
45 /// Unrolling may create recipes that should not be unrolled themselves.
46 /// Those are tracked in ToSkip.
47 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
48
49 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
50 // ..., UF-1.
51 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
52
53 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
54 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
55
56 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
57 /// all parts.
58 void unrollRecipeByUF(VPRecipeBase &R);
59
60 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
61 /// depends on the concrete header phi. Inserts newly created recipes at \p
62 /// InsertPtForPhi.
63 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
64 VPBasicBlock::iterator InsertPtForPhi);
65
66 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
67 /// the induction steps for each part.
68 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
69 VPBasicBlock::iterator InsertPtForPhi);
70
71 VPValue *getConstantInt(unsigned Part) {
72 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
73 return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part);
74 }
75
76public:
77 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
78
79 void unrollBlock(VPBlockBase *VPB);
80
81 VPValue *getValueForPart(VPValue *V, unsigned Part) {
82 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(Val: V))
83 return V;
84 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
85 "accessed value does not exist");
86 return VPV2Parts[V][Part - 1];
87 }
88
89 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
90 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
91 /// corresponding VPValue defined by \p CopyR.
92 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
93 unsigned Part) {
94 for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) {
95 const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV);
96 assert(V->second.size() == Part - 1 && "earlier parts not set");
97 V->second.push_back(Elt: CopyR->getVPValue(I: Idx));
98 }
99 }
100
101 /// Given a uniform recipe \p R, add it for all parts.
102 void addUniformForAllParts(VPSingleDefRecipe *R) {
103 const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R);
104 assert(Inserted && "uniform value already added");
105 for (unsigned Part = 0; Part != UF; ++Part)
106 V->second.push_back(Elt: R);
107 }
108
109 bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); }
110
111 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
112 /// \p P.
113 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
114 auto *Op = R->getOperand(N: OpIdx);
115 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
116 }
117
118 /// Update \p R's operands with their corresponding VPValues for part \p P.
119 void remapOperands(VPRecipeBase *R, unsigned Part) {
120 for (const auto &[OpIdx, Op] : enumerate(First: R->operands()))
121 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
122 }
123};
124} // namespace
125
126static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
127 unsigned Part, VPlan &Plan,
128 VPTypeAnalysis &TypeInfo) {
129 if (Part == 0)
130 return;
131
132 VPBuilder Builder(Steps);
133 Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0));
134 Type *IntStepTy =
135 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
136 VPValue *StartIndex = Steps->getVFValue();
137 if (Part > 1) {
138 StartIndex = Builder.createOverflowingOp(
139 Opcode: Instruction::Mul,
140 Operands: {StartIndex,
141 Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: StartIndex), Val: Part)});
142 }
143 StartIndex = Builder.createScalarSExtOrTrunc(
144 Op: StartIndex, ResultTy: IntStepTy, SrcTy: TypeInfo.inferScalarType(V: StartIndex),
145 DL: Steps->getDebugLoc());
146
147 if (BaseIVTy->isFloatingPointTy())
148 StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex,
149 ResultTy: BaseIVTy, DL: Steps->getDebugLoc());
150
151 Steps->setStartIndex(StartIndex);
152}
153
154void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
155 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
156 for (unsigned Part = 1; Part != UF; ++Part) {
157 auto *Copy = VPR->clone();
158 VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt);
159
160 auto PartI = vp_depth_first_shallow(G: Copy->getEntry());
161 auto Part0 = vp_depth_first_shallow(G: VPR->getEntry());
162 for (const auto &[PartIVPBB, Part0VPBB] :
163 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI),
164 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) {
165 for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) {
166 remapOperands(R: &PartIR, Part);
167 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR))
168 addStartIndexForScalarSteps(Steps, Part, Plan, TypeInfo);
169
170 addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part);
171 }
172 }
173 }
174}
175
176void UnrollState::unrollWidenInductionByUF(
177 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
178 VPBasicBlock *PH = cast<VPBasicBlock>(
179 Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
180 Type *IVTy = TypeInfo.inferScalarType(V: IV);
181 auto &ID = IV->getInductionDescriptor();
182 VPIRFlags Flags;
183 if (isa_and_present<FPMathOperator>(Val: ID.getInductionBinOp()))
184 Flags = ID.getInductionBinOp()->getFastMathFlags();
185
186 VPValue *ScalarStep = IV->getStepValue();
187 VPBuilder Builder(PH);
188 Type *VectorStepTy =
189 IVTy->isPointerTy() ? TypeInfo.inferScalarType(V: ScalarStep) : IVTy;
190 VPInstruction *VectorStep = Builder.createNaryOp(
191 Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy,
192 Flags, DL: IV->getDebugLoc());
193
194 ToSkip.insert(Ptr: VectorStep);
195
196 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
197 // remains the header phi. Parts > 0 are computed by adding Step to the
198 // previous part. The header phi recipe will get 2 new operands: the step
199 // value for a single part and the last part, used to compute the backedge
200 // value during VPWidenInductionRecipe::execute.
201 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
202 // %Part.1 = %Part.0 + %VectorStep
203 // %Part.2 = %Part.1 + %VectorStep
204 // %Part.3 = %Part.2 + %VectorStep
205 //
206 // The newly added recipes are added to ToSkip to avoid interleaving them
207 // again.
208 VPValue *Prev = IV;
209 Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi);
210 unsigned AddOpc;
211 VPIRFlags AddFlags;
212 if (IVTy->isPointerTy()) {
213 AddOpc = VPInstruction::WidePtrAdd;
214 AddFlags = GEPNoWrapFlags::none();
215 } else if (IVTy->isFloatingPointTy()) {
216 AddOpc = ID.getInductionOpcode();
217 AddFlags = Flags; // FMF flags
218 } else {
219 AddOpc = Instruction::Add;
220 AddFlags = VPIRFlags::getDefaultFlags(Opcode: AddOpc);
221 if (cast<VPWidenIntOrFpInductionRecipe>(Val: IV)->isCanonical())
222 AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false);
223 }
224 for (unsigned Part = 1; Part != UF; ++Part) {
225 std::string Name =
226 Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add";
227
228 VPInstruction *Add =
229 Builder.createNaryOp(Opcode: AddOpc,
230 Operands: {
231 Prev,
232 VectorStep,
233 },
234 Flags: AddFlags, DL: IV->getDebugLoc(), Name);
235 ToSkip.insert(Ptr: Add);
236 addRecipeForPart(OrigR: IV, CopyR: Add, Part);
237 Prev = Add;
238 }
239 IV->addOperand(Operand: VectorStep);
240 IV->addOperand(Operand: Prev);
241}
242
243void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
244 VPBasicBlock::iterator InsertPtForPhi) {
245 // First-order recurrences pass a single vector or scalar through their header
246 // phis, irrespective of interleaving.
247 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R))
248 return;
249
250 // Generate step vectors for each unrolled part.
251 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) {
252 unrollWidenInductionByUF(IV, InsertPtForPhi);
253 return;
254 }
255
256 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R);
257 if (RdxPhi && RdxPhi->isOrdered())
258 return;
259
260 auto InsertPt = std::next(x: R->getIterator());
261 for (unsigned Part = 1; Part != UF; ++Part) {
262 VPRecipeBase *Copy = R->clone();
263 Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt);
264 addRecipeForPart(OrigR: R, CopyR: Copy, Part);
265 if (RdxPhi) {
266 // If the start value is a ReductionStartVector, use the identity value
267 // (second operand) for unrolled parts. If the scaling factor is > 1,
268 // create a new ReductionStartVector with the scale factor and both
269 // operands set to the identity value.
270 if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) {
271 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
272 "unexpected start VPInstruction");
273 if (Part != 1)
274 continue;
275 VPValue *StartV;
276 if (match(V: VPI->getOperand(N: 2), P: m_One())) {
277 StartV = VPI->getOperand(N: 1);
278 } else {
279 auto *C = VPI->clone();
280 C->setOperand(I: 0, New: C->getOperand(N: 1));
281 C->insertAfter(InsertPos: VPI);
282 StartV = C;
283 }
284 for (unsigned Part = 1; Part != UF; ++Part)
285 VPV2Parts[VPI][Part - 1] = StartV;
286 }
287 } else {
288 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
289 "unexpected header phi recipe not needing unrolled part");
290 }
291 }
292}
293
294/// Handle non-header-phi recipes.
295void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
296 if (match(V: &R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount())))
297 return;
298
299 if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) {
300 if (vputils::onlyFirstPartUsed(Def: VPI)) {
301 addUniformForAllParts(R: VPI);
302 return;
303 }
304 }
305 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
306 if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) &&
307 RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) {
308 // Stores to an invariant address only need to store the last part.
309 remapOperands(R: &R, Part: UF - 1);
310 return;
311 }
312 if (match(V: RepR,
313 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
314 addUniformForAllParts(R: RepR);
315 return;
316 }
317 }
318
319 // Unroll non-uniform recipes.
320 auto InsertPt = std::next(x: R.getIterator());
321 VPBasicBlock &VPBB = *R.getParent();
322 for (unsigned Part = 1; Part != UF; ++Part) {
323 VPRecipeBase *Copy = R.clone();
324 Copy->insertBefore(BB&: VPBB, IP: InsertPt);
325 addRecipeForPart(OrigR: &R, CopyR: Copy, Part);
326
327 // Phi operands are updated once all other recipes have been unrolled.
328 if (isa<VPWidenPHIRecipe>(Val: Copy))
329 continue;
330
331 VPValue *Op;
332 if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
333 Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) {
334 Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1));
335 Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part));
336 continue;
337 }
338 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: &R)) {
339 VPBuilder Builder(VPR);
340 const DataLayout &DL = Plan.getDataLayout();
341 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: VPR));
342 Type *VFTy = TypeInfo.inferScalarType(V: &Plan.getVF());
343 VPValue *VF = Builder.createScalarZExtOrTrunc(
344 Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown());
345 // VFxUF does not wrap, so VF * Part also cannot wrap.
346 VPValue *VFxPart = Builder.createOverflowingOp(
347 Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)},
348 WrapFlags: {true, true});
349 Copy->setOperand(I: 0, New: VPR->getOperand(N: 0));
350 Copy->addOperand(Operand: VFxPart);
351 continue;
352 }
353 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) {
354 auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0));
355 if (Phi && Phi->isOrdered()) {
356 auto &Parts = VPV2Parts[Phi];
357 if (Part == 1) {
358 Parts.clear();
359 Parts.push_back(Elt: Red);
360 }
361 Parts.push_back(Elt: Copy->getVPSingleValue());
362 Phi->setOperand(I: 1, New: Copy->getVPSingleValue());
363 }
364 }
365 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: Copy)) {
366 // Materialize PartN offset for VectorEndPointer.
367 VEPR->setOperand(I: 0, New: R.getOperand(N: 0));
368 VEPR->setOperand(I: 1, New: R.getOperand(N: 1));
369 VEPR->materializeOffset(Part);
370 continue;
371 }
372
373 remapOperands(R: Copy, Part);
374
375 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy))
376 addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part, Plan, TypeInfo);
377
378 // Add operand indicating the part to generate code for, to recipes still
379 // requiring it.
380 if (isa<VPWidenCanonicalIVRecipe>(Val: Copy))
381 Copy->addOperand(Operand: getConstantInt(Part));
382
383 if (match(V: Copy,
384 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
385 VPBuilder Builder(Copy);
386 VPValue *ScaledByPart = Builder.createOverflowingOp(
387 Opcode: Instruction::Mul, Operands: {Copy->getOperand(N: 1), getConstantInt(Part)});
388 Copy->setOperand(I: 1, New: ScaledByPart);
389 }
390 }
391 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
392 // Materialize Part0 offset for VectorEndPointer.
393 VEPR->materializeOffset();
394 }
395}
396
397void UnrollState::unrollBlock(VPBlockBase *VPB) {
398 auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB);
399 if (VPR) {
400 if (VPR->isReplicator())
401 return unrollReplicateRegionByUF(VPR);
402
403 // Traverse blocks in region in RPO to ensure defs are visited before uses
404 // across blocks.
405 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
406 RPOT(VPR->getEntry());
407 for (VPBlockBase *VPB : RPOT)
408 unrollBlock(VPB);
409 return;
410 }
411
412 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
413 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
414 auto InsertPtForPhi = VPBB->getFirstNonPhi();
415 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
416 if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R))
417 continue;
418
419 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
420 // Compute*Result which combine all parts to compute the final value.
421 VPValue *Op1;
422 if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) ||
423 match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) ||
424 match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) ||
425 match(V: &R,
426 P: m_ComputeAnyOfResult(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) ||
427 match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) {
428 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
429 for (unsigned Part = 1; Part != UF; ++Part)
430 R.addOperand(Operand: getValueForPart(V: Op1, Part));
431 continue;
432 }
433 VPValue *Op0;
434 if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) {
435 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
436 for (unsigned Part = 1; Part != UF; ++Part)
437 R.addOperand(Operand: getValueForPart(V: Op1, Part));
438 continue;
439 }
440
441 VPValue *Op2;
442 if (match(V: &R, P: m_ExtractLastActive(Op0: m_VPValue(), Op1: m_VPValue(V&: Op1),
443 Op2: m_VPValue(V&: Op2)))) {
444 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
445 for (unsigned Part = 1; Part != UF; ++Part) {
446 R.addOperand(Operand: getValueForPart(V: Op1, Part));
447 R.addOperand(Operand: getValueForPart(V: Op2, Part));
448 }
449 continue;
450 }
451
452 if (Plan.hasScalarVFOnly()) {
453 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) ||
454 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
455 auto *I = cast<VPInstruction>(Val: &R);
456 bool IsPenultimatePart =
457 I->getOpcode() == VPInstruction::ExtractPenultimateElement;
458 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
459 // For scalar VF, directly use the scalar part value.
460 I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx));
461 continue;
462 }
463 }
464 // For vector VF, the penultimate element is always extracted from the last part.
465 if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) ||
466 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
467 addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R));
468 R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1));
469 continue;
470 }
471
472 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R);
473 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) {
474 addUniformForAllParts(R: SingleDef);
475 continue;
476 }
477
478 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) {
479 unrollHeaderPHIByUF(R: H, InsertPtForPhi);
480 continue;
481 }
482
483 unrollRecipeByUF(R);
484 }
485}
486
487void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
488 assert(UF > 0 && "Unroll factor must be positive");
489 Plan.setUF(UF);
490 llvm::scope_exit Cleanup([&Plan, UF]() {
491 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
492 // Remove recipes that are redundant after unrolling.
493 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
494 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
495 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
496 if (VPI &&
497 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
498 VPI->getOperand(N: 1) == &Plan.getVF()) {
499 VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0));
500 VPI->eraseFromParent();
501 }
502 }
503 }
504
505 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
506 Plan.getUF().replaceAllUsesWith(New: Plan.getConstantInt(Ty: TCTy, Val: UF));
507 });
508 if (UF == 1) {
509 return;
510 }
511
512 UnrollState Unroller(Plan, UF);
513
514 // Iterate over all blocks in the plan starting from Entry, and unroll
515 // recipes inside them. This includes the vector preheader and middle blocks,
516 // which may set up or post-process per-part values.
517 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
518 Plan.getEntry());
519 for (VPBlockBase *VPB : RPOT)
520 Unroller.unrollBlock(VPB);
521
522 unsigned Part = 1;
523 // Remap operands of cloned header phis to update backedge values. The header
524 // phis cloned during unrolling are just after the header phi for part 0.
525 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
526 for (VPRecipeBase &H :
527 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
528 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
529 // value across the backedge, needs to remap to the last part of the spliced
530 // value.
531 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) {
532 Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1);
533 continue;
534 }
535 if (Unroller.contains(VPV: H.getVPSingleValue())) {
536 Part = 1;
537 continue;
538 }
539 Unroller.remapOperands(R: &H, Part);
540 Part++;
541 }
542
543 VPlanTransforms::removeDeadRecipes(Plan);
544}
545
546/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe,
547/// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p
548/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
549static VPValue *
550cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
551 VPSingleDefRecipe *DefR, VPLane Lane,
552 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
553 assert((isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(DefR)) &&
554 "DefR must be a VPReplicateRecipe, VPInstruction or "
555 "VPScalarIVStepsRecipe");
556 VPValue *Op;
557 if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) {
558 auto LaneDefs = Def2LaneDefs.find(Val: Op);
559 if (LaneDefs != Def2LaneDefs.end())
560 return LaneDefs->second[Lane.getKnownLane()];
561
562 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
563 return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
564 }
565
566 // Collect the operands at Lane, creating extracts as needed.
567 SmallVector<VPValue *> NewOps;
568 for (VPValue *Op : DefR->operands()) {
569 // If Op is a definition that has been unrolled, directly use the clone for
570 // the corresponding lane.
571 auto LaneDefs = Def2LaneDefs.find(Val: Op);
572 if (LaneDefs != Def2LaneDefs.end()) {
573 NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]);
574 continue;
575 }
576 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
577 // Look through mandatory Unpack.
578 [[maybe_unused]] bool Matched =
579 match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)));
580 assert(Matched && "original op must have been Unpack");
581 auto *ExtractPart =
582 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op});
583 NewOps.push_back(
584 Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart}));
585 continue;
586 }
587 if (vputils::isSingleScalar(VPV: Op)) {
588 NewOps.push_back(Elt: Op);
589 continue;
590 }
591
592 // Look through buildvector to avoid unnecessary extracts.
593 if (match(V: Op, P: m_BuildVector())) {
594 NewOps.push_back(
595 Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane()));
596 continue;
597 }
598 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
599 VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
600 NewOps.push_back(Elt: Ext);
601 }
602
603 VPSingleDefRecipe *New;
604 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) {
605 // TODO: have cloning of replicate recipes also provide the desired result
606 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
607 // Mask from the operands?)
608 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
609 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
610 *RepR, *RepR, RepR->getDebugLoc());
611 } else {
612 New = DefR->clone();
613 for (const auto &[Idx, Op] : enumerate(First&: NewOps)) {
614 New->setOperand(I: Idx, New: Op);
615 }
616 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: New)) {
617 // Skip lane 0: an absent start index is implicitly zero.
618 unsigned KnownLane = Lane.getKnownLane();
619 if (KnownLane != 0) {
620 VPTypeAnalysis TypeInfo(Plan);
621 Type *BaseIVTy = TypeInfo.inferScalarType(V: DefR->getOperand(N: 0));
622
623 VPValue *StartIndex = Steps->getStartIndex();
624 VPValue *LaneOffset;
625 unsigned AddOp;
626 VPIRFlags Flags;
627 if (BaseIVTy->isFloatingPointTy()) {
628 int SignedLane = static_cast<int>(KnownLane);
629 if (!StartIndex && Steps->getInductionOpcode() == Instruction::FSub)
630 SignedLane = -SignedLane;
631 LaneOffset =
632 Plan.getOrAddLiveIn(V: ConstantFP::get(Ty: BaseIVTy, V: SignedLane));
633 AddOp = Steps->getInductionOpcode();
634 Flags = VPIRFlags(FastMathFlags());
635 } else {
636 unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits();
637 LaneOffset = Plan.getConstantInt(Val: APInt(BaseIVBits, KnownLane,
638 /*isSigned*/ false,
639 /*implicitTrunc*/ true));
640 AddOp = Instruction::Add;
641 Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false));
642 }
643
644 if (StartIndex) {
645 VPBuilder LaneBuilder(DefR);
646 LaneOffset =
647 LaneBuilder.createNaryOp(Opcode: AddOp, Operands: {StartIndex, LaneOffset}, Flags);
648 }
649 Steps->setStartIndex(LaneOffset);
650 }
651 }
652 }
653 New->insertBefore(InsertPos: DefR);
654 return New;
655}
656
657void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
658 if (Plan.hasScalarVFOnly())
659 return;
660
661 Type *IdxTy = IntegerType::get(
662 C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32);
663
664 // Visit all VPBBs outside the loop region and directly inside the top-level
665 // loop region.
666 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
667 Range: vp_depth_first_shallow(G: Plan.getEntry()));
668 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
669 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()));
670 auto VPBBsToUnroll =
671 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion);
672 // A mapping of current VPValue definitions to collections of new VPValues
673 // defined per lane. Serves to hook-up potential users of current VPValue
674 // definition that are replicated-per-VF later.
675 DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
676 // The removal of current recipes being replaced by new ones needs to be
677 // delayed after Def2LaneDefs is no longer in use.
678 SmallVector<VPRecipeBase *> ToRemove;
679 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
680 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
681 if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: &R) ||
682 (isa<VPReplicateRecipe>(Val: &R) &&
683 cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) ||
684 (isa<VPInstruction>(Val: &R) &&
685 !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() &&
686 cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack))
687 continue;
688
689 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
690 VPBuilder Builder(DefR);
691 if (DefR->getNumUsers() == 0) {
692 // Create single-scalar version of DefR for all lanes.
693 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
694 cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs);
695 DefR->eraseFromParent();
696 continue;
697 }
698 /// Create single-scalar version of DefR for all lanes.
699 SmallVector<VPValue *> LaneDefs;
700 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
701 LaneDefs.push_back(
702 Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs));
703
704 Def2LaneDefs[DefR] = LaneDefs;
705 /// Users that only demand the first lane can use the definition for lane
706 /// 0.
707 DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) {
708 return U.usesFirstLaneOnly(Op: DefR);
709 });
710
711 // Update each build vector user that currently has DefR as its only
712 // operand, to have all LaneDefs as its operands.
713 for (VPUser *U : to_vector(Range: DefR->users())) {
714 auto *VPI = dyn_cast<VPInstruction>(Val: U);
715 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
716 VPI->getOpcode() != VPInstruction::BuildStructVector))
717 continue;
718 assert(VPI->getNumOperands() == 1 &&
719 "Build(Struct)Vector must have a single operand before "
720 "replicating by VF");
721 VPI->setOperand(I: 0, New: LaneDefs[0]);
722 for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs))
723 VPI->addOperand(Operand: LaneDef);
724 }
725 ToRemove.push_back(Elt: DefR);
726 }
727 }
728 for (auto *R : reverse(C&: ToRemove))
729 R->eraseFromParent();
730}
731