1//===-- VPlanUnroll.cpp - VPlan unroller ----------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements explicit unrolling for VPlans.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPRecipeBuilder.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanCFG.h"
18#include "VPlanHelpers.h"
19#include "VPlanPatternMatch.h"
20#include "VPlanTransforms.h"
21#include "VPlanUtils.h"
22#include "llvm/ADT/PostOrderIterator.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/Analysis/IVDescriptors.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Intrinsics.h"
28
29using namespace llvm;
30using namespace llvm::VPlanPatternMatch;
31
32namespace {
33
34/// Helper to hold state needed for unrolling. It holds the Plan to unroll by
35/// UF. It also holds copies of VPValues across UF-1 unroll parts to facilitate
36/// the unrolling transformation, where the original VPValues are retained for
37/// part zero.
38class UnrollState {
39 /// Plan to unroll.
40 VPlan &Plan;
41 /// Unroll factor to unroll by.
42 const unsigned UF;
43 /// Analysis for types.
44 VPTypeAnalysis TypeInfo;
45
46 /// Unrolling may create recipes that should not be unrolled themselves.
47 /// Those are tracked in ToSkip.
48 SmallPtrSet<VPRecipeBase *, 8> ToSkip;
49
50 // Associate with each VPValue of part 0 its unrolled instances of parts 1,
51 // ..., UF-1.
52 DenseMap<VPValue *, SmallVector<VPValue *>> VPV2Parts;
53
54 /// Unroll replicate region \p VPR by cloning the region UF - 1 times.
55 void unrollReplicateRegionByUF(VPRegionBlock *VPR);
56
57 /// Unroll recipe \p R by cloning it UF - 1 times, unless it is uniform across
58 /// all parts.
59 void unrollRecipeByUF(VPRecipeBase &R);
60
61 /// Unroll header phi recipe \p R. How exactly the recipe gets unrolled
62 /// depends on the concrete header phi. Inserts newly created recipes at \p
63 /// InsertPtForPhi.
64 void unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
65 VPBasicBlock::iterator InsertPtForPhi);
66
67 /// Unroll a widen induction recipe \p IV. This introduces recipes to compute
68 /// the induction steps for each part.
69 void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
70 VPBasicBlock::iterator InsertPtForPhi);
71
72 VPValue *getConstantInt(unsigned Part) {
73 Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
74 return Plan.getConstantInt(Ty: CanIVIntTy, Val: Part);
75 }
76
77public:
78 UnrollState(VPlan &Plan, unsigned UF) : Plan(Plan), UF(UF), TypeInfo(Plan) {}
79
80 void unrollBlock(VPBlockBase *VPB);
81
82 VPValue *getValueForPart(VPValue *V, unsigned Part) {
83 if (Part == 0 || isa<VPIRValue, VPSymbolicValue>(Val: V))
84 return V;
85 assert((VPV2Parts.contains(V) && VPV2Parts[V].size() >= Part) &&
86 "accessed value does not exist");
87 return VPV2Parts[V][Part - 1];
88 }
89
90 /// Given a single original recipe \p OrigR (of part zero), and its copy \p
91 /// CopyR for part \p Part, map every VPValue defined by \p OrigR to its
92 /// corresponding VPValue defined by \p CopyR.
93 void addRecipeForPart(VPRecipeBase *OrigR, VPRecipeBase *CopyR,
94 unsigned Part) {
95 for (const auto &[Idx, VPV] : enumerate(First: OrigR->definedValues())) {
96 const auto &[V, _] = VPV2Parts.try_emplace(Key: VPV);
97 assert(V->second.size() == Part - 1 && "earlier parts not set");
98 V->second.push_back(Elt: CopyR->getVPValue(I: Idx));
99 }
100 }
101
102 /// Given a uniform recipe \p R, add it for all parts.
103 void addUniformForAllParts(VPSingleDefRecipe *R) {
104 const auto &[V, Inserted] = VPV2Parts.try_emplace(Key: R);
105 assert(Inserted && "uniform value already added");
106 for (unsigned Part = 0; Part != UF; ++Part)
107 V->second.push_back(Elt: R);
108 }
109
110 bool contains(VPValue *VPV) const { return VPV2Parts.contains(Val: VPV); }
111
112 /// Update \p R's operand at \p OpIdx with its corresponding VPValue for part
113 /// \p P.
114 void remapOperand(VPRecipeBase *R, unsigned OpIdx, unsigned Part) {
115 auto *Op = R->getOperand(N: OpIdx);
116 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
117 }
118
119 /// Update \p R's operands with their corresponding VPValues for part \p P.
120 void remapOperands(VPRecipeBase *R, unsigned Part) {
121 for (const auto &[OpIdx, Op] : enumerate(First: R->operands()))
122 R->setOperand(I: OpIdx, New: getValueForPart(V: Op, Part));
123 }
124};
125} // namespace
126
127static void addStartIndexForScalarSteps(VPScalarIVStepsRecipe *Steps,
128 unsigned Part, VPlan &Plan,
129 VPTypeAnalysis &TypeInfo) {
130 if (Part == 0)
131 return;
132
133 VPBuilder Builder(Steps);
134 Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0));
135 Type *IntStepTy =
136 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
137 VPValue *StartIndex = Steps->getVFValue();
138 if (Part > 1) {
139 StartIndex = Builder.createOverflowingOp(
140 Opcode: Instruction::Mul,
141 Operands: {StartIndex,
142 Plan.getConstantInt(Ty: TypeInfo.inferScalarType(V: StartIndex), Val: Part)});
143 }
144 StartIndex = Builder.createScalarSExtOrTrunc(
145 Op: StartIndex, ResultTy: IntStepTy, SrcTy: TypeInfo.inferScalarType(V: StartIndex),
146 DL: Steps->getDebugLoc());
147
148 if (BaseIVTy->isFloatingPointTy())
149 StartIndex = Builder.createScalarCast(Opcode: Instruction::SIToFP, Op: StartIndex,
150 ResultTy: BaseIVTy, DL: Steps->getDebugLoc());
151
152 Steps->setStartIndex(StartIndex);
153}
154
155void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
156 VPBlockBase *InsertPt = VPR->getSingleSuccessor();
157 for (unsigned Part = 1; Part != UF; ++Part) {
158 auto *Copy = VPR->clone();
159 VPBlockUtils::insertBlockBefore(NewBlock: Copy, BlockPtr: InsertPt);
160
161 auto PartI = vp_depth_first_shallow(G: Copy->getEntry());
162 auto Part0 = vp_depth_first_shallow(G: VPR->getEntry());
163 for (const auto &[PartIVPBB, Part0VPBB] :
164 zip(t: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: PartI),
165 u: VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Part0))) {
166 for (const auto &[PartIR, Part0R] : zip(t&: *PartIVPBB, u&: *Part0VPBB)) {
167 remapOperands(R: &PartIR, Part);
168 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &PartIR))
169 addStartIndexForScalarSteps(Steps, Part, Plan, TypeInfo);
170
171 addRecipeForPart(OrigR: &Part0R, CopyR: &PartIR, Part);
172 }
173 }
174 }
175}
176
177void UnrollState::unrollWidenInductionByUF(
178 VPWidenInductionRecipe *IV, VPBasicBlock::iterator InsertPtForPhi) {
179 VPBasicBlock *PH = cast<VPBasicBlock>(
180 Val: IV->getParent()->getEnclosingLoopRegion()->getSinglePredecessor());
181 Type *IVTy = TypeInfo.inferScalarType(V: IV);
182 auto &ID = IV->getInductionDescriptor();
183 FastMathFlags FMF;
184 VPIRFlags::WrapFlagsTy WrapFlags(false, false);
185 if (auto *IntOrFPInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: IV)) {
186 if (IntOrFPInd->hasFastMathFlags())
187 FMF = IntOrFPInd->getFastMathFlags();
188 if (IntOrFPInd->hasNoWrapFlags())
189 WrapFlags = IntOrFPInd->getNoWrapFlags();
190 }
191
192 VPValue *ScalarStep = IV->getStepValue();
193 VPBuilder Builder(PH);
194 Type *VectorStepTy =
195 IVTy->isPointerTy() ? TypeInfo.inferScalarType(V: ScalarStep) : IVTy;
196 VPInstruction *VectorStep = Builder.createNaryOp(
197 Opcode: VPInstruction::WideIVStep, Operands: {&Plan.getVF(), ScalarStep}, ResultTy: VectorStepTy, Flags: FMF,
198 DL: IV->getDebugLoc());
199
200 ToSkip.insert(Ptr: VectorStep);
201
202 // Now create recipes to compute the induction steps for part 1 .. UF. Part 0
203 // remains the header phi. Parts > 0 are computed by adding Step to the
204 // previous part. The header phi recipe will get 2 new operands: the step
205 // value for a single part and the last part, used to compute the backedge
206 // value during VPWidenInductionRecipe::execute.
207 // %Part.0 = VPWidenInductionRecipe %Start, %ScalarStep, %VectorStep, %Part.3
208 // %Part.1 = %Part.0 + %VectorStep
209 // %Part.2 = %Part.1 + %VectorStep
210 // %Part.3 = %Part.2 + %VectorStep
211 //
212 // The newly added recipes are added to ToSkip to avoid interleaving them
213 // again.
214 VPValue *Prev = IV;
215 Builder.setInsertPoint(TheBB: IV->getParent(), IP: InsertPtForPhi);
216 unsigned AddOpc;
217 VPIRFlags AddFlags;
218 if (IVTy->isPointerTy()) {
219 AddOpc = VPInstruction::WidePtrAdd;
220 AddFlags = GEPNoWrapFlags::none();
221 } else if (IVTy->isFloatingPointTy()) {
222 AddOpc = ID.getInductionOpcode();
223 AddFlags = FMF;
224 } else {
225 AddOpc = Instruction::Add;
226 AddFlags = WrapFlags;
227 if (cast<VPWidenIntOrFpInductionRecipe>(Val: IV)->isCanonical())
228 AddFlags = VPIRFlags::WrapFlagsTy(/*NUW=*/true, /*NSW=*/false);
229 }
230 for (unsigned Part = 1; Part != UF; ++Part) {
231 std::string Name =
232 Part > 1 ? "step.add." + std::to_string(val: Part) : "step.add";
233
234 VPInstruction *Add =
235 Builder.createNaryOp(Opcode: AddOpc,
236 Operands: {
237 Prev,
238 VectorStep,
239 },
240 Flags: AddFlags, DL: IV->getDebugLoc(), Name);
241 ToSkip.insert(Ptr: Add);
242 addRecipeForPart(OrigR: IV, CopyR: Add, Part);
243 Prev = Add;
244 }
245 IV->addOperand(Operand: VectorStep);
246 IV->addOperand(Operand: Prev);
247}
248
249void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
250 VPBasicBlock::iterator InsertPtForPhi) {
251 // First-order recurrences pass a single vector or scalar through their header
252 // phis, irrespective of interleaving.
253 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: R))
254 return;
255
256 // Generate step vectors for each unrolled part.
257 if (auto *IV = dyn_cast<VPWidenInductionRecipe>(Val: R)) {
258 unrollWidenInductionByUF(IV, InsertPtForPhi);
259 return;
260 }
261
262 auto *RdxPhi = dyn_cast<VPReductionPHIRecipe>(Val: R);
263 if (RdxPhi && RdxPhi->isOrdered())
264 return;
265
266 auto InsertPt = std::next(x: R->getIterator());
267 for (unsigned Part = 1; Part != UF; ++Part) {
268 VPRecipeBase *Copy = R->clone();
269 Copy->insertBefore(BB&: *R->getParent(), IP: InsertPt);
270 addRecipeForPart(OrigR: R, CopyR: Copy, Part);
271 if (RdxPhi) {
272 // If the start value is a ReductionStartVector, use the identity value
273 // (second operand) for unrolled parts. If the scaling factor is > 1,
274 // create a new ReductionStartVector with the scale factor and both
275 // operands set to the identity value.
276 if (auto *VPI = dyn_cast<VPInstruction>(Val: RdxPhi->getStartValue())) {
277 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
278 "unexpected start VPInstruction");
279 if (Part != 1)
280 continue;
281 VPValue *StartV;
282 if (match(V: VPI->getOperand(N: 2), P: m_One())) {
283 StartV = VPI->getOperand(N: 1);
284 } else {
285 auto *C = VPI->clone();
286 C->setOperand(I: 0, New: C->getOperand(N: 1));
287 C->insertAfter(InsertPos: VPI);
288 StartV = C;
289 }
290 for (unsigned Part = 1; Part != UF; ++Part)
291 VPV2Parts[VPI][Part - 1] = StartV;
292 }
293 } else {
294 assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
295 "unexpected header phi recipe not needing unrolled part");
296 }
297 }
298}
299
300/// Handle non-header-phi recipes.
301void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
302 if (match(V: &R, P: m_CombineOr(L: m_BranchOnCond(), R: m_BranchOnCount())))
303 return;
304
305 if (auto *VPI = dyn_cast<VPInstruction>(Val: &R)) {
306 if (vputils::onlyFirstPartUsed(Def: VPI)) {
307 addUniformForAllParts(R: VPI);
308 return;
309 }
310 }
311 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
312 if (isa<StoreInst>(Val: RepR->getUnderlyingValue()) &&
313 RepR->getOperand(N: 1)->isDefinedOutsideLoopRegions()) {
314 // Stores to an invariant address only need to store the last part.
315 remapOperands(R: &R, Part: UF - 1);
316 return;
317 }
318 if (match(V: RepR,
319 P: m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
320 addUniformForAllParts(R: RepR);
321 return;
322 }
323 }
324
325 // Unroll non-uniform recipes.
326 auto InsertPt = std::next(x: R.getIterator());
327 VPBasicBlock &VPBB = *R.getParent();
328 for (unsigned Part = 1; Part != UF; ++Part) {
329 VPRecipeBase *Copy = R.clone();
330 Copy->insertBefore(BB&: VPBB, IP: InsertPt);
331 addRecipeForPart(OrigR: &R, CopyR: Copy, Part);
332
333 // Phi operands are updated once all other recipes have been unrolled.
334 if (isa<VPWidenPHIRecipe>(Val: Copy))
335 continue;
336
337 VPValue *Op;
338 if (match(V: &R, P: m_VPInstruction<VPInstruction::FirstOrderRecurrenceSplice>(
339 Ops: m_VPValue(), Ops: m_VPValue(V&: Op)))) {
340 Copy->setOperand(I: 0, New: getValueForPart(V: Op, Part: Part - 1));
341 Copy->setOperand(I: 1, New: getValueForPart(V: Op, Part));
342 continue;
343 }
344 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Val: &R)) {
345 VPBuilder Builder(VPR);
346 const DataLayout &DL = Plan.getDataLayout();
347 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: VPR));
348 Type *VFTy = TypeInfo.inferScalarType(V: &Plan.getVF());
349 VPValue *VF = Builder.createScalarZExtOrTrunc(
350 Op: &Plan.getVF(), ResultTy: IndexTy, SrcTy: VFTy, DL: DebugLoc::getUnknown());
351 // VFxUF does not wrap, so VF * Part also cannot wrap.
352 VPValue *VFxPart = Builder.createOverflowingOp(
353 Opcode: Instruction::Mul, Operands: {VF, Plan.getConstantInt(Ty: IndexTy, Val: Part)},
354 WrapFlags: {true, true});
355 Copy->setOperand(I: 0, New: VPR->getOperand(N: 0));
356 Copy->addOperand(Operand: VFxPart);
357 continue;
358 }
359 if (auto *Red = dyn_cast<VPReductionRecipe>(Val: &R)) {
360 auto *Phi = dyn_cast<VPReductionPHIRecipe>(Val: R.getOperand(N: 0));
361 if (Phi && Phi->isOrdered()) {
362 auto &Parts = VPV2Parts[Phi];
363 if (Part == 1) {
364 Parts.clear();
365 Parts.push_back(Elt: Red);
366 }
367 Parts.push_back(Elt: Copy->getVPSingleValue());
368 Phi->setOperand(I: 1, New: Copy->getVPSingleValue());
369 }
370 }
371 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: Copy)) {
372 // Materialize PartN offset for VectorEndPointer.
373 VEPR->setOperand(I: 0, New: R.getOperand(N: 0));
374 VEPR->setOperand(I: 1, New: R.getOperand(N: 1));
375 VEPR->materializeOffset(Part);
376 continue;
377 }
378
379 remapOperands(R: Copy, Part);
380
381 if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(Val: Copy))
382 addStartIndexForScalarSteps(Steps: ScalarIVSteps, Part, Plan, TypeInfo);
383
384 // Add operand indicating the part to generate code for, to recipes still
385 // requiring it.
386 if (isa<VPWidenCanonicalIVRecipe>(Val: Copy))
387 Copy->addOperand(Operand: getConstantInt(Part));
388
389 if (match(V: Copy,
390 P: m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
391 VPBuilder Builder(Copy);
392 VPValue *ScaledByPart = Builder.createOverflowingOp(
393 Opcode: Instruction::Mul, Operands: {Copy->getOperand(N: 1), getConstantInt(Part)});
394 Copy->setOperand(I: 1, New: ScaledByPart);
395 }
396 }
397 if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(Val: &R)) {
398 // Materialize Part0 offset for VectorEndPointer.
399 VEPR->materializeOffset();
400 }
401}
402
403void UnrollState::unrollBlock(VPBlockBase *VPB) {
404 auto *VPR = dyn_cast<VPRegionBlock>(Val: VPB);
405 if (VPR) {
406 if (VPR->isReplicator())
407 return unrollReplicateRegionByUF(VPR);
408
409 // Traverse blocks in region in RPO to ensure defs are visited before uses
410 // across blocks.
411 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>>
412 RPOT(VPR->getEntry());
413 for (VPBlockBase *VPB : RPOT)
414 unrollBlock(VPB);
415 return;
416 }
417
418 // VPB is a VPBasicBlock; unroll it, i.e., unroll its recipes.
419 auto *VPBB = cast<VPBasicBlock>(Val: VPB);
420 auto InsertPtForPhi = VPBB->getFirstNonPhi();
421 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
422 if (ToSkip.contains(Ptr: &R) || isa<VPIRInstruction>(Val: &R))
423 continue;
424
425 // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
426 // Compute*Result which combine all parts to compute the final value.
427 VPValue *Op1;
428 if (match(V: &R, P: m_VPInstruction<VPInstruction::AnyOf>(Ops: m_VPValue(V&: Op1))) ||
429 match(V: &R, P: m_FirstActiveLane(Op0: m_VPValue(V&: Op1))) ||
430 match(V: &R, P: m_LastActiveLane(Op0: m_VPValue(V&: Op1))) ||
431 match(V: &R,
432 P: m_ComputeAnyOfResult(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue(V&: Op1))) ||
433 match(V: &R, P: m_ComputeReductionResult(Op0: m_VPValue(V&: Op1)))) {
434 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
435 for (unsigned Part = 1; Part != UF; ++Part)
436 R.addOperand(Operand: getValueForPart(V: Op1, Part));
437 continue;
438 }
439 VPValue *Op0;
440 if (match(V: &R, P: m_ExtractLane(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)))) {
441 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
442 for (unsigned Part = 1; Part != UF; ++Part)
443 R.addOperand(Operand: getValueForPart(V: Op1, Part));
444 continue;
445 }
446
447 VPValue *Op2;
448 if (match(V: &R, P: m_ExtractLastActive(Op0: m_VPValue(), Op1: m_VPValue(V&: Op1),
449 Op2: m_VPValue(V&: Op2)))) {
450 addUniformForAllParts(R: cast<VPInstruction>(Val: &R));
451 for (unsigned Part = 1; Part != UF; ++Part) {
452 R.addOperand(Operand: getValueForPart(V: Op1, Part));
453 R.addOperand(Operand: getValueForPart(V: Op2, Part));
454 }
455 continue;
456 }
457
458 if (Plan.hasScalarVFOnly()) {
459 if (match(V: &R, P: m_ExtractLastPart(Op0: m_VPValue(V&: Op0))) ||
460 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
461 auto *I = cast<VPInstruction>(Val: &R);
462 bool IsPenultimatePart =
463 I->getOpcode() == VPInstruction::ExtractPenultimateElement;
464 unsigned PartIdx = IsPenultimatePart ? UF - 2 : UF - 1;
465 // For scalar VF, directly use the scalar part value.
466 I->replaceAllUsesWith(New: getValueForPart(V: Op0, Part: PartIdx));
467 continue;
468 }
469 }
470 // For vector VF, the penultimate element is always extracted from the last part.
471 if (match(V: &R, P: m_ExtractLastLaneOfLastPart(Op0: m_VPValue(V&: Op0))) ||
472 match(V: &R, P: m_ExtractPenultimateElement(Op0: m_VPValue(V&: Op0)))) {
473 addUniformForAllParts(R: cast<VPSingleDefRecipe>(Val: &R));
474 R.setOperand(I: 0, New: getValueForPart(V: Op0, Part: UF - 1));
475 continue;
476 }
477
478 auto *SingleDef = dyn_cast<VPSingleDefRecipe>(Val: &R);
479 if (SingleDef && vputils::isUniformAcrossVFsAndUFs(V: SingleDef)) {
480 addUniformForAllParts(R: SingleDef);
481 continue;
482 }
483
484 if (auto *H = dyn_cast<VPHeaderPHIRecipe>(Val: &R)) {
485 unrollHeaderPHIByUF(R: H, InsertPtForPhi);
486 continue;
487 }
488
489 unrollRecipeByUF(R);
490 }
491}
492
493void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
494 assert(UF > 0 && "Unroll factor must be positive");
495 Plan.setUF(UF);
496 llvm::scope_exit Cleanup([&Plan, UF]() {
497 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
498 // Remove recipes that are redundant after unrolling.
499 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
500 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
501 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
502 if (VPI &&
503 VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
504 VPI->getOperand(N: 1) == &Plan.getVF()) {
505 VPI->replaceAllUsesWith(New: VPI->getOperand(N: 0));
506 VPI->eraseFromParent();
507 }
508 }
509 }
510
511 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(V: Plan.getTripCount());
512 Plan.getUF().replaceAllUsesWith(New: Plan.getConstantInt(Ty: TCTy, Val: UF));
513 });
514 if (UF == 1) {
515 return;
516 }
517
518 UnrollState Unroller(Plan, UF);
519
520 // Iterate over all blocks in the plan starting from Entry, and unroll
521 // recipes inside them. This includes the vector preheader and middle blocks,
522 // which may set up or post-process per-part values.
523 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
524 Plan.getEntry());
525 for (VPBlockBase *VPB : RPOT)
526 Unroller.unrollBlock(VPB);
527
528 unsigned Part = 1;
529 // Remap operands of cloned header phis to update backedge values. The header
530 // phis cloned during unrolling are just after the header phi for part 0.
531 // Reset Part to 1 when reaching the first (part 0) recipe of a block.
532 for (VPRecipeBase &H :
533 Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
534 // The second operand of Fixed Order Recurrence phi's, feeding the spliced
535 // value across the backedge, needs to remap to the last part of the spliced
536 // value.
537 if (isa<VPFirstOrderRecurrencePHIRecipe>(Val: &H)) {
538 Unroller.remapOperand(R: &H, OpIdx: 1, Part: UF - 1);
539 continue;
540 }
541 if (Unroller.contains(VPV: H.getVPSingleValue())) {
542 Part = 1;
543 continue;
544 }
545 Unroller.remapOperands(R: &H, Part);
546 Part++;
547 }
548
549 VPlanTransforms::removeDeadRecipes(Plan);
550}
551
552/// Add a lane offset to the start index of \p Steps.
553static void addLaneToStartIndex(VPScalarIVStepsRecipe *Steps, unsigned Lane,
554 VPlan &Plan, VPRecipeBase *InsertPt) {
555 assert(Lane > 0 && "Zero lane adds no offset to start index");
556 VPTypeAnalysis TypeInfo(Plan);
557 Type *BaseIVTy = TypeInfo.inferScalarType(V: Steps->getOperand(N: 0));
558
559 VPValue *OldStartIndex = Steps->getStartIndex();
560 VPValue *LaneOffset;
561 unsigned AddOpcode;
562 // TODO: Retrieve the flags from Steps unconditionally.
563 VPIRFlags Flags;
564 if (BaseIVTy->isFloatingPointTy()) {
565 int SignedLane = static_cast<int>(Lane);
566 if (!OldStartIndex && Steps->getInductionOpcode() == Instruction::FSub)
567 SignedLane = -SignedLane;
568 LaneOffset = Plan.getOrAddLiveIn(V: ConstantFP::get(Ty: BaseIVTy, V: SignedLane));
569 AddOpcode = Steps->getInductionOpcode();
570 Flags = VPIRFlags(FastMathFlags());
571 } else {
572 unsigned BaseIVBits = BaseIVTy->getScalarSizeInBits();
573 LaneOffset = Plan.getConstantInt(
574 Val: APInt(BaseIVBits, Lane, /*isSigned*/ false, /*implicitTrunc*/ true));
575 AddOpcode = Instruction::Add;
576 Flags = VPIRFlags(VPIRFlags::WrapFlagsTy(false, false));
577 }
578
579 VPValue *NewStartIndex = LaneOffset;
580 if (OldStartIndex) {
581 VPBuilder Builder(InsertPt);
582 NewStartIndex =
583 Builder.createNaryOp(Opcode: AddOpcode, Operands: {OldStartIndex, LaneOffset}, Flags);
584 }
585 Steps->setStartIndex(NewStartIndex);
586}
587
588/// Create a single-scalar clone of \p DefR (must be a VPReplicateRecipe,
589/// VPInstruction or VPScalarIVStepsRecipe) for lane \p Lane. Use \p
590/// Def2LaneDefs to look up scalar definitions for operands of \DefR.
591static VPValue *
592cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
593 VPSingleDefRecipe *DefR, VPLane Lane,
594 const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
595 assert((isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(DefR)) &&
596 "DefR must be a VPReplicateRecipe, VPInstruction or "
597 "VPScalarIVStepsRecipe");
598 VPValue *Op;
599 if (match(R: DefR, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)))) {
600 auto LaneDefs = Def2LaneDefs.find(Val: Op);
601 if (LaneDefs != Def2LaneDefs.end())
602 return LaneDefs->second[Lane.getKnownLane()];
603
604 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
605 return Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
606 }
607
608 // Collect the operands at Lane, creating extracts as needed.
609 SmallVector<VPValue *> NewOps;
610 for (VPValue *Op : DefR->operands()) {
611 // If Op is a definition that has been unrolled, directly use the clone for
612 // the corresponding lane.
613 auto LaneDefs = Def2LaneDefs.find(Val: Op);
614 if (LaneDefs != Def2LaneDefs.end()) {
615 NewOps.push_back(Elt: LaneDefs->second[Lane.getKnownLane()]);
616 continue;
617 }
618 if (Lane.getKind() == VPLane::Kind::ScalableLast) {
619 // Look through mandatory Unpack.
620 [[maybe_unused]] bool Matched =
621 match(V: Op, P: m_VPInstruction<VPInstruction::Unpack>(Ops: m_VPValue(V&: Op)));
622 assert(Matched && "original op must have been Unpack");
623 auto *ExtractPart =
624 Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: {Op});
625 NewOps.push_back(
626 Elt: Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: {ExtractPart}));
627 continue;
628 }
629 if (vputils::isSingleScalar(VPV: Op)) {
630 NewOps.push_back(Elt: Op);
631 continue;
632 }
633
634 // Look through buildvector to avoid unnecessary extracts.
635 if (match(V: Op, P: m_BuildVector())) {
636 NewOps.push_back(
637 Elt: cast<VPInstruction>(Val: Op)->getOperand(N: Lane.getKnownLane()));
638 continue;
639 }
640 VPValue *Idx = Plan.getConstantInt(Ty: IdxTy, Val: Lane.getKnownLane());
641 VPValue *Ext = Builder.createNaryOp(Opcode: Instruction::ExtractElement, Operands: {Op, Idx});
642 NewOps.push_back(Elt: Ext);
643 }
644
645 VPSingleDefRecipe *New;
646 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: DefR)) {
647 // TODO: have cloning of replicate recipes also provide the desired result
648 // coupled with setting its operands to NewOps (deriving IsSingleScalar and
649 // Mask from the operands?)
650 New = new VPReplicateRecipe(RepR->getUnderlyingInstr(), NewOps,
651 /*IsSingleScalar=*/true, /*Mask=*/nullptr,
652 *RepR, *RepR, RepR->getDebugLoc());
653 } else {
654 New = DefR->clone();
655 for (const auto &[Idx, Op] : enumerate(First&: NewOps)) {
656 New->setOperand(I: Idx, New: Op);
657 }
658 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: New)) {
659 // Skip lane 0: an absent start index is implicitly zero.
660 unsigned KnownLane = Lane.getKnownLane();
661 if (KnownLane != 0)
662 addLaneToStartIndex(Steps, Lane: KnownLane, Plan, InsertPt: DefR);
663 }
664 }
665 New->insertBefore(InsertPos: DefR);
666 return New;
667}
668
669/// Convert recipes in region blocks to operate on a single lane 0.
670/// VPReplicateRecipes are converted to single-scalar ones, branch-on-mask is
671/// converted into BranchOnCond and extracts are created as needed.
672static void convertRecipesInRegionBlocksToSingleScalar(VPlan &Plan, Type *IdxTy,
673 VPBlockBase *Entry,
674 ElementCount VF) {
675 VPValue *Idx0 = Plan.getZero(Ty: IdxTy);
676 VPTypeAnalysis TypeInfo(Plan);
677 for (VPBlockBase *VPB : vp_depth_first_shallow(G: Entry)) {
678 for (VPRecipeBase &OldR : make_early_inc_range(Range&: cast<VPBasicBlock>(Val&: *VPB))) {
679 VPBuilder Builder(&OldR);
680 assert(!match(&OldR, m_ExtractElement(m_VPValue(), m_VPValue())) &&
681 "must not contain extracts before conversion");
682
683 // For scalar VF, operands are already scalar; no extraction needed.
684 if (!VF.isScalar()) {
685 for (const auto &[I, Op] : enumerate(First: OldR.operands())) {
686 // Skip operands that don't need extraction: values defined in the
687 // same block (already scalar), or values that are already single
688 // scalars.
689 auto *DefR = Op->getDefiningRecipe();
690 if ((isa_and_present<VPScalarIVStepsRecipe>(Val: DefR) &&
691 DefR->getParent() == VPB) ||
692 vputils::isSingleScalar(VPV: Op))
693 continue;
694
695 // Extract lane zero from values defined outside the region.
696 VPValue *Extract = Builder.createNaryOp(
697 Opcode: Instruction::ExtractElement, Operands: {Op, Idx0}, DL: OldR.getDebugLoc());
698 OldR.setOperand(I, New: Extract);
699 }
700 }
701
702 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &OldR)) {
703 auto *NewR =
704 new VPReplicateRecipe(RepR->getUnderlyingInstr(), RepR->operands(),
705 /* IsSingleScalar=*/true, /*Mask=*/nullptr,
706 *RepR, *RepR, RepR->getDebugLoc());
707 NewR->insertBefore(InsertPos: RepR);
708 RepR->replaceAllUsesWith(New: NewR);
709 RepR->eraseFromParent();
710 } else if (auto *BranchOnMask = dyn_cast<VPBranchOnMaskRecipe>(Val: &OldR)) {
711 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCond,
712 Operands: {BranchOnMask->getOperand(N: 0)},
713 DL: BranchOnMask->getDebugLoc());
714 BranchOnMask->eraseFromParent();
715 } else if (auto *PredPhi = dyn_cast<VPPredInstPHIRecipe>(Val: &OldR)) {
716 VPValue *PredOp = PredPhi->getOperand(N: 0);
717 Type *PredTy = TypeInfo.inferScalarType(V: PredOp);
718 VPValue *PoisonVal = Plan.getOrAddLiveIn(V: PoisonValue::get(T: PredTy));
719
720 VPPhi *NewPhi = Builder.createScalarPhi(IncomingValues: {PoisonVal, PredOp},
721 DL: PredPhi->getDebugLoc());
722 PredPhi->replaceAllUsesWith(New: NewPhi);
723 PredPhi->eraseFromParent();
724 } else {
725 assert((isa<VPScalarIVStepsRecipe>(OldR) ||
726 (isa<VPInstruction>(OldR) &&
727 vputils::isSingleScalar(OldR.getVPSingleValue()))) &&
728 "unexpected unhandled recipe");
729 }
730 }
731 }
732}
733
734/// Update recipes in the cloned blocks rooted at \p NewEntry to match \p Lane,
735/// using the original blocks rooted at \p OldEntry as reference.
736static void processLaneForReplicateRegion(VPlan &Plan, Type *IdxTy,
737 unsigned Lane, VPBasicBlock *OldEntry,
738 VPBasicBlock *NewEntry) {
739 DenseMap<VPValue *, VPValue *> Old2NewVPValues;
740 VPValue *IdxLane = Plan.getConstantInt(Ty: IdxTy, Val: Lane);
741 for (const auto &[OldBB, NewBB] :
742 zip_equal(t: vp_depth_first_shallow(G: OldEntry),
743 u: vp_depth_first_shallow(G: NewEntry))) {
744 for (auto &&[OldR, NewR] :
745 zip_equal(t&: *cast<VPBasicBlock>(Val: OldBB), u&: *cast<VPBasicBlock>(Val: NewBB))) {
746 for (const auto &[OldV, NewV] :
747 zip_equal(t: OldR.definedValues(), u: NewR.definedValues()))
748 Old2NewVPValues[OldV] = NewV;
749
750 // Remap operands to use lane-specific values.
751 for (const auto &[I, OldOp] : enumerate(First: NewR.operands())) {
752 // Use cloned value if operand was defined in the region.
753 if (auto *NewOp = Old2NewVPValues.lookup(Val: OldOp))
754 NewR.setOperand(I, New: NewOp);
755 }
756
757 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Val: &NewR))
758 addLaneToStartIndex(Steps, Lane, Plan, InsertPt: Steps);
759 else if (match(V: &NewR, P: m_ExtractElement(Op0: m_VPValue(), Op1: m_ZeroInt())))
760 NewR.setOperand(I: 1, New: IdxLane);
761 }
762 }
763}
764
765/// Dissolve a single replicate region by replicating its blocks for each lane
766/// of \p VF. The region is disconnected, its blocks are reparented, cloned for
767/// each lane, and reconnected in sequence.
768static void dissolveReplicateRegion(VPRegionBlock *Region, ElementCount VF,
769 VPlan &Plan, Type *IdxTy) {
770 VPBlockBase *FirstLaneEntry = Region->getEntry();
771 VPBlockBase *FirstLaneExiting = Region->getExiting();
772
773 // Disconnect and dissolve the region.
774 VPBlockBase *Predecessor = Region->getSinglePredecessor();
775 assert(Predecessor && "Replicate region must have a single predecessor");
776 VPBlockBase *Successor = Region->getSingleSuccessor();
777 assert(Successor && "Replicate region must have a single successor");
778 VPBlockUtils::disconnectBlocks(From: Predecessor, To: Region);
779 VPBlockUtils::disconnectBlocks(From: Region, To: Successor);
780
781 VPRegionBlock *ParentRegion = Region->getParent();
782 for (VPBlockBase *VPB : vp_depth_first_shallow(G: FirstLaneEntry))
783 VPB->setParent(ParentRegion);
784
785 // Process the original blocks for lane 0: converting their recipes to
786 // single-scalar.
787 convertRecipesInRegionBlocksToSingleScalar(Plan, IdxTy, Entry: FirstLaneEntry, VF);
788
789 // Clone converted blocks for remaining lanes and process each in reverse
790 // order, connecting each lane's Exiting block to the subsequent lane's entry.
791 VPBlockBase *NextLaneEntry = Successor;
792 unsigned NumLanes = VF.getFixedValue();
793 for (int Lane = NumLanes - 1; Lane > 0; --Lane) {
794 const auto &[CurrentLaneEntry, CurrentLaneExiting] =
795 VPBlockUtils::cloneFrom(Entry: FirstLaneEntry);
796 for (VPBlockBase *VPB : vp_depth_first_shallow(G: CurrentLaneEntry))
797 VPB->setParent(ParentRegion);
798 processLaneForReplicateRegion(Plan, IdxTy, Lane,
799 OldEntry: cast<VPBasicBlock>(Val: FirstLaneEntry),
800 NewEntry: cast<VPBasicBlock>(Val: CurrentLaneEntry));
801 VPBlockUtils::connectBlocks(From: CurrentLaneExiting, To: NextLaneEntry);
802 NextLaneEntry = CurrentLaneEntry;
803 }
804
805 // Connect Predecessor to FirstLaneEntry, and FirstLaneRegionExit to
806 // NextLaneEntry which is the second lane region entry. The latter is
807 // done last so that earlier clonings from FirstLaneEntry stop at
808 // FirstLaneExiting.
809 VPBlockUtils::connectBlocks(From: Predecessor, To: FirstLaneEntry);
810 VPBlockUtils::connectBlocks(From: FirstLaneExiting, To: NextLaneEntry);
811}
812
813/// Collect and dissolve all replicate regions in the vector loop, replicating
814/// their blocks and recipes for each lane of \p VF.
815static void replicateReplicateRegionsByVF(VPlan &Plan, ElementCount VF,
816 Type *IdxTy) {
817 // Collect all replicate regions before modifying the CFG.
818 SmallVector<VPRegionBlock *> ReplicateRegions;
819 for (VPRegionBlock *Region : VPBlockUtils::blocksOnly<VPRegionBlock>(
820 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
821 // Skip regions with live-outs when vectorizing as packing scalar results
822 // back into vectors is not yet implemented.
823 if (Region->isReplicator() &&
824 (VF.isScalar() || Region->getExitingBasicBlock()->empty()))
825 ReplicateRegions.push_back(Elt: Region);
826 }
827
828 assert((ReplicateRegions.empty() || !VF.isScalable()) &&
829 "cannot replicate across scalable VFs");
830
831 // Dissolve replicate regions by replicating their blocks for each lane.
832 for (VPRegionBlock *Region : ReplicateRegions)
833 dissolveReplicateRegion(Region, VF, Plan, IdxTy);
834
835 VPlanTransforms::mergeBlocksIntoPredecessors(Plan);
836}
837
838void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
839 Type *IdxTy = IntegerType::get(
840 C&: Plan.getScalarHeader()->getIRBasicBlock()->getContext(), NumBits: 32);
841
842 if (Plan.hasScalarVFOnly()) {
843 // When Plan is only unrolled by UF, replicating by VF amounts to dissolving
844 // replicate regions.
845 replicateReplicateRegionsByVF(Plan, VF, IdxTy);
846 return;
847 }
848
849 // Visit all VPBBs outside the loop region and directly inside the top-level
850 // loop region.
851 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
852 Range: vp_depth_first_shallow(G: Plan.getEntry()));
853 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
854 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()));
855 auto VPBBsToUnroll =
856 concat<VPBasicBlock *>(Ranges&: VPBBsOutsideLoopRegion, Ranges&: VPBBsInsideLoopRegion);
857 // A mapping of current VPValue definitions to collections of new VPValues
858 // defined per lane. Serves to hook-up potential users of current VPValue
859 // definition that are replicated-per-VF later.
860 DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
861 // The removal of current recipes being replaced by new ones needs to be
862 // delayed after Def2LaneDefs is no longer in use.
863 SmallVector<VPRecipeBase *> ToRemove;
864 for (VPBasicBlock *VPBB : VPBBsToUnroll) {
865 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
866 if (!isa<VPInstruction, VPReplicateRecipe, VPScalarIVStepsRecipe>(Val: &R) ||
867 (isa<VPReplicateRecipe>(Val: &R) &&
868 cast<VPReplicateRecipe>(Val: &R)->isSingleScalar()) ||
869 (isa<VPInstruction>(Val: &R) &&
870 !cast<VPInstruction>(Val: &R)->doesGeneratePerAllLanes() &&
871 cast<VPInstruction>(Val: &R)->getOpcode() != VPInstruction::Unpack))
872 continue;
873
874 auto *DefR = cast<VPSingleDefRecipe>(Val: &R);
875 VPBuilder Builder(DefR);
876 if (DefR->getNumUsers() == 0) {
877 // Create single-scalar version of DefR for all lanes.
878 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
879 cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs);
880 DefR->eraseFromParent();
881 continue;
882 }
883 /// Create single-scalar version of DefR for all lanes.
884 SmallVector<VPValue *> LaneDefs;
885 for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
886 LaneDefs.push_back(
887 Elt: cloneForLane(Plan, Builder, IdxTy, DefR, Lane: VPLane(I), Def2LaneDefs));
888
889 Def2LaneDefs[DefR] = LaneDefs;
890 /// Users that only demand the first lane can use the definition for lane
891 /// 0.
892 DefR->replaceUsesWithIf(New: LaneDefs[0], ShouldReplace: [DefR](VPUser &U, unsigned) {
893 return U.usesFirstLaneOnly(Op: DefR);
894 });
895
896 // Update each build vector user that currently has DefR as its only
897 // operand, to have all LaneDefs as its operands.
898 for (VPUser *U : to_vector(Range: DefR->users())) {
899 auto *VPI = dyn_cast<VPInstruction>(Val: U);
900 if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
901 VPI->getOpcode() != VPInstruction::BuildStructVector))
902 continue;
903 assert(VPI->getNumOperands() == 1 &&
904 "Build(Struct)Vector must have a single operand before "
905 "replicating by VF");
906 VPI->setOperand(I: 0, New: LaneDefs[0]);
907 for (VPValue *LaneDef : drop_begin(RangeOrContainer&: LaneDefs))
908 VPI->addOperand(Operand: LaneDef);
909 }
910 ToRemove.push_back(Elt: DefR);
911 }
912 }
913 for (auto *R : reverse(C&: ToRemove))
914 R->eraseFromParent();
915
916 replicateReplicateRegionsByVF(Plan, VF, IdxTy);
917}
918