1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlan.h"
15#include "VPlanAnalysis.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Twine.h"
19#include "llvm/Analysis/IVDescriptors.h"
20#include "llvm/IR/BasicBlock.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/Type.h"
25#include "llvm/IR/Value.h"
26#include "llvm/Support/Casting.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/Support/raw_ostream.h"
30#include "llvm/Transforms/Utils/BasicBlockUtils.h"
31#include "llvm/Transforms/Utils/LoopUtils.h"
32#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33#include <cassert>
34
35using namespace llvm;
36
37using VectorParts = SmallVector<Value *, 2>;
38
39namespace llvm {
40extern cl::opt<bool> EnableVPlanNativePath;
41}
42extern cl::opt<unsigned> ForceTargetInstructionCost;
43
44#define LV_NAME "loop-vectorize"
45#define DEBUG_TYPE LV_NAME
46
47bool VPRecipeBase::mayWriteToMemory() const {
48 switch (getVPDefID()) {
49 case VPInterleaveSC:
50 return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0;
51 case VPWidenStoreEVLSC:
52 case VPWidenStoreSC:
53 return true;
54 case VPReplicateSC:
55 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
56 ->mayWriteToMemory();
57 case VPWidenCallSC:
58 return !cast<VPWidenCallRecipe>(Val: this)
59 ->getCalledScalarFunction()
60 ->onlyReadsMemory();
61 case VPBranchOnMaskSC:
62 case VPScalarIVStepsSC:
63 case VPPredInstPHISC:
64 return false;
65 case VPBlendSC:
66 case VPReductionEVLSC:
67 case VPReductionSC:
68 case VPWidenCanonicalIVSC:
69 case VPWidenCastSC:
70 case VPWidenGEPSC:
71 case VPWidenIntOrFpInductionSC:
72 case VPWidenLoadEVLSC:
73 case VPWidenLoadSC:
74 case VPWidenPHISC:
75 case VPWidenSC:
76 case VPWidenSelectSC: {
77 const Instruction *I =
78 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
79 (void)I;
80 assert((!I || !I->mayWriteToMemory()) &&
81 "underlying instruction may write to memory");
82 return false;
83 }
84 default:
85 return true;
86 }
87}
88
89bool VPRecipeBase::mayReadFromMemory() const {
90 switch (getVPDefID()) {
91 case VPWidenLoadEVLSC:
92 case VPWidenLoadSC:
93 return true;
94 case VPReplicateSC:
95 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
96 ->mayReadFromMemory();
97 case VPWidenCallSC:
98 return !cast<VPWidenCallRecipe>(Val: this)
99 ->getCalledScalarFunction()
100 ->onlyWritesMemory();
101 case VPBranchOnMaskSC:
102 case VPPredInstPHISC:
103 case VPScalarIVStepsSC:
104 case VPWidenStoreEVLSC:
105 case VPWidenStoreSC:
106 return false;
107 case VPBlendSC:
108 case VPReductionEVLSC:
109 case VPReductionSC:
110 case VPWidenCanonicalIVSC:
111 case VPWidenCastSC:
112 case VPWidenGEPSC:
113 case VPWidenIntOrFpInductionSC:
114 case VPWidenPHISC:
115 case VPWidenSC:
116 case VPWidenSelectSC: {
117 const Instruction *I =
118 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
119 (void)I;
120 assert((!I || !I->mayReadFromMemory()) &&
121 "underlying instruction may read from memory");
122 return false;
123 }
124 default:
125 return true;
126 }
127}
128
129bool VPRecipeBase::mayHaveSideEffects() const {
130 switch (getVPDefID()) {
131 case VPDerivedIVSC:
132 case VPPredInstPHISC:
133 case VPScalarCastSC:
134 return false;
135 case VPInstructionSC:
136 switch (cast<VPInstruction>(Val: this)->getOpcode()) {
137 case Instruction::Or:
138 case Instruction::ICmp:
139 case Instruction::Select:
140 case VPInstruction::Not:
141 case VPInstruction::CalculateTripCountMinusVF:
142 case VPInstruction::CanonicalIVIncrementForPart:
143 case VPInstruction::ExtractFromEnd:
144 case VPInstruction::FirstOrderRecurrenceSplice:
145 case VPInstruction::LogicalAnd:
146 case VPInstruction::PtrAdd:
147 return false;
148 default:
149 return true;
150 }
151 case VPWidenCallSC: {
152 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
153 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
154 }
155 case VPBlendSC:
156 case VPReductionEVLSC:
157 case VPReductionSC:
158 case VPScalarIVStepsSC:
159 case VPWidenCanonicalIVSC:
160 case VPWidenCastSC:
161 case VPWidenGEPSC:
162 case VPWidenIntOrFpInductionSC:
163 case VPWidenPHISC:
164 case VPWidenPointerInductionSC:
165 case VPWidenSC:
166 case VPWidenSelectSC: {
167 const Instruction *I =
168 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
169 (void)I;
170 assert((!I || !I->mayHaveSideEffects()) &&
171 "underlying instruction has side-effects");
172 return false;
173 }
174 case VPInterleaveSC:
175 return mayWriteToMemory();
176 case VPWidenLoadEVLSC:
177 case VPWidenLoadSC:
178 case VPWidenStoreEVLSC:
179 case VPWidenStoreSC:
180 assert(
181 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
182 mayWriteToMemory() &&
183 "mayHaveSideffects result for ingredient differs from this "
184 "implementation");
185 return mayWriteToMemory();
186 case VPReplicateSC: {
187 auto *R = cast<VPReplicateRecipe>(Val: this);
188 return R->getUnderlyingInstr()->mayHaveSideEffects();
189 }
190 default:
191 return true;
192 }
193}
194
195void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
196 VPValue *ExitValue = getOperand(N: 0);
197 auto Lane = vputils::isUniformAfterVectorization(VPV: ExitValue)
198 ? VPLane::getFirstLane()
199 : VPLane::getLastLaneForVF(VF: State.VF);
200 VPBasicBlock *MiddleVPBB =
201 cast<VPBasicBlock>(Val: Plan.getVectorLoopRegion()->getSingleSuccessor());
202 VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
203 auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
204 // Values leaving the vector loop reach live out phi's in the exiting block
205 // via middle block.
206 auto *PredVPBB = !ExitingVPBB || ExitingVPBB->getEnclosingLoopRegion()
207 ? MiddleVPBB
208 : ExitingVPBB;
209 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
210 // Set insertion point in PredBB in case an extract needs to be generated.
211 // TODO: Model extracts explicitly.
212 State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt());
213 Value *V = State.get(Def: ExitValue, Instance: VPIteration(State.UF - 1, Lane));
214 if (Phi->getBasicBlockIndex(BB: PredBB) != -1)
215 Phi->setIncomingValueForBlock(BB: PredBB, V);
216 else
217 Phi->addIncoming(V, BB: PredBB);
218}
219
220#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
221void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
222 O << "Live-out ";
223 getPhi()->printAsOperand(O);
224 O << " = ";
225 getOperand(0)->printAsOperand(O, SlotTracker);
226 O << "\n";
227}
228#endif
229
230void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(InsertPos->getParent() &&
233 "Insertion position not in any VPBasicBlock");
234 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
235}
236
237void VPRecipeBase::insertBefore(VPBasicBlock &BB,
238 iplist<VPRecipeBase>::iterator I) {
239 assert(!Parent && "Recipe already in some VPBasicBlock");
240 assert(I == BB.end() || I->getParent() == &BB);
241 BB.insert(Recipe: this, InsertPt: I);
242}
243
244void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
245 assert(!Parent && "Recipe already in some VPBasicBlock");
246 assert(InsertPos->getParent() &&
247 "Insertion position not in any VPBasicBlock");
248 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
249}
250
251void VPRecipeBase::removeFromParent() {
252 assert(getParent() && "Recipe not in any VPBasicBlock");
253 getParent()->getRecipeList().remove(IT: getIterator());
254 Parent = nullptr;
255}
256
257iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
258 assert(getParent() && "Recipe not in any VPBasicBlock");
259 return getParent()->getRecipeList().erase(where: getIterator());
260}
261
262void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
263 removeFromParent();
264 insertAfter(InsertPos);
265}
266
267void VPRecipeBase::moveBefore(VPBasicBlock &BB,
268 iplist<VPRecipeBase>::iterator I) {
269 removeFromParent();
270 insertBefore(BB, I);
271}
272
273/// Return the underlying instruction to be used for computing \p R's cost via
274/// the legacy cost model. Return nullptr if there's no suitable instruction.
275static Instruction *getInstructionForCost(const VPRecipeBase *R) {
276 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: R))
277 return dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
278 if (auto *IG = dyn_cast<VPInterleaveRecipe>(Val: R))
279 return IG->getInsertPos();
280 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: R))
281 return &WidenMem->getIngredient();
282 return nullptr;
283}
284
285InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
286 if (auto *UI = getInstructionForCost(R: this))
287 if (Ctx.skipCostComputation(UI, IsVector: VF.isVector()))
288 return 0;
289
290 InstructionCost RecipeCost = computeCost(VF, Ctx);
291 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292 RecipeCost.isValid())
293 RecipeCost = InstructionCost(ForceTargetInstructionCost);
294
295 LLVM_DEBUG({
296 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
297 dump();
298 });
299 return RecipeCost;
300}
301
302InstructionCost VPRecipeBase::computeCost(ElementCount VF,
303 VPCostContext &Ctx) const {
304 // Compute the cost for the recipe falling back to the legacy cost model using
305 // the underlying instruction. If there is no underlying instruction, returns
306 // 0.
307 Instruction *UI = getInstructionForCost(R: this);
308 if (UI && isa<VPReplicateRecipe>(Val: this)) {
309 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
310 // transform, avoid computing their cost multiple times for now.
311 Ctx.SkipCostComputation.insert(Ptr: UI);
312 }
313 return UI ? Ctx.getLegacyCost(UI, VF) : 0;
314}
315
316FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
317 assert(OpType == OperationType::FPMathOp &&
318 "recipe doesn't have fast math flags");
319 FastMathFlags Res;
320 Res.setAllowReassoc(FMFs.AllowReassoc);
321 Res.setNoNaNs(FMFs.NoNaNs);
322 Res.setNoInfs(FMFs.NoInfs);
323 Res.setNoSignedZeros(FMFs.NoSignedZeros);
324 Res.setAllowReciprocal(FMFs.AllowReciprocal);
325 Res.setAllowContract(FMFs.AllowContract);
326 Res.setApproxFunc(FMFs.ApproxFunc);
327 return Res;
328}
329
330VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
331 VPValue *A, VPValue *B, DebugLoc DL,
332 const Twine &Name)
333 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
334 Pred, DL),
335 Opcode(Opcode), Name(Name.str()) {
336 assert(Opcode == Instruction::ICmp &&
337 "only ICmp predicates supported at the moment");
338}
339
340VPInstruction::VPInstruction(unsigned Opcode,
341 std::initializer_list<VPValue *> Operands,
342 FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
343 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
344 Opcode(Opcode), Name(Name.str()) {
345 // Make sure the VPInstruction is a floating-point operation.
346 assert(isFPMathOp() && "this op can't take fast-math flags");
347}
348
349bool VPInstruction::doesGeneratePerAllLanes() const {
350 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
351}
352
353bool VPInstruction::canGenerateScalarForFirstLane() const {
354 if (Instruction::isBinaryOp(Opcode: getOpcode()))
355 return true;
356 if (isSingleScalar() || isVectorToScalar())
357 return true;
358 switch (Opcode) {
359 case Instruction::ICmp:
360 case VPInstruction::BranchOnCond:
361 case VPInstruction::BranchOnCount:
362 case VPInstruction::CalculateTripCountMinusVF:
363 case VPInstruction::CanonicalIVIncrementForPart:
364 case VPInstruction::PtrAdd:
365 case VPInstruction::ExplicitVectorLength:
366 return true;
367 default:
368 return false;
369 }
370}
371
372Value *VPInstruction::generatePerLane(VPTransformState &State,
373 const VPIteration &Lane) {
374 IRBuilderBase &Builder = State.Builder;
375
376 assert(getOpcode() == VPInstruction::PtrAdd &&
377 "only PtrAdd opcodes are supported for now");
378 return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Instance: Lane),
379 Offset: State.get(Def: getOperand(N: 1), Instance: Lane), Name);
380}
381
382Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
383 IRBuilderBase &Builder = State.Builder;
384
385 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
386 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
387 Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed);
388 Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed);
389 auto *Res =
390 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
391 if (auto *I = dyn_cast<Instruction>(Val: Res))
392 setFlags(I);
393 return Res;
394 }
395
396 switch (getOpcode()) {
397 case VPInstruction::Not: {
398 Value *A = State.get(Def: getOperand(N: 0), Part);
399 return Builder.CreateNot(V: A, Name);
400 }
401 case Instruction::ICmp: {
402 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
403 Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed);
404 Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed);
405 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
406 }
407 case Instruction::Select: {
408 Value *Cond = State.get(Def: getOperand(N: 0), Part);
409 Value *Op1 = State.get(Def: getOperand(N: 1), Part);
410 Value *Op2 = State.get(Def: getOperand(N: 2), Part);
411 return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name);
412 }
413 case VPInstruction::ActiveLaneMask: {
414 // Get first lane of vector induction variable.
415 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
416 // Get the original loop tripcount.
417 Value *ScalarTC = State.get(Def: getOperand(N: 1), Instance: VPIteration(Part, 0));
418
419 // If this part of the active lane mask is scalar, generate the CMP directly
420 // to avoid unnecessary extracts.
421 if (State.VF.isScalar())
422 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
423 Name);
424
425 auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext());
426 auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF);
427 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
428 Types: {PredTy, ScalarTC->getType()},
429 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
430 }
431 case VPInstruction::FirstOrderRecurrenceSplice: {
432 // Generate code to combine the previous and current values in vector v3.
433 //
434 // vector.ph:
435 // v_init = vector(..., ..., ..., a[-1])
436 // br vector.body
437 //
438 // vector.body
439 // i = phi [0, vector.ph], [i+4, vector.body]
440 // v1 = phi [v_init, vector.ph], [v2, vector.body]
441 // v2 = a[i, i+1, i+2, i+3];
442 // v3 = vector(v1(3), v2(0, 1, 2))
443
444 // For the first part, use the recurrence phi (v1), otherwise v2.
445 auto *V1 = State.get(Def: getOperand(N: 0), Part: 0);
446 Value *PartMinus1 = Part == 0 ? V1 : State.get(Def: getOperand(N: 1), Part: Part - 1);
447 if (!PartMinus1->getType()->isVectorTy())
448 return PartMinus1;
449 Value *V2 = State.get(Def: getOperand(N: 1), Part);
450 return Builder.CreateVectorSplice(V1: PartMinus1, V2, Imm: -1, Name);
451 }
452 case VPInstruction::CalculateTripCountMinusVF: {
453 if (Part != 0)
454 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
455
456 Value *ScalarTC = State.get(Def: getOperand(N: 0), Instance: {0, 0});
457 Value *Step =
458 createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: State.UF);
459 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step);
460 Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step);
461 Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0);
462 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
463 }
464 case VPInstruction::ExplicitVectorLength: {
465 // Compute EVL
466 auto GetEVL = [=](VPTransformState &State, Value *AVL) {
467 assert(AVL->getType()->isIntegerTy() &&
468 "Requested vector length should be an integer.");
469
470 // TODO: Add support for MaxSafeDist for correct loop emission.
471 assert(State.VF.isScalable() && "Expected scalable vector factor.");
472 Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue());
473
474 Value *EVL = State.Builder.CreateIntrinsic(
475 RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
476 Args: {AVL, VFArg, State.Builder.getTrue()});
477 return EVL;
478 };
479 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
480 // be outside of the main loop.
481 assert(Part == 0 && "No unrolling expected for predicated vectorization.");
482 // Compute VTC - IV as the AVL (requested vector length).
483 Value *Index = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
484 Value *TripCount = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
485 Value *AVL = State.Builder.CreateSub(LHS: TripCount, RHS: Index);
486 Value *EVL = GetEVL(State, AVL);
487 return EVL;
488 }
489 case VPInstruction::CanonicalIVIncrementForPart: {
490 auto *IV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
491 if (Part == 0)
492 return IV;
493
494 // The canonical IV is incremented by the vectorization factor (num of SIMD
495 // elements) times the unroll part.
496 Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part);
497 return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(),
498 HasNSW: hasNoSignedWrap());
499 }
500 case VPInstruction::BranchOnCond: {
501 if (Part != 0)
502 return nullptr;
503
504 Value *Cond = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
505 // Replace the temporary unreachable terminator with a new conditional
506 // branch, hooking it up to backward destination for exiting blocks now and
507 // to forward destination(s) later when they are created.
508 BranchInst *CondBr =
509 Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), False: nullptr);
510 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
511 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
512
513 if (!getParent()->isExiting())
514 return CondBr;
515
516 VPRegionBlock *ParentRegion = getParent()->getParent();
517 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
518 CondBr->setSuccessor(idx: 1, NewSucc: State.CFG.VPBB2IRBB[Header]);
519 return CondBr;
520 }
521 case VPInstruction::BranchOnCount: {
522 if (Part != 0)
523 return nullptr;
524 // First create the compare.
525 Value *IV = State.get(Def: getOperand(N: 0), Part, /*IsScalar*/ true);
526 Value *TC = State.get(Def: getOperand(N: 1), Part, /*IsScalar*/ true);
527 Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC);
528
529 // Now create the branch.
530 auto *Plan = getParent()->getPlan();
531 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
532 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
533
534 // Replace the temporary unreachable terminator with a new conditional
535 // branch, hooking it up to backward destination (the header) now and to the
536 // forward destination (the exit/middle block) later when it is created.
537 // Note that CreateCondBr expects a valid BB as first argument, so we need
538 // to set it to nullptr later.
539 BranchInst *CondBr = Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(),
540 False: State.CFG.VPBB2IRBB[Header]);
541 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
542 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
543 return CondBr;
544 }
545 case VPInstruction::ComputeReductionResult: {
546 if (Part != 0)
547 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
548
549 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
550 // and will be removed by breaking up the recipe further.
551 auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0));
552 auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
553 // Get its reduction variable descriptor.
554 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
555
556 RecurKind RK = RdxDesc.getRecurrenceKind();
557
558 VPValue *LoopExitingDef = getOperand(N: 1);
559 Type *PhiTy = OrigPhi->getType();
560 VectorParts RdxParts(State.UF);
561 for (unsigned Part = 0; Part < State.UF; ++Part)
562 RdxParts[Part] = State.get(Def: LoopExitingDef, Part, IsScalar: PhiR->isInLoop());
563
564 // If the vector reduction can be performed in a smaller type, we truncate
565 // then extend the loop exit value to enable InstCombine to evaluate the
566 // entire expression in the smaller type.
567 // TODO: Handle this in truncateToMinBW.
568 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
569 Type *RdxVecTy = VectorType::get(ElementType: RdxDesc.getRecurrenceType(), EC: State.VF);
570 for (unsigned Part = 0; Part < State.UF; ++Part)
571 RdxParts[Part] = Builder.CreateTrunc(V: RdxParts[Part], DestTy: RdxVecTy);
572 }
573 // Reduce all of the unrolled parts into a single vector.
574 Value *ReducedPartRdx = RdxParts[0];
575 unsigned Op = RecurrenceDescriptor::getOpcode(Kind: RK);
576 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK))
577 Op = Instruction::Or;
578
579 if (PhiR->isOrdered()) {
580 ReducedPartRdx = RdxParts[State.UF - 1];
581 } else {
582 // Floating-point operations should have some FMF to enable the reduction.
583 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
584 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
585 for (unsigned Part = 1; Part < State.UF; ++Part) {
586 Value *RdxPart = RdxParts[Part];
587 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
588 ReducedPartRdx = Builder.CreateBinOp(
589 Opc: (Instruction::BinaryOps)Op, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
590 else
591 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
592 }
593 }
594
595 // Create the reduction after the loop. Note that inloop reductions create
596 // the target reduction in the loop using a Reduction recipe.
597 if ((State.VF.isVector() ||
598 RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) &&
599 !PhiR->isInLoop()) {
600 ReducedPartRdx =
601 createTargetReduction(B&: Builder, Desc: RdxDesc, Src: ReducedPartRdx, OrigPhi);
602 // If the reduction can be performed in a smaller type, we need to extend
603 // the reduction to the wider type before we branch to the original loop.
604 if (PhiTy != RdxDesc.getRecurrenceType())
605 ReducedPartRdx = RdxDesc.isSigned()
606 ? Builder.CreateSExt(V: ReducedPartRdx, DestTy: PhiTy)
607 : Builder.CreateZExt(V: ReducedPartRdx, DestTy: PhiTy);
608 }
609
610 // If there were stores of the reduction value to a uniform memory address
611 // inside the loop, create the final store here.
612 if (StoreInst *SI = RdxDesc.IntermediateStore) {
613 auto *NewSI = Builder.CreateAlignedStore(
614 Val: ReducedPartRdx, Ptr: SI->getPointerOperand(), Align: SI->getAlign());
615 propagateMetadata(I: NewSI, VL: SI);
616 }
617
618 return ReducedPartRdx;
619 }
620 case VPInstruction::ExtractFromEnd: {
621 if (Part != 0)
622 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
623
624 auto *CI = cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue());
625 unsigned Offset = CI->getZExtValue();
626 assert(Offset > 0 && "Offset from end must be positive");
627 Value *Res;
628 if (State.VF.isVector()) {
629 assert(Offset <= State.VF.getKnownMinValue() &&
630 "invalid offset to extract from");
631 // Extract lane VF - Offset from the operand.
632 Res = State.get(
633 Def: getOperand(N: 0),
634 Instance: VPIteration(State.UF - 1, VPLane::getLaneFromEnd(VF: State.VF, Offset)));
635 } else {
636 assert(Offset <= State.UF && "invalid offset to extract from");
637 // When loop is unrolled without vectorizing, retrieve UF - Offset.
638 Res = State.get(Def: getOperand(N: 0), Part: State.UF - Offset);
639 }
640 if (isa<ExtractElementInst>(Val: Res))
641 Res->setName(Name);
642 return Res;
643 }
644 case VPInstruction::LogicalAnd: {
645 Value *A = State.get(Def: getOperand(N: 0), Part);
646 Value *B = State.get(Def: getOperand(N: 1), Part);
647 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
648 }
649 case VPInstruction::PtrAdd: {
650 assert(vputils::onlyFirstLaneUsed(this) &&
651 "can only generate first lane for PtrAdd");
652 Value *Ptr = State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true);
653 Value *Addend = State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true);
654 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name);
655 }
656 case VPInstruction::ResumePhi: {
657 if (Part != 0)
658 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
659 Value *IncomingFromVPlanPred =
660 State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true);
661 Value *IncomingFromOtherPreds =
662 State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true);
663 auto *NewPhi =
664 Builder.CreatePHI(Ty: IncomingFromOtherPreds->getType(), NumReservedValues: 2, Name);
665 BasicBlock *VPlanPred =
666 State.CFG
667 .VPBB2IRBB[cast<VPBasicBlock>(Val: getParent()->getSinglePredecessor())];
668 NewPhi->addIncoming(V: IncomingFromVPlanPred, BB: VPlanPred);
669 for (auto *OtherPred : predecessors(BB: Builder.GetInsertBlock())) {
670 assert(OtherPred != VPlanPred &&
671 "VPlan predecessors should not be connected yet");
672 NewPhi->addIncoming(V: IncomingFromOtherPreds, BB: OtherPred);
673 }
674 return NewPhi;
675 }
676
677 default:
678 llvm_unreachable("Unsupported opcode for instruction");
679 }
680}
681
682bool VPInstruction::isVectorToScalar() const {
683 return getOpcode() == VPInstruction::ExtractFromEnd ||
684 getOpcode() == VPInstruction::ComputeReductionResult;
685}
686
687bool VPInstruction::isSingleScalar() const {
688 return getOpcode() == VPInstruction::ResumePhi;
689}
690
691#if !defined(NDEBUG)
692bool VPInstruction::isFPMathOp() const {
693 // Inspired by FPMathOperator::classof. Notable differences are that we don't
694 // support Call, PHI and Select opcodes here yet.
695 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
696 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
697 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
698 Opcode == Instruction::FCmp || Opcode == Instruction::Select;
699}
700#endif
701
702void VPInstruction::execute(VPTransformState &State) {
703 assert(!State.Instance && "VPInstruction executing an Instance");
704 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
705 assert((hasFastMathFlags() == isFPMathOp() ||
706 getOpcode() == Instruction::Select) &&
707 "Recipe not a FPMathOp but has fast-math flags?");
708 if (hasFastMathFlags())
709 State.Builder.setFastMathFlags(getFastMathFlags());
710 State.setDebugLocFrom(getDebugLoc());
711 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
712 (vputils::onlyFirstLaneUsed(Def: this) ||
713 isVectorToScalar() || isSingleScalar());
714 bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
715 bool OnlyFirstPartUsed = vputils::onlyFirstPartUsed(Def: this);
716 for (unsigned Part = 0; Part < State.UF; ++Part) {
717 if (GeneratesPerAllLanes) {
718 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
719 Lane != NumLanes; ++Lane) {
720 Value *GeneratedValue = generatePerLane(State, Lane: VPIteration(Part, Lane));
721 assert(GeneratedValue && "generatePerLane must produce a value");
722 State.set(Def: this, V: GeneratedValue, Instance: VPIteration(Part, Lane));
723 }
724 continue;
725 }
726
727 if (Part != 0 && OnlyFirstPartUsed && hasResult()) {
728 Value *Part0 = State.get(Def: this, Part: 0, /*IsScalar*/ GeneratesPerFirstLaneOnly);
729 State.set(Def: this, V: Part0, Part,
730 /*IsScalar*/ GeneratesPerFirstLaneOnly);
731 continue;
732 }
733
734 Value *GeneratedValue = generatePerPart(State, Part);
735 if (!hasResult())
736 continue;
737 assert(GeneratedValue && "generatePerPart must produce a value");
738 assert((GeneratedValue->getType()->isVectorTy() ==
739 !GeneratesPerFirstLaneOnly ||
740 State.VF.isScalar()) &&
741 "scalar value but not only first lane defined");
742 State.set(Def: this, V: GeneratedValue, Part,
743 /*IsScalar*/ GeneratesPerFirstLaneOnly);
744 }
745}
746
747bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
748 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
749 if (Instruction::isBinaryOp(Opcode: getOpcode()))
750 return vputils::onlyFirstLaneUsed(Def: this);
751
752 switch (getOpcode()) {
753 default:
754 return false;
755 case Instruction::ICmp:
756 case VPInstruction::PtrAdd:
757 // TODO: Cover additional opcodes.
758 return vputils::onlyFirstLaneUsed(Def: this);
759 case VPInstruction::ActiveLaneMask:
760 case VPInstruction::ExplicitVectorLength:
761 case VPInstruction::CalculateTripCountMinusVF:
762 case VPInstruction::CanonicalIVIncrementForPart:
763 case VPInstruction::BranchOnCount:
764 case VPInstruction::BranchOnCond:
765 case VPInstruction::ResumePhi:
766 return true;
767 };
768 llvm_unreachable("switch should return");
769}
770
771bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
772 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
773 if (Instruction::isBinaryOp(Opcode: getOpcode()))
774 return vputils::onlyFirstPartUsed(Def: this);
775
776 switch (getOpcode()) {
777 default:
778 return false;
779 case Instruction::ICmp:
780 case Instruction::Select:
781 return vputils::onlyFirstPartUsed(Def: this);
782 case VPInstruction::BranchOnCount:
783 case VPInstruction::BranchOnCond:
784 case VPInstruction::CanonicalIVIncrementForPart:
785 return true;
786 };
787 llvm_unreachable("switch should return");
788}
789
790#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
791void VPInstruction::dump() const {
792 VPSlotTracker SlotTracker(getParent()->getPlan());
793 print(dbgs(), "", SlotTracker);
794}
795
796void VPInstruction::print(raw_ostream &O, const Twine &Indent,
797 VPSlotTracker &SlotTracker) const {
798 O << Indent << "EMIT ";
799
800 if (hasResult()) {
801 printAsOperand(O, SlotTracker);
802 O << " = ";
803 }
804
805 switch (getOpcode()) {
806 case VPInstruction::Not:
807 O << "not";
808 break;
809 case VPInstruction::SLPLoad:
810 O << "combined load";
811 break;
812 case VPInstruction::SLPStore:
813 O << "combined store";
814 break;
815 case VPInstruction::ActiveLaneMask:
816 O << "active lane mask";
817 break;
818 case VPInstruction::ResumePhi:
819 O << "resume-phi";
820 break;
821 case VPInstruction::ExplicitVectorLength:
822 O << "EXPLICIT-VECTOR-LENGTH";
823 break;
824 case VPInstruction::FirstOrderRecurrenceSplice:
825 O << "first-order splice";
826 break;
827 case VPInstruction::BranchOnCond:
828 O << "branch-on-cond";
829 break;
830 case VPInstruction::CalculateTripCountMinusVF:
831 O << "TC > VF ? TC - VF : 0";
832 break;
833 case VPInstruction::CanonicalIVIncrementForPart:
834 O << "VF * Part +";
835 break;
836 case VPInstruction::BranchOnCount:
837 O << "branch-on-count";
838 break;
839 case VPInstruction::ExtractFromEnd:
840 O << "extract-from-end";
841 break;
842 case VPInstruction::ComputeReductionResult:
843 O << "compute-reduction-result";
844 break;
845 case VPInstruction::LogicalAnd:
846 O << "logical-and";
847 break;
848 case VPInstruction::PtrAdd:
849 O << "ptradd";
850 break;
851 default:
852 O << Instruction::getOpcodeName(getOpcode());
853 }
854
855 printFlags(O);
856 printOperands(O, SlotTracker);
857
858 if (auto DL = getDebugLoc()) {
859 O << ", !dbg ";
860 DL.print(O);
861 }
862}
863#endif
864
865void VPWidenCallRecipe::execute(VPTransformState &State) {
866 assert(State.VF.isVector() && "not widening");
867 Function *CalledScalarFn = getCalledScalarFunction();
868 assert(!isDbgInfoIntrinsic(CalledScalarFn->getIntrinsicID()) &&
869 "DbgInfoIntrinsic should have been dropped during VPlan construction");
870 State.setDebugLocFrom(getDebugLoc());
871
872 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
873 FunctionType *VFTy = nullptr;
874 if (Variant)
875 VFTy = Variant->getFunctionType();
876 for (unsigned Part = 0; Part < State.UF; ++Part) {
877 SmallVector<Type *, 2> TysForDecl;
878 // Add return type if intrinsic is overloaded on it.
879 if (UseIntrinsic &&
880 isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1))
881 TysForDecl.push_back(Elt: VectorType::get(
882 ElementType: CalledScalarFn->getReturnType()->getScalarType(), EC: State.VF));
883 SmallVector<Value *, 4> Args;
884 for (const auto &I : enumerate(First: arg_operands())) {
885 // Some intrinsics have a scalar argument - don't replace it with a
886 // vector.
887 Value *Arg;
888 if (UseIntrinsic &&
889 isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index()))
890 Arg = State.get(Def: I.value(), Instance: VPIteration(0, 0));
891 // Some vectorized function variants may also take a scalar argument,
892 // e.g. linear parameters for pointers. This needs to be the scalar value
893 // from the start of the respective part when interleaving.
894 else if (VFTy && !VFTy->getParamType(i: I.index())->isVectorTy())
895 Arg = State.get(Def: I.value(), Instance: VPIteration(Part, 0));
896 else
897 Arg = State.get(Def: I.value(), Part);
898 if (UseIntrinsic &&
899 isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index()))
900 TysForDecl.push_back(Elt: Arg->getType());
901 Args.push_back(Elt: Arg);
902 }
903
904 Function *VectorF;
905 if (UseIntrinsic) {
906 // Use vector version of the intrinsic.
907 Module *M = State.Builder.GetInsertBlock()->getModule();
908 VectorF = Intrinsic::getDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
909 assert(VectorF && "Can't retrieve vector intrinsic.");
910 } else {
911#ifndef NDEBUG
912 assert(Variant != nullptr && "Can't create vector function.");
913#endif
914 VectorF = Variant;
915 }
916
917 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingInstr());
918 SmallVector<OperandBundleDef, 1> OpBundles;
919 if (CI)
920 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
921
922 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
923
924 if (isa<FPMathOperator>(Val: V))
925 V->copyFastMathFlags(I: CI);
926
927 if (!V->getType()->isVoidTy())
928 State.set(Def: this, V, Part);
929 State.addMetadata(To: V, From: CI);
930 }
931}
932
933#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
934void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
935 VPSlotTracker &SlotTracker) const {
936 O << Indent << "WIDEN-CALL ";
937
938 Function *CalledFn = getCalledScalarFunction();
939 if (CalledFn->getReturnType()->isVoidTy())
940 O << "void ";
941 else {
942 printAsOperand(O, SlotTracker);
943 O << " = ";
944 }
945
946 O << "call @" << CalledFn->getName() << "(";
947 interleaveComma(arg_operands(), O, [&O, &SlotTracker](VPValue *Op) {
948 Op->printAsOperand(O, SlotTracker);
949 });
950 O << ")";
951
952 if (VectorIntrinsicID)
953 O << " (using vector intrinsic)";
954 else {
955 O << " (using library function";
956 if (Variant->hasName())
957 O << ": " << Variant->getName();
958 O << ")";
959 }
960}
961
962void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
963 VPSlotTracker &SlotTracker) const {
964 O << Indent << "WIDEN-SELECT ";
965 printAsOperand(O, SlotTracker);
966 O << " = select ";
967 getOperand(0)->printAsOperand(O, SlotTracker);
968 O << ", ";
969 getOperand(1)->printAsOperand(O, SlotTracker);
970 O << ", ";
971 getOperand(2)->printAsOperand(O, SlotTracker);
972 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
973}
974#endif
975
976void VPWidenSelectRecipe::execute(VPTransformState &State) {
977 State.setDebugLocFrom(getDebugLoc());
978
979 // The condition can be loop invariant but still defined inside the
980 // loop. This means that we can't just use the original 'cond' value.
981 // We have to take the 'vectorized' value and pick the first lane.
982 // Instcombine will make this a no-op.
983 auto *InvarCond =
984 isInvariantCond() ? State.get(Def: getCond(), Instance: VPIteration(0, 0)) : nullptr;
985
986 for (unsigned Part = 0; Part < State.UF; ++Part) {
987 Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond(), Part);
988 Value *Op0 = State.get(Def: getOperand(N: 1), Part);
989 Value *Op1 = State.get(Def: getOperand(N: 2), Part);
990 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
991 State.set(Def: this, V: Sel, Part);
992 State.addMetadata(To: Sel, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
993 }
994}
995
996VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
997 const FastMathFlags &FMF) {
998 AllowReassoc = FMF.allowReassoc();
999 NoNaNs = FMF.noNaNs();
1000 NoInfs = FMF.noInfs();
1001 NoSignedZeros = FMF.noSignedZeros();
1002 AllowReciprocal = FMF.allowReciprocal();
1003 AllowContract = FMF.allowContract();
1004 ApproxFunc = FMF.approxFunc();
1005}
1006
1007#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1008void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
1009 switch (OpType) {
1010 case OperationType::Cmp:
1011 O << " " << CmpInst::getPredicateName(getPredicate());
1012 break;
1013 case OperationType::DisjointOp:
1014 if (DisjointFlags.IsDisjoint)
1015 O << " disjoint";
1016 break;
1017 case OperationType::PossiblyExactOp:
1018 if (ExactFlags.IsExact)
1019 O << " exact";
1020 break;
1021 case OperationType::OverflowingBinOp:
1022 if (WrapFlags.HasNUW)
1023 O << " nuw";
1024 if (WrapFlags.HasNSW)
1025 O << " nsw";
1026 break;
1027 case OperationType::FPMathOp:
1028 getFastMathFlags().print(O);
1029 break;
1030 case OperationType::GEPOp:
1031 if (GEPFlags.IsInBounds)
1032 O << " inbounds";
1033 break;
1034 case OperationType::NonNegOp:
1035 if (NonNegFlags.NonNeg)
1036 O << " nneg";
1037 break;
1038 case OperationType::Other:
1039 break;
1040 }
1041 if (getNumOperands() > 0)
1042 O << " ";
1043}
1044#endif
1045
1046void VPWidenRecipe::execute(VPTransformState &State) {
1047 State.setDebugLocFrom(getDebugLoc());
1048 auto &Builder = State.Builder;
1049 switch (Opcode) {
1050 case Instruction::Call:
1051 case Instruction::Br:
1052 case Instruction::PHI:
1053 case Instruction::GetElementPtr:
1054 case Instruction::Select:
1055 llvm_unreachable("This instruction is handled by a different recipe.");
1056 case Instruction::UDiv:
1057 case Instruction::SDiv:
1058 case Instruction::SRem:
1059 case Instruction::URem:
1060 case Instruction::Add:
1061 case Instruction::FAdd:
1062 case Instruction::Sub:
1063 case Instruction::FSub:
1064 case Instruction::FNeg:
1065 case Instruction::Mul:
1066 case Instruction::FMul:
1067 case Instruction::FDiv:
1068 case Instruction::FRem:
1069 case Instruction::Shl:
1070 case Instruction::LShr:
1071 case Instruction::AShr:
1072 case Instruction::And:
1073 case Instruction::Or:
1074 case Instruction::Xor: {
1075 // Just widen unops and binops.
1076 for (unsigned Part = 0; Part < State.UF; ++Part) {
1077 SmallVector<Value *, 2> Ops;
1078 for (VPValue *VPOp : operands())
1079 Ops.push_back(Elt: State.get(Def: VPOp, Part));
1080
1081 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
1082
1083 if (auto *VecOp = dyn_cast<Instruction>(Val: V))
1084 setFlags(VecOp);
1085
1086 // Use this vector value for all users of the original instruction.
1087 State.set(Def: this, V, Part);
1088 State.addMetadata(To: V, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
1089 }
1090
1091 break;
1092 }
1093 case Instruction::Freeze: {
1094 for (unsigned Part = 0; Part < State.UF; ++Part) {
1095 Value *Op = State.get(Def: getOperand(N: 0), Part);
1096
1097 Value *Freeze = Builder.CreateFreeze(V: Op);
1098 State.set(Def: this, V: Freeze, Part);
1099 }
1100 break;
1101 }
1102 case Instruction::ICmp:
1103 case Instruction::FCmp: {
1104 // Widen compares. Generate vector compares.
1105 bool FCmp = Opcode == Instruction::FCmp;
1106 for (unsigned Part = 0; Part < State.UF; ++Part) {
1107 Value *A = State.get(Def: getOperand(N: 0), Part);
1108 Value *B = State.get(Def: getOperand(N: 1), Part);
1109 Value *C = nullptr;
1110 if (FCmp) {
1111 // Propagate fast math flags.
1112 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1113 if (auto *I = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()))
1114 Builder.setFastMathFlags(I->getFastMathFlags());
1115 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
1116 } else {
1117 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
1118 }
1119 State.set(Def: this, V: C, Part);
1120 State.addMetadata(To: C, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
1121 }
1122
1123 break;
1124 }
1125 default:
1126 // This instruction is not vectorized by simple widening.
1127 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1128 << Instruction::getOpcodeName(Opcode));
1129 llvm_unreachable("Unhandled instruction!");
1130 } // end of switch.
1131
1132#if !defined(NDEBUG)
1133 // Verify that VPlan type inference results agree with the type of the
1134 // generated values.
1135 for (unsigned Part = 0; Part < State.UF; ++Part) {
1136 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
1137 State.VF) == State.get(this, Part)->getType() &&
1138 "inferred type and type from generated instructions do not match");
1139 }
1140#endif
1141}
1142
1143#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1144void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
1145 VPSlotTracker &SlotTracker) const {
1146 O << Indent << "WIDEN ";
1147 printAsOperand(O, SlotTracker);
1148 O << " = " << Instruction::getOpcodeName(Opcode);
1149 printFlags(O);
1150 printOperands(O, SlotTracker);
1151}
1152#endif
1153
1154void VPWidenCastRecipe::execute(VPTransformState &State) {
1155 State.setDebugLocFrom(getDebugLoc());
1156 auto &Builder = State.Builder;
1157 /// Vectorize casts.
1158 assert(State.VF.isVector() && "Not vectorizing?");
1159 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
1160 VPValue *Op = getOperand(N: 0);
1161 for (unsigned Part = 0; Part < State.UF; ++Part) {
1162 if (Part > 0 && Op->isLiveIn()) {
1163 // FIXME: Remove once explicit unrolling is implemented using VPlan.
1164 State.set(Def: this, V: State.get(Def: this, Part: 0), Part);
1165 continue;
1166 }
1167 Value *A = State.get(Def: Op, Part);
1168 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
1169 State.set(Def: this, V: Cast, Part);
1170 State.addMetadata(To: Cast, From: cast_or_null<Instruction>(Val: getUnderlyingValue()));
1171 }
1172}
1173
1174#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1175void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1176 VPSlotTracker &SlotTracker) const {
1177 O << Indent << "WIDEN-CAST ";
1178 printAsOperand(O, SlotTracker);
1179 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1180 printFlags(O);
1181 printOperands(O, SlotTracker);
1182 O << " to " << *getResultType();
1183}
1184#endif
1185
1186/// This function adds
1187/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1188/// to each vector element of Val. The sequence starts at StartIndex.
1189/// \p Opcode is relevant for FP induction variable.
1190static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1191 Instruction::BinaryOps BinOp, ElementCount VF,
1192 IRBuilderBase &Builder) {
1193 assert(VF.isVector() && "only vector VFs are supported");
1194
1195 // Create and check the types.
1196 auto *ValVTy = cast<VectorType>(Val: Val->getType());
1197 ElementCount VLen = ValVTy->getElementCount();
1198
1199 Type *STy = Val->getType()->getScalarType();
1200 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1201 "Induction Step must be an integer or FP");
1202 assert(Step->getType() == STy && "Step has wrong type");
1203
1204 SmallVector<Constant *, 8> Indices;
1205
1206 // Create a vector of consecutive numbers from zero to VF.
1207 VectorType *InitVecValVTy = ValVTy;
1208 if (STy->isFloatingPointTy()) {
1209 Type *InitVecValSTy =
1210 IntegerType::get(C&: STy->getContext(), NumBits: STy->getScalarSizeInBits());
1211 InitVecValVTy = VectorType::get(ElementType: InitVecValSTy, EC: VLen);
1212 }
1213 Value *InitVec = Builder.CreateStepVector(DstType: InitVecValVTy);
1214
1215 // Splat the StartIdx
1216 Value *StartIdxSplat = Builder.CreateVectorSplat(EC: VLen, V: StartIdx);
1217
1218 if (STy->isIntegerTy()) {
1219 InitVec = Builder.CreateAdd(LHS: InitVec, RHS: StartIdxSplat);
1220 Step = Builder.CreateVectorSplat(EC: VLen, V: Step);
1221 assert(Step->getType() == Val->getType() && "Invalid step vec");
1222 // FIXME: The newly created binary instructions should contain nsw/nuw
1223 // flags, which can be found from the original scalar operations.
1224 Step = Builder.CreateMul(LHS: InitVec, RHS: Step);
1225 return Builder.CreateAdd(LHS: Val, RHS: Step, Name: "induction");
1226 }
1227
1228 // Floating point induction.
1229 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1230 "Binary Opcode should be specified for FP induction");
1231 InitVec = Builder.CreateUIToFP(V: InitVec, DestTy: ValVTy);
1232 InitVec = Builder.CreateFAdd(L: InitVec, R: StartIdxSplat);
1233
1234 Step = Builder.CreateVectorSplat(EC: VLen, V: Step);
1235 Value *MulOp = Builder.CreateFMul(L: InitVec, R: Step);
1236 return Builder.CreateBinOp(Opc: BinOp, LHS: Val, RHS: MulOp, Name: "induction");
1237}
1238
1239/// A helper function that returns an integer or floating-point constant with
1240/// value C.
1241static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1242 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C)
1243 : ConstantFP::get(Ty, V: C);
1244}
1245
1246static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1247 ElementCount VF) {
1248 assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1249 Type *IntTy = IntegerType::get(C&: FTy->getContext(), NumBits: FTy->getScalarSizeInBits());
1250 Value *RuntimeVF = getRuntimeVF(B, Ty: IntTy, VF);
1251 return B.CreateUIToFP(V: RuntimeVF, DestTy: FTy);
1252}
1253
1254void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1255 assert(!State.Instance && "Int or FP induction being replicated.");
1256
1257 Value *Start = getStartValue()->getLiveInIRValue();
1258 const InductionDescriptor &ID = getInductionDescriptor();
1259 TruncInst *Trunc = getTruncInst();
1260 IRBuilderBase &Builder = State.Builder;
1261 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1262 assert(State.VF.isVector() && "must have vector VF");
1263
1264 // The value from the original loop to which we are mapping the new induction
1265 // variable.
1266 Instruction *EntryVal = Trunc ? cast<Instruction>(Val: Trunc) : IV;
1267
1268 // Fast-math-flags propagate from the original induction instruction.
1269 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1270 if (ID.getInductionBinOp() && isa<FPMathOperator>(Val: ID.getInductionBinOp()))
1271 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1272
1273 // Now do the actual transformations, and start with fetching the step value.
1274 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
1275
1276 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1277 "Expected either an induction phi-node or a truncate of it!");
1278
1279 // Construct the initial value of the vector IV in the vector loop preheader
1280 auto CurrIP = Builder.saveIP();
1281 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
1282 Builder.SetInsertPoint(VectorPH->getTerminator());
1283 if (isa<TruncInst>(Val: EntryVal)) {
1284 assert(Start->getType()->isIntegerTy() &&
1285 "Truncation requires an integer type");
1286 auto *TruncType = cast<IntegerType>(Val: EntryVal->getType());
1287 Step = Builder.CreateTrunc(V: Step, DestTy: TruncType);
1288 Start = Builder.CreateCast(Op: Instruction::Trunc, V: Start, DestTy: TruncType);
1289 }
1290
1291 Value *Zero = getSignedIntOrFpConstant(Ty: Start->getType(), C: 0);
1292 Value *SplatStart = Builder.CreateVectorSplat(EC: State.VF, V: Start);
1293 Value *SteppedStart = getStepVector(
1294 Val: SplatStart, StartIdx: Zero, Step, BinOp: ID.getInductionOpcode(), VF: State.VF, Builder&: State.Builder);
1295
1296 // We create vector phi nodes for both integer and floating-point induction
1297 // variables. Here, we determine the kind of arithmetic we will perform.
1298 Instruction::BinaryOps AddOp;
1299 Instruction::BinaryOps MulOp;
1300 if (Step->getType()->isIntegerTy()) {
1301 AddOp = Instruction::Add;
1302 MulOp = Instruction::Mul;
1303 } else {
1304 AddOp = ID.getInductionOpcode();
1305 MulOp = Instruction::FMul;
1306 }
1307
1308 // Multiply the vectorization factor by the step using integer or
1309 // floating-point arithmetic as appropriate.
1310 Type *StepType = Step->getType();
1311 Value *RuntimeVF;
1312 if (Step->getType()->isFloatingPointTy())
1313 RuntimeVF = getRuntimeVFAsFloat(B&: Builder, FTy: StepType, VF: State.VF);
1314 else
1315 RuntimeVF = getRuntimeVF(B&: Builder, Ty: StepType, VF: State.VF);
1316 Value *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: Step, RHS: RuntimeVF);
1317
1318 // Create a vector splat to use in the induction update.
1319 //
1320 // FIXME: If the step is non-constant, we create the vector splat with
1321 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1322 // handle a constant vector splat.
1323 Value *SplatVF = isa<Constant>(Val: Mul)
1324 ? ConstantVector::getSplat(EC: State.VF, Elt: cast<Constant>(Val: Mul))
1325 : Builder.CreateVectorSplat(EC: State.VF, V: Mul);
1326 Builder.restoreIP(IP: CurrIP);
1327
1328 // We may need to add the step a number of times, depending on the unroll
1329 // factor. The last of those goes into the PHI.
1330 PHINode *VecInd = PHINode::Create(Ty: SteppedStart->getType(), NumReservedValues: 2, NameStr: "vec.ind");
1331 VecInd->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
1332 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1333 Instruction *LastInduction = VecInd;
1334 for (unsigned Part = 0; Part < State.UF; ++Part) {
1335 State.set(Def: this, V: LastInduction, Part);
1336
1337 if (isa<TruncInst>(Val: EntryVal))
1338 State.addMetadata(To: LastInduction, From: EntryVal);
1339
1340 LastInduction = cast<Instruction>(
1341 Val: Builder.CreateBinOp(Opc: AddOp, LHS: LastInduction, RHS: SplatVF, Name: "step.add"));
1342 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1343 }
1344
1345 LastInduction->setName("vec.ind.next");
1346 VecInd->addIncoming(V: SteppedStart, BB: VectorPH);
1347 // Add induction update using an incorrect block temporarily. The phi node
1348 // will be fixed after VPlan execution. Note that at this point the latch
1349 // block cannot be used, as it does not exist yet.
1350 // TODO: Model increment value in VPlan, by turning the recipe into a
1351 // multi-def and a subclass of VPHeaderPHIRecipe.
1352 VecInd->addIncoming(V: LastInduction, BB: VectorPH);
1353}
1354
1355#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1356void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1357 VPSlotTracker &SlotTracker) const {
1358 O << Indent << "WIDEN-INDUCTION";
1359 if (getTruncInst()) {
1360 O << "\\l\"";
1361 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
1362 O << " +\n" << Indent << "\" ";
1363 getVPValue(0)->printAsOperand(O, SlotTracker);
1364 } else
1365 O << " " << VPlanIngredient(IV);
1366
1367 O << ", ";
1368 getStepValue()->printAsOperand(O, SlotTracker);
1369}
1370#endif
1371
1372bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1373 // The step may be defined by a recipe in the preheader (e.g. if it requires
1374 // SCEV expansion), but for the canonical induction the step is required to be
1375 // 1, which is represented as live-in.
1376 if (getStepValue()->getDefiningRecipe())
1377 return false;
1378 auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue());
1379 auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue());
1380 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &*getParent()->begin());
1381 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
1382 getScalarType() == CanIV->getScalarType();
1383}
1384
1385#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1386void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1387 VPSlotTracker &SlotTracker) const {
1388 O << Indent;
1389 printAsOperand(O, SlotTracker);
1390 O << Indent << "= DERIVED-IV ";
1391 getStartValue()->printAsOperand(O, SlotTracker);
1392 O << " + ";
1393 getOperand(1)->printAsOperand(O, SlotTracker);
1394 O << " * ";
1395 getStepValue()->printAsOperand(O, SlotTracker);
1396}
1397#endif
1398
1399void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1400 // Fast-math-flags propagate from the original induction instruction.
1401 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1402 if (hasFastMathFlags())
1403 State.Builder.setFastMathFlags(getFastMathFlags());
1404
1405 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1406 /// variable on which to base the steps, \p Step is the size of the step.
1407
1408 Value *BaseIV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
1409 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
1410 IRBuilderBase &Builder = State.Builder;
1411
1412 // Ensure step has the same type as that of scalar IV.
1413 Type *BaseIVTy = BaseIV->getType()->getScalarType();
1414 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1415
1416 // We build scalar steps for both integer and floating-point induction
1417 // variables. Here, we determine the kind of arithmetic we will perform.
1418 Instruction::BinaryOps AddOp;
1419 Instruction::BinaryOps MulOp;
1420 if (BaseIVTy->isIntegerTy()) {
1421 AddOp = Instruction::Add;
1422 MulOp = Instruction::Mul;
1423 } else {
1424 AddOp = InductionOpcode;
1425 MulOp = Instruction::FMul;
1426 }
1427
1428 // Determine the number of scalars we need to generate for each unroll
1429 // iteration.
1430 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
1431 // Compute the scalar steps and save the results in State.
1432 Type *IntStepTy =
1433 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
1434 Type *VecIVTy = nullptr;
1435 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1436 if (!FirstLaneOnly && State.VF.isScalable()) {
1437 VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF);
1438 UnitStepVec =
1439 Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF));
1440 SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step);
1441 SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV);
1442 }
1443
1444 unsigned StartPart = 0;
1445 unsigned EndPart = State.UF;
1446 unsigned StartLane = 0;
1447 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1448 if (State.Instance) {
1449 StartPart = State.Instance->Part;
1450 EndPart = StartPart + 1;
1451 StartLane = State.Instance->Lane.getKnownLane();
1452 EndLane = StartLane + 1;
1453 }
1454 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1455 Value *StartIdx0 = createStepForVF(B&: Builder, Ty: IntStepTy, VF: State.VF, Step: Part);
1456
1457 if (!FirstLaneOnly && State.VF.isScalable()) {
1458 auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0);
1459 auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec);
1460 if (BaseIVTy->isFloatingPointTy())
1461 InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy);
1462 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep);
1463 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul);
1464 State.set(Def: this, V: Add, Part);
1465 // It's useful to record the lane values too for the known minimum number
1466 // of elements so we do those below. This improves the code quality when
1467 // trying to extract the first element, for example.
1468 }
1469
1470 if (BaseIVTy->isFloatingPointTy())
1471 StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy);
1472
1473 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1474 Value *StartIdx = Builder.CreateBinOp(
1475 Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane));
1476 // The step returned by `createStepForVF` is a runtime-evaluated value
1477 // when VF is scalable. Otherwise, it should be folded into a Constant.
1478 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1479 "Expected StartIdx to be folded to a constant when VF is not "
1480 "scalable");
1481 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
1482 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
1483 State.set(Def: this, V: Add, Instance: VPIteration(Part, Lane));
1484 }
1485 }
1486}
1487
1488#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1489void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1490 VPSlotTracker &SlotTracker) const {
1491 O << Indent;
1492 printAsOperand(O, SlotTracker);
1493 O << " = SCALAR-STEPS ";
1494 printOperands(O, SlotTracker);
1495}
1496#endif
1497
1498void VPWidenGEPRecipe::execute(VPTransformState &State) {
1499 assert(State.VF.isVector() && "not widening");
1500 auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr());
1501 // Construct a vector GEP by widening the operands of the scalar GEP as
1502 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1503 // results in a vector of pointers when at least one operand of the GEP
1504 // is vector-typed. Thus, to keep the representation compact, we only use
1505 // vector-typed operands for loop-varying values.
1506
1507 if (areAllOperandsInvariant()) {
1508 // If we are vectorizing, but the GEP has only loop-invariant operands,
1509 // the GEP we build (by only using vector-typed operands for
1510 // loop-varying values) would be a scalar pointer. Thus, to ensure we
1511 // produce a vector of pointers, we need to either arbitrarily pick an
1512 // operand to broadcast, or broadcast a clone of the original GEP.
1513 // Here, we broadcast a clone of the original.
1514 //
1515 // TODO: If at some point we decide to scalarize instructions having
1516 // loop-invariant operands, this special case will no longer be
1517 // required. We would add the scalarization decision to
1518 // collectLoopScalars() and teach getVectorValue() to broadcast
1519 // the lane-zero scalar value.
1520 SmallVector<Value *> Ops;
1521 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1522 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Instance: VPIteration(0, 0)));
1523
1524 auto *NewGEP =
1525 State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1526 IdxList: ArrayRef(Ops).drop_front(), Name: "", NW: isInBounds());
1527 for (unsigned Part = 0; Part < State.UF; ++Part) {
1528 Value *EntryPart = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
1529 State.set(Def: this, V: EntryPart, Part);
1530 State.addMetadata(To: EntryPart, From: GEP);
1531 }
1532 } else {
1533 // If the GEP has at least one loop-varying operand, we are sure to
1534 // produce a vector of pointers. But if we are only unrolling, we want
1535 // to produce a scalar GEP for each unroll part. Thus, the GEP we
1536 // produce with the code below will be scalar (if VF == 1) or vector
1537 // (otherwise). Note that for the unroll-only case, we still maintain
1538 // values in the vector mapping with initVector, as we do for other
1539 // instructions.
1540 for (unsigned Part = 0; Part < State.UF; ++Part) {
1541 // The pointer operand of the new GEP. If it's loop-invariant, we
1542 // won't broadcast it.
1543 auto *Ptr = isPointerLoopInvariant()
1544 ? State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0))
1545 : State.get(Def: getOperand(N: 0), Part);
1546
1547 // Collect all the indices for the new GEP. If any index is
1548 // loop-invariant, we won't broadcast it.
1549 SmallVector<Value *, 4> Indices;
1550 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1551 VPValue *Operand = getOperand(N: I);
1552 if (isIndexLoopInvariant(I: I - 1))
1553 Indices.push_back(Elt: State.get(Def: Operand, Instance: VPIteration(0, 0)));
1554 else
1555 Indices.push_back(Elt: State.get(Def: Operand, Part));
1556 }
1557
1558 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1559 // but it should be a vector, otherwise.
1560 auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr,
1561 IdxList: Indices, Name: "", NW: isInBounds());
1562 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1563 "NewGEP is not a pointer vector");
1564 State.set(Def: this, V: NewGEP, Part);
1565 State.addMetadata(To: NewGEP, From: GEP);
1566 }
1567 }
1568}
1569
1570#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1571void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1572 VPSlotTracker &SlotTracker) const {
1573 O << Indent << "WIDEN-GEP ";
1574 O << (isPointerLoopInvariant() ? "Inv" : "Var");
1575 for (size_t I = 0; I < getNumOperands() - 1; ++I)
1576 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1577
1578 O << " ";
1579 printAsOperand(O, SlotTracker);
1580 O << " = getelementptr";
1581 printFlags(O);
1582 printOperands(O, SlotTracker);
1583}
1584#endif
1585
1586void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1587 auto &Builder = State.Builder;
1588 State.setDebugLocFrom(getDebugLoc());
1589 for (unsigned Part = 0; Part < State.UF; ++Part) {
1590 // Calculate the pointer for the specific unroll-part.
1591 Value *PartPtr = nullptr;
1592 // Use i32 for the gep index type when the value is constant,
1593 // or query DataLayout for a more suitable index type otherwise.
1594 const DataLayout &DL =
1595 Builder.GetInsertBlock()->getDataLayout();
1596 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1597 ? DL.getIndexType(PtrTy: IndexedTy->getPointerTo())
1598 : Builder.getInt32Ty();
1599 Value *Ptr = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
1600 bool InBounds = isInBounds();
1601 if (IsReverse) {
1602 // If the address is consecutive but reversed, then the
1603 // wide store needs to start at the last vector element.
1604 // RunTimeVF = VScale * VF.getKnownMinValue()
1605 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1606 Value *RunTimeVF = getRuntimeVF(B&: Builder, Ty: IndexTy, VF: State.VF);
1607 // NumElt = -Part * RunTimeVF
1608 Value *NumElt = Builder.CreateMul(
1609 LHS: ConstantInt::get(Ty: IndexTy, V: -(int64_t)Part), RHS: RunTimeVF);
1610 // LastLane = 1 - RunTimeVF
1611 Value *LastLane =
1612 Builder.CreateSub(LHS: ConstantInt::get(Ty: IndexTy, V: 1), RHS: RunTimeVF);
1613 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "", NW: InBounds);
1614 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: PartPtr, IdxList: LastLane, Name: "", NW: InBounds);
1615 } else {
1616 Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: Part);
1617 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "", NW: InBounds);
1618 }
1619
1620 State.set(Def: this, V: PartPtr, Part, /*IsScalar*/ true);
1621 }
1622}
1623
1624#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1625void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1626 VPSlotTracker &SlotTracker) const {
1627 O << Indent;
1628 printAsOperand(O, SlotTracker);
1629 O << " = vector-pointer ";
1630 if (IsReverse)
1631 O << "(reverse) ";
1632
1633 printOperands(O, SlotTracker);
1634}
1635#endif
1636
1637void VPBlendRecipe::execute(VPTransformState &State) {
1638 State.setDebugLocFrom(getDebugLoc());
1639 // We know that all PHIs in non-header blocks are converted into
1640 // selects, so we don't have to worry about the insertion order and we
1641 // can just use the builder.
1642 // At this point we generate the predication tree. There may be
1643 // duplications since this is a simple recursive scan, but future
1644 // optimizations will clean it up.
1645
1646 unsigned NumIncoming = getNumIncomingValues();
1647
1648 // Generate a sequence of selects of the form:
1649 // SELECT(Mask3, In3,
1650 // SELECT(Mask2, In2,
1651 // SELECT(Mask1, In1,
1652 // In0)))
1653 // Note that Mask0 is never used: lanes for which no path reaches this phi and
1654 // are essentially undef are taken from In0.
1655 VectorParts Entry(State.UF);
1656 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
1657 for (unsigned In = 0; In < NumIncoming; ++In) {
1658 for (unsigned Part = 0; Part < State.UF; ++Part) {
1659 // We might have single edge PHIs (blocks) - use an identity
1660 // 'select' for the first PHI operand.
1661 Value *In0 = State.get(Def: getIncomingValue(Idx: In), Part, IsScalar: OnlyFirstLaneUsed);
1662 if (In == 0)
1663 Entry[Part] = In0; // Initialize with the first incoming value.
1664 else {
1665 // Select between the current value and the previous incoming edge
1666 // based on the incoming mask.
1667 Value *Cond = State.get(Def: getMask(Idx: In), Part, IsScalar: OnlyFirstLaneUsed);
1668 Entry[Part] =
1669 State.Builder.CreateSelect(C: Cond, True: In0, False: Entry[Part], Name: "predphi");
1670 }
1671 }
1672 }
1673 for (unsigned Part = 0; Part < State.UF; ++Part)
1674 State.set(Def: this, V: Entry[Part], Part, IsScalar: OnlyFirstLaneUsed);
1675}
1676
1677#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1678void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1679 VPSlotTracker &SlotTracker) const {
1680 O << Indent << "BLEND ";
1681 printAsOperand(O, SlotTracker);
1682 O << " =";
1683 if (getNumIncomingValues() == 1) {
1684 // Not a User of any mask: not really blending, this is a
1685 // single-predecessor phi.
1686 O << " ";
1687 getIncomingValue(0)->printAsOperand(O, SlotTracker);
1688 } else {
1689 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1690 O << " ";
1691 getIncomingValue(I)->printAsOperand(O, SlotTracker);
1692 if (I == 0)
1693 continue;
1694 O << "/";
1695 getMask(I)->printAsOperand(O, SlotTracker);
1696 }
1697 }
1698}
1699#endif
1700
1701void VPReductionRecipe::execute(VPTransformState &State) {
1702 assert(!State.Instance && "Reduction being replicated.");
1703 Value *PrevInChain = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true);
1704 RecurKind Kind = RdxDesc.getRecurrenceKind();
1705 // Propagate the fast-math flags carried by the underlying instruction.
1706 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1707 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1708 for (unsigned Part = 0; Part < State.UF; ++Part) {
1709 Value *NewVecOp = State.get(Def: getVecOp(), Part);
1710 if (VPValue *Cond = getCondOp()) {
1711 Value *NewCond = State.get(Def: Cond, Part, IsScalar: State.VF.isScalar());
1712 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
1713 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1714 Value *Iden = RdxDesc.getRecurrenceIdentity(K: Kind, Tp: ElementTy,
1715 FMF: RdxDesc.getFastMathFlags());
1716 if (State.VF.isVector()) {
1717 Iden = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Iden);
1718 }
1719
1720 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Iden);
1721 NewVecOp = Select;
1722 }
1723 Value *NewRed;
1724 Value *NextInChain;
1725 if (IsOrdered) {
1726 if (State.VF.isVector())
1727 NewRed = createOrderedReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp,
1728 Start: PrevInChain);
1729 else
1730 NewRed = State.Builder.CreateBinOp(
1731 Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: PrevInChain,
1732 RHS: NewVecOp);
1733 PrevInChain = NewRed;
1734 } else {
1735 PrevInChain = State.get(Def: getChainOp(), Part, /*IsScalar*/ true);
1736 NewRed = createTargetReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp);
1737 }
1738 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1739 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: RdxDesc.getRecurrenceKind(),
1740 Left: NewRed, Right: PrevInChain);
1741 } else if (IsOrdered)
1742 NextInChain = NewRed;
1743 else
1744 NextInChain = State.Builder.CreateBinOp(
1745 Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: PrevInChain);
1746 State.set(Def: this, V: NextInChain, Part, /*IsScalar*/ true);
1747 }
1748}
1749
1750void VPReductionEVLRecipe::execute(VPTransformState &State) {
1751 assert(!State.Instance && "Reduction being replicated.");
1752 assert(State.UF == 1 &&
1753 "Expected only UF == 1 when vectorizing with explicit vector length.");
1754
1755 auto &Builder = State.Builder;
1756 // Propagate the fast-math flags carried by the underlying instruction.
1757 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
1758 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1759 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1760
1761 RecurKind Kind = RdxDesc.getRecurrenceKind();
1762 Value *Prev = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true);
1763 Value *VecOp = State.get(Def: getVecOp(), Part: 0);
1764 Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0));
1765
1766 VectorBuilder VBuilder(Builder);
1767 VBuilder.setEVL(EVL);
1768 Value *Mask;
1769 // TODO: move the all-true mask generation into VectorBuilder.
1770 if (VPValue *CondOp = getCondOp())
1771 Mask = State.get(Def: CondOp, Part: 0);
1772 else
1773 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
1774 VBuilder.setMask(Mask);
1775
1776 Value *NewRed;
1777 if (isOrdered()) {
1778 NewRed = createOrderedReduction(VB&: VBuilder, Desc: RdxDesc, Src: VecOp, Start: Prev);
1779 } else {
1780 NewRed = createSimpleTargetReduction(VB&: VBuilder, Src: VecOp, Desc: RdxDesc);
1781 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
1782 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
1783 else
1784 NewRed = Builder.CreateBinOp(
1785 Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: Prev);
1786 }
1787 State.set(Def: this, V: NewRed, Part: 0, /*IsScalar*/ true);
1788}
1789
1790#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1791void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1792 VPSlotTracker &SlotTracker) const {
1793 O << Indent << "REDUCE ";
1794 printAsOperand(O, SlotTracker);
1795 O << " = ";
1796 getChainOp()->printAsOperand(O, SlotTracker);
1797 O << " +";
1798 if (isa<FPMathOperator>(getUnderlyingInstr()))
1799 O << getUnderlyingInstr()->getFastMathFlags();
1800 O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1801 getVecOp()->printAsOperand(O, SlotTracker);
1802 if (isConditional()) {
1803 O << ", ";
1804 getCondOp()->printAsOperand(O, SlotTracker);
1805 }
1806 O << ")";
1807 if (RdxDesc.IntermediateStore)
1808 O << " (with final reduction value stored in invariant address sank "
1809 "outside of loop)";
1810}
1811
1812void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1813 VPSlotTracker &SlotTracker) const {
1814 const RecurrenceDescriptor &RdxDesc = getRecurrenceDescriptor();
1815 O << Indent << "REDUCE ";
1816 printAsOperand(O, SlotTracker);
1817 O << " = ";
1818 getChainOp()->printAsOperand(O, SlotTracker);
1819 O << " +";
1820 if (isa<FPMathOperator>(getUnderlyingInstr()))
1821 O << getUnderlyingInstr()->getFastMathFlags();
1822 O << " vp.reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1823 getVecOp()->printAsOperand(O, SlotTracker);
1824 O << ", ";
1825 getEVL()->printAsOperand(O, SlotTracker);
1826 if (isConditional()) {
1827 O << ", ";
1828 getCondOp()->printAsOperand(O, SlotTracker);
1829 }
1830 O << ")";
1831 if (RdxDesc.IntermediateStore)
1832 O << " (with final reduction value stored in invariant address sank "
1833 "outside of loop)";
1834}
1835#endif
1836
1837bool VPReplicateRecipe::shouldPack() const {
1838 // Find if the recipe is used by a widened recipe via an intervening
1839 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1840 return any_of(Range: users(), P: [](const VPUser *U) {
1841 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
1842 return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) {
1843 return !U->usesScalars(Op: PredR);
1844 });
1845 return false;
1846 });
1847}
1848
1849#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1850void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1851 VPSlotTracker &SlotTracker) const {
1852 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1853
1854 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1855 printAsOperand(O, SlotTracker);
1856 O << " = ";
1857 }
1858 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1859 O << "call";
1860 printFlags(O);
1861 O << "@" << CB->getCalledFunction()->getName() << "(";
1862 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1863 O, [&O, &SlotTracker](VPValue *Op) {
1864 Op->printAsOperand(O, SlotTracker);
1865 });
1866 O << ")";
1867 } else {
1868 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1869 printFlags(O);
1870 printOperands(O, SlotTracker);
1871 }
1872
1873 if (shouldPack())
1874 O << " (S->V)";
1875}
1876#endif
1877
1878/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1879/// if it is either defined outside the vector region or its operand is known to
1880/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1881/// TODO: Uniformity should be associated with a VPValue and there should be a
1882/// generic way to check.
1883static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1884 return C->isDefinedOutsideVectorRegions() ||
1885 isa<VPDerivedIVRecipe>(Val: C->getOperand(N: 0)) ||
1886 isa<VPCanonicalIVPHIRecipe>(Val: C->getOperand(N: 0));
1887}
1888
1889Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1890 assert(vputils::onlyFirstLaneUsed(this) &&
1891 "Codegen only implemented for first lane.");
1892 switch (Opcode) {
1893 case Instruction::SExt:
1894 case Instruction::ZExt:
1895 case Instruction::Trunc: {
1896 // Note: SExt/ZExt not used yet.
1897 Value *Op = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
1898 return State.Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: Op, DestTy: ResultTy);
1899 }
1900 default:
1901 llvm_unreachable("opcode not implemented yet");
1902 }
1903}
1904
1905void VPScalarCastRecipe ::execute(VPTransformState &State) {
1906 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(C: this);
1907 for (unsigned Part = 0; Part != State.UF; ++Part) {
1908 Value *Res;
1909 // Only generate a single instance, if the recipe is uniform across UFs and
1910 // VFs.
1911 if (Part > 0 && IsUniformAcrossVFsAndUFs)
1912 Res = State.get(Def: this, Instance: VPIteration(0, 0));
1913 else
1914 Res = generate(State, Part);
1915 State.set(Def: this, V: Res, Instance: VPIteration(Part, 0));
1916 }
1917}
1918
1919#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1920void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1921 VPSlotTracker &SlotTracker) const {
1922 O << Indent << "SCALAR-CAST ";
1923 printAsOperand(O, SlotTracker);
1924 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1925 printOperands(O, SlotTracker);
1926 O << " to " << *ResultTy;
1927}
1928#endif
1929
1930void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1931 assert(State.Instance && "Branch on Mask works only on single instance.");
1932
1933 unsigned Part = State.Instance->Part;
1934 unsigned Lane = State.Instance->Lane.getKnownLane();
1935
1936 Value *ConditionBit = nullptr;
1937 VPValue *BlockInMask = getMask();
1938 if (BlockInMask) {
1939 ConditionBit = State.get(Def: BlockInMask, Part);
1940 if (ConditionBit->getType()->isVectorTy())
1941 ConditionBit = State.Builder.CreateExtractElement(
1942 Vec: ConditionBit, Idx: State.Builder.getInt32(C: Lane));
1943 } else // Block in mask is all-one.
1944 ConditionBit = State.Builder.getTrue();
1945
1946 // Replace the temporary unreachable terminator with a new conditional branch,
1947 // whose two destinations will be set later when they are created.
1948 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1949 assert(isa<UnreachableInst>(CurrentTerminator) &&
1950 "Expected to replace unreachable terminator with conditional branch.");
1951 auto *CondBr = BranchInst::Create(IfTrue: State.CFG.PrevBB, IfFalse: nullptr, Cond: ConditionBit);
1952 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
1953 ReplaceInstWithInst(From: CurrentTerminator, To: CondBr);
1954}
1955
1956void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1957 assert(State.Instance && "Predicated instruction PHI works per instance.");
1958 Instruction *ScalarPredInst =
1959 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Instance: *State.Instance));
1960 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1961 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1962 assert(PredicatingBB && "Predicated block has no single predecessor.");
1963 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1964 "operand must be VPReplicateRecipe");
1965
1966 // By current pack/unpack logic we need to generate only a single phi node: if
1967 // a vector value for the predicated instruction exists at this point it means
1968 // the instruction has vector users only, and a phi for the vector value is
1969 // needed. In this case the recipe of the predicated instruction is marked to
1970 // also do that packing, thereby "hoisting" the insert-element sequence.
1971 // Otherwise, a phi node for the scalar value is needed.
1972 unsigned Part = State.Instance->Part;
1973 if (State.hasVectorValue(Def: getOperand(N: 0), Part)) {
1974 Value *VectorValue = State.get(Def: getOperand(N: 0), Part);
1975 InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue);
1976 PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2);
1977 VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector.
1978 VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element.
1979 if (State.hasVectorValue(Def: this, Part))
1980 State.reset(Def: this, V: VPhi, Part);
1981 else
1982 State.set(Def: this, V: VPhi, Part);
1983 // NOTE: Currently we need to update the value of the operand, so the next
1984 // predicated iteration inserts its generated value in the correct vector.
1985 State.reset(Def: getOperand(N: 0), V: VPhi, Part);
1986 } else {
1987 Type *PredInstType = getOperand(N: 0)->getUnderlyingValue()->getType();
1988 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
1989 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
1990 BB: PredicatingBB);
1991 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
1992 if (State.hasScalarValue(Def: this, Instance: *State.Instance))
1993 State.reset(Def: this, V: Phi, Instance: *State.Instance);
1994 else
1995 State.set(Def: this, V: Phi, Instance: *State.Instance);
1996 // NOTE: Currently we need to update the value of the operand, so the next
1997 // predicated iteration inserts its generated value in the correct vector.
1998 State.reset(Def: getOperand(N: 0), V: Phi, Instance: *State.Instance);
1999 }
2000}
2001
2002#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2003void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2004 VPSlotTracker &SlotTracker) const {
2005 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
2006 printAsOperand(O, SlotTracker);
2007 O << " = ";
2008 printOperands(O, SlotTracker);
2009}
2010
2011void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
2012 VPSlotTracker &SlotTracker) const {
2013 O << Indent << "WIDEN ";
2014 printAsOperand(O, SlotTracker);
2015 O << " = load ";
2016 printOperands(O, SlotTracker);
2017}
2018
2019void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2020 VPSlotTracker &SlotTracker) const {
2021 O << Indent << "WIDEN ";
2022 printAsOperand(O, SlotTracker);
2023 O << " = vp.load ";
2024 printOperands(O, SlotTracker);
2025}
2026
2027void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
2028 VPSlotTracker &SlotTracker) const {
2029 O << Indent << "WIDEN store ";
2030 printOperands(O, SlotTracker);
2031}
2032
2033void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2034 VPSlotTracker &SlotTracker) const {
2035 O << Indent << "WIDEN vp.store ";
2036 printOperands(O, SlotTracker);
2037}
2038#endif
2039
2040static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
2041 VectorType *DstVTy, const DataLayout &DL) {
2042 // Verify that V is a vector type with same number of elements as DstVTy.
2043 auto VF = DstVTy->getElementCount();
2044 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
2045 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2046 Type *SrcElemTy = SrcVecTy->getElementType();
2047 Type *DstElemTy = DstVTy->getElementType();
2048 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2049 "Vector elements must have same size");
2050
2051 // Do a direct cast if element types are castable.
2052 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
2053 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
2054 }
2055 // V cannot be directly casted to desired vector type.
2056 // May happen when V is a floating point vector but DstVTy is a vector of
2057 // pointers or vice-versa. Handle this using a two-step bitcast using an
2058 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2059 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2060 "Only one type should be a pointer type");
2061 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2062 "Only one type should be a floating point type");
2063 Type *IntTy =
2064 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
2065 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
2066 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
2067 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
2068}
2069
2070/// Return a vector containing interleaved elements from multiple
2071/// smaller input vectors.
2072static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
2073 const Twine &Name) {
2074 unsigned Factor = Vals.size();
2075 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
2076
2077 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
2078#ifndef NDEBUG
2079 for (Value *Val : Vals)
2080 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
2081#endif
2082
2083 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
2084 // must use intrinsics to interleave.
2085 if (VecTy->isScalableTy()) {
2086 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VTy: VecTy);
2087 return Builder.CreateIntrinsic(RetTy: WideVecTy, ID: Intrinsic::vector_interleave2,
2088 Args: Vals,
2089 /*FMFSource=*/nullptr, Name);
2090 }
2091
2092 // Fixed length. Start by concatenating all vectors into a wide vector.
2093 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
2094
2095 // Interleave the elements into the wide vector.
2096 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
2097 return Builder.CreateShuffleVector(
2098 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
2099}
2100
2101// Try to vectorize the interleave group that \p Instr belongs to.
2102//
2103// E.g. Translate following interleaved load group (factor = 3):
2104// for (i = 0; i < N; i+=3) {
2105// R = Pic[i]; // Member of index 0
2106// G = Pic[i+1]; // Member of index 1
2107// B = Pic[i+2]; // Member of index 2
2108// ... // do something to R, G, B
2109// }
2110// To:
2111// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2112// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2113// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2114// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2115//
2116// Or translate following interleaved store group (factor = 3):
2117// for (i = 0; i < N; i+=3) {
2118// ... do something to R, G, B
2119// Pic[i] = R; // Member of index 0
2120// Pic[i+1] = G; // Member of index 1
2121// Pic[i+2] = B; // Member of index 2
2122// }
2123// To:
2124// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2125// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2126// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2127// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2128// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2129void VPInterleaveRecipe::execute(VPTransformState &State) {
2130 assert(!State.Instance && "Interleave group being replicated.");
2131 const InterleaveGroup<Instruction> *Group = IG;
2132 Instruction *Instr = Group->getInsertPos();
2133
2134 // Prepare for the vector type of the interleaved load/store.
2135 Type *ScalarTy = getLoadStoreType(I: Instr);
2136 unsigned InterleaveFactor = Group->getFactor();
2137 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
2138
2139 // Prepare for the new pointers.
2140 SmallVector<Value *, 2> AddrParts;
2141 unsigned Index = Group->getIndex(Instr);
2142
2143 // TODO: extend the masked interleaved-group support to reversed access.
2144 VPValue *BlockInMask = getMask();
2145 assert((!BlockInMask || !Group->isReverse()) &&
2146 "Reversed masked interleave-group not supported.");
2147
2148 Value *Idx;
2149 // If the group is reverse, adjust the index to refer to the last vector lane
2150 // instead of the first. We adjust the index from the first vector lane,
2151 // rather than directly getting the pointer for lane VF - 1, because the
2152 // pointer operand of the interleaved access is supposed to be uniform. For
2153 // uniform instructions, we're only required to generate a value for the
2154 // first vector lane in each unroll iteration.
2155 if (Group->isReverse()) {
2156 Value *RuntimeVF =
2157 getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt32Ty(), VF: State.VF);
2158 Idx = State.Builder.CreateSub(LHS: RuntimeVF, RHS: State.Builder.getInt32(C: 1));
2159 Idx = State.Builder.CreateMul(LHS: Idx,
2160 RHS: State.Builder.getInt32(C: Group->getFactor()));
2161 Idx = State.Builder.CreateAdd(LHS: Idx, RHS: State.Builder.getInt32(C: Index));
2162 Idx = State.Builder.CreateNeg(V: Idx);
2163 } else
2164 Idx = State.Builder.getInt32(C: -Index);
2165
2166 VPValue *Addr = getAddr();
2167 for (unsigned Part = 0; Part < State.UF; Part++) {
2168 Value *AddrPart = State.get(Def: Addr, Instance: VPIteration(Part, 0));
2169 if (auto *I = dyn_cast<Instruction>(Val: AddrPart))
2170 State.setDebugLocFrom(I->getDebugLoc());
2171
2172 // Notice current instruction could be any index. Need to adjust the address
2173 // to the member of index 0.
2174 //
2175 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2176 // b = A[i]; // Member of index 0
2177 // Current pointer is pointed to A[i+1], adjust it to A[i].
2178 //
2179 // E.g. A[i+1] = a; // Member of index 1
2180 // A[i] = b; // Member of index 0
2181 // A[i+2] = c; // Member of index 2 (Current instruction)
2182 // Current pointer is pointed to A[i+2], adjust it to A[i].
2183
2184 bool InBounds = false;
2185 if (auto *gep = dyn_cast<GetElementPtrInst>(Val: AddrPart->stripPointerCasts()))
2186 InBounds = gep->isInBounds();
2187 AddrPart = State.Builder.CreateGEP(Ty: ScalarTy, Ptr: AddrPart, IdxList: Idx, Name: "", NW: InBounds);
2188 AddrParts.push_back(Elt: AddrPart);
2189 }
2190
2191 State.setDebugLocFrom(Instr->getDebugLoc());
2192 Value *PoisonVec = PoisonValue::get(T: VecTy);
2193
2194 auto CreateGroupMask = [&BlockInMask, &State, &InterleaveFactor](
2195 unsigned Part, Value *MaskForGaps) -> Value * {
2196 if (State.VF.isScalable()) {
2197 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2198 assert(InterleaveFactor == 2 &&
2199 "Unsupported deinterleave factor for scalable vectors");
2200 auto *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2201 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2202 auto *MaskTy = VectorType::get(ElementType: State.Builder.getInt1Ty(),
2203 NumElements: State.VF.getKnownMinValue() * 2, Scalable: true);
2204 return State.Builder.CreateIntrinsic(
2205 RetTy: MaskTy, ID: Intrinsic::vector_interleave2, Args: Ops,
2206 /*FMFSource=*/nullptr, Name: "interleaved.mask");
2207 }
2208
2209 if (!BlockInMask)
2210 return MaskForGaps;
2211
2212 Value *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2213 Value *ShuffledMask = State.Builder.CreateShuffleVector(
2214 V: BlockInMaskPart,
2215 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getKnownMinValue()),
2216 Name: "interleaved.mask");
2217 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
2218 LHS: ShuffledMask, RHS: MaskForGaps)
2219 : ShuffledMask;
2220 };
2221
2222 const DataLayout &DL = Instr->getDataLayout();
2223 // Vectorize the interleaved load group.
2224 if (isa<LoadInst>(Val: Instr)) {
2225 Value *MaskForGaps = nullptr;
2226 if (NeedsMaskForGaps) {
2227 MaskForGaps = createBitMaskForGaps(Builder&: State.Builder,
2228 VF: State.VF.getKnownMinValue(), Group: *Group);
2229 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2230 }
2231
2232 // For each unroll part, create a wide load for the group.
2233 SmallVector<Value *, 2> NewLoads;
2234 for (unsigned Part = 0; Part < State.UF; Part++) {
2235 Instruction *NewLoad;
2236 if (BlockInMask || MaskForGaps) {
2237 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2238 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: AddrParts[Part],
2239 Alignment: Group->getAlign(), Mask: GroupMask,
2240 PassThru: PoisonVec, Name: "wide.masked.vec");
2241 } else
2242 NewLoad = State.Builder.CreateAlignedLoad(
2243 Ty: VecTy, Ptr: AddrParts[Part], Align: Group->getAlign(), Name: "wide.vec");
2244 Group->addMetadata(NewInst: NewLoad);
2245 NewLoads.push_back(Elt: NewLoad);
2246 }
2247
2248 ArrayRef<VPValue *> VPDefs = definedValues();
2249 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2250 if (VecTy->isScalableTy()) {
2251 assert(InterleaveFactor == 2 &&
2252 "Unsupported deinterleave factor for scalable vectors");
2253
2254 for (unsigned Part = 0; Part < State.UF; ++Part) {
2255 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2256 // so must use intrinsics to deinterleave.
2257 Value *DI = State.Builder.CreateIntrinsic(
2258 ID: Intrinsic::vector_deinterleave2, Types: VecTy, Args: NewLoads[Part],
2259 /*FMFSource=*/nullptr, Name: "strided.vec");
2260 unsigned J = 0;
2261 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2262 Instruction *Member = Group->getMember(Index: I);
2263
2264 if (!Member)
2265 continue;
2266
2267 Value *StridedVec = State.Builder.CreateExtractValue(Agg: DI, Idxs: I);
2268 // If this member has different type, cast the result type.
2269 if (Member->getType() != ScalarTy) {
2270 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
2271 StridedVec =
2272 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
2273 }
2274
2275 if (Group->isReverse())
2276 StridedVec =
2277 State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2278
2279 State.set(Def: VPDefs[J], V: StridedVec, Part);
2280 ++J;
2281 }
2282 }
2283
2284 return;
2285 }
2286
2287 // For each member in the group, shuffle out the appropriate data from the
2288 // wide loads.
2289 unsigned J = 0;
2290 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2291 Instruction *Member = Group->getMember(Index: I);
2292
2293 // Skip the gaps in the group.
2294 if (!Member)
2295 continue;
2296
2297 auto StrideMask =
2298 createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getKnownMinValue());
2299 for (unsigned Part = 0; Part < State.UF; Part++) {
2300 Value *StridedVec = State.Builder.CreateShuffleVector(
2301 V: NewLoads[Part], Mask: StrideMask, Name: "strided.vec");
2302
2303 // If this member has different type, cast the result type.
2304 if (Member->getType() != ScalarTy) {
2305 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
2306 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
2307 StridedVec =
2308 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
2309 }
2310
2311 if (Group->isReverse())
2312 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2313
2314 State.set(Def: VPDefs[J], V: StridedVec, Part);
2315 }
2316 ++J;
2317 }
2318 return;
2319 }
2320
2321 // The sub vector type for current instruction.
2322 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
2323
2324 // Vectorize the interleaved store group.
2325 Value *MaskForGaps =
2326 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
2327 assert((!MaskForGaps || !State.VF.isScalable()) &&
2328 "masking gaps for scalable vectors is not yet supported.");
2329 ArrayRef<VPValue *> StoredValues = getStoredValues();
2330 for (unsigned Part = 0; Part < State.UF; Part++) {
2331 // Collect the stored vector from each member.
2332 SmallVector<Value *, 4> StoredVecs;
2333 unsigned StoredIdx = 0;
2334 for (unsigned i = 0; i < InterleaveFactor; i++) {
2335 assert((Group->getMember(i) || MaskForGaps) &&
2336 "Fail to get a member from an interleaved store group");
2337 Instruction *Member = Group->getMember(Index: i);
2338
2339 // Skip the gaps in the group.
2340 if (!Member) {
2341 Value *Undef = PoisonValue::get(T: SubVT);
2342 StoredVecs.push_back(Elt: Undef);
2343 continue;
2344 }
2345
2346 Value *StoredVec = State.get(Def: StoredValues[StoredIdx], Part);
2347 ++StoredIdx;
2348
2349 if (Group->isReverse())
2350 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
2351
2352 // If this member has different type, cast it to a unified type.
2353
2354 if (StoredVec->getType() != SubVT)
2355 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
2356
2357 StoredVecs.push_back(Elt: StoredVec);
2358 }
2359
2360 // Interleave all the smaller vectors into one wider vector.
2361 Value *IVec =
2362 interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
2363 Instruction *NewStoreInstr;
2364 if (BlockInMask || MaskForGaps) {
2365 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2366 NewStoreInstr = State.Builder.CreateMaskedStore(
2367 Val: IVec, Ptr: AddrParts[Part], Alignment: Group->getAlign(), Mask: GroupMask);
2368 } else
2369 NewStoreInstr = State.Builder.CreateAlignedStore(Val: IVec, Ptr: AddrParts[Part],
2370 Align: Group->getAlign());
2371
2372 Group->addMetadata(NewInst: NewStoreInstr);
2373 }
2374}
2375
2376#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2377void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
2378 VPSlotTracker &SlotTracker) const {
2379 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
2380 IG->getInsertPos()->printAsOperand(O, false);
2381 O << ", ";
2382 getAddr()->printAsOperand(O, SlotTracker);
2383 VPValue *Mask = getMask();
2384 if (Mask) {
2385 O << ", ";
2386 Mask->printAsOperand(O, SlotTracker);
2387 }
2388
2389 unsigned OpIdx = 0;
2390 for (unsigned i = 0; i < IG->getFactor(); ++i) {
2391 if (!IG->getMember(i))
2392 continue;
2393 if (getNumStoreOperands() > 0) {
2394 O << "\n" << Indent << " store ";
2395 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
2396 O << " to index " << i;
2397 } else {
2398 O << "\n" << Indent << " ";
2399 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
2400 O << " = load from index " << i;
2401 }
2402 ++OpIdx;
2403 }
2404}
2405#endif
2406
2407void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
2408 Value *Start = getStartValue()->getLiveInIRValue();
2409 PHINode *EntryPart = PHINode::Create(Ty: Start->getType(), NumReservedValues: 2, NameStr: "index");
2410 EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
2411
2412 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2413 EntryPart->addIncoming(V: Start, BB: VectorPH);
2414 EntryPart->setDebugLoc(getDebugLoc());
2415 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2416 State.set(Def: this, V: EntryPart, Part, /*IsScalar*/ true);
2417}
2418
2419#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2420void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2421 VPSlotTracker &SlotTracker) const {
2422 O << Indent << "EMIT ";
2423 printAsOperand(O, SlotTracker);
2424 O << " = CANONICAL-INDUCTION ";
2425 printOperands(O, SlotTracker);
2426}
2427#endif
2428
2429bool VPCanonicalIVPHIRecipe::isCanonical(
2430 InductionDescriptor::InductionKind Kind, VPValue *Start,
2431 VPValue *Step) const {
2432 // Must be an integer induction.
2433 if (Kind != InductionDescriptor::IK_IntInduction)
2434 return false;
2435 // Start must match the start value of this canonical induction.
2436 if (Start != getStartValue())
2437 return false;
2438
2439 // If the step is defined by a recipe, it is not a ConstantInt.
2440 if (Step->getDefiningRecipe())
2441 return false;
2442
2443 ConstantInt *StepC = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
2444 return StepC && StepC->isOne();
2445}
2446
2447bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
2448 return IsScalarAfterVectorization &&
2449 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
2450}
2451
2452#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2453void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2454 VPSlotTracker &SlotTracker) const {
2455 O << Indent << "EMIT ";
2456 printAsOperand(O, SlotTracker);
2457 O << " = WIDEN-POINTER-INDUCTION ";
2458 getStartValue()->printAsOperand(O, SlotTracker);
2459 O << ", " << *IndDesc.getStep();
2460}
2461#endif
2462
2463void VPExpandSCEVRecipe::execute(VPTransformState &State) {
2464 assert(!State.Instance && "cannot be used in per-lane");
2465 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
2466 SCEVExpander Exp(SE, DL, "induction");
2467
2468 Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(),
2469 I: &*State.Builder.GetInsertPoint());
2470 assert(!State.ExpandedSCEVs.contains(Expr) &&
2471 "Same SCEV expanded multiple times");
2472 State.ExpandedSCEVs[Expr] = Res;
2473 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
2474 State.set(Def: this, V: Res, Instance: {Part, 0});
2475}
2476
2477#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2478void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
2479 VPSlotTracker &SlotTracker) const {
2480 O << Indent << "EMIT ";
2481 getVPSingleValue()->printAsOperand(O, SlotTracker);
2482 O << " = EXPAND SCEV " << *Expr;
2483}
2484#endif
2485
2486void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
2487 Value *CanonicalIV = State.get(Def: getOperand(N: 0), Part: 0, /*IsScalar*/ true);
2488 Type *STy = CanonicalIV->getType();
2489 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
2490 ElementCount VF = State.VF;
2491 Value *VStart = VF.isScalar()
2492 ? CanonicalIV
2493 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
2494 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2495 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: Part);
2496 if (VF.isVector()) {
2497 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
2498 VStep =
2499 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
2500 }
2501 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
2502 State.set(Def: this, V: CanonicalVectorIV, Part);
2503 }
2504}
2505
2506#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2507void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
2508 VPSlotTracker &SlotTracker) const {
2509 O << Indent << "EMIT ";
2510 printAsOperand(O, SlotTracker);
2511 O << " = WIDEN-CANONICAL-INDUCTION ";
2512 printOperands(O, SlotTracker);
2513}
2514#endif
2515
2516void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
2517 auto &Builder = State.Builder;
2518 // Create a vector from the initial value.
2519 auto *VectorInit = getStartValue()->getLiveInIRValue();
2520
2521 Type *VecTy = State.VF.isScalar()
2522 ? VectorInit->getType()
2523 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
2524
2525 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2526 if (State.VF.isVector()) {
2527 auto *IdxTy = Builder.getInt32Ty();
2528 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
2529 IRBuilder<>::InsertPointGuard Guard(Builder);
2530 Builder.SetInsertPoint(VectorPH->getTerminator());
2531 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
2532 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
2533 VectorInit = Builder.CreateInsertElement(
2534 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
2535 }
2536
2537 // Create a phi node for the new recurrence.
2538 PHINode *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
2539 EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
2540 EntryPart->addIncoming(V: VectorInit, BB: VectorPH);
2541 State.set(Def: this, V: EntryPart, Part: 0);
2542}
2543
2544#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2545void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
2546 VPSlotTracker &SlotTracker) const {
2547 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
2548 printAsOperand(O, SlotTracker);
2549 O << " = phi ";
2550 printOperands(O, SlotTracker);
2551}
2552#endif
2553
2554void VPReductionPHIRecipe::execute(VPTransformState &State) {
2555 auto &Builder = State.Builder;
2556
2557 // Reductions do not have to start at zero. They can start with
2558 // any loop invariant values.
2559 VPValue *StartVPV = getStartValue();
2560 Value *StartV = StartVPV->getLiveInIRValue();
2561
2562 // In order to support recurrences we need to be able to vectorize Phi nodes.
2563 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
2564 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
2565 // this value when we vectorize all of the instructions that use the PHI.
2566 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
2567 Type *VecTy = ScalarPHI ? StartV->getType()
2568 : VectorType::get(ElementType: StartV->getType(), EC: State.VF);
2569
2570 BasicBlock *HeaderBB = State.CFG.PrevBB;
2571 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
2572 "recipe must be in the vector loop header");
2573 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
2574 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2575 Instruction *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
2576 EntryPart->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
2577 State.set(Def: this, V: EntryPart, Part, IsScalar: IsInLoop);
2578 }
2579
2580 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2581
2582 Value *Iden = nullptr;
2583 RecurKind RK = RdxDesc.getRecurrenceKind();
2584 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) ||
2585 RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
2586 // MinMax and AnyOf reductions have the start value as their identity.
2587 if (ScalarPHI) {
2588 Iden = StartV;
2589 } else {
2590 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2591 Builder.SetInsertPoint(VectorPH->getTerminator());
2592 StartV = Iden =
2593 Builder.CreateVectorSplat(EC: State.VF, V: StartV, Name: "minmax.ident");
2594 }
2595 } else {
2596 Iden = RdxDesc.getRecurrenceIdentity(K: RK, Tp: VecTy->getScalarType(),
2597 FMF: RdxDesc.getFastMathFlags());
2598
2599 if (!ScalarPHI) {
2600 Iden = Builder.CreateVectorSplat(EC: State.VF, V: Iden);
2601 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2602 Builder.SetInsertPoint(VectorPH->getTerminator());
2603 Constant *Zero = Builder.getInt32(C: 0);
2604 StartV = Builder.CreateInsertElement(Vec: Iden, NewElt: StartV, Idx: Zero);
2605 }
2606 }
2607
2608 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2609 Value *EntryPart = State.get(Def: this, Part, IsScalar: IsInLoop);
2610 // Make sure to add the reduction start value only to the
2611 // first unroll part.
2612 Value *StartVal = (Part == 0) ? StartV : Iden;
2613 cast<PHINode>(Val: EntryPart)->addIncoming(V: StartVal, BB: VectorPH);
2614 }
2615}
2616
2617#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2618void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2619 VPSlotTracker &SlotTracker) const {
2620 O << Indent << "WIDEN-REDUCTION-PHI ";
2621
2622 printAsOperand(O, SlotTracker);
2623 O << " = phi ";
2624 printOperands(O, SlotTracker);
2625}
2626#endif
2627
2628void VPWidenPHIRecipe::execute(VPTransformState &State) {
2629 assert(EnableVPlanNativePath &&
2630 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2631
2632 Value *Op0 = State.get(Def: getOperand(N: 0), Part: 0);
2633 Type *VecTy = Op0->getType();
2634 Value *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name: "vec.phi");
2635 State.set(Def: this, V: VecPhi, Part: 0);
2636}
2637
2638#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2639void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2640 VPSlotTracker &SlotTracker) const {
2641 O << Indent << "WIDEN-PHI ";
2642
2643 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
2644 // Unless all incoming values are modeled in VPlan print the original PHI
2645 // directly.
2646 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2647 // values as VPValues.
2648 if (getNumOperands() != OriginalPhi->getNumOperands()) {
2649 O << VPlanIngredient(OriginalPhi);
2650 return;
2651 }
2652
2653 printAsOperand(O, SlotTracker);
2654 O << " = phi ";
2655 printOperands(O, SlotTracker);
2656}
2657#endif
2658
2659// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2660// remove VPActiveLaneMaskPHIRecipe.
2661void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2662 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2663 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2664 Value *StartMask = State.get(Def: getOperand(N: 0), Part);
2665 PHINode *EntryPart =
2666 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
2667 EntryPart->addIncoming(V: StartMask, BB: VectorPH);
2668 EntryPart->setDebugLoc(getDebugLoc());
2669 State.set(Def: this, V: EntryPart, Part);
2670 }
2671}
2672
2673#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2674void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2675 VPSlotTracker &SlotTracker) const {
2676 O << Indent << "ACTIVE-LANE-MASK-PHI ";
2677
2678 printAsOperand(O, SlotTracker);
2679 O << " = phi ";
2680 printOperands(O, SlotTracker);
2681}
2682#endif
2683
2684void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2685 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2686 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2687 Value *Start = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
2688 PHINode *EntryPart =
2689 State.Builder.CreatePHI(Ty: Start->getType(), NumReservedValues: 2, Name: "evl.based.iv");
2690 EntryPart->addIncoming(V: Start, BB: VectorPH);
2691 EntryPart->setDebugLoc(getDebugLoc());
2692 State.set(Def: this, V: EntryPart, Part: 0, /*IsScalar=*/true);
2693}
2694
2695#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2696void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2697 VPSlotTracker &SlotTracker) const {
2698 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2699
2700 printAsOperand(O, SlotTracker);
2701 O << " = phi ";
2702 printOperands(O, SlotTracker);
2703}
2704#endif
2705