1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/IR/BasicBlock.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Instruction.h"
29#include "llvm/IR/Instructions.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Type.h"
32#include "llvm/IR/Value.h"
33#include "llvm/Support/Casting.h"
34#include "llvm/Support/CommandLine.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/raw_ostream.h"
37#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38#include "llvm/Transforms/Utils/LoopUtils.h"
39#include "llvm/Transforms/Utils/LoopVersioning.h"
40#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
41#include <cassert>
42
43using namespace llvm;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPDefID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54 case VPInstructionSC:
55 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
56 case VPInterleaveSC:
57 return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0;
58 case VPWidenStoreEVLSC:
59 case VPWidenStoreSC:
60 return true;
61 case VPReplicateSC:
62 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
63 ->mayWriteToMemory();
64 case VPWidenCallSC:
65 return !cast<VPWidenCallRecipe>(Val: this)
66 ->getCalledScalarFunction()
67 ->onlyReadsMemory();
68 case VPWidenIntrinsicSC:
69 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
70 case VPBranchOnMaskSC:
71 case VPFirstOrderRecurrencePHISC:
72 case VPScalarIVStepsSC:
73 case VPPredInstPHISC:
74 return false;
75 case VPBlendSC:
76 case VPReductionEVLSC:
77 case VPReductionSC:
78 case VPVectorPointerSC:
79 case VPWidenCanonicalIVSC:
80 case VPWidenCastSC:
81 case VPWidenGEPSC:
82 case VPWidenIntOrFpInductionSC:
83 case VPWidenLoadEVLSC:
84 case VPWidenLoadSC:
85 case VPWidenPHISC:
86 case VPWidenSC:
87 case VPWidenSelectSC: {
88 const Instruction *I =
89 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
90 (void)I;
91 assert((!I || !I->mayWriteToMemory()) &&
92 "underlying instruction may write to memory");
93 return false;
94 }
95 default:
96 return true;
97 }
98}
99
100bool VPRecipeBase::mayReadFromMemory() const {
101 switch (getVPDefID()) {
102 case VPExpressionSC:
103 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
104 case VPInstructionSC:
105 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
106 case VPWidenLoadEVLSC:
107 case VPWidenLoadSC:
108 return true;
109 case VPReplicateSC:
110 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
111 ->mayReadFromMemory();
112 case VPWidenCallSC:
113 return !cast<VPWidenCallRecipe>(Val: this)
114 ->getCalledScalarFunction()
115 ->onlyWritesMemory();
116 case VPWidenIntrinsicSC:
117 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
118 case VPBranchOnMaskSC:
119 case VPFirstOrderRecurrencePHISC:
120 case VPPredInstPHISC:
121 case VPScalarIVStepsSC:
122 case VPWidenStoreEVLSC:
123 case VPWidenStoreSC:
124 return false;
125 case VPBlendSC:
126 case VPReductionEVLSC:
127 case VPReductionSC:
128 case VPVectorPointerSC:
129 case VPWidenCanonicalIVSC:
130 case VPWidenCastSC:
131 case VPWidenGEPSC:
132 case VPWidenIntOrFpInductionSC:
133 case VPWidenPHISC:
134 case VPWidenSC:
135 case VPWidenSelectSC: {
136 const Instruction *I =
137 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
138 (void)I;
139 assert((!I || !I->mayReadFromMemory()) &&
140 "underlying instruction may read from memory");
141 return false;
142 }
143 default:
144 return true;
145 }
146}
147
148bool VPRecipeBase::mayHaveSideEffects() const {
149 switch (getVPDefID()) {
150 case VPExpressionSC:
151 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
152 case VPDerivedIVSC:
153 case VPFirstOrderRecurrencePHISC:
154 case VPPredInstPHISC:
155 case VPVectorEndPointerSC:
156 return false;
157 case VPInstructionSC:
158 return mayWriteToMemory();
159 case VPWidenCallSC: {
160 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
161 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
162 }
163 case VPWidenIntrinsicSC:
164 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
165 case VPBlendSC:
166 case VPReductionEVLSC:
167 case VPReductionSC:
168 case VPScalarIVStepsSC:
169 case VPVectorPointerSC:
170 case VPWidenCanonicalIVSC:
171 case VPWidenCastSC:
172 case VPWidenGEPSC:
173 case VPWidenIntOrFpInductionSC:
174 case VPWidenPHISC:
175 case VPWidenPointerInductionSC:
176 case VPWidenSC:
177 case VPWidenSelectSC: {
178 const Instruction *I =
179 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
180 (void)I;
181 assert((!I || !I->mayHaveSideEffects()) &&
182 "underlying instruction has side-effects");
183 return false;
184 }
185 case VPInterleaveSC:
186 return mayWriteToMemory();
187 case VPWidenLoadEVLSC:
188 case VPWidenLoadSC:
189 case VPWidenStoreEVLSC:
190 case VPWidenStoreSC:
191 assert(
192 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
193 mayWriteToMemory() &&
194 "mayHaveSideffects result for ingredient differs from this "
195 "implementation");
196 return mayWriteToMemory();
197 case VPReplicateSC: {
198 auto *R = cast<VPReplicateRecipe>(Val: this);
199 return R->getUnderlyingInstr()->mayHaveSideEffects();
200 }
201 default:
202 return true;
203 }
204}
205
206void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
207 assert(!Parent && "Recipe already in some VPBasicBlock");
208 assert(InsertPos->getParent() &&
209 "Insertion position not in any VPBasicBlock");
210 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
211}
212
213void VPRecipeBase::insertBefore(VPBasicBlock &BB,
214 iplist<VPRecipeBase>::iterator I) {
215 assert(!Parent && "Recipe already in some VPBasicBlock");
216 assert(I == BB.end() || I->getParent() == &BB);
217 BB.insert(Recipe: this, InsertPt: I);
218}
219
220void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
221 assert(!Parent && "Recipe already in some VPBasicBlock");
222 assert(InsertPos->getParent() &&
223 "Insertion position not in any VPBasicBlock");
224 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
225}
226
227void VPRecipeBase::removeFromParent() {
228 assert(getParent() && "Recipe not in any VPBasicBlock");
229 getParent()->getRecipeList().remove(IT: getIterator());
230 Parent = nullptr;
231}
232
233iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
234 assert(getParent() && "Recipe not in any VPBasicBlock");
235 return getParent()->getRecipeList().erase(where: getIterator());
236}
237
238void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
239 removeFromParent();
240 insertAfter(InsertPos);
241}
242
243void VPRecipeBase::moveBefore(VPBasicBlock &BB,
244 iplist<VPRecipeBase>::iterator I) {
245 removeFromParent();
246 insertBefore(BB, I);
247}
248
249InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
250 // Get the underlying instruction for the recipe, if there is one. It is used
251 // to
252 // * decide if cost computation should be skipped for this recipe,
253 // * apply forced target instruction cost.
254 Instruction *UI = nullptr;
255 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
256 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
257 else if (auto *IG = dyn_cast<VPInterleaveRecipe>(Val: this))
258 UI = IG->getInsertPos();
259 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
260 UI = &WidenMem->getIngredient();
261
262 InstructionCost RecipeCost;
263 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
264 RecipeCost = 0;
265 } else {
266 RecipeCost = computeCost(VF, Ctx);
267 if (UI && ForceTargetInstructionCost.getNumOccurrences() > 0 &&
268 RecipeCost.isValid())
269 RecipeCost = InstructionCost(ForceTargetInstructionCost);
270 }
271
272 LLVM_DEBUG({
273 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
274 dump();
275 });
276 return RecipeCost;
277}
278
279InstructionCost VPRecipeBase::computeCost(ElementCount VF,
280 VPCostContext &Ctx) const {
281 llvm_unreachable("subclasses should implement computeCost");
282}
283
284bool VPRecipeBase::isPhi() const {
285 return (getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC) ||
286 (isa<VPInstruction>(Val: this) &&
287 cast<VPInstruction>(Val: this)->getOpcode() == Instruction::PHI) ||
288 isa<VPIRPhi>(Val: this);
289}
290
291bool VPRecipeBase::isScalarCast() const {
292 auto *VPI = dyn_cast<VPInstruction>(Val: this);
293 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
294}
295
296InstructionCost
297VPPartialReductionRecipe::computeCost(ElementCount VF,
298 VPCostContext &Ctx) const {
299 std::optional<unsigned> Opcode;
300 VPValue *Op = getOperand(N: 0);
301 VPRecipeBase *OpR = Op->getDefiningRecipe();
302
303 // If the partial reduction is predicated, a select will be operand 0
304 using namespace llvm::VPlanPatternMatch;
305 if (match(V: getOperand(N: 1), P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(V&: Op), Op2: m_VPValue()))) {
306 OpR = Op->getDefiningRecipe();
307 }
308
309 Type *InputTypeA = nullptr, *InputTypeB = nullptr;
310 TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
311 ExtBType = TTI::PR_None;
312
313 auto GetExtendKind = [](VPRecipeBase *R) {
314 if (!R)
315 return TTI::PR_None;
316 auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(Val: R);
317 if (!WidenCastR)
318 return TTI::PR_None;
319 if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
320 return TTI::PR_ZeroExtend;
321 if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
322 return TTI::PR_SignExtend;
323 return TTI::PR_None;
324 };
325
326 // Pick out opcode, type/ext information and use sub side effects from a widen
327 // recipe.
328 auto HandleWiden = [&](VPWidenRecipe *Widen) {
329 if (match(V: Widen,
330 P: m_Binary<Instruction::Sub>(Op0: m_SpecificInt(V: 0), Op1: m_VPValue(V&: Op)))) {
331 Widen = dyn_cast<VPWidenRecipe>(Val: Op->getDefiningRecipe());
332 }
333 Opcode = Widen->getOpcode();
334 VPRecipeBase *ExtAR = Widen->getOperand(N: 0)->getDefiningRecipe();
335 VPRecipeBase *ExtBR = Widen->getOperand(N: 1)->getDefiningRecipe();
336 InputTypeA = Ctx.Types.inferScalarType(V: ExtAR ? ExtAR->getOperand(N: 0)
337 : Widen->getOperand(N: 0));
338 InputTypeB = Ctx.Types.inferScalarType(V: ExtBR ? ExtBR->getOperand(N: 0)
339 : Widen->getOperand(N: 1));
340 ExtAType = GetExtendKind(ExtAR);
341 ExtBType = GetExtendKind(ExtBR);
342 };
343
344 if (isa<VPWidenCastRecipe>(Val: OpR)) {
345 InputTypeA = Ctx.Types.inferScalarType(V: OpR->getOperand(N: 0));
346 ExtAType = GetExtendKind(OpR);
347 } else if (isa<VPReductionPHIRecipe>(Val: OpR)) {
348 auto RedPhiOp1R = getOperand(N: 1)->getDefiningRecipe();
349 if (isa<VPWidenCastRecipe>(Val: RedPhiOp1R)) {
350 InputTypeA = Ctx.Types.inferScalarType(V: RedPhiOp1R->getOperand(N: 0));
351 ExtAType = GetExtendKind(RedPhiOp1R);
352 } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: RedPhiOp1R))
353 HandleWiden(Widen);
354 } else if (auto Widen = dyn_cast<VPWidenRecipe>(Val: OpR)) {
355 HandleWiden(Widen);
356 } else if (auto Reduction = dyn_cast<VPPartialReductionRecipe>(Val: OpR)) {
357 return Reduction->computeCost(VF, Ctx);
358 }
359 auto *PhiType = Ctx.Types.inferScalarType(V: getOperand(N: 1));
360 return Ctx.TTI.getPartialReductionCost(Opcode: getOpcode(), InputTypeA, InputTypeB,
361 AccumType: PhiType, VF, OpAExtend: ExtAType, OpBExtend: ExtBType,
362 BinOp: Opcode, CostKind: Ctx.CostKind);
363}
364
365void VPPartialReductionRecipe::execute(VPTransformState &State) {
366 auto &Builder = State.Builder;
367
368 assert(getOpcode() == Instruction::Add &&
369 "Unhandled partial reduction opcode");
370
371 Value *BinOpVal = State.get(Def: getOperand(N: 1));
372 Value *PhiVal = State.get(Def: getOperand(N: 0));
373 assert(PhiVal && BinOpVal && "Phi and Mul must be set");
374
375 Type *RetTy = PhiVal->getType();
376
377 CallInst *V = Builder.CreateIntrinsic(
378 RetTy, ID: Intrinsic::experimental_vector_partial_reduce_add,
379 Args: {PhiVal, BinOpVal}, FMFSource: nullptr, Name: "partial.reduce");
380
381 State.set(Def: this, V);
382}
383
384#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
385void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
386 VPSlotTracker &SlotTracker) const {
387 O << Indent << "PARTIAL-REDUCE ";
388 printAsOperand(O, SlotTracker);
389 O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
390 printOperands(O, SlotTracker);
391}
392#endif
393
394FastMathFlags VPIRFlags::getFastMathFlags() const {
395 assert(OpType == OperationType::FPMathOp &&
396 "recipe doesn't have fast math flags");
397 FastMathFlags Res;
398 Res.setAllowReassoc(FMFs.AllowReassoc);
399 Res.setNoNaNs(FMFs.NoNaNs);
400 Res.setNoInfs(FMFs.NoInfs);
401 Res.setNoSignedZeros(FMFs.NoSignedZeros);
402 Res.setAllowReciprocal(FMFs.AllowReciprocal);
403 Res.setAllowContract(FMFs.AllowContract);
404 Res.setApproxFunc(FMFs.ApproxFunc);
405 return Res;
406}
407
408#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
409void VPSingleDefRecipe::dump() const { VPDef::dump(); }
410#endif
411
412template <unsigned PartOpIdx>
413VPValue *
414VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
415 if (U.getNumOperands() == PartOpIdx + 1)
416 return U.getOperand(N: PartOpIdx);
417 return nullptr;
418}
419
420template <unsigned PartOpIdx>
421unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(VPUser &U) const {
422 if (auto *UnrollPartOp = getUnrollPartOperand(U))
423 return cast<ConstantInt>(UnrollPartOp->getLiveInIRValue())->getZExtValue();
424 return 0;
425}
426
427namespace llvm {
428template class VPUnrollPartAccessor<2>;
429template class VPUnrollPartAccessor<3>;
430}
431
432VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
433 const VPIRFlags &Flags, DebugLoc DL,
434 const Twine &Name)
435 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL),
436 VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {
437 assert(flagsValidForOpcode(getOpcode()) &&
438 "Set flags not supported for the provided opcode");
439 assert((getNumOperandsForOpcode(Opcode) == -1u ||
440 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
441 "number of operands does not match opcode");
442}
443
444#ifndef NDEBUG
445unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
446 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
447 return 1;
448
449 if (Instruction::isBinaryOp(Opcode))
450 return 2;
451
452 switch (Opcode) {
453 case VPInstruction::StepVector:
454 return 0;
455 case Instruction::Alloca:
456 case Instruction::ExtractValue:
457 case Instruction::Freeze:
458 case Instruction::Load:
459 case VPInstruction::AnyOf:
460 case VPInstruction::BranchOnCond:
461 case VPInstruction::CalculateTripCountMinusVF:
462 case VPInstruction::CanonicalIVIncrementForPart:
463 case VPInstruction::ExplicitVectorLength:
464 case VPInstruction::ExtractLastElement:
465 case VPInstruction::ExtractPenultimateElement:
466 case VPInstruction::FirstActiveLane:
467 case VPInstruction::Not:
468 return 1;
469 case Instruction::ICmp:
470 case Instruction::FCmp:
471 case Instruction::Store:
472 case VPInstruction::ActiveLaneMask:
473 case VPInstruction::BranchOnCount:
474 case VPInstruction::ComputeReductionResult:
475 case VPInstruction::FirstOrderRecurrenceSplice:
476 case VPInstruction::LogicalAnd:
477 case VPInstruction::PtrAdd:
478 case VPInstruction::WideIVStep:
479 return 2;
480 case Instruction::Select:
481 case VPInstruction::ComputeAnyOfResult:
482 case VPInstruction::ReductionStartVector:
483 return 3;
484 case VPInstruction::ComputeFindIVResult:
485 return 4;
486 case Instruction::Call:
487 case Instruction::GetElementPtr:
488 case Instruction::PHI:
489 case Instruction::Switch:
490 // Cannot determine the number of operands from the opcode.
491 return -1u;
492 }
493 llvm_unreachable("all cases should be handled above");
494}
495#endif
496
497bool VPInstruction::doesGeneratePerAllLanes() const {
498 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
499}
500
501bool VPInstruction::canGenerateScalarForFirstLane() const {
502 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
503 return true;
504 if (isSingleScalar() || isVectorToScalar())
505 return true;
506 switch (Opcode) {
507 case Instruction::Freeze:
508 case Instruction::ICmp:
509 case Instruction::PHI:
510 case Instruction::Select:
511 case VPInstruction::BranchOnCond:
512 case VPInstruction::BranchOnCount:
513 case VPInstruction::CalculateTripCountMinusVF:
514 case VPInstruction::CanonicalIVIncrementForPart:
515 case VPInstruction::PtrAdd:
516 case VPInstruction::ExplicitVectorLength:
517 case VPInstruction::AnyOf:
518 return true;
519 default:
520 return false;
521 }
522}
523
524Value *VPInstruction::generatePerLane(VPTransformState &State,
525 const VPLane &Lane) {
526 IRBuilderBase &Builder = State.Builder;
527
528 assert(getOpcode() == VPInstruction::PtrAdd &&
529 "only PtrAdd opcodes are supported for now");
530 return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Lane),
531 Offset: State.get(Def: getOperand(N: 1), Lane), Name);
532}
533
534/// Create a conditional branch using \p Cond branching to the successors of \p
535/// VPBB. Note that the first successor is always forward (i.e. not created yet)
536/// while the second successor may already have been created (if it is a header
537/// block and VPBB is a latch).
538static BranchInst *createCondBranch(Value *Cond, VPBasicBlock *VPBB,
539 VPTransformState &State) {
540 // Replace the temporary unreachable terminator with a new conditional
541 // branch, hooking it up to backward destination (header) for latch blocks
542 // now, and to forward destination(s) later when they are created.
543 // Second successor may be backwards - iff it is already in VPBB2IRBB.
544 VPBasicBlock *SecondVPSucc = cast<VPBasicBlock>(Val: VPBB->getSuccessors()[1]);
545 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
546 BasicBlock *IRBB = State.CFG.VPBB2IRBB[VPBB];
547 BranchInst *CondBr = State.Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
548 // First successor is always forward, reset it to nullptr
549 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
550 IRBB->getTerminator()->eraseFromParent();
551 return CondBr;
552}
553
554Value *VPInstruction::generate(VPTransformState &State) {
555 IRBuilderBase &Builder = State.Builder;
556
557 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
558 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
559 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
560 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
561 auto *Res =
562 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
563 if (auto *I = dyn_cast<Instruction>(Val: Res))
564 applyFlags(I&: *I);
565 return Res;
566 }
567
568 switch (getOpcode()) {
569 case VPInstruction::Not: {
570 Value *A = State.get(Def: getOperand(N: 0));
571 return Builder.CreateNot(V: A, Name);
572 }
573 case Instruction::ExtractElement: {
574 assert(State.VF.isVector() && "Only extract elements from vectors");
575 if (getOperand(N: 1)->isLiveIn()) {
576 unsigned IdxToExtract =
577 cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue())->getZExtValue();
578 return State.get(Def: getOperand(N: 0), Lane: VPLane(IdxToExtract));
579 }
580 Value *Vec = State.get(Def: getOperand(N: 0));
581 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
582 return Builder.CreateExtractElement(Vec, Idx, Name);
583 }
584 case Instruction::Freeze: {
585 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
586 return Builder.CreateFreeze(V: Op, Name);
587 }
588 case Instruction::ICmp: {
589 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
590 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
591 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
592 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
593 }
594 case Instruction::PHI: {
595 llvm_unreachable("should be handled by VPPhi::execute");
596 }
597 case Instruction::Select: {
598 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
599 Value *Cond = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
600 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
601 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
602 return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name);
603 }
604 case VPInstruction::ActiveLaneMask: {
605 // Get first lane of vector induction variable.
606 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
607 // Get the original loop tripcount.
608 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
609
610 // If this part of the active lane mask is scalar, generate the CMP directly
611 // to avoid unnecessary extracts.
612 if (State.VF.isScalar())
613 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
614 Name);
615
616 auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext());
617 auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF);
618 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
619 Types: {PredTy, ScalarTC->getType()},
620 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
621 }
622 case VPInstruction::FirstOrderRecurrenceSplice: {
623 // Generate code to combine the previous and current values in vector v3.
624 //
625 // vector.ph:
626 // v_init = vector(..., ..., ..., a[-1])
627 // br vector.body
628 //
629 // vector.body
630 // i = phi [0, vector.ph], [i+4, vector.body]
631 // v1 = phi [v_init, vector.ph], [v2, vector.body]
632 // v2 = a[i, i+1, i+2, i+3];
633 // v3 = vector(v1(3), v2(0, 1, 2))
634
635 auto *V1 = State.get(Def: getOperand(N: 0));
636 if (!V1->getType()->isVectorTy())
637 return V1;
638 Value *V2 = State.get(Def: getOperand(N: 1));
639 return Builder.CreateVectorSplice(V1, V2, Imm: -1, Name);
640 }
641 case VPInstruction::CalculateTripCountMinusVF: {
642 unsigned UF = getParent()->getPlan()->getUF();
643 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
644 Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF);
645 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step);
646 Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step);
647 Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0);
648 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
649 }
650 case VPInstruction::ExplicitVectorLength: {
651 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
652 // be outside of the main loop.
653 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
654 // Compute EVL
655 assert(AVL->getType()->isIntegerTy() &&
656 "Requested vector length should be an integer.");
657
658 assert(State.VF.isScalable() && "Expected scalable vector factor.");
659 Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue());
660
661 Value *EVL = State.Builder.CreateIntrinsic(
662 RetTy: State.Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
663 Args: {AVL, VFArg, State.Builder.getTrue()});
664 return EVL;
665 }
666 case VPInstruction::CanonicalIVIncrementForPart: {
667 unsigned Part = getUnrollPart(U&: *this);
668 auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
669 assert(Part != 0 && "Must have a positive part");
670 // The canonical IV is incremented by the vectorization factor (num of
671 // SIMD elements) times the unroll part.
672 Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part);
673 return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(),
674 HasNSW: hasNoSignedWrap());
675 }
676 case VPInstruction::BranchOnCond: {
677 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
678 auto *Br = createCondBranch(Cond, VPBB: getParent(), State);
679 applyMetadata(I&: *Br);
680 return Br;
681 }
682 case VPInstruction::BranchOnCount: {
683 // First create the compare.
684 Value *IV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
685 Value *TC = State.get(Def: getOperand(N: 1), /*IsScalar*/ true);
686 Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC);
687 return createCondBranch(Cond, VPBB: getParent(), State);
688 }
689 case VPInstruction::Broadcast: {
690 return Builder.CreateVectorSplat(
691 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
692 }
693 case VPInstruction::BuildStructVector: {
694 // For struct types, we need to build a new 'wide' struct type, where each
695 // element is widened, i.e., we create a struct of vectors.
696 auto *StructTy =
697 cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)));
698 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
699 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
700 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
701 FieldIndex++) {
702 Value *ScalarValue =
703 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
704 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
705 VectorValue =
706 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
707 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
708 }
709 }
710 return Res;
711 }
712 case VPInstruction::BuildVector: {
713 auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
714 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
715 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
716 for (const auto &[Idx, Op] : enumerate(First: operands()))
717 Res = State.Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
718 Idx: State.Builder.getInt32(C: Idx));
719 return Res;
720 }
721 case VPInstruction::ReductionStartVector: {
722 if (State.VF.isScalar())
723 return State.get(Def: getOperand(N: 0), IsScalar: true);
724 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
725 Builder.setFastMathFlags(getFastMathFlags());
726 // If this start vector is scaled then it should produce a vector with fewer
727 // elements than the VF.
728 ElementCount VF = State.VF.divideCoefficientBy(
729 RHS: cast<ConstantInt>(Val: getOperand(N: 2)->getLiveInIRValue())->getZExtValue());
730 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
731 Constant *Zero = Builder.getInt32(C: 0);
732 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
733 Idx: Zero);
734 }
735 case VPInstruction::ComputeAnyOfResult: {
736 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
737 // and will be removed by breaking up the recipe further.
738 auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0));
739 auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
740 Value *ReducedPartRdx = State.get(Def: getOperand(N: 2));
741 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
742 ReducedPartRdx = Builder.CreateBinOp(
743 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(
744 Kind: RecurKind::AnyOf),
745 LHS: State.get(Def: getOperand(N: Idx)), RHS: ReducedPartRdx, Name: "bin.rdx");
746 return createAnyOfReduction(B&: Builder, Src: ReducedPartRdx,
747 InitVal: State.get(Def: getOperand(N: 1), Lane: VPLane(0)), OrigPhi);
748 }
749 case VPInstruction::ComputeFindIVResult: {
750 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
751 // and will be removed by breaking up the recipe further.
752 auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0));
753 // Get its reduction variable descriptor.
754 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
755 RecurKind RK = RdxDesc.getRecurrenceKind();
756 assert(RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
757 "Unexpected reduction kind");
758 assert(!PhiR->isInLoop() &&
759 "In-loop FindLastIV reduction is not supported yet");
760
761 // The recipe's operands are the reduction phi, the start value, the
762 // sentinel value, followed by one operand for each part of the reduction.
763 unsigned UF = getNumOperands() - 3;
764 Value *ReducedPartRdx = State.get(Def: getOperand(N: 3));
765 RecurKind MinMaxKind;
766 bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(Kind: RK);
767 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind: RK)) {
768 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
769 } else {
770 assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) &&
771 "Kind must either be FindLastIV or FindFirstIV");
772 assert(IsSigned && "Only FindFirstIV with SMax is currently supported");
773 MinMaxKind = RecurKind::SMin;
774 }
775 for (unsigned Part = 1; Part < UF; ++Part)
776 ReducedPartRdx = createMinMaxOp(Builder, RK: MinMaxKind, Left: ReducedPartRdx,
777 Right: State.get(Def: getOperand(N: 3 + Part)));
778
779 Value *Start = State.get(Def: getOperand(N: 1), IsScalar: true);
780 Value *Sentinel = getOperand(N: 2)->getLiveInIRValue();
781 return createFindLastIVReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK, Start,
782 Sentinel);
783 }
784 case VPInstruction::ComputeReductionResult: {
785 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
786 // and will be removed by breaking up the recipe further.
787 auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0));
788 // Get its reduction variable descriptor.
789 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
790
791 RecurKind RK = RdxDesc.getRecurrenceKind();
792 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
793 "should be handled by ComputeFindIVResult");
794
795 // The recipe's operands are the reduction phi, followed by one operand for
796 // each part of the reduction.
797 unsigned UF = getNumOperands() - 1;
798 VectorParts RdxParts(UF);
799 for (unsigned Part = 0; Part < UF; ++Part)
800 RdxParts[Part] = State.get(Def: getOperand(N: 1 + Part), IsScalar: PhiR->isInLoop());
801
802 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
803 if (hasFastMathFlags())
804 Builder.setFastMathFlags(getFastMathFlags());
805
806 // Reduce all of the unrolled parts into a single vector.
807 Value *ReducedPartRdx = RdxParts[0];
808 if (PhiR->isOrdered()) {
809 ReducedPartRdx = RdxParts[UF - 1];
810 } else {
811 // Floating-point operations should have some FMF to enable the reduction.
812 for (unsigned Part = 1; Part < UF; ++Part) {
813 Value *RdxPart = RdxParts[Part];
814 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
815 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
816 else
817 ReducedPartRdx =
818 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(),
819 LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
820 }
821 }
822
823 // Create the reduction after the loop. Note that inloop reductions create
824 // the target reduction in the loop using a Reduction recipe.
825 if (State.VF.isVector() && !PhiR->isInLoop()) {
826 // TODO: Support in-order reductions based on the recurrence descriptor.
827 // All ops in the reduction inherit fast-math-flags from the recurrence
828 // descriptor.
829 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
830 }
831
832 return ReducedPartRdx;
833 }
834 case VPInstruction::ExtractLastElement:
835 case VPInstruction::ExtractPenultimateElement: {
836 unsigned Offset = getOpcode() == VPInstruction::ExtractLastElement ? 1 : 2;
837 Value *Res;
838 if (State.VF.isVector()) {
839 assert(Offset <= State.VF.getKnownMinValue() &&
840 "invalid offset to extract from");
841 // Extract lane VF - Offset from the operand.
842 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
843 } else {
844 assert(Offset <= 1 && "invalid offset to extract from");
845 Res = State.get(Def: getOperand(N: 0));
846 }
847 if (isa<ExtractElementInst>(Val: Res))
848 Res->setName(Name);
849 return Res;
850 }
851 case VPInstruction::LogicalAnd: {
852 Value *A = State.get(Def: getOperand(N: 0));
853 Value *B = State.get(Def: getOperand(N: 1));
854 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
855 }
856 case VPInstruction::PtrAdd: {
857 assert(vputils::onlyFirstLaneUsed(this) &&
858 "can only generate first lane for PtrAdd");
859 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
860 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
861 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
862 }
863 case VPInstruction::AnyOf: {
864 Value *Res = State.get(Def: getOperand(N: 0));
865 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
866 Res = Builder.CreateOr(LHS: Res, RHS: State.get(Def: Op));
867 return Builder.CreateOrReduce(Src: Res);
868 }
869 case VPInstruction::FirstActiveLane: {
870 if (getNumOperands() == 1) {
871 Value *Mask = State.get(Def: getOperand(N: 0));
872 return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
873 ZeroIsPoison: true, Name);
874 }
875 // If there are multiple operands, create a chain of selects to pick the
876 // first operand with an active lane and add the number of lanes of the
877 // preceding operands.
878 Value *RuntimeVF =
879 getRuntimeVF(B&: State.Builder, Ty: State.Builder.getInt64Ty(), VF: State.VF);
880 unsigned LastOpIdx = getNumOperands() - 1;
881 Value *Res = nullptr;
882 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
883 Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
884 ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)), ZeroIsPoison: true, Name);
885 Value *Current = Builder.CreateAdd(
886 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
887 if (Res) {
888 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
889 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
890 } else {
891 Res = Current;
892 }
893 }
894
895 return Res;
896 }
897 default:
898 llvm_unreachable("Unsupported opcode for instruction");
899 }
900}
901
902InstructionCost VPInstruction::computeCost(ElementCount VF,
903 VPCostContext &Ctx) const {
904 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
905 Type *ResTy = Ctx.Types.inferScalarType(V: this);
906 if (!vputils::onlyFirstLaneUsed(Def: this))
907 ResTy = toVectorTy(Scalar: ResTy, EC: VF);
908
909 if (!getUnderlyingValue()) {
910 switch (getOpcode()) {
911 case Instruction::FMul:
912 return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind);
913 default:
914 // TODO: Compute cost for VPInstructions without underlying values once
915 // the legacy cost model has been retired.
916 return 0;
917 }
918 }
919
920 assert(!doesGeneratePerAllLanes() &&
921 "Should only generate a vector value or single scalar, not scalars "
922 "for all lanes.");
923 return Ctx.TTI.getArithmeticInstrCost(Opcode: getOpcode(), Ty: ResTy, CostKind: Ctx.CostKind);
924 }
925
926 switch (getOpcode()) {
927 case Instruction::ExtractElement: {
928 // Add on the cost of extracting the element.
929 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
930 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
931 CostKind: Ctx.CostKind);
932 }
933 case VPInstruction::AnyOf: {
934 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
935 return Ctx.TTI.getArithmeticReductionCost(
936 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
937 }
938 case VPInstruction::FirstActiveLane: {
939 // Calculate the cost of determining the lane index.
940 auto *PredTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
941 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
942 Type::getInt64Ty(C&: Ctx.LLVMCtx),
943 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
944 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
945 }
946 case VPInstruction::FirstOrderRecurrenceSplice: {
947 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
948 SmallVector<int> Mask(VF.getKnownMinValue());
949 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
950 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
951
952 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
953 DstTy: cast<VectorType>(Val: VectorTy),
954 SrcTy: cast<VectorType>(Val: VectorTy), Mask,
955 CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1);
956 }
957 case VPInstruction::ActiveLaneMask: {
958 Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
959 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
960 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
961 {ArgTy, ArgTy});
962 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
963 }
964 case VPInstruction::ExplicitVectorLength: {
965 Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0));
966 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
967 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
968 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
969 I32Ty, {Arg0Ty, I32Ty, I1Ty});
970 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
971 }
972 case VPInstruction::ExtractPenultimateElement:
973 if (VF == ElementCount::getScalable(MinVal: 1))
974 return InstructionCost::getInvalid();
975 LLVM_FALLTHROUGH;
976 default:
977 // TODO: Compute cost other VPInstructions once the legacy cost model has
978 // been retired.
979 assert(!getUnderlyingValue() &&
980 "unexpected VPInstruction witht underlying value");
981 return 0;
982 }
983}
984
985bool VPInstruction::isVectorToScalar() const {
986 return getOpcode() == VPInstruction::ExtractLastElement ||
987 getOpcode() == VPInstruction::ExtractPenultimateElement ||
988 getOpcode() == Instruction::ExtractElement ||
989 getOpcode() == VPInstruction::FirstActiveLane ||
990 getOpcode() == VPInstruction::ComputeAnyOfResult ||
991 getOpcode() == VPInstruction::ComputeFindIVResult ||
992 getOpcode() == VPInstruction::ComputeReductionResult ||
993 getOpcode() == VPInstruction::AnyOf;
994}
995
996bool VPInstruction::isSingleScalar() const {
997 return getOpcode() == Instruction::PHI || isScalarCast();
998}
999
1000void VPInstruction::execute(VPTransformState &State) {
1001 assert(!State.Lane && "VPInstruction executing an Lane");
1002 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1003 assert(flagsValidForOpcode(getOpcode()) &&
1004 "Set flags not supported for the provided opcode");
1005 if (hasFastMathFlags())
1006 State.Builder.setFastMathFlags(getFastMathFlags());
1007 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1008 (vputils::onlyFirstLaneUsed(Def: this) ||
1009 isVectorToScalar() || isSingleScalar());
1010 bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
1011 if (GeneratesPerAllLanes) {
1012 for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
1013 Lane != NumLanes; ++Lane) {
1014 Value *GeneratedValue = generatePerLane(State, Lane: VPLane(Lane));
1015 assert(GeneratedValue && "generatePerLane must produce a value");
1016 State.set(Def: this, V: GeneratedValue, Lane: VPLane(Lane));
1017 }
1018 return;
1019 }
1020
1021 Value *GeneratedValue = generate(State);
1022 if (!hasResult())
1023 return;
1024 assert(GeneratedValue && "generate must produce a value");
1025 assert((((GeneratedValue->getType()->isVectorTy() ||
1026 GeneratedValue->getType()->isStructTy()) ==
1027 !GeneratesPerFirstLaneOnly) ||
1028 State.VF.isScalar()) &&
1029 "scalar value but not only first lane defined");
1030 State.set(Def: this, V: GeneratedValue,
1031 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1032}
1033
1034bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1035 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1036 return false;
1037 switch (getOpcode()) {
1038 case Instruction::ExtractElement:
1039 case Instruction::Freeze:
1040 case Instruction::ICmp:
1041 case Instruction::Select:
1042 case VPInstruction::AnyOf:
1043 case VPInstruction::BuildStructVector:
1044 case VPInstruction::BuildVector:
1045 case VPInstruction::CalculateTripCountMinusVF:
1046 case VPInstruction::CanonicalIVIncrementForPart:
1047 case VPInstruction::ExtractLastElement:
1048 case VPInstruction::ExtractPenultimateElement:
1049 case VPInstruction::FirstActiveLane:
1050 case VPInstruction::FirstOrderRecurrenceSplice:
1051 case VPInstruction::LogicalAnd:
1052 case VPInstruction::Not:
1053 case VPInstruction::PtrAdd:
1054 case VPInstruction::WideIVStep:
1055 case VPInstruction::StepVector:
1056 case VPInstruction::ReductionStartVector:
1057 return false;
1058 default:
1059 return true;
1060 }
1061}
1062
1063bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
1064 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1065 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1066 return vputils::onlyFirstLaneUsed(Def: this);
1067
1068 switch (getOpcode()) {
1069 default:
1070 return false;
1071 case Instruction::ExtractElement:
1072 return Op == getOperand(N: 1);
1073 case Instruction::PHI:
1074 return true;
1075 case Instruction::ICmp:
1076 case Instruction::Select:
1077 case Instruction::Or:
1078 case Instruction::Freeze:
1079 // TODO: Cover additional opcodes.
1080 return vputils::onlyFirstLaneUsed(Def: this);
1081 case VPInstruction::ActiveLaneMask:
1082 case VPInstruction::ExplicitVectorLength:
1083 case VPInstruction::CalculateTripCountMinusVF:
1084 case VPInstruction::CanonicalIVIncrementForPart:
1085 case VPInstruction::BranchOnCount:
1086 case VPInstruction::BranchOnCond:
1087 case VPInstruction::Broadcast:
1088 case VPInstruction::ReductionStartVector:
1089 return true;
1090 case VPInstruction::PtrAdd:
1091 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1092 case VPInstruction::ComputeAnyOfResult:
1093 case VPInstruction::ComputeFindIVResult:
1094 return Op == getOperand(N: 1);
1095 };
1096 llvm_unreachable("switch should return");
1097}
1098
1099bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
1100 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1101 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1102 return vputils::onlyFirstPartUsed(Def: this);
1103
1104 switch (getOpcode()) {
1105 default:
1106 return false;
1107 case Instruction::ICmp:
1108 case Instruction::Select:
1109 return vputils::onlyFirstPartUsed(Def: this);
1110 case VPInstruction::BranchOnCount:
1111 case VPInstruction::BranchOnCond:
1112 case VPInstruction::CanonicalIVIncrementForPart:
1113 return true;
1114 };
1115 llvm_unreachable("switch should return");
1116}
1117
1118#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1119void VPInstruction::dump() const {
1120 VPSlotTracker SlotTracker(getParent()->getPlan());
1121 print(dbgs(), "", SlotTracker);
1122}
1123
1124void VPInstruction::print(raw_ostream &O, const Twine &Indent,
1125 VPSlotTracker &SlotTracker) const {
1126 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1127
1128 if (hasResult()) {
1129 printAsOperand(O, SlotTracker);
1130 O << " = ";
1131 }
1132
1133 switch (getOpcode()) {
1134 case VPInstruction::Not:
1135 O << "not";
1136 break;
1137 case VPInstruction::SLPLoad:
1138 O << "combined load";
1139 break;
1140 case VPInstruction::SLPStore:
1141 O << "combined store";
1142 break;
1143 case VPInstruction::ActiveLaneMask:
1144 O << "active lane mask";
1145 break;
1146 case VPInstruction::ExplicitVectorLength:
1147 O << "EXPLICIT-VECTOR-LENGTH";
1148 break;
1149 case VPInstruction::FirstOrderRecurrenceSplice:
1150 O << "first-order splice";
1151 break;
1152 case VPInstruction::BranchOnCond:
1153 O << "branch-on-cond";
1154 break;
1155 case VPInstruction::CalculateTripCountMinusVF:
1156 O << "TC > VF ? TC - VF : 0";
1157 break;
1158 case VPInstruction::CanonicalIVIncrementForPart:
1159 O << "VF * Part +";
1160 break;
1161 case VPInstruction::BranchOnCount:
1162 O << "branch-on-count";
1163 break;
1164 case VPInstruction::Broadcast:
1165 O << "broadcast";
1166 break;
1167 case VPInstruction::BuildStructVector:
1168 O << "buildstructvector";
1169 break;
1170 case VPInstruction::BuildVector:
1171 O << "buildvector";
1172 break;
1173 case VPInstruction::ExtractLastElement:
1174 O << "extract-last-element";
1175 break;
1176 case VPInstruction::ExtractPenultimateElement:
1177 O << "extract-penultimate-element";
1178 break;
1179 case VPInstruction::ComputeAnyOfResult:
1180 O << "compute-anyof-result";
1181 break;
1182 case VPInstruction::ComputeFindIVResult:
1183 O << "compute-find-iv-result";
1184 break;
1185 case VPInstruction::ComputeReductionResult:
1186 O << "compute-reduction-result";
1187 break;
1188 case VPInstruction::LogicalAnd:
1189 O << "logical-and";
1190 break;
1191 case VPInstruction::PtrAdd:
1192 O << "ptradd";
1193 break;
1194 case VPInstruction::AnyOf:
1195 O << "any-of";
1196 break;
1197 case VPInstruction::FirstActiveLane:
1198 O << "first-active-lane";
1199 break;
1200 case VPInstruction::ReductionStartVector:
1201 O << "reduction-start-vector";
1202 break;
1203 default:
1204 O << Instruction::getOpcodeName(getOpcode());
1205 }
1206
1207 printFlags(O);
1208 printOperands(O, SlotTracker);
1209
1210 if (auto DL = getDebugLoc()) {
1211 O << ", !dbg ";
1212 DL.print(O);
1213 }
1214}
1215#endif
1216
1217void VPInstructionWithType::execute(VPTransformState &State) {
1218 State.setDebugLocFrom(getDebugLoc());
1219 if (isScalarCast()) {
1220 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1221 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1222 V: Op, DestTy: ResultTy);
1223 State.set(Def: this, V: Cast, Lane: VPLane(0));
1224 return;
1225 }
1226 switch (getOpcode()) {
1227 case VPInstruction::StepVector: {
1228 Value *StepVector =
1229 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1230 State.set(Def: this, V: StepVector);
1231 break;
1232 }
1233 default:
1234 llvm_unreachable("opcode not implemented yet");
1235 }
1236}
1237
1238#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1239void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,
1240 VPSlotTracker &SlotTracker) const {
1241 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1242 printAsOperand(O, SlotTracker);
1243 O << " = ";
1244
1245 switch (getOpcode()) {
1246 case VPInstruction::WideIVStep:
1247 O << "wide-iv-step ";
1248 printOperands(O, SlotTracker);
1249 break;
1250 case VPInstruction::StepVector:
1251 O << "step-vector " << *ResultTy;
1252 break;
1253 default:
1254 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1255 O << Instruction::getOpcodeName(getOpcode()) << " ";
1256 printOperands(O, SlotTracker);
1257 O << " to " << *ResultTy;
1258 }
1259}
1260#endif
1261
1262void VPPhi::execute(VPTransformState &State) {
1263 State.setDebugLocFrom(getDebugLoc());
1264 PHINode *NewPhi = State.Builder.CreatePHI(
1265 Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName());
1266 unsigned NumIncoming = getNumIncoming();
1267 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1268 // TODO: Fixup all incoming values of header phis once recipes defining them
1269 // are introduced.
1270 NumIncoming = 1;
1271 }
1272 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1273 Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0));
1274 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1275 NewPhi->addIncoming(V: IncV, BB: PredBB);
1276 }
1277 State.set(Def: this, V: NewPhi, Lane: VPLane(0));
1278}
1279
1280#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1281void VPPhi::print(raw_ostream &O, const Twine &Indent,
1282 VPSlotTracker &SlotTracker) const {
1283 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1284 printAsOperand(O, SlotTracker);
1285 O << " = phi ";
1286 printPhiOperands(O, SlotTracker);
1287}
1288#endif
1289
1290VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1291 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1292 return new VPIRPhi(*Phi);
1293 return new VPIRInstruction(I);
1294}
1295
1296void VPIRInstruction::execute(VPTransformState &State) {
1297 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1298 "PHINodes must be handled by VPIRPhi");
1299 // Advance the insert point after the wrapped IR instruction. This allows
1300 // interleaving VPIRInstructions and other recipes.
1301 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1302}
1303
1304InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1305 VPCostContext &Ctx) const {
1306 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1307 // hence it does not contribute to the cost-modeling for the VPlan.
1308 return 0;
1309}
1310
1311void VPIRInstruction::extractLastLaneOfFirstOperand(VPBuilder &Builder) {
1312 assert(isa<PHINode>(getInstruction()) &&
1313 "can only update exiting operands to phi nodes");
1314 assert(getNumOperands() > 0 && "must have at least one operand");
1315 VPValue *Exiting = getOperand(N: 0);
1316 if (Exiting->isLiveIn())
1317 return;
1318
1319 Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastElement, Operands: {Exiting});
1320 setOperand(I: 0, New: Exiting);
1321}
1322
1323#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1324void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
1325 VPSlotTracker &SlotTracker) const {
1326 O << Indent << "IR " << I;
1327}
1328#endif
1329
1330void VPIRPhi::execute(VPTransformState &State) {
1331 PHINode *Phi = &getIRPhi();
1332 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1333 VPValue *ExitValue = Op;
1334 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1335 ? VPLane::getFirstLane()
1336 : VPLane::getLastLaneForVF(VF: State.VF);
1337 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1338 auto *PredVPBB = Pred->getExitingBasicBlock();
1339 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1340 // Set insertion point in PredBB in case an extract needs to be generated.
1341 // TODO: Model extracts explicitly.
1342 State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt());
1343 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
1344 // If there is no existing block for PredBB in the phi, add a new incoming
1345 // value. Otherwise update the existing incoming value for PredBB.
1346 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
1347 Phi->addIncoming(V, BB: PredBB);
1348 else
1349 Phi->setIncomingValueForBlock(BB: PredBB, V);
1350 }
1351
1352 // Advance the insert point after the wrapped IR instruction. This allows
1353 // interleaving VPIRInstructions and other recipes.
1354 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1355}
1356
1357void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1358 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1359 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1360 "Number of phi operands must match number of predecessors");
1361 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1362 R->removeOperand(Idx: Position);
1363}
1364
1365#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1366void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1367 VPSlotTracker &SlotTracker) const {
1368 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1369 [this, &O, &SlotTracker](auto Op) {
1370 O << "[ ";
1371 Op.value()->printAsOperand(O, SlotTracker);
1372 O << ", ";
1373 getIncomingBlock(Op.index())->printAsOperand(O);
1374 O << " ]";
1375 });
1376}
1377#endif
1378
1379#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1380void VPIRPhi::print(raw_ostream &O, const Twine &Indent,
1381 VPSlotTracker &SlotTracker) const {
1382 VPIRInstruction::print(O, Indent, SlotTracker);
1383
1384 if (getNumOperands() != 0) {
1385 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1386 interleaveComma(
1387 enumerate(operands()), O, [this, &O, &SlotTracker](auto Op) {
1388 Op.value()->printAsOperand(O, SlotTracker);
1389 O << " from ";
1390 getParent()->getPredecessors()[Op.index()]->printAsOperand(O);
1391 });
1392 O << ")";
1393 }
1394}
1395#endif
1396
1397VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
1398 : VPIRMetadata(I) {
1399 if (!LVer || !isa<LoadInst, StoreInst>(Val: &I))
1400 return;
1401 const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(OrigInst: &I);
1402 if (AliasScopeMD)
1403 Metadata.emplace_back(Args: LLVMContext::MD_alias_scope, Args: AliasScopeMD);
1404 if (NoAliasMD)
1405 Metadata.emplace_back(Args: LLVMContext::MD_noalias, Args: NoAliasMD);
1406}
1407
1408void VPIRMetadata::applyMetadata(Instruction &I) const {
1409 for (const auto &[Kind, Node] : Metadata)
1410 I.setMetadata(KindID: Kind, Node);
1411}
1412
1413void VPWidenCallRecipe::execute(VPTransformState &State) {
1414 assert(State.VF.isVector() && "not widening");
1415 assert(Variant != nullptr && "Can't create vector function.");
1416
1417 FunctionType *VFTy = Variant->getFunctionType();
1418 // Add return type if intrinsic is overloaded on it.
1419 SmallVector<Value *, 4> Args;
1420 for (const auto &I : enumerate(First: args())) {
1421 Value *Arg;
1422 // Some vectorized function variants may also take a scalar argument,
1423 // e.g. linear parameters for pointers. This needs to be the scalar value
1424 // from the start of the respective part when interleaving.
1425 if (!VFTy->getParamType(i: I.index())->isVectorTy())
1426 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1427 else
1428 Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value()));
1429 Args.push_back(Elt: Arg);
1430 }
1431
1432 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1433 SmallVector<OperandBundleDef, 1> OpBundles;
1434 if (CI)
1435 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1436
1437 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1438 applyFlags(I&: *V);
1439 applyMetadata(I&: *V);
1440 V->setCallingConv(Variant->getCallingConv());
1441
1442 if (!V->getType()->isVoidTy())
1443 State.set(Def: this, V);
1444}
1445
1446InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1447 VPCostContext &Ctx) const {
1448 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1449 Tys: Variant->getFunctionType()->params(),
1450 CostKind: Ctx.CostKind);
1451}
1452
1453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1454void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
1455 VPSlotTracker &SlotTracker) const {
1456 O << Indent << "WIDEN-CALL ";
1457
1458 Function *CalledFn = getCalledScalarFunction();
1459 if (CalledFn->getReturnType()->isVoidTy())
1460 O << "void ";
1461 else {
1462 printAsOperand(O, SlotTracker);
1463 O << " = ";
1464 }
1465
1466 O << "call";
1467 printFlags(O);
1468 O << " @" << CalledFn->getName() << "(";
1469 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1470 Op->printAsOperand(O, SlotTracker);
1471 });
1472 O << ")";
1473
1474 O << " (using library function";
1475 if (Variant->hasName())
1476 O << ": " << Variant->getName();
1477 O << ")";
1478}
1479#endif
1480
1481void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1482 assert(State.VF.isVector() && "not widening");
1483
1484 SmallVector<Type *, 2> TysForDecl;
1485 // Add return type if intrinsic is overloaded on it.
1486 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1, TTI: State.TTI))
1487 TysForDecl.push_back(Elt: VectorType::get(ElementType: getResultType(), EC: State.VF));
1488 SmallVector<Value *, 4> Args;
1489 for (const auto &I : enumerate(First: operands())) {
1490 // Some intrinsics have a scalar argument - don't replace it with a
1491 // vector.
1492 Value *Arg;
1493 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1494 TTI: State.TTI))
1495 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1496 else
1497 Arg = State.get(Def: I.value(), IsScalar: onlyFirstLaneUsed(Op: I.value()));
1498 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1499 TTI: State.TTI))
1500 TysForDecl.push_back(Elt: Arg->getType());
1501 Args.push_back(Elt: Arg);
1502 }
1503
1504 // Use vector version of the intrinsic.
1505 Module *M = State.Builder.GetInsertBlock()->getModule();
1506 Function *VectorF =
1507 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1508 assert(VectorF &&
1509 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1510
1511 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1512 SmallVector<OperandBundleDef, 1> OpBundles;
1513 if (CI)
1514 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1515
1516 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1517
1518 applyFlags(I&: *V);
1519 applyMetadata(I&: *V);
1520
1521 if (!V->getType()->isVoidTy())
1522 State.set(Def: this, V);
1523}
1524
1525InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1526 VPCostContext &Ctx) const {
1527 // Some backends analyze intrinsic arguments to determine cost. Use the
1528 // underlying value for the operand if it has one. Otherwise try to use the
1529 // operand of the underlying call instruction, if there is one. Otherwise
1530 // clear Arguments.
1531 // TODO: Rework TTI interface to be independent of concrete IR values.
1532 SmallVector<const Value *> Arguments;
1533 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1534 auto *V = Op->getUnderlyingValue();
1535 if (!V) {
1536 if (auto *UI = dyn_cast_or_null<CallBase>(Val: getUnderlyingValue())) {
1537 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1538 continue;
1539 }
1540 Arguments.clear();
1541 break;
1542 }
1543 Arguments.push_back(Elt: V);
1544 }
1545
1546 Type *RetTy = toVectorizedTy(Ty: Ctx.Types.inferScalarType(V: this), EC: VF);
1547 SmallVector<Type *> ParamTys;
1548 for (unsigned I = 0; I != getNumOperands(); ++I)
1549 ParamTys.push_back(
1550 Elt: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: I)), EC: VF));
1551
1552 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1553 FastMathFlags FMF = hasFastMathFlags() ? getFastMathFlags() : FastMathFlags();
1554 IntrinsicCostAttributes CostAttrs(
1555 VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
1556 dyn_cast_or_null<IntrinsicInst>(Val: getUnderlyingValue()),
1557 InstructionCost::getInvalid(), &Ctx.TLI);
1558 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1559}
1560
1561StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1562 return Intrinsic::getBaseName(id: VectorIntrinsicID);
1563}
1564
1565bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
1566 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1567 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1568 auto [Idx, V] = X;
1569 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1570 Idx, nullptr);
1571 });
1572}
1573
1574#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1575void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
1576 VPSlotTracker &SlotTracker) const {
1577 O << Indent << "WIDEN-INTRINSIC ";
1578 if (ResultTy->isVoidTy()) {
1579 O << "void ";
1580 } else {
1581 printAsOperand(O, SlotTracker);
1582 O << " = ";
1583 }
1584
1585 O << "call";
1586 printFlags(O);
1587 O << getIntrinsicName() << "(";
1588
1589 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1590 Op->printAsOperand(O, SlotTracker);
1591 });
1592 O << ")";
1593}
1594#endif
1595
1596void VPHistogramRecipe::execute(VPTransformState &State) {
1597 IRBuilderBase &Builder = State.Builder;
1598
1599 Value *Address = State.get(Def: getOperand(N: 0));
1600 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
1601 VectorType *VTy = cast<VectorType>(Val: Address->getType());
1602
1603 // The histogram intrinsic requires a mask even if the recipe doesn't;
1604 // if the mask operand was omitted then all lanes should be executed and
1605 // we just need to synthesize an all-true mask.
1606 Value *Mask = nullptr;
1607 if (VPValue *VPMask = getMask())
1608 Mask = State.get(Def: VPMask);
1609 else
1610 Mask =
1611 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
1612
1613 // If this is a subtract, we want to invert the increment amount. We may
1614 // add a separate intrinsic in future, but for now we'll try this.
1615 if (Opcode == Instruction::Sub)
1616 IncAmt = Builder.CreateNeg(V: IncAmt);
1617 else
1618 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1619
1620 State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
1621 Types: {VTy, IncAmt->getType()},
1622 Args: {Address, IncAmt, Mask});
1623}
1624
1625InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1626 VPCostContext &Ctx) const {
1627 // FIXME: Take the gather and scatter into account as well. For now we're
1628 // generating the same cost as the fallback path, but we'll likely
1629 // need to create a new TTI method for determining the cost, including
1630 // whether we can use base + vec-of-smaller-indices or just
1631 // vec-of-pointers.
1632 assert(VF.isVector() && "Invalid VF for histogram cost");
1633 Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1634 VPValue *IncAmt = getOperand(N: 1);
1635 Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
1636 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
1637
1638 // Assume that a non-constant update value (or a constant != 1) requires
1639 // a multiply, and add that into the cost.
1640 InstructionCost MulCost =
1641 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
1642 if (IncAmt->isLiveIn()) {
1643 ConstantInt *CI = dyn_cast<ConstantInt>(Val: IncAmt->getLiveInIRValue());
1644
1645 if (CI && CI->getZExtValue() == 1)
1646 MulCost = TTI::TCC_Free;
1647 }
1648
1649 // Find the cost of the histogram operation itself.
1650 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
1651 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1652 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
1653 Type::getVoidTy(C&: Ctx.LLVMCtx),
1654 {PtrTy, IncTy, MaskTy});
1655
1656 // Add the costs together with the add/sub operation.
1657 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
1658 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
1659}
1660
1661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662void VPHistogramRecipe::print(raw_ostream &O, const Twine &Indent,
1663 VPSlotTracker &SlotTracker) const {
1664 O << Indent << "WIDEN-HISTOGRAM buckets: ";
1665 getOperand(0)->printAsOperand(O, SlotTracker);
1666
1667 if (Opcode == Instruction::Sub)
1668 O << ", dec: ";
1669 else {
1670 assert(Opcode == Instruction::Add);
1671 O << ", inc: ";
1672 }
1673 getOperand(1)->printAsOperand(O, SlotTracker);
1674
1675 if (VPValue *Mask = getMask()) {
1676 O << ", mask: ";
1677 Mask->printAsOperand(O, SlotTracker);
1678 }
1679}
1680
1681void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
1682 VPSlotTracker &SlotTracker) const {
1683 O << Indent << "WIDEN-SELECT ";
1684 printAsOperand(O, SlotTracker);
1685 O << " = select ";
1686 printFlags(O);
1687 getOperand(0)->printAsOperand(O, SlotTracker);
1688 O << ", ";
1689 getOperand(1)->printAsOperand(O, SlotTracker);
1690 O << ", ";
1691 getOperand(2)->printAsOperand(O, SlotTracker);
1692 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
1693}
1694#endif
1695
1696void VPWidenSelectRecipe::execute(VPTransformState &State) {
1697 // The condition can be loop invariant but still defined inside the
1698 // loop. This means that we can't just use the original 'cond' value.
1699 // We have to take the 'vectorized' value and pick the first lane.
1700 // Instcombine will make this a no-op.
1701 auto *InvarCond =
1702 isInvariantCond() ? State.get(Def: getCond(), Lane: VPLane(0)) : nullptr;
1703
1704 Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond());
1705 Value *Op0 = State.get(Def: getOperand(N: 1));
1706 Value *Op1 = State.get(Def: getOperand(N: 2));
1707 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
1708 State.set(Def: this, V: Sel);
1709 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
1710 if (isa<FPMathOperator>(Val: I))
1711 applyFlags(I&: *I);
1712 applyMetadata(I&: *I);
1713 }
1714}
1715
1716InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
1717 VPCostContext &Ctx) const {
1718 SelectInst *SI = cast<SelectInst>(Val: getUnderlyingValue());
1719 bool ScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1720 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1721 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1722
1723 VPValue *Op0, *Op1;
1724 using namespace llvm::VPlanPatternMatch;
1725 if (!ScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1726 (match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))) ||
1727 match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1))))) {
1728 // select x, y, false --> x & y
1729 // select x, true, y --> x | y
1730 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1731 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1732
1733 SmallVector<const Value *, 2> Operands;
1734 if (all_of(Range: operands(),
1735 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1736 Operands.append(in_start: SI->op_begin(), in_end: SI->op_end());
1737 bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1738 return Ctx.TTI.getArithmeticInstrCost(
1739 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: VectorTy,
1740 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1741 }
1742
1743 Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1744 if (!ScalarCond)
1745 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1746
1747 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1748 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
1749 Pred = Cmp->getPredicate();
1750 return Ctx.TTI.getCmpSelInstrCost(
1751 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1752 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1753}
1754
1755VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
1756 AllowReassoc = FMF.allowReassoc();
1757 NoNaNs = FMF.noNaNs();
1758 NoInfs = FMF.noInfs();
1759 NoSignedZeros = FMF.noSignedZeros();
1760 AllowReciprocal = FMF.allowReciprocal();
1761 AllowContract = FMF.allowContract();
1762 ApproxFunc = FMF.approxFunc();
1763}
1764
1765#if !defined(NDEBUG)
1766bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
1767 switch (OpType) {
1768 case OperationType::OverflowingBinOp:
1769 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
1770 Opcode == Instruction::Mul ||
1771 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
1772 case OperationType::DisjointOp:
1773 return Opcode == Instruction::Or;
1774 case OperationType::PossiblyExactOp:
1775 return Opcode == Instruction::AShr;
1776 case OperationType::GEPOp:
1777 return Opcode == Instruction::GetElementPtr ||
1778 Opcode == VPInstruction::PtrAdd;
1779 case OperationType::FPMathOp:
1780 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1781 Opcode == Instruction::FSub || Opcode == Instruction::FNeg ||
1782 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
1783 Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
1784 Opcode == VPInstruction::WideIVStep ||
1785 Opcode == VPInstruction::ReductionStartVector ||
1786 Opcode == VPInstruction::ComputeReductionResult;
1787 case OperationType::NonNegOp:
1788 return Opcode == Instruction::ZExt;
1789 break;
1790 case OperationType::Cmp:
1791 return Opcode == Instruction::ICmp;
1792 case OperationType::Other:
1793 return true;
1794 }
1795 llvm_unreachable("Unknown OperationType enum");
1796}
1797#endif
1798
1799#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1800void VPIRFlags::printFlags(raw_ostream &O) const {
1801 switch (OpType) {
1802 case OperationType::Cmp:
1803 O << " " << CmpInst::getPredicateName(getPredicate());
1804 break;
1805 case OperationType::DisjointOp:
1806 if (DisjointFlags.IsDisjoint)
1807 O << " disjoint";
1808 break;
1809 case OperationType::PossiblyExactOp:
1810 if (ExactFlags.IsExact)
1811 O << " exact";
1812 break;
1813 case OperationType::OverflowingBinOp:
1814 if (WrapFlags.HasNUW)
1815 O << " nuw";
1816 if (WrapFlags.HasNSW)
1817 O << " nsw";
1818 break;
1819 case OperationType::FPMathOp:
1820 getFastMathFlags().print(O);
1821 break;
1822 case OperationType::GEPOp:
1823 if (GEPFlags.isInBounds())
1824 O << " inbounds";
1825 else if (GEPFlags.hasNoUnsignedSignedWrap())
1826 O << " nusw";
1827 if (GEPFlags.hasNoUnsignedWrap())
1828 O << " nuw";
1829 break;
1830 case OperationType::NonNegOp:
1831 if (NonNegFlags.NonNeg)
1832 O << " nneg";
1833 break;
1834 case OperationType::Other:
1835 break;
1836 }
1837 O << " ";
1838}
1839#endif
1840
1841void VPWidenRecipe::execute(VPTransformState &State) {
1842 auto &Builder = State.Builder;
1843 switch (Opcode) {
1844 case Instruction::Call:
1845 case Instruction::Br:
1846 case Instruction::PHI:
1847 case Instruction::GetElementPtr:
1848 case Instruction::Select:
1849 llvm_unreachable("This instruction is handled by a different recipe.");
1850 case Instruction::UDiv:
1851 case Instruction::SDiv:
1852 case Instruction::SRem:
1853 case Instruction::URem:
1854 case Instruction::Add:
1855 case Instruction::FAdd:
1856 case Instruction::Sub:
1857 case Instruction::FSub:
1858 case Instruction::FNeg:
1859 case Instruction::Mul:
1860 case Instruction::FMul:
1861 case Instruction::FDiv:
1862 case Instruction::FRem:
1863 case Instruction::Shl:
1864 case Instruction::LShr:
1865 case Instruction::AShr:
1866 case Instruction::And:
1867 case Instruction::Or:
1868 case Instruction::Xor: {
1869 // Just widen unops and binops.
1870 SmallVector<Value *, 2> Ops;
1871 for (VPValue *VPOp : operands())
1872 Ops.push_back(Elt: State.get(Def: VPOp));
1873
1874 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
1875
1876 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
1877 applyFlags(I&: *VecOp);
1878 applyMetadata(I&: *VecOp);
1879 }
1880
1881 // Use this vector value for all users of the original instruction.
1882 State.set(Def: this, V);
1883 break;
1884 }
1885 case Instruction::ExtractValue: {
1886 assert(getNumOperands() == 2 && "expected single level extractvalue");
1887 Value *Op = State.get(Def: getOperand(N: 0));
1888 auto *CI = cast<ConstantInt>(Val: getOperand(N: 1)->getLiveInIRValue());
1889 Value *Extract = Builder.CreateExtractValue(Agg: Op, Idxs: CI->getZExtValue());
1890 State.set(Def: this, V: Extract);
1891 break;
1892 }
1893 case Instruction::Freeze: {
1894 Value *Op = State.get(Def: getOperand(N: 0));
1895 Value *Freeze = Builder.CreateFreeze(V: Op);
1896 State.set(Def: this, V: Freeze);
1897 break;
1898 }
1899 case Instruction::ICmp:
1900 case Instruction::FCmp: {
1901 // Widen compares. Generate vector compares.
1902 bool FCmp = Opcode == Instruction::FCmp;
1903 Value *A = State.get(Def: getOperand(N: 0));
1904 Value *B = State.get(Def: getOperand(N: 1));
1905 Value *C = nullptr;
1906 if (FCmp) {
1907 // Propagate fast math flags.
1908 C = Builder.CreateFCmpFMF(
1909 P: getPredicate(), LHS: A, RHS: B,
1910 FMFSource: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
1911 } else {
1912 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
1913 }
1914 if (auto *I = dyn_cast<Instruction>(Val: C))
1915 applyMetadata(I&: *I);
1916 State.set(Def: this, V: C);
1917 break;
1918 }
1919 default:
1920 // This instruction is not vectorized by simple widening.
1921 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
1922 << Instruction::getOpcodeName(Opcode));
1923 llvm_unreachable("Unhandled instruction!");
1924 } // end of switch.
1925
1926#if !defined(NDEBUG)
1927 // Verify that VPlan type inference results agree with the type of the
1928 // generated values.
1929 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
1930 State.get(this)->getType() &&
1931 "inferred type and type from generated instructions do not match");
1932#endif
1933}
1934
1935InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1936 VPCostContext &Ctx) const {
1937 switch (Opcode) {
1938 case Instruction::FNeg: {
1939 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1940 return Ctx.TTI.getArithmeticInstrCost(
1941 Opcode, Ty: VectorTy, CostKind: Ctx.CostKind,
1942 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1943 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None});
1944 }
1945
1946 case Instruction::UDiv:
1947 case Instruction::SDiv:
1948 case Instruction::SRem:
1949 case Instruction::URem:
1950 // More complex computation, let the legacy cost-model handle this for now.
1951 return Ctx.getLegacyCost(UI: cast<Instruction>(Val: getUnderlyingValue()), VF);
1952 case Instruction::Add:
1953 case Instruction::FAdd:
1954 case Instruction::Sub:
1955 case Instruction::FSub:
1956 case Instruction::Mul:
1957 case Instruction::FMul:
1958 case Instruction::FDiv:
1959 case Instruction::FRem:
1960 case Instruction::Shl:
1961 case Instruction::LShr:
1962 case Instruction::AShr:
1963 case Instruction::And:
1964 case Instruction::Or:
1965 case Instruction::Xor: {
1966 VPValue *RHS = getOperand(N: 1);
1967 // Certain instructions can be cheaper to vectorize if they have a constant
1968 // second vector operand. One example of this are shifts on x86.
1969 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS);
1970
1971 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1972 getOperand(N: 1)->isDefinedOutsideLoopRegions())
1973 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1974 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1975 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1976
1977 SmallVector<const Value *, 4> Operands;
1978 if (CtxI)
1979 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
1980 return Ctx.TTI.getArithmeticInstrCost(
1981 Opcode, Ty: VectorTy, CostKind: Ctx.CostKind,
1982 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1983 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
1984 }
1985 case Instruction::Freeze: {
1986 // This opcode is unknown. Assume that it is the same as 'mul'.
1987 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1988 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy,
1989 CostKind: Ctx.CostKind);
1990 }
1991 case Instruction::ExtractValue: {
1992 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
1993 CostKind: Ctx.CostKind);
1994 }
1995 case Instruction::ICmp:
1996 case Instruction::FCmp: {
1997 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1998 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1999 return Ctx.TTI.getCmpSelInstrCost(
2000 Opcode, ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy), VecPred: getPredicate(),
2001 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
2002 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
2003 }
2004 default:
2005 llvm_unreachable("Unsupported opcode for instruction");
2006 }
2007}
2008
2009#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2010void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
2011 VPSlotTracker &SlotTracker) const {
2012 O << Indent << "WIDEN ";
2013 printAsOperand(O, SlotTracker);
2014 O << " = " << Instruction::getOpcodeName(Opcode);
2015 printFlags(O);
2016 printOperands(O, SlotTracker);
2017}
2018#endif
2019
2020void VPWidenCastRecipe::execute(VPTransformState &State) {
2021 auto &Builder = State.Builder;
2022 /// Vectorize casts.
2023 assert(State.VF.isVector() && "Not vectorizing?");
2024 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2025 VPValue *Op = getOperand(N: 0);
2026 Value *A = State.get(Def: Op);
2027 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2028 State.set(Def: this, V: Cast);
2029 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2030 applyFlags(I&: *CastOp);
2031 applyMetadata(I&: *CastOp);
2032 }
2033}
2034
2035InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2036 VPCostContext &Ctx) const {
2037 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2038 // the legacy cost model, including truncates/extends when evaluating a
2039 // reduction in a smaller type.
2040 if (!getUnderlyingValue())
2041 return 0;
2042 // Computes the CastContextHint from a recipes that may access memory.
2043 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
2044 if (VF.isScalar())
2045 return TTI::CastContextHint::Normal;
2046 if (isa<VPInterleaveRecipe>(Val: R))
2047 return TTI::CastContextHint::Interleave;
2048 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R))
2049 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
2050 : TTI::CastContextHint::Normal;
2051 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
2052 if (WidenMemoryRecipe == nullptr)
2053 return TTI::CastContextHint::None;
2054 if (!WidenMemoryRecipe->isConsecutive())
2055 return TTI::CastContextHint::GatherScatter;
2056 if (WidenMemoryRecipe->isReverse())
2057 return TTI::CastContextHint::Reversed;
2058 if (WidenMemoryRecipe->isMasked())
2059 return TTI::CastContextHint::Masked;
2060 return TTI::CastContextHint::Normal;
2061 };
2062
2063 VPValue *Operand = getOperand(N: 0);
2064 TTI::CastContextHint CCH = TTI::CastContextHint::None;
2065 // For Trunc/FPTrunc, get the context from the only user.
2066 if ((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
2067 !hasMoreThanOneUniqueUser() && getNumUsers() > 0) {
2068 if (auto *StoreRecipe = dyn_cast<VPRecipeBase>(Val: *user_begin()))
2069 CCH = ComputeCCH(StoreRecipe);
2070 }
2071 // For Z/Sext, get the context from the operand.
2072 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
2073 Opcode == Instruction::FPExt) {
2074 if (Operand->isLiveIn())
2075 CCH = TTI::CastContextHint::Normal;
2076 else if (Operand->getDefiningRecipe())
2077 CCH = ComputeCCH(Operand->getDefiningRecipe());
2078 }
2079
2080 auto *SrcTy =
2081 cast<VectorType>(Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Operand), EC: VF));
2082 auto *DestTy = cast<VectorType>(Val: toVectorTy(Scalar: getResultType(), EC: VF));
2083 // Arm TTI will use the underlying instruction to determine the cost.
2084 return Ctx.TTI.getCastInstrCost(
2085 Opcode, Dst: DestTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
2086 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
2087}
2088
2089#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2090void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
2091 VPSlotTracker &SlotTracker) const {
2092 O << Indent << "WIDEN-CAST ";
2093 printAsOperand(O, SlotTracker);
2094 O << " = " << Instruction::getOpcodeName(Opcode);
2095 printFlags(O);
2096 printOperands(O, SlotTracker);
2097 O << " to " << *getResultType();
2098}
2099#endif
2100
2101InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2102 VPCostContext &Ctx) const {
2103 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2104}
2105
2106/// A helper function that returns an integer or floating-point constant with
2107/// value C.
2108static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
2109 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C)
2110 : ConstantFP::get(Ty, V: C);
2111}
2112
2113#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2114void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
2115 VPSlotTracker &SlotTracker) const {
2116 O << Indent;
2117 printAsOperand(O, SlotTracker);
2118 O << " = WIDEN-INDUCTION ";
2119 printOperands(O, SlotTracker);
2120
2121 if (auto *TI = getTruncInst())
2122 O << " (truncated to " << *TI->getType() << ")";
2123}
2124#endif
2125
2126bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2127 // The step may be defined by a recipe in the preheader (e.g. if it requires
2128 // SCEV expansion), but for the canonical induction the step is required to be
2129 // 1, which is represented as live-in.
2130 if (getStepValue()->getDefiningRecipe())
2131 return false;
2132 auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue());
2133 auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue());
2134 auto *CanIV = cast<VPCanonicalIVPHIRecipe>(Val: &*getParent()->begin());
2135 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2136 getScalarType() == CanIV->getScalarType();
2137}
2138
2139#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2140void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
2141 VPSlotTracker &SlotTracker) const {
2142 O << Indent;
2143 printAsOperand(O, SlotTracker);
2144 O << " = DERIVED-IV ";
2145 getStartValue()->printAsOperand(O, SlotTracker);
2146 O << " + ";
2147 getOperand(1)->printAsOperand(O, SlotTracker);
2148 O << " * ";
2149 getStepValue()->printAsOperand(O, SlotTracker);
2150}
2151#endif
2152
2153void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2154 // Fast-math-flags propagate from the original induction instruction.
2155 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2156 if (hasFastMathFlags())
2157 State.Builder.setFastMathFlags(getFastMathFlags());
2158
2159 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2160 /// variable on which to base the steps, \p Step is the size of the step.
2161
2162 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2163 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2164 IRBuilderBase &Builder = State.Builder;
2165
2166 // Ensure step has the same type as that of scalar IV.
2167 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2168 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2169
2170 // We build scalar steps for both integer and floating-point induction
2171 // variables. Here, we determine the kind of arithmetic we will perform.
2172 Instruction::BinaryOps AddOp;
2173 Instruction::BinaryOps MulOp;
2174 if (BaseIVTy->isIntegerTy()) {
2175 AddOp = Instruction::Add;
2176 MulOp = Instruction::Mul;
2177 } else {
2178 AddOp = InductionOpcode;
2179 MulOp = Instruction::FMul;
2180 }
2181
2182 // Determine the number of scalars we need to generate for each unroll
2183 // iteration.
2184 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2185 // Compute the scalar steps and save the results in State.
2186 Type *IntStepTy =
2187 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
2188 Type *VecIVTy = nullptr;
2189 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2190 if (!FirstLaneOnly && State.VF.isScalable()) {
2191 VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF);
2192 UnitStepVec =
2193 Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF));
2194 SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step);
2195 SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV);
2196 }
2197
2198 unsigned StartLane = 0;
2199 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2200 if (State.Lane) {
2201 StartLane = State.Lane->getKnownLane();
2202 EndLane = StartLane + 1;
2203 }
2204 Value *StartIdx0;
2205 if (getUnrollPart(U&: *this) == 0)
2206 StartIdx0 = ConstantInt::get(Ty: IntStepTy, V: 0);
2207 else {
2208 StartIdx0 = State.get(Def: getOperand(N: 2), IsScalar: true);
2209 if (getUnrollPart(U&: *this) != 1) {
2210 StartIdx0 =
2211 Builder.CreateMul(LHS: StartIdx0, RHS: ConstantInt::get(Ty: StartIdx0->getType(),
2212 V: getUnrollPart(U&: *this)));
2213 }
2214 StartIdx0 = Builder.CreateSExtOrTrunc(V: StartIdx0, DestTy: IntStepTy);
2215 }
2216
2217 if (!FirstLaneOnly && State.VF.isScalable()) {
2218 auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0);
2219 auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec);
2220 if (BaseIVTy->isFloatingPointTy())
2221 InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy);
2222 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep);
2223 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul);
2224 State.set(Def: this, V: Add);
2225 // It's useful to record the lane values too for the known minimum number
2226 // of elements so we do those below. This improves the code quality when
2227 // trying to extract the first element, for example.
2228 }
2229
2230 if (BaseIVTy->isFloatingPointTy())
2231 StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy);
2232
2233 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2234 Value *StartIdx = Builder.CreateBinOp(
2235 Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane));
2236 // The step returned by `createStepForVF` is a runtime-evaluated value
2237 // when VF is scalable. Otherwise, it should be folded into a Constant.
2238 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2239 "Expected StartIdx to be folded to a constant when VF is not "
2240 "scalable");
2241 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2242 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2243 State.set(Def: this, V: Add, Lane: VPLane(Lane));
2244 }
2245}
2246
2247#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2248void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
2249 VPSlotTracker &SlotTracker) const {
2250 O << Indent;
2251 printAsOperand(O, SlotTracker);
2252 O << " = SCALAR-STEPS ";
2253 printOperands(O, SlotTracker);
2254}
2255#endif
2256
2257void VPWidenGEPRecipe::execute(VPTransformState &State) {
2258 assert(State.VF.isVector() && "not widening");
2259 auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr());
2260 // Construct a vector GEP by widening the operands of the scalar GEP as
2261 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2262 // results in a vector of pointers when at least one operand of the GEP
2263 // is vector-typed. Thus, to keep the representation compact, we only use
2264 // vector-typed operands for loop-varying values.
2265
2266 if (areAllOperandsInvariant()) {
2267 // If we are vectorizing, but the GEP has only loop-invariant operands,
2268 // the GEP we build (by only using vector-typed operands for
2269 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2270 // produce a vector of pointers, we need to either arbitrarily pick an
2271 // operand to broadcast, or broadcast a clone of the original GEP.
2272 // Here, we broadcast a clone of the original.
2273 //
2274 // TODO: If at some point we decide to scalarize instructions having
2275 // loop-invariant operands, this special case will no longer be
2276 // required. We would add the scalarization decision to
2277 // collectLoopScalars() and teach getVectorValue() to broadcast
2278 // the lane-zero scalar value.
2279 SmallVector<Value *> Ops;
2280 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2281 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0)));
2282
2283 auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
2284 IdxList: ArrayRef(Ops).drop_front(), Name: "",
2285 NW: getGEPNoWrapFlags());
2286 Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2287 State.set(Def: this, V: Splat);
2288 } else {
2289 // If the GEP has at least one loop-varying operand, we are sure to
2290 // produce a vector of pointers unless VF is scalar.
2291 // The pointer operand of the new GEP. If it's loop-invariant, we
2292 // won't broadcast it.
2293 auto *Ptr = isPointerLoopInvariant() ? State.get(Def: getOperand(N: 0), Lane: VPLane(0))
2294 : State.get(Def: getOperand(N: 0));
2295
2296 // Collect all the indices for the new GEP. If any index is
2297 // loop-invariant, we won't broadcast it.
2298 SmallVector<Value *, 4> Indices;
2299 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2300 VPValue *Operand = getOperand(N: I);
2301 if (isIndexLoopInvariant(I: I - 1))
2302 Indices.push_back(Elt: State.get(Def: Operand, Lane: VPLane(0)));
2303 else
2304 Indices.push_back(Elt: State.get(Def: Operand));
2305 }
2306
2307 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2308 // but it should be a vector, otherwise.
2309 auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr,
2310 IdxList: Indices, Name: "", NW: getGEPNoWrapFlags());
2311 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2312 "NewGEP is not a pointer vector");
2313 State.set(Def: this, V: NewGEP);
2314 }
2315}
2316
2317#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2318void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
2319 VPSlotTracker &SlotTracker) const {
2320 O << Indent << "WIDEN-GEP ";
2321 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2322 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2323 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2324
2325 O << " ";
2326 printAsOperand(O, SlotTracker);
2327 O << " = getelementptr";
2328 printFlags(O);
2329 printOperands(O, SlotTracker);
2330}
2331#endif
2332
2333static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
2334 unsigned CurrentPart, IRBuilderBase &Builder) {
2335 // Use i32 for the gep index type when the value is constant,
2336 // or query DataLayout for a more suitable index type otherwise.
2337 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2338 return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
2339 ? DL.getIndexType(PtrTy: Builder.getPtrTy(AddrSpace: 0))
2340 : Builder.getInt32Ty();
2341}
2342
2343void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2344 auto &Builder = State.Builder;
2345 unsigned CurrentPart = getUnrollPart(U&: *this);
2346 bool IsUnitStride = Stride == 1 || Stride == -1;
2347 Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ true,
2348 IsUnitStride, CurrentPart, Builder);
2349
2350 // The wide store needs to start at the last vector element.
2351 Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane(0));
2352 if (IndexTy != RunTimeVF->getType())
2353 RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy);
2354 // NumElt = Stride * CurrentPart * RunTimeVF
2355 Value *NumElt = Builder.CreateMul(
2356 LHS: ConstantInt::get(Ty: IndexTy, V: Stride * (int64_t)CurrentPart), RHS: RunTimeVF);
2357 // LastLane = Stride * (RunTimeVF - 1)
2358 Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: 1));
2359 if (Stride != 1)
2360 LastLane = Builder.CreateMul(LHS: ConstantInt::get(Ty: IndexTy, V: Stride), RHS: LastLane);
2361 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2362 Value *ResultPtr =
2363 Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "", NW: getGEPNoWrapFlags());
2364 ResultPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: ResultPtr, IdxList: LastLane, Name: "",
2365 NW: getGEPNoWrapFlags());
2366
2367 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2368}
2369
2370#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2371void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2372 VPSlotTracker &SlotTracker) const {
2373 O << Indent;
2374 printAsOperand(O, SlotTracker);
2375 O << " = vector-end-pointer";
2376 printFlags(O);
2377 printOperands(O, SlotTracker);
2378}
2379#endif
2380
2381void VPVectorPointerRecipe::execute(VPTransformState &State) {
2382 auto &Builder = State.Builder;
2383 unsigned CurrentPart = getUnrollPart(U&: *this);
2384 Type *IndexTy = getGEPIndexTy(IsScalable: State.VF.isScalable(), /*IsReverse*/ false,
2385 /*IsUnitStride*/ true, CurrentPart, Builder);
2386 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2387
2388 Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: CurrentPart);
2389 Value *ResultPtr =
2390 Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "", NW: getGEPNoWrapFlags());
2391
2392 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2393}
2394
2395#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2396void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
2397 VPSlotTracker &SlotTracker) const {
2398 O << Indent;
2399 printAsOperand(O, SlotTracker);
2400 O << " = vector-pointer ";
2401
2402 printOperands(O, SlotTracker);
2403}
2404#endif
2405
2406void VPBlendRecipe::execute(VPTransformState &State) {
2407 assert(isNormalized() && "Expected blend to be normalized!");
2408 // We know that all PHIs in non-header blocks are converted into
2409 // selects, so we don't have to worry about the insertion order and we
2410 // can just use the builder.
2411 // At this point we generate the predication tree. There may be
2412 // duplications since this is a simple recursive scan, but future
2413 // optimizations will clean it up.
2414
2415 unsigned NumIncoming = getNumIncomingValues();
2416
2417 // Generate a sequence of selects of the form:
2418 // SELECT(Mask3, In3,
2419 // SELECT(Mask2, In2,
2420 // SELECT(Mask1, In1,
2421 // In0)))
2422 // Note that Mask0 is never used: lanes for which no path reaches this phi and
2423 // are essentially undef are taken from In0.
2424 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
2425 Value *Result = nullptr;
2426 for (unsigned In = 0; In < NumIncoming; ++In) {
2427 // We might have single edge PHIs (blocks) - use an identity
2428 // 'select' for the first PHI operand.
2429 Value *In0 = State.get(Def: getIncomingValue(Idx: In), IsScalar: OnlyFirstLaneUsed);
2430 if (In == 0)
2431 Result = In0; // Initialize with the first incoming value.
2432 else {
2433 // Select between the current value and the previous incoming edge
2434 // based on the incoming mask.
2435 Value *Cond = State.get(Def: getMask(Idx: In), IsScalar: OnlyFirstLaneUsed);
2436 Result = State.Builder.CreateSelect(C: Cond, True: In0, False: Result, Name: "predphi");
2437 }
2438 }
2439 State.set(Def: this, V: Result, IsScalar: OnlyFirstLaneUsed);
2440}
2441
2442InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2443 VPCostContext &Ctx) const {
2444 // Handle cases where only the first lane is used the same way as the legacy
2445 // cost model.
2446 if (vputils::onlyFirstLaneUsed(Def: this))
2447 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2448
2449 Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
2450 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2451 return (getNumIncomingValues() - 1) *
2452 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2453 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2454}
2455
2456#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2457void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
2458 VPSlotTracker &SlotTracker) const {
2459 O << Indent << "BLEND ";
2460 printAsOperand(O, SlotTracker);
2461 O << " =";
2462 if (getNumIncomingValues() == 1) {
2463 // Not a User of any mask: not really blending, this is a
2464 // single-predecessor phi.
2465 O << " ";
2466 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2467 } else {
2468 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2469 O << " ";
2470 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2471 if (I == 0)
2472 continue;
2473 O << "/";
2474 getMask(I)->printAsOperand(O, SlotTracker);
2475 }
2476 }
2477}
2478#endif
2479
2480void VPReductionRecipe::execute(VPTransformState &State) {
2481 assert(!State.Lane && "Reduction being replicated.");
2482 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2483 RecurKind Kind = getRecurrenceKind();
2484 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2485 "In-loop AnyOf reductions aren't currently supported");
2486 // Propagate the fast-math flags carried by the underlying instruction.
2487 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2488 State.Builder.setFastMathFlags(getFastMathFlags());
2489 Value *NewVecOp = State.get(Def: getVecOp());
2490 if (VPValue *Cond = getCondOp()) {
2491 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2492 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2493 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2494
2495 Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2496 if (State.VF.isVector())
2497 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2498
2499 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2500 NewVecOp = Select;
2501 }
2502 Value *NewRed;
2503 Value *NextInChain;
2504 if (IsOrdered) {
2505 if (State.VF.isVector())
2506 NewRed =
2507 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2508 else
2509 NewRed = State.Builder.CreateBinOp(
2510 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2511 LHS: PrevInChain, RHS: NewVecOp);
2512 PrevInChain = NewRed;
2513 NextInChain = NewRed;
2514 } else {
2515 PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2516 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2517 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2518 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2519 else
2520 NextInChain = State.Builder.CreateBinOp(
2521 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2522 RHS: PrevInChain);
2523 }
2524 State.set(Def: this, V: NextInChain, /*IsScalar*/ true);
2525}
2526
2527void VPReductionEVLRecipe::execute(VPTransformState &State) {
2528 assert(!State.Lane && "Reduction being replicated.");
2529
2530 auto &Builder = State.Builder;
2531 // Propagate the fast-math flags carried by the underlying instruction.
2532 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2533 Builder.setFastMathFlags(getFastMathFlags());
2534
2535 RecurKind Kind = getRecurrenceKind();
2536 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
2537 Value *VecOp = State.get(Def: getVecOp());
2538 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
2539
2540 Value *Mask;
2541 if (VPValue *CondOp = getCondOp())
2542 Mask = State.get(Def: CondOp);
2543 else
2544 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2545
2546 Value *NewRed;
2547 if (isOrdered()) {
2548 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2549 } else {
2550 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2551 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2552 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2553 else
2554 NewRed = Builder.CreateBinOp(
2555 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2556 RHS: Prev);
2557 }
2558 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
2559}
2560
2561InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2562 VPCostContext &Ctx) const {
2563 RecurKind RdxKind = getRecurrenceKind();
2564 Type *ElementTy = Ctx.Types.inferScalarType(V: this);
2565 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2566 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2567 FastMathFlags FMFs = getFastMathFlags();
2568 std::optional<FastMathFlags> OptionalFMF =
2569 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2570
2571 // TODO: Support any-of reductions.
2572 assert(
2573 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2574 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2575 "Any-of reduction not implemented in VPlan-based cost model currently.");
2576
2577 // Note that TTI should model the cost of moving result to the scalar register
2578 // and the BinOp cost in the getMinMaxReductionCost().
2579 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2580 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2581 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2582 }
2583
2584 // Note that TTI should model the cost of moving result to the scalar register
2585 // and the BinOp cost in the getArithmeticReductionCost().
2586 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2587 CostKind: Ctx.CostKind);
2588}
2589
2590VPExpressionRecipe::VPExpressionRecipe(
2591 ExpressionTypes ExpressionType,
2592 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2593 : VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2594 ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2595 ExpressionRecipes.begin(), ExpressionRecipes.end())
2596 .takeVector()),
2597 ExpressionType(ExpressionType) {
2598 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2599 assert(
2600 none_of(ExpressionRecipes,
2601 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2602 "expression cannot contain recipes with side-effects");
2603
2604 // Maintain a copy of the expression recipes as a set of users.
2605 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2606 for (auto *R : ExpressionRecipes)
2607 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2608
2609 // Recipes in the expression, except the last one, must only be used by
2610 // (other) recipes inside the expression. If there are other users, external
2611 // to the expression, use a clone of the recipe for external users.
2612 for (VPSingleDefRecipe *R : ExpressionRecipes) {
2613 if (R != ExpressionRecipes.back() &&
2614 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2615 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2616 })) {
2617 // There are users outside of the expression. Clone the recipe and use the
2618 // clone those external users.
2619 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2620 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2621 VPUser &U, unsigned) {
2622 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2623 });
2624 CopyForExtUsers->insertBefore(InsertPos: R);
2625 }
2626 if (R->getParent())
2627 R->removeFromParent();
2628 }
2629
2630 // Internalize all external operands to the expression recipes. To do so,
2631 // create new temporary VPValues for all operands defined by a recipe outside
2632 // the expression. The original operands are added as operands of the
2633 // VPExpressionRecipe itself.
2634 for (auto *R : ExpressionRecipes) {
2635 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
2636 auto *Def = Op->getDefiningRecipe();
2637 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
2638 continue;
2639 addOperand(Operand: Op);
2640 LiveInPlaceholders.push_back(Elt: new VPValue());
2641 R->setOperand(I: Idx, New: LiveInPlaceholders.back());
2642 }
2643 }
2644}
2645
2646void VPExpressionRecipe::decompose() {
2647 for (auto *R : ExpressionRecipes)
2648 R->insertBefore(InsertPos: this);
2649
2650 for (const auto &[Idx, Op] : enumerate(First: operands()))
2651 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
2652
2653 replaceAllUsesWith(New: ExpressionRecipes.back());
2654 ExpressionRecipes.clear();
2655}
2656
2657InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2658 VPCostContext &Ctx) const {
2659 Type *RedTy = Ctx.Types.inferScalarType(V: this);
2660 auto *SrcVecTy = cast<VectorType>(
2661 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
2662 assert(RedTy->isIntegerTy() &&
2663 "VPExpressionRecipe only supports integer types currently.");
2664 switch (ExpressionType) {
2665 case ExpressionTypes::ExtendedReduction: {
2666 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2667 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind());
2668 return Ctx.TTI.getExtendedReductionCost(
2669 Opcode,
2670 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
2671 Instruction::ZExt,
2672 ResTy: RedTy, Ty: SrcVecTy, FMF: std::nullopt, CostKind: Ctx.CostKind);
2673 }
2674 case ExpressionTypes::MulAccReduction:
2675 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
2676
2677 case ExpressionTypes::ExtMulAccReduction:
2678 return Ctx.TTI.getMulAccReductionCost(
2679 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
2680 Instruction::ZExt,
2681 ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
2682 }
2683 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
2684}
2685
2686bool VPExpressionRecipe::mayReadOrWriteMemory() const {
2687 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
2688 return R->mayReadFromMemory() || R->mayWriteToMemory();
2689 });
2690}
2691
2692bool VPExpressionRecipe::mayHaveSideEffects() const {
2693 assert(
2694 none_of(ExpressionRecipes,
2695 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2696 "expression cannot contain recipes with side-effects");
2697 return false;
2698}
2699
2700#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2701
2702void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
2703 VPSlotTracker &SlotTracker) const {
2704 O << Indent << "EXPRESSION ";
2705 printAsOperand(O, SlotTracker);
2706 O << " = ";
2707 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
2708 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
2709
2710 switch (ExpressionType) {
2711 case ExpressionTypes::ExtendedReduction: {
2712 getOperand(1)->printAsOperand(O, SlotTracker);
2713 O << " +";
2714 O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
2715 getOperand(0)->printAsOperand(O, SlotTracker);
2716 Red->printFlags(O);
2717
2718 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2719 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2720 << *Ext0->getResultType();
2721 if (Red->isConditional()) {
2722 O << ", ";
2723 Red->getCondOp()->printAsOperand(O, SlotTracker);
2724 }
2725 O << ")";
2726 break;
2727 }
2728 case ExpressionTypes::MulAccReduction:
2729 case ExpressionTypes::ExtMulAccReduction: {
2730 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
2731 O << " + ";
2732 O << "reduce."
2733 << Instruction::getOpcodeName(
2734 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
2735 << " (";
2736 O << "mul";
2737 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
2738 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
2739 : ExpressionRecipes[0]);
2740 Mul->printFlags(O);
2741 if (IsExtended)
2742 O << "(";
2743 getOperand(0)->printAsOperand(O, SlotTracker);
2744 if (IsExtended) {
2745 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
2746 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
2747 << *Ext0->getResultType() << "), (";
2748 } else {
2749 O << ", ";
2750 }
2751 getOperand(1)->printAsOperand(O, SlotTracker);
2752 if (IsExtended) {
2753 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
2754 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
2755 << *Ext1->getResultType() << ")";
2756 }
2757 if (Red->isConditional()) {
2758 O << ", ";
2759 Red->getCondOp()->printAsOperand(O, SlotTracker);
2760 }
2761 O << ")";
2762 break;
2763 }
2764 }
2765}
2766
2767void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
2768 VPSlotTracker &SlotTracker) const {
2769 O << Indent << "REDUCE ";
2770 printAsOperand(O, SlotTracker);
2771 O << " = ";
2772 getChainOp()->printAsOperand(O, SlotTracker);
2773 O << " +";
2774 printFlags(O);
2775 O << " reduce."
2776 << Instruction::getOpcodeName(
2777 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2778 << " (";
2779 getVecOp()->printAsOperand(O, SlotTracker);
2780 if (isConditional()) {
2781 O << ", ";
2782 getCondOp()->printAsOperand(O, SlotTracker);
2783 }
2784 O << ")";
2785}
2786
2787void VPReductionEVLRecipe::print(raw_ostream &O, const Twine &Indent,
2788 VPSlotTracker &SlotTracker) const {
2789 O << Indent << "REDUCE ";
2790 printAsOperand(O, SlotTracker);
2791 O << " = ";
2792 getChainOp()->printAsOperand(O, SlotTracker);
2793 O << " +";
2794 printFlags(O);
2795 O << " vp.reduce."
2796 << Instruction::getOpcodeName(
2797 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
2798 << " (";
2799 getVecOp()->printAsOperand(O, SlotTracker);
2800 O << ", ";
2801 getEVL()->printAsOperand(O, SlotTracker);
2802 if (isConditional()) {
2803 O << ", ";
2804 getCondOp()->printAsOperand(O, SlotTracker);
2805 }
2806 O << ")";
2807}
2808
2809#endif
2810
2811/// A helper function to scalarize a single Instruction in the innermost loop.
2812/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
2813/// operands from \p RepRecipe instead of \p Instr's operands.
2814static void scalarizeInstruction(const Instruction *Instr,
2815 VPReplicateRecipe *RepRecipe,
2816 const VPLane &Lane, VPTransformState &State) {
2817 assert((!Instr->getType()->isAggregateType() ||
2818 canVectorizeTy(Instr->getType())) &&
2819 "Expected vectorizable or non-aggregate type.");
2820
2821 // Does this instruction return a value ?
2822 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2823
2824 Instruction *Cloned = Instr->clone();
2825 if (!IsVoidRetTy) {
2826 Cloned->setName(Instr->getName() + ".cloned");
2827#if !defined(NDEBUG)
2828 // Verify that VPlan type inference results agree with the type of the
2829 // generated values.
2830 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2831 "inferred type and type from generated instructions do not match");
2832#endif
2833 }
2834
2835 RepRecipe->applyFlags(I&: *Cloned);
2836 RepRecipe->applyMetadata(I&: *Cloned);
2837
2838 if (auto DL = RepRecipe->getDebugLoc())
2839 State.setDebugLocFrom(DL);
2840
2841 // Replace the operands of the cloned instructions with their scalar
2842 // equivalents in the new loop.
2843 for (const auto &I : enumerate(First: RepRecipe->operands())) {
2844 auto InputLane = Lane;
2845 VPValue *Operand = I.value();
2846 if (vputils::isSingleScalar(VPV: Operand))
2847 InputLane = VPLane::getFirstLane();
2848 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
2849 }
2850
2851 // Place the cloned scalar in the new loop.
2852 State.Builder.Insert(I: Cloned);
2853
2854 State.set(Def: RepRecipe, V: Cloned, Lane);
2855
2856 // If we just cloned a new assumption, add it the assumption cache.
2857 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
2858 State.AC->registerAssumption(CI: II);
2859
2860 assert(
2861 (RepRecipe->getParent()->getParent() ||
2862 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
2863 all_of(RepRecipe->operands(),
2864 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2865 "Expected a recipe is either within a region or all of its operands "
2866 "are defined outside the vectorized region.");
2867}
2868
2869void VPReplicateRecipe::execute(VPTransformState &State) {
2870 Instruction *UI = getUnderlyingInstr();
2871
2872 if (!State.Lane) {
2873 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
2874 "must have already been unrolled");
2875 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State);
2876 return;
2877 }
2878
2879 assert((State.VF.isScalar() || !isSingleScalar()) &&
2880 "uniform recipe shouldn't be predicated");
2881 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
2882 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
2883 // Insert scalar instance packing it into a vector.
2884 if (State.VF.isVector() && shouldPack()) {
2885 Value *WideValue =
2886 State.Lane->isFirstLane()
2887 ? PoisonValue::get(T: VectorType::get(ElementType: UI->getType(), EC: State.VF))
2888 : State.get(Def: this);
2889 State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
2890 Lane: *State.Lane));
2891 }
2892}
2893
2894bool VPReplicateRecipe::shouldPack() const {
2895 // Find if the recipe is used by a widened recipe via an intervening
2896 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
2897 return any_of(Range: users(), P: [](const VPUser *U) {
2898 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
2899 return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) {
2900 return !U->usesScalars(Op: PredR);
2901 });
2902 return false;
2903 });
2904}
2905
2906InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
2907 VPCostContext &Ctx) const {
2908 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
2909 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
2910 // transform, avoid computing their cost multiple times for now.
2911 Ctx.SkipCostComputation.insert(Ptr: UI);
2912
2913 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2914 Type *ResultTy = Ctx.Types.inferScalarType(V: this);
2915 switch (UI->getOpcode()) {
2916 case Instruction::GetElementPtr:
2917 // We mark this instruction as zero-cost because the cost of GEPs in
2918 // vectorized code depends on whether the corresponding memory instruction
2919 // is scalarized or not. Therefore, we handle GEPs with the memory
2920 // instruction cost.
2921 return 0;
2922 case Instruction::Add:
2923 case Instruction::Sub:
2924 case Instruction::FAdd:
2925 case Instruction::FSub:
2926 case Instruction::Mul:
2927 case Instruction::FMul:
2928 case Instruction::FDiv:
2929 case Instruction::FRem:
2930 case Instruction::Shl:
2931 case Instruction::LShr:
2932 case Instruction::AShr:
2933 case Instruction::And:
2934 case Instruction::Or:
2935 case Instruction::Xor: {
2936 auto Op2Info = Ctx.getOperandInfo(V: getOperand(N: 1));
2937 SmallVector<const Value *, 4> Operands(UI->operand_values());
2938 return Ctx.TTI.getArithmeticInstrCost(
2939 Opcode: UI->getOpcode(), Ty: ResultTy, CostKind,
2940 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2941 Opd2Info: Op2Info, Args: Operands, CxtI: UI, TLibInfo: &Ctx.TLI) *
2942 (isSingleScalar() ? 1 : VF.getFixedValue());
2943 }
2944 }
2945
2946 return Ctx.getLegacyCost(UI, VF);
2947}
2948
2949#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2950void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
2951 VPSlotTracker &SlotTracker) const {
2952 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
2953
2954 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
2955 printAsOperand(O, SlotTracker);
2956 O << " = ";
2957 }
2958 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
2959 O << "call";
2960 printFlags(O);
2961 O << "@" << CB->getCalledFunction()->getName() << "(";
2962 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
2963 O, [&O, &SlotTracker](VPValue *Op) {
2964 Op->printAsOperand(O, SlotTracker);
2965 });
2966 O << ")";
2967 } else {
2968 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
2969 printFlags(O);
2970 printOperands(O, SlotTracker);
2971 }
2972
2973 if (shouldPack())
2974 O << " (S->V)";
2975}
2976#endif
2977
2978void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
2979 assert(State.Lane && "Branch on Mask works only on single instance.");
2980
2981 VPValue *BlockInMask = getOperand(N: 0);
2982 Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane);
2983
2984 // Replace the temporary unreachable terminator with a new conditional branch,
2985 // whose two destinations will be set later when they are created.
2986 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
2987 assert(isa<UnreachableInst>(CurrentTerminator) &&
2988 "Expected to replace unreachable terminator with conditional branch.");
2989 auto CondBr =
2990 State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
2991 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
2992 CurrentTerminator->eraseFromParent();
2993}
2994
2995InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
2996 VPCostContext &Ctx) const {
2997 // The legacy cost model doesn't assign costs to branches for individual
2998 // replicate regions. Match the current behavior in the VPlan cost model for
2999 // now.
3000 return 0;
3001}
3002
3003void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3004 assert(State.Lane && "Predicated instruction PHI works per instance.");
3005 Instruction *ScalarPredInst =
3006 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane));
3007 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3008 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3009 assert(PredicatingBB && "Predicated block has no single predecessor.");
3010 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3011 "operand must be VPReplicateRecipe");
3012
3013 // By current pack/unpack logic we need to generate only a single phi node: if
3014 // a vector value for the predicated instruction exists at this point it means
3015 // the instruction has vector users only, and a phi for the vector value is
3016 // needed. In this case the recipe of the predicated instruction is marked to
3017 // also do that packing, thereby "hoisting" the insert-element sequence.
3018 // Otherwise, a phi node for the scalar value is needed.
3019 if (State.hasVectorValue(Def: getOperand(N: 0))) {
3020 Value *VectorValue = State.get(Def: getOperand(N: 0));
3021 InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue);
3022 PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2);
3023 VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector.
3024 VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element.
3025 if (State.hasVectorValue(Def: this))
3026 State.reset(Def: this, V: VPhi);
3027 else
3028 State.set(Def: this, V: VPhi);
3029 // NOTE: Currently we need to update the value of the operand, so the next
3030 // predicated iteration inserts its generated value in the correct vector.
3031 State.reset(Def: getOperand(N: 0), V: VPhi);
3032 } else {
3033 if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane())
3034 return;
3035
3036 Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
3037 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
3038 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3039 BB: PredicatingBB);
3040 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3041 if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3042 State.reset(Def: this, V: Phi, Lane: *State.Lane);
3043 else
3044 State.set(Def: this, V: Phi, Lane: *State.Lane);
3045 // NOTE: Currently we need to update the value of the operand, so the next
3046 // predicated iteration inserts its generated value in the correct vector.
3047 State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane);
3048 }
3049}
3050
3051#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3052void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3053 VPSlotTracker &SlotTracker) const {
3054 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3055 printAsOperand(O, SlotTracker);
3056 O << " = ";
3057 printOperands(O, SlotTracker);
3058}
3059#endif
3060
3061InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3062 VPCostContext &Ctx) const {
3063 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3064 const Align Alignment =
3065 getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3066 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3067 ->getAddressSpace();
3068 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3069 ? Instruction::Load
3070 : Instruction::Store;
3071
3072 if (!Consecutive) {
3073 // TODO: Using the original IR may not be accurate.
3074 // Currently, ARM will use the underlying IR to calculate gather/scatter
3075 // instruction cost.
3076 const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3077 assert(!Reverse &&
3078 "Inconsecutive memory access should not have the order.");
3079 return Ctx.TTI.getAddressComputationCost(Ty) +
3080 Ctx.TTI.getGatherScatterOpCost(Opcode, DataTy: Ty, Ptr, VariableMask: IsMasked, Alignment,
3081 CostKind: Ctx.CostKind, I: &Ingredient);
3082 }
3083
3084 InstructionCost Cost = 0;
3085 if (IsMasked) {
3086 Cost +=
3087 Ctx.TTI.getMaskedMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3088 } else {
3089 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3090 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0)
3091 : getOperand(N: 1));
3092 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3093 OpdInfo: OpInfo, I: &Ingredient);
3094 }
3095 if (!Reverse)
3096 return Cost;
3097
3098 return Cost += Ctx.TTI.getShuffleCost(
3099 Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3100 SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0);
3101}
3102
3103void VPWidenLoadRecipe::execute(VPTransformState &State) {
3104 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3105 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3106 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3107 bool CreateGather = !isConsecutive();
3108
3109 auto &Builder = State.Builder;
3110 Value *Mask = nullptr;
3111 if (auto *VPMask = getMask()) {
3112 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3113 // of a null all-one mask is a null mask.
3114 Mask = State.get(Def: VPMask);
3115 if (isReverse())
3116 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3117 }
3118
3119 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
3120 Value *NewLI;
3121 if (CreateGather) {
3122 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3123 Name: "wide.masked.gather");
3124 } else if (Mask) {
3125 NewLI =
3126 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3127 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3128 } else {
3129 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3130 }
3131 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3132 if (Reverse)
3133 NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse");
3134 State.set(Def: this, V: NewLI);
3135}
3136
3137#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3138void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3139 VPSlotTracker &SlotTracker) const {
3140 O << Indent << "WIDEN ";
3141 printAsOperand(O, SlotTracker);
3142 O << " = load ";
3143 printOperands(O, SlotTracker);
3144}
3145#endif
3146
3147/// Use all-true mask for reverse rather than actual mask, as it avoids a
3148/// dependence w/o affecting the result.
3149static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3150 Value *EVL, const Twine &Name) {
3151 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3152 Value *AllTrueMask =
3153 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3154 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3155 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3156}
3157
3158void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3159 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3160 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3161 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3162 bool CreateGather = !isConsecutive();
3163
3164 auto &Builder = State.Builder;
3165 CallInst *NewLI;
3166 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3167 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3168 Value *Mask = nullptr;
3169 if (VPValue *VPMask = getMask()) {
3170 Mask = State.get(Def: VPMask);
3171 if (isReverse())
3172 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3173 } else {
3174 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3175 }
3176
3177 if (CreateGather) {
3178 NewLI =
3179 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3180 FMFSource: nullptr, Name: "wide.masked.gather");
3181 } else {
3182 NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3183 Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3184 }
3185 NewLI->addParamAttr(
3186 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3187 applyMetadata(I&: *NewLI);
3188 Instruction *Res = NewLI;
3189 if (isReverse())
3190 Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse");
3191 State.set(Def: this, V: Res);
3192}
3193
3194InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3195 VPCostContext &Ctx) const {
3196 if (!Consecutive || IsMasked)
3197 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3198
3199 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3200 // here because the EVL recipes using EVL to replace the tail mask. But in the
3201 // legacy model, it will always calculate the cost of mask.
3202 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3203 // don't need to compare to the legacy cost model.
3204 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3205 const Align Alignment =
3206 getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3207 unsigned AS =
3208 getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient));
3209 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3210 Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3211 if (!Reverse)
3212 return Cost;
3213
3214 return Cost + Ctx.TTI.getShuffleCost(
3215 Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3216 SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0);
3217}
3218
3219#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3220void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3221 VPSlotTracker &SlotTracker) const {
3222 O << Indent << "WIDEN ";
3223 printAsOperand(O, SlotTracker);
3224 O << " = vp.load ";
3225 printOperands(O, SlotTracker);
3226}
3227#endif
3228
3229void VPWidenStoreRecipe::execute(VPTransformState &State) {
3230 VPValue *StoredVPValue = getStoredValue();
3231 bool CreateScatter = !isConsecutive();
3232 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3233
3234 auto &Builder = State.Builder;
3235
3236 Value *Mask = nullptr;
3237 if (auto *VPMask = getMask()) {
3238 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3239 // of a null all-one mask is a null mask.
3240 Mask = State.get(Def: VPMask);
3241 if (isReverse())
3242 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3243 }
3244
3245 Value *StoredVal = State.get(Def: StoredVPValue);
3246 if (isReverse()) {
3247 // If we store to reverse consecutive memory locations, then we need
3248 // to reverse the order of elements in the stored value.
3249 StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse");
3250 // We don't want to update the value in the map as it might be used in
3251 // another expression. So don't call resetVectorValue(StoredVal).
3252 }
3253 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
3254 Instruction *NewSI = nullptr;
3255 if (CreateScatter)
3256 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3257 else if (Mask)
3258 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3259 else
3260 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3261 applyMetadata(I&: *NewSI);
3262}
3263
3264#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3265void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
3266 VPSlotTracker &SlotTracker) const {
3267 O << Indent << "WIDEN store ";
3268 printOperands(O, SlotTracker);
3269}
3270#endif
3271
3272void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3273 VPValue *StoredValue = getStoredValue();
3274 bool CreateScatter = !isConsecutive();
3275 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
3276
3277 auto &Builder = State.Builder;
3278
3279 CallInst *NewSI = nullptr;
3280 Value *StoredVal = State.get(Def: StoredValue);
3281 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3282 if (isReverse())
3283 StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse");
3284 Value *Mask = nullptr;
3285 if (VPValue *VPMask = getMask()) {
3286 Mask = State.get(Def: VPMask);
3287 if (isReverse())
3288 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3289 } else {
3290 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3291 }
3292 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3293 if (CreateScatter) {
3294 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3295 ID: Intrinsic::vp_scatter,
3296 Args: {StoredVal, Addr, Mask, EVL});
3297 } else {
3298 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3299 ID: Intrinsic::vp_store,
3300 Args: {StoredVal, Addr, Mask, EVL});
3301 }
3302 NewSI->addParamAttr(
3303 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3304 applyMetadata(I&: *NewSI);
3305}
3306
3307InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3308 VPCostContext &Ctx) const {
3309 if (!Consecutive || IsMasked)
3310 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3311
3312 // We need to use the getMaskedMemoryOpCost() instead of getMemoryOpCost()
3313 // here because the EVL recipes using EVL to replace the tail mask. But in the
3314 // legacy model, it will always calculate the cost of mask.
3315 // TODO: Using getMemoryOpCost() instead of getMaskedMemoryOpCost when we
3316 // don't need to compare to the legacy cost model.
3317 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3318 const Align Alignment =
3319 getLoadStoreAlignment(I: const_cast<Instruction *>(&Ingredient));
3320 unsigned AS =
3321 getLoadStoreAddressSpace(I: const_cast<Instruction *>(&Ingredient));
3322 InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
3323 Opcode: Instruction::Store, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind);
3324 if (!Reverse)
3325 return Cost;
3326
3327 return Cost + Ctx.TTI.getShuffleCost(
3328 Kind: TargetTransformInfo::SK_Reverse, DstTy: cast<VectorType>(Val: Ty),
3329 SrcTy: cast<VectorType>(Val: Ty), Mask: {}, CostKind: Ctx.CostKind, Index: 0);
3330}
3331
3332#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3333void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
3334 VPSlotTracker &SlotTracker) const {
3335 O << Indent << "WIDEN vp.store ";
3336 printOperands(O, SlotTracker);
3337}
3338#endif
3339
3340static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3341 VectorType *DstVTy, const DataLayout &DL) {
3342 // Verify that V is a vector type with same number of elements as DstVTy.
3343 auto VF = DstVTy->getElementCount();
3344 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
3345 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3346 Type *SrcElemTy = SrcVecTy->getElementType();
3347 Type *DstElemTy = DstVTy->getElementType();
3348 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3349 "Vector elements must have same size");
3350
3351 // Do a direct cast if element types are castable.
3352 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
3353 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
3354 }
3355 // V cannot be directly casted to desired vector type.
3356 // May happen when V is a floating point vector but DstVTy is a vector of
3357 // pointers or vice-versa. Handle this using a two-step bitcast using an
3358 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3359 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3360 "Only one type should be a pointer type");
3361 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3362 "Only one type should be a floating point type");
3363 Type *IntTy =
3364 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
3365 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
3366 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
3367 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
3368}
3369
3370/// Return a vector containing interleaved elements from multiple
3371/// smaller input vectors.
3372static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
3373 const Twine &Name) {
3374 unsigned Factor = Vals.size();
3375 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3376
3377 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
3378#ifndef NDEBUG
3379 for (Value *Val : Vals)
3380 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3381#endif
3382
3383 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3384 // must use intrinsics to interleave.
3385 if (VecTy->isScalableTy()) {
3386 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3387 VectorType *InterleaveTy =
3388 VectorType::get(ElementType: VecTy->getElementType(),
3389 EC: VecTy->getElementCount().multiplyCoefficientBy(RHS: Factor));
3390 return Builder.CreateIntrinsic(RetTy: InterleaveTy,
3391 ID: getInterleaveIntrinsicID(Factor), Args: Vals,
3392 /*FMFSource=*/nullptr, Name);
3393 }
3394
3395 // Fixed length. Start by concatenating all vectors into a wide vector.
3396 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
3397
3398 // Interleave the elements into the wide vector.
3399 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3400 return Builder.CreateShuffleVector(
3401 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
3402}
3403
3404// Try to vectorize the interleave group that \p Instr belongs to.
3405//
3406// E.g. Translate following interleaved load group (factor = 3):
3407// for (i = 0; i < N; i+=3) {
3408// R = Pic[i]; // Member of index 0
3409// G = Pic[i+1]; // Member of index 1
3410// B = Pic[i+2]; // Member of index 2
3411// ... // do something to R, G, B
3412// }
3413// To:
3414// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
3415// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
3416// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
3417// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
3418//
3419// Or translate following interleaved store group (factor = 3):
3420// for (i = 0; i < N; i+=3) {
3421// ... do something to R, G, B
3422// Pic[i] = R; // Member of index 0
3423// Pic[i+1] = G; // Member of index 1
3424// Pic[i+2] = B; // Member of index 2
3425// }
3426// To:
3427// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
3428// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
3429// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
3430// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
3431// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
3432void VPInterleaveRecipe::execute(VPTransformState &State) {
3433 assert(!State.Lane && "Interleave group being replicated.");
3434 const InterleaveGroup<Instruction> *Group = IG;
3435 Instruction *Instr = Group->getInsertPos();
3436
3437 // Prepare for the vector type of the interleaved load/store.
3438 Type *ScalarTy = getLoadStoreType(I: Instr);
3439 unsigned InterleaveFactor = Group->getFactor();
3440 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
3441
3442 VPValue *BlockInMask = getMask();
3443 VPValue *Addr = getAddr();
3444 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
3445 Value *PoisonVec = PoisonValue::get(T: VecTy);
3446
3447 auto CreateGroupMask = [&BlockInMask, &State,
3448 &InterleaveFactor](Value *MaskForGaps) -> Value * {
3449 if (State.VF.isScalable()) {
3450 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
3451 assert(InterleaveFactor <= 8 &&
3452 "Unsupported deinterleave factor for scalable vectors");
3453 auto *ResBlockInMask = State.get(Def: BlockInMask);
3454 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
3455 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
3456 }
3457
3458 if (!BlockInMask)
3459 return MaskForGaps;
3460
3461 Value *ResBlockInMask = State.get(Def: BlockInMask);
3462 Value *ShuffledMask = State.Builder.CreateShuffleVector(
3463 V: ResBlockInMask,
3464 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
3465 Name: "interleaved.mask");
3466 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
3467 LHS: ShuffledMask, RHS: MaskForGaps)
3468 : ShuffledMask;
3469 };
3470
3471 const DataLayout &DL = Instr->getDataLayout();
3472 // Vectorize the interleaved load group.
3473 if (isa<LoadInst>(Val: Instr)) {
3474 Value *MaskForGaps = nullptr;
3475 if (NeedsMaskForGaps) {
3476 MaskForGaps =
3477 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
3478 assert(MaskForGaps && "Mask for Gaps is required but it is null");
3479 }
3480
3481 Instruction *NewLoad;
3482 if (BlockInMask || MaskForGaps) {
3483 Value *GroupMask = CreateGroupMask(MaskForGaps);
3484 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
3485 Alignment: Group->getAlign(), Mask: GroupMask,
3486 PassThru: PoisonVec, Name: "wide.masked.vec");
3487 } else
3488 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
3489 Align: Group->getAlign(), Name: "wide.vec");
3490 Group->addMetadata(NewInst: NewLoad);
3491
3492 ArrayRef<VPValue *> VPDefs = definedValues();
3493 const DataLayout &DL = State.CFG.PrevBB->getDataLayout();
3494 if (VecTy->isScalableTy()) {
3495 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
3496 // so must use intrinsics to deinterleave.
3497 assert(InterleaveFactor <= 8 &&
3498 "Unsupported deinterleave factor for scalable vectors");
3499 Value *Deinterleave = State.Builder.CreateIntrinsic(
3500 ID: getDeinterleaveIntrinsicID(Factor: InterleaveFactor), Types: NewLoad->getType(),
3501 Args: NewLoad,
3502 /*FMFSource=*/nullptr, Name: "strided.vec");
3503
3504 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
3505 Instruction *Member = Group->getMember(Index: I);
3506 Value *StridedVec = State.Builder.CreateExtractValue(Agg: Deinterleave, Idxs: I);
3507 if (!Member) {
3508 // This value is not needed as it's not used
3509 cast<Instruction>(Val: StridedVec)->eraseFromParent();
3510 continue;
3511 }
3512 // If this member has different type, cast the result type.
3513 if (Member->getType() != ScalarTy) {
3514 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
3515 StridedVec =
3516 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
3517 }
3518
3519 if (Group->isReverse())
3520 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
3521
3522 State.set(Def: VPDefs[J], V: StridedVec);
3523 ++J;
3524 }
3525
3526 return;
3527 }
3528 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
3529
3530 // For each member in the group, shuffle out the appropriate data from the
3531 // wide loads.
3532 unsigned J = 0;
3533 for (unsigned I = 0; I < InterleaveFactor; ++I) {
3534 Instruction *Member = Group->getMember(Index: I);
3535
3536 // Skip the gaps in the group.
3537 if (!Member)
3538 continue;
3539
3540 auto StrideMask =
3541 createStrideMask(Start: I, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
3542 Value *StridedVec =
3543 State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask, Name: "strided.vec");
3544
3545 // If this member has different type, cast the result type.
3546 if (Member->getType() != ScalarTy) {
3547 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
3548 StridedVec =
3549 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
3550 }
3551
3552 if (Group->isReverse())
3553 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
3554
3555 State.set(Def: VPDefs[J], V: StridedVec);
3556 ++J;
3557 }
3558 return;
3559 }
3560
3561 // The sub vector type for current instruction.
3562 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
3563
3564 // Vectorize the interleaved store group.
3565 Value *MaskForGaps =
3566 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
3567 assert((!MaskForGaps || !State.VF.isScalable()) &&
3568 "masking gaps for scalable vectors is not yet supported.");
3569 ArrayRef<VPValue *> StoredValues = getStoredValues();
3570 // Collect the stored vector from each member.
3571 SmallVector<Value *, 4> StoredVecs;
3572 unsigned StoredIdx = 0;
3573 for (unsigned i = 0; i < InterleaveFactor; i++) {
3574 assert((Group->getMember(i) || MaskForGaps) &&
3575 "Fail to get a member from an interleaved store group");
3576 Instruction *Member = Group->getMember(Index: i);
3577
3578 // Skip the gaps in the group.
3579 if (!Member) {
3580 Value *Undef = PoisonValue::get(T: SubVT);
3581 StoredVecs.push_back(Elt: Undef);
3582 continue;
3583 }
3584
3585 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
3586 ++StoredIdx;
3587
3588 if (Group->isReverse())
3589 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
3590
3591 // If this member has different type, cast it to a unified type.
3592
3593 if (StoredVec->getType() != SubVT)
3594 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
3595
3596 StoredVecs.push_back(Elt: StoredVec);
3597 }
3598
3599 // Interleave all the smaller vectors into one wider vector.
3600 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
3601 Instruction *NewStoreInstr;
3602 if (BlockInMask || MaskForGaps) {
3603 Value *GroupMask = CreateGroupMask(MaskForGaps);
3604 NewStoreInstr = State.Builder.CreateMaskedStore(
3605 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
3606 } else
3607 NewStoreInstr =
3608 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
3609
3610 Group->addMetadata(NewInst: NewStoreInstr);
3611}
3612
3613#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3614void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
3615 VPSlotTracker &SlotTracker) const {
3616 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
3617 IG->getInsertPos()->printAsOperand(O, false);
3618 O << ", ";
3619 getAddr()->printAsOperand(O, SlotTracker);
3620 VPValue *Mask = getMask();
3621 if (Mask) {
3622 O << ", ";
3623 Mask->printAsOperand(O, SlotTracker);
3624 }
3625
3626 unsigned OpIdx = 0;
3627 for (unsigned i = 0; i < IG->getFactor(); ++i) {
3628 if (!IG->getMember(i))
3629 continue;
3630 if (getNumStoreOperands() > 0) {
3631 O << "\n" << Indent << " store ";
3632 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
3633 O << " to index " << i;
3634 } else {
3635 O << "\n" << Indent << " ";
3636 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
3637 O << " = load from index " << i;
3638 }
3639 ++OpIdx;
3640 }
3641}
3642#endif
3643
3644InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
3645 VPCostContext &Ctx) const {
3646 Instruction *InsertPos = getInsertPos();
3647 // Find the VPValue index of the interleave group. We need to skip gaps.
3648 unsigned InsertPosIdx = 0;
3649 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
3650 if (auto *Member = IG->getMember(Index: Idx)) {
3651 if (Member == InsertPos)
3652 break;
3653 InsertPosIdx++;
3654 }
3655 Type *ValTy = Ctx.Types.inferScalarType(
3656 V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx)
3657 : getStoredValues()[InsertPosIdx]);
3658 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
3659 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
3660
3661 unsigned InterleaveFactor = IG->getFactor();
3662 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
3663
3664 // Holds the indices of existing members in the interleaved group.
3665 SmallVector<unsigned, 4> Indices;
3666 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
3667 if (IG->getMember(Index: IF))
3668 Indices.push_back(Elt: IF);
3669
3670 // Calculate the cost of the whole interleaved group.
3671 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
3672 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
3673 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
3674
3675 if (!IG->isReverse())
3676 return Cost;
3677
3678 return Cost + IG->getNumMembers() *
3679 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
3680 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
3681 Index: 0);
3682}
3683
3684#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3685void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3686 VPSlotTracker &SlotTracker) const {
3687 O << Indent << "EMIT ";
3688 printAsOperand(O, SlotTracker);
3689 O << " = CANONICAL-INDUCTION ";
3690 printOperands(O, SlotTracker);
3691}
3692#endif
3693
3694bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
3695 return IsScalarAfterVectorization &&
3696 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
3697}
3698
3699void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
3700 assert(getInductionDescriptor().getKind() ==
3701 InductionDescriptor::IK_PtrInduction &&
3702 "Not a pointer induction according to InductionDescriptor!");
3703 assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
3704 "Unexpected type.");
3705 assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
3706 "Recipe should have been replaced");
3707
3708 unsigned CurrentPart = getUnrollPart(U&: *this);
3709
3710 // Build a pointer phi
3711 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
3712 Type *ScStValueType = ScalarStartValue->getType();
3713
3714 BasicBlock *VectorPH =
3715 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
3716 PHINode *NewPointerPhi = nullptr;
3717 if (CurrentPart == 0) {
3718 IRBuilder<>::InsertPointGuard Guard(State.Builder);
3719 if (State.Builder.GetInsertPoint() !=
3720 State.Builder.GetInsertBlock()->getFirstNonPHIIt())
3721 State.Builder.SetInsertPoint(
3722 State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3723 NewPointerPhi = State.Builder.CreatePHI(Ty: ScStValueType, NumReservedValues: 2, Name: "pointer.phi");
3724 NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH);
3725 NewPointerPhi->setDebugLoc(getDebugLoc());
3726 } else {
3727 // The recipe has been unrolled. In that case, fetch the single pointer phi
3728 // shared among all unrolled parts of the recipe.
3729 auto *GEP =
3730 cast<GetElementPtrInst>(Val: State.get(Def: getFirstUnrolledPartOperand()));
3731 NewPointerPhi = cast<PHINode>(Val: GEP->getPointerOperand());
3732 }
3733
3734 // A pointer induction, performed by using a gep
3735 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
3736 Value *ScalarStepValue = State.get(Def: getStepValue(), Lane: VPLane(0));
3737 Type *PhiType = State.TypeAnalysis.inferScalarType(V: getStepValue());
3738 Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF);
3739 // Add induction update using an incorrect block temporarily. The phi node
3740 // will be fixed after VPlan execution. Note that at this point the latch
3741 // block cannot be used, as it does not exist yet.
3742 // TODO: Model increment value in VPlan, by turning the recipe into a
3743 // multi-def and a subclass of VPHeaderPHIRecipe.
3744 if (CurrentPart == 0) {
3745 // The recipe represents the first part of the pointer induction. Create the
3746 // GEP to increment the phi across all unrolled parts.
3747 Value *NumUnrolledElems = State.get(Def: getOperand(N: 2), IsScalar: true);
3748
3749 Value *InductionGEP = GetElementPtrInst::Create(
3750 PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
3751 IdxList: State.Builder.CreateMul(
3752 LHS: ScalarStepValue,
3753 RHS: State.Builder.CreateTrunc(V: NumUnrolledElems, DestTy: PhiType)),
3754 NameStr: "ptr.ind", InsertBefore: InductionLoc);
3755
3756 NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH);
3757 }
3758
3759 // Create actual address geps that use the pointer phi as base and a
3760 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3761 Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF);
3762 Value *StartOffsetScalar = State.Builder.CreateMul(
3763 LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: CurrentPart));
3764 Value *StartOffset =
3765 State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar);
3766 // Create a vector of consecutive numbers from zero to VF.
3767 StartOffset = State.Builder.CreateAdd(
3768 LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType));
3769
3770 assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
3771 "scalar step must be the same across all parts");
3772 Value *GEP = State.Builder.CreateGEP(
3773 Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
3774 IdxList: State.Builder.CreateMul(LHS: StartOffset, RHS: State.Builder.CreateVectorSplat(
3775 EC: State.VF, V: ScalarStepValue)),
3776 Name: "vector.gep");
3777 State.set(Def: this, V: GEP);
3778}
3779
3780#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3781void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
3782 VPSlotTracker &SlotTracker) const {
3783 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
3784 "unexpected number of operands");
3785 O << Indent << "EMIT ";
3786 printAsOperand(O, SlotTracker);
3787 O << " = WIDEN-POINTER-INDUCTION ";
3788 getStartValue()->printAsOperand(O, SlotTracker);
3789 O << ", ";
3790 getStepValue()->printAsOperand(O, SlotTracker);
3791 O << ", ";
3792 getOperand(2)->printAsOperand(O, SlotTracker);
3793 if (getNumOperands() == 5) {
3794 O << ", ";
3795 getOperand(3)->printAsOperand(O, SlotTracker);
3796 O << ", ";
3797 getOperand(4)->printAsOperand(O, SlotTracker);
3798 }
3799}
3800#endif
3801
3802void VPExpandSCEVRecipe::execute(VPTransformState &State) {
3803 assert(!State.Lane && "cannot be used in per-lane");
3804 const DataLayout &DL = SE.getDataLayout();
3805 SCEVExpander Exp(SE, DL, "induction", /*PreserveLCSSA=*/true);
3806 Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(),
3807 I: &*State.Builder.GetInsertPoint());
3808 State.set(Def: this, V: Res, Lane: VPLane(0));
3809}
3810
3811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3812void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
3813 VPSlotTracker &SlotTracker) const {
3814 O << Indent << "EMIT ";
3815 printAsOperand(O, SlotTracker);
3816 O << " = EXPAND SCEV " << *Expr;
3817}
3818#endif
3819
3820void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
3821 Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
3822 Type *STy = CanonicalIV->getType();
3823 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
3824 ElementCount VF = State.VF;
3825 Value *VStart = VF.isScalar()
3826 ? CanonicalIV
3827 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
3828 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U&: *this));
3829 if (VF.isVector()) {
3830 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
3831 VStep =
3832 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
3833 }
3834 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
3835 State.set(Def: this, V: CanonicalVectorIV);
3836}
3837
3838#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3839void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
3840 VPSlotTracker &SlotTracker) const {
3841 O << Indent << "EMIT ";
3842 printAsOperand(O, SlotTracker);
3843 O << " = WIDEN-CANONICAL-INDUCTION ";
3844 printOperands(O, SlotTracker);
3845}
3846#endif
3847
3848void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
3849 auto &Builder = State.Builder;
3850 // Create a vector from the initial value.
3851 auto *VectorInit = getStartValue()->getLiveInIRValue();
3852
3853 Type *VecTy = State.VF.isScalar()
3854 ? VectorInit->getType()
3855 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
3856
3857 BasicBlock *VectorPH =
3858 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
3859 if (State.VF.isVector()) {
3860 auto *IdxTy = Builder.getInt32Ty();
3861 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
3862 IRBuilder<>::InsertPointGuard Guard(Builder);
3863 Builder.SetInsertPoint(VectorPH->getTerminator());
3864 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
3865 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
3866 VectorInit = Builder.CreateInsertElement(
3867 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
3868 }
3869
3870 // Create a phi node for the new recurrence.
3871 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
3872 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
3873 Phi->addIncoming(V: VectorInit, BB: VectorPH);
3874 State.set(Def: this, V: Phi);
3875}
3876
3877InstructionCost
3878VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
3879 VPCostContext &Ctx) const {
3880 if (VF.isScalar())
3881 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3882
3883 return 0;
3884}
3885
3886#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3887void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
3888 VPSlotTracker &SlotTracker) const {
3889 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
3890 printAsOperand(O, SlotTracker);
3891 O << " = phi ";
3892 printOperands(O, SlotTracker);
3893}
3894#endif
3895
3896void VPReductionPHIRecipe::execute(VPTransformState &State) {
3897 // Reductions do not have to start at zero. They can start with
3898 // any loop invariant values.
3899 VPValue *StartVPV = getStartValue();
3900
3901 // In order to support recurrences we need to be able to vectorize Phi nodes.
3902 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3903 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
3904 // this value when we vectorize all of the instructions that use the PHI.
3905 BasicBlock *VectorPH =
3906 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
3907 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
3908 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
3909 Type *VecTy = StartV->getType();
3910
3911 BasicBlock *HeaderBB = State.CFG.PrevBB;
3912 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
3913 "recipe must be in the vector loop header");
3914 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
3915 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
3916 State.set(Def: this, V: Phi, IsScalar: IsInLoop);
3917
3918 Phi->addIncoming(V: StartV, BB: VectorPH);
3919}
3920
3921#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3922void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3923 VPSlotTracker &SlotTracker) const {
3924 O << Indent << "WIDEN-REDUCTION-PHI ";
3925
3926 printAsOperand(O, SlotTracker);
3927 O << " = phi ";
3928 printOperands(O, SlotTracker);
3929 if (VFScaleFactor != 1)
3930 O << " (VF scaled by 1/" << VFScaleFactor << ")";
3931}
3932#endif
3933
3934void VPWidenPHIRecipe::execute(VPTransformState &State) {
3935 Value *Op0 = State.get(Def: getOperand(N: 0));
3936 Type *VecTy = Op0->getType();
3937 Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name);
3938 // Manually move it with the other PHIs in case PHI recipes above this one
3939 // also inserted non-phi instructions.
3940 // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
3941 // convertToConcreteRecipes.
3942 VecPhi->moveBefore(InsertPos: State.Builder.GetInsertBlock()->getFirstNonPHIIt());
3943 State.set(Def: this, V: VecPhi);
3944}
3945
3946#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3947void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3948 VPSlotTracker &SlotTracker) const {
3949 O << Indent << "WIDEN-PHI ";
3950
3951 printAsOperand(O, SlotTracker);
3952 O << " = phi ";
3953 printPhiOperands(O, SlotTracker);
3954}
3955#endif
3956
3957// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
3958// remove VPActiveLaneMaskPHIRecipe.
3959void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
3960 BasicBlock *VectorPH =
3961 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
3962 Value *StartMask = State.get(Def: getOperand(N: 0));
3963 PHINode *Phi =
3964 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
3965 Phi->addIncoming(V: StartMask, BB: VectorPH);
3966 State.set(Def: this, V: Phi);
3967}
3968
3969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3970void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3971 VPSlotTracker &SlotTracker) const {
3972 O << Indent << "ACTIVE-LANE-MASK-PHI ";
3973
3974 printAsOperand(O, SlotTracker);
3975 O << " = phi ";
3976 printOperands(O, SlotTracker);
3977}
3978#endif
3979
3980#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3981void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
3982 VPSlotTracker &SlotTracker) const {
3983 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
3984
3985 printAsOperand(O, SlotTracker);
3986 O << " = phi ";
3987 printOperands(O, SlotTracker);
3988}
3989#endif
3990