1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanHelpers.h"
17#include "VPlanPatternMatch.h"
18#include "VPlanUtils.h"
19#include "llvm/ADT/STLExtras.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/SmallVectorExtras.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include "llvm/Transforms/Utils/LoopUtils.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
51// It is sometimes necessary to disable printing of metadata in tests in order
52// to avoid non-deterministic behaviour due to metadata introduced by VPlan
53// that wasn't present in the original scalar IR.
54static cl::opt<bool> VPlanPrintMetadata(
55 "vplan-print-metadata", cl::init(true), cl::Hidden,
56 cl::desc("Controls the printing of recipe metadata when debugging."));
57#endif
58
59bool VPRecipeBase::mayWriteToMemory() const {
60 switch (getVPRecipeID()) {
61 case VPExpressionSC:
62 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
63 case VPInstructionSC: {
64 auto *VPI = cast<VPInstruction>(Val: this);
65 // Loads read from memory but don't write to memory.
66 if (VPI->getOpcode() == Instruction::Load)
67 return false;
68 return VPI->opcodeMayReadOrWriteFromMemory();
69 }
70 case VPInterleaveEVLSC:
71 case VPInterleaveSC:
72 return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0;
73 case VPWidenStoreEVLSC:
74 case VPWidenStoreSC:
75 return true;
76 case VPReplicateSC:
77 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
78 ->mayWriteToMemory();
79 case VPWidenCallSC:
80 return !cast<VPWidenCallRecipe>(Val: this)
81 ->getCalledScalarFunction()
82 ->onlyReadsMemory();
83 case VPWidenMemIntrinsicSC:
84 case VPWidenIntrinsicSC:
85 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
86 case VPActiveLaneMaskPHISC:
87 case VPCurrentIterationPHISC:
88 case VPBranchOnMaskSC:
89 case VPDerivedIVSC:
90 case VPFirstOrderRecurrencePHISC:
91 case VPReductionPHISC:
92 case VPScalarIVStepsSC:
93 case VPPredInstPHISC:
94 return false;
95 case VPBlendSC:
96 case VPReductionEVLSC:
97 case VPReductionSC:
98 case VPVectorPointerSC:
99 case VPWidenCanonicalIVSC:
100 case VPWidenCastSC:
101 case VPWidenGEPSC:
102 case VPWidenIntOrFpInductionSC:
103 case VPWidenLoadEVLSC:
104 case VPWidenLoadSC:
105 case VPWidenPHISC:
106 case VPWidenPointerInductionSC:
107 case VPWidenSC: {
108 const Instruction *I =
109 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
110 (void)I;
111 assert((!I || !I->mayWriteToMemory()) &&
112 "underlying instruction may write to memory");
113 return false;
114 }
115 default:
116 return true;
117 }
118}
119
120bool VPRecipeBase::mayReadFromMemory() const {
121 switch (getVPRecipeID()) {
122 case VPExpressionSC:
123 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
124 case VPInstructionSC:
125 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
126 case VPWidenLoadEVLSC:
127 case VPWidenLoadSC:
128 return true;
129 case VPReplicateSC:
130 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
131 ->mayReadFromMemory();
132 case VPWidenCallSC:
133 return !cast<VPWidenCallRecipe>(Val: this)
134 ->getCalledScalarFunction()
135 ->onlyWritesMemory();
136 case VPWidenMemIntrinsicSC:
137 case VPWidenIntrinsicSC:
138 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
139 case VPBranchOnMaskSC:
140 case VPDerivedIVSC:
141 case VPCurrentIterationPHISC:
142 case VPFirstOrderRecurrencePHISC:
143 case VPReductionPHISC:
144 case VPPredInstPHISC:
145 case VPScalarIVStepsSC:
146 case VPWidenStoreEVLSC:
147 case VPWidenStoreSC:
148 return false;
149 case VPBlendSC:
150 case VPReductionEVLSC:
151 case VPReductionSC:
152 case VPVectorPointerSC:
153 case VPWidenCanonicalIVSC:
154 case VPWidenCastSC:
155 case VPWidenGEPSC:
156 case VPWidenIntOrFpInductionSC:
157 case VPWidenPHISC:
158 case VPWidenPointerInductionSC:
159 case VPWidenSC: {
160 const Instruction *I =
161 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
162 (void)I;
163 assert((!I || !I->mayReadFromMemory()) &&
164 "underlying instruction may read from memory");
165 return false;
166 }
167 default:
168 // FIXME: Return false if the recipe represents an interleaved store.
169 return true;
170 }
171}
172
173bool VPRecipeBase::mayHaveSideEffects() const {
174 switch (getVPRecipeID()) {
175 case VPExpressionSC:
176 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
177 case VPActiveLaneMaskPHISC:
178 case VPDerivedIVSC:
179 case VPCurrentIterationPHISC:
180 case VPFirstOrderRecurrencePHISC:
181 case VPReductionPHISC:
182 case VPPredInstPHISC:
183 case VPVectorEndPointerSC:
184 return false;
185 case VPInstructionSC: {
186 auto *VPI = cast<VPInstruction>(Val: this);
187 return mayWriteToMemory() ||
188 VPI->getOpcode() == VPInstruction::BranchOnCount ||
189 VPI->getOpcode() == VPInstruction::BranchOnCond ||
190 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
191 }
192 case VPWidenCallSC: {
193 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
194 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
195 }
196 case VPWidenMemIntrinsicSC:
197 case VPWidenIntrinsicSC:
198 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
199 case VPBlendSC:
200 case VPReductionEVLSC:
201 case VPReductionSC:
202 case VPScalarIVStepsSC:
203 case VPVectorPointerSC:
204 case VPWidenCanonicalIVSC:
205 case VPWidenCastSC:
206 case VPWidenGEPSC:
207 case VPWidenIntOrFpInductionSC:
208 case VPWidenPHISC:
209 case VPWidenPointerInductionSC:
210 case VPWidenSC: {
211 const Instruction *I =
212 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
213 (void)I;
214 assert((!I || !I->mayHaveSideEffects()) &&
215 "underlying instruction has side-effects");
216 return false;
217 }
218 case VPInterleaveEVLSC:
219 case VPInterleaveSC:
220 return mayWriteToMemory();
221 case VPWidenLoadEVLSC:
222 case VPWidenLoadSC:
223 case VPWidenStoreEVLSC:
224 case VPWidenStoreSC:
225 assert(
226 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
227 mayWriteToMemory() &&
228 "mayHaveSideffects result for ingredient differs from this "
229 "implementation");
230 return mayWriteToMemory();
231 case VPReplicateSC: {
232 auto *R = cast<VPReplicateRecipe>(Val: this);
233 return R->getUnderlyingInstr()->mayHaveSideEffects();
234 }
235 default:
236 return true;
237 }
238}
239
240bool VPRecipeBase::isSafeToSpeculativelyExecute() const {
241 switch (getVPRecipeID()) {
242 default:
243 return false;
244 case VPInstructionSC: {
245 unsigned Opcode = cast<VPInstruction>(Val: this)->getOpcode();
246 if (Instruction::isCast(Opcode))
247 return true;
248
249 switch (Opcode) {
250 default:
251 return false;
252 case Instruction::Add:
253 case Instruction::Sub:
254 case Instruction::Mul:
255 case Instruction::GetElementPtr:
256 return true;
257 }
258 }
259 }
260}
261
262void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
263 assert(!Parent && "Recipe already in some VPBasicBlock");
264 assert(InsertPos->getParent() &&
265 "Insertion position not in any VPBasicBlock");
266 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
267}
268
269void VPRecipeBase::insertBefore(VPBasicBlock &BB,
270 iplist<VPRecipeBase>::iterator I) {
271 assert(!Parent && "Recipe already in some VPBasicBlock");
272 assert(I == BB.end() || I->getParent() == &BB);
273 BB.insert(Recipe: this, InsertPt: I);
274}
275
276void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
277 assert(!Parent && "Recipe already in some VPBasicBlock");
278 assert(InsertPos->getParent() &&
279 "Insertion position not in any VPBasicBlock");
280 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
281}
282
283void VPRecipeBase::removeFromParent() {
284 assert(getParent() && "Recipe not in any VPBasicBlock");
285 getParent()->getRecipeList().remove(IT: getIterator());
286 Parent = nullptr;
287}
288
289iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
290 assert(getParent() && "Recipe not in any VPBasicBlock");
291 return getParent()->getRecipeList().erase(where: getIterator());
292}
293
294void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
295 removeFromParent();
296 insertAfter(InsertPos);
297}
298
299void VPRecipeBase::moveBefore(VPBasicBlock &BB,
300 iplist<VPRecipeBase>::iterator I) {
301 removeFromParent();
302 insertBefore(BB, I);
303}
304
305InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
306 // Get the underlying instruction for the recipe, if there is one. It is used
307 // to
308 // * decide if cost computation should be skipped for this recipe,
309 // * apply forced target instruction cost.
310 Instruction *UI = nullptr;
311 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
312 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
313 else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this))
314 UI = IG->getInsertPos();
315 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
316 UI = &WidenMem->getIngredient();
317
318 InstructionCost RecipeCost;
319 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
320 RecipeCost = 0;
321 } else {
322 RecipeCost = computeCost(VF, Ctx);
323 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
324 RecipeCost.isValid()) {
325 if (UI)
326 RecipeCost = InstructionCost(ForceTargetInstructionCost);
327 else
328 RecipeCost = InstructionCost(0);
329 }
330 }
331
332 LLVM_DEBUG({
333 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
334 dump();
335 });
336 return RecipeCost;
337}
338
339InstructionCost VPRecipeBase::computeCost(ElementCount VF,
340 VPCostContext &Ctx) const {
341 llvm_unreachable("subclasses should implement computeCost");
342}
343
344bool VPRecipeBase::isPhi() const {
345 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
346 isa<VPPhi, VPIRPhi>(Val: this);
347}
348
349void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
350 assert(OpType == Other.OpType && "OpType must match");
351 switch (OpType) {
352 case OperationType::OverflowingBinOp:
353 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
354 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
355 break;
356 case OperationType::Trunc:
357 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
358 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
359 break;
360 case OperationType::DisjointOp:
361 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
362 break;
363 case OperationType::PossiblyExactOp:
364 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
365 break;
366 case OperationType::GEPOp:
367 GEPFlagsStorage &= Other.GEPFlagsStorage;
368 break;
369 case OperationType::FPMathOp:
370 case OperationType::FCmp:
371 assert((OpType != OperationType::FCmp ||
372 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
373 "Cannot drop CmpPredicate");
374 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
375 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
376 break;
377 case OperationType::NonNegOp:
378 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
379 break;
380 case OperationType::Cmp:
381 assert(CmpPredStorage == Other.CmpPredStorage &&
382 "Cannot drop CmpPredicate");
383 break;
384 case OperationType::ReductionOp:
385 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
386 "Cannot change RecurKind");
387 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
388 "Cannot change IsOrdered");
389 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
390 "Cannot change IsInLoop");
391 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
392 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
393 break;
394 case OperationType::Other:
395 break;
396 }
397}
398
399FastMathFlags VPIRFlags::getFastMathFlagsOrNone() const {
400 if (!hasFastMathFlags())
401 return {};
402 const FastMathFlagsTy &F = getFMFsRef();
403 FastMathFlags Res;
404 Res.setAllowReassoc(F.AllowReassoc);
405 Res.setNoNaNs(F.NoNaNs);
406 Res.setNoInfs(F.NoInfs);
407 Res.setNoSignedZeros(F.NoSignedZeros);
408 Res.setAllowReciprocal(F.AllowReciprocal);
409 Res.setAllowContract(F.AllowContract);
410 Res.setApproxFunc(F.ApproxFunc);
411 return Res;
412}
413
414#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
415void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
416
417void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
418 VPSlotTracker &SlotTracker) const {
419 printRecipe(O, Indent, SlotTracker);
420 if (auto DL = getDebugLoc()) {
421 O << ", !dbg ";
422 DL.print(O);
423 }
424
425 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
426 Metadata->print(O, SlotTracker);
427}
428#endif
429
430VPExpandSCEVRecipe::VPExpandSCEVRecipe(const SCEV *Expr)
431 : VPSingleDefRecipe(VPRecipeBase::VPExpandSCEVSC, {}, Expr->getType()),
432 Expr(Expr) {}
433
434/// For call VPInstruction operands, return the operand index of the called
435/// function. The function is either the last operand (for unmasked calls) or
436/// the second-to-last operand (for masked calls).
437static unsigned getCalledFnOperandIndex(ArrayRef<VPValue *> Operands) {
438 unsigned NumOps = Operands.size();
439 auto *LastOp = dyn_cast<VPIRValue>(Val: Operands[NumOps - 1]);
440 if (LastOp && isa<Function>(Val: LastOp->getValue()))
441 return NumOps - 1;
442 assert(isa<Function>(cast<VPIRValue>(Operands[NumOps - 2])->getValue()) &&
443 "expected function operand");
444 return NumOps - 2;
445}
446
447/// For call VPInstruction operands, return the called function.
448static Function *getCalledFunction(ArrayRef<VPValue *> Operands) {
449 unsigned Idx = getCalledFnOperandIndex(Operands);
450 return cast<Function>(Val: cast<VPIRValue>(Val: Operands[Idx])->getValue());
451}
452
453Type *llvm::computeScalarTypeForInstruction(unsigned Opcode,
454 ArrayRef<VPValue *> Operands) {
455 assert(!Operands.empty() &&
456 "zero-operand VPInstruction opcodes must pass explicit ResultTy");
457 // Assert operand \p Idx (if present and typed) has type \p ExpectedTy.
458 [[maybe_unused]] auto AssertOperandType = [&Operands](unsigned Idx,
459 Type *ExpectedTy) {
460 if (!ExpectedTy || Operands.size() <= Idx)
461 return;
462 [[maybe_unused]] Type *OpTy = Operands[Idx]->getScalarType();
463 assert((!OpTy || OpTy == ExpectedTy) &&
464 "different types inferred for different operands");
465 };
466
467 Type *Op0Ty = Operands[0]->getScalarType();
468 LLVMContext &Ctx = Op0Ty->getContext();
469 switch (Opcode) {
470 case VPInstruction::BranchOnCond:
471 assert(Op0Ty->isIntegerTy(1) && "expected bool condition");
472 return Type::getVoidTy(C&: Ctx);
473 case VPInstruction::BranchOnTwoConds:
474 assert(Op0Ty->isIntegerTy(1) && "expected bool condition");
475 AssertOperandType(1, IntegerType::get(C&: Ctx, NumBits: 1));
476 return Type::getVoidTy(C&: Ctx);
477 case VPInstruction::BranchOnCount:
478 assert(Op0Ty->isIntegerTy() && "expected integer operand");
479 AssertOperandType(1, Op0Ty);
480 return Type::getVoidTy(C&: Ctx);
481 case VPInstruction::CalculateTripCountMinusVF:
482 case VPInstruction::CanonicalIVIncrementForPart:
483 assert(Op0Ty->isIntegerTy() && "expected integer operand");
484 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
485 AssertOperandType(Idx, Op0Ty);
486 return Op0Ty;
487 case Instruction::Switch:
488 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
489 AssertOperandType(Idx, Op0Ty);
490 return Type::getVoidTy(C&: Ctx);
491 case Instruction::Store:
492 return Type::getVoidTy(C&: Ctx);
493 case Instruction::ICmp:
494 assert(Op0Ty->isIntOrPtrTy() && "expected integer or pointer operand");
495 AssertOperandType(1, Op0Ty);
496 return IntegerType::get(C&: Ctx, NumBits: 1);
497 case Instruction::FCmp:
498 assert(Op0Ty->isFloatingPointTy() && "expected floating-point operand");
499 AssertOperandType(1, Op0Ty);
500 return IntegerType::get(C&: Ctx, NumBits: 1);
501 case VPInstruction::ActiveLaneMask:
502 assert(Op0Ty->isIntegerTy() && "expected integer operand");
503 AssertOperandType(1, Op0Ty);
504 return IntegerType::get(C&: Ctx, NumBits: 1);
505 case VPInstruction::MaskedCond:
506 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
507 return IntegerType::get(C&: Ctx, NumBits: 1);
508 case VPInstruction::LogicalAnd:
509 case VPInstruction::LogicalOr:
510 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
511 AssertOperandType(1, Op0Ty);
512 return IntegerType::get(C&: Ctx, NumBits: 1);
513 case VPInstruction::AnyOf:
514 assert(Op0Ty->isIntegerTy(1) && "expected bool operand");
515 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
516 AssertOperandType(Idx, Op0Ty);
517 return IntegerType::get(C&: Ctx, NumBits: 1);
518 case VPInstruction::ExplicitVectorLength:
519 assert(Op0Ty->isIntegerTy() && "expected integer operand");
520 return IntegerType::get(C&: Ctx, NumBits: 32);
521 case Instruction::Select: {
522 assert((!Op0Ty || Op0Ty->isIntegerTy(1)) &&
523 "select condition must be bool");
524 Type *Op1Ty = Operands[1]->getScalarType();
525 AssertOperandType(2, Op1Ty);
526 return Op1Ty;
527 }
528 case Instruction::InsertElement:
529 // The inserted scalar (operand 1) must match the vector element type;
530 // operand 2 must be an integer.
531 AssertOperandType(1, Op0Ty);
532 assert(Operands[2]->getScalarType()->isIntegerTy() &&
533 "expected integer operand");
534 return Op0Ty;
535 case VPInstruction::ReductionStartVector:
536 // The start value and the identity value (operands 0 and 1) fill the same
537 // vector and must match in type; operand 2 is the scaling factor.
538 AssertOperandType(1, Op0Ty);
539 return Op0Ty;
540 case VPInstruction::ExtractLane: {
541 assert(Operands.size() >= 2 && "ExtractLane requires a lane operand and "
542 "at least one source vector operand");
543 // Operand 0 is the lane index, used for integer arithmetic.
544 assert(Op0Ty->isIntegerTy() && "expected integer operand");
545 Type *Op1Ty = Operands[1]->getScalarType();
546 for (unsigned Idx = 2; Idx != Operands.size(); ++Idx)
547 AssertOperandType(Idx, Op1Ty);
548 return Op1Ty;
549 }
550 case VPInstruction::PtrAdd:
551 case VPInstruction::WidePtrAdd:
552 assert(Operands[0]->getScalarType()->isPointerTy() &&
553 "expected pointer operand");
554 assert(Operands[1]->getScalarType()->isIntegerTy() &&
555 "expected integer operand");
556 return Op0Ty;
557 case Instruction::ExtractValue: {
558 assert(Operands.size() == 2 && "expected single level extractvalue");
559 auto *StructTy = cast<StructType>(Val: Op0Ty);
560 return StructTy->getTypeAtIndex(
561 N: cast<VPConstantInt>(Val: Operands[1])->getZExtValue());
562 }
563 case VPInstruction::FirstActiveLane:
564 case VPInstruction::LastActiveLane:
565 case VPInstruction::NumActiveLanes:
566 case VPInstruction::IncomingAliasMask:
567 case Instruction::Load:
568 case Instruction::Alloca:
569 llvm_unreachable("type must be passed explicitly");
570 case Instruction::Call:
571 return getCalledFunction(Operands)->getReturnType();
572 default:
573 break;
574 }
575
576 // Opcodes that require all operands to share the same scalar type as the
577 // result.
578 bool AllOperandsSameType =
579 Instruction::isBinaryOp(Opcode) ||
580 is_contained(Set: {VPInstruction::FirstOrderRecurrenceSplice,
581 VPInstruction::BuildVector,
582 VPInstruction::BuildStructVector},
583 Element: Opcode);
584 if (AllOperandsSameType)
585 for (unsigned Idx = 1; Idx != Operands.size(); ++Idx)
586 AssertOperandType(Idx, Op0Ty);
587
588 return Op0Ty;
589}
590
591Type *VPReplicateRecipe::computeScalarType(const Instruction *I,
592 ArrayRef<VPValue *> Operands) {
593 unsigned Opcode = I->getOpcode();
594 if (Instruction::isCast(Opcode) ||
595 is_contained(Range: ArrayRef<unsigned>({Instruction::ExtractValue,
596 Instruction::Load, Instruction::Alloca}),
597 Element: Opcode))
598 return I->getType();
599 return computeScalarTypeForInstruction(Opcode, Operands);
600}
601
602VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
603 const VPIRFlags &Flags, const VPIRMetadata &MD,
604 DebugLoc DL, const Twine &Name, Type *ResultTy)
605 : VPRecipeWithIRFlags(
606 VPRecipeBase::VPInstructionSC, Operands,
607 ResultTy ? ResultTy
608 : computeScalarTypeForInstruction(Opcode, Operands),
609 Flags, DL),
610 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
611 assert(flagsValidForOpcode(getOpcode()) &&
612 "Set flags not supported for the provided opcode");
613 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
614 "Opcode requires specific flags to be set");
615 assert((getNumOperandsForOpcode() == -1u ||
616 getNumOperandsForOpcode() == getNumOperands() ||
617 (isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) &&
618 "number of operands does not match opcode");
619}
620
621unsigned VPInstruction::getNumOperandsForOpcode() const {
622 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
623 return 1;
624
625 if (Instruction::isBinaryOp(Opcode))
626 return 2;
627
628 switch (Opcode) {
629 case VPInstruction::StepVector:
630 case VPInstruction::VScale:
631 case VPInstruction::IncomingAliasMask:
632 return 0;
633 case Instruction::Alloca:
634 case Instruction::ExtractValue:
635 case Instruction::Freeze:
636 case Instruction::Load:
637 case VPInstruction::BranchOnCond:
638 case VPInstruction::Broadcast:
639 case VPInstruction::ExitingIVValue:
640 case VPInstruction::ExplicitVectorLength:
641 case VPInstruction::ExtractLastLane:
642 case VPInstruction::ExtractLastPart:
643 case VPInstruction::ExtractPenultimateElement:
644 case VPInstruction::MaskedCond:
645 case VPInstruction::Not:
646 case VPInstruction::Reverse:
647 case VPInstruction::Unpack:
648 case VPInstruction::NumActiveLanes:
649 return 1;
650 case Instruction::ICmp:
651 case Instruction::FCmp:
652 case Instruction::ExtractElement:
653 case Instruction::Store:
654 case VPInstruction::BranchOnCount:
655 case VPInstruction::BranchOnTwoConds:
656 case VPInstruction::FirstOrderRecurrenceSplice:
657 case VPInstruction::LogicalAnd:
658 case VPInstruction::LogicalOr:
659 case VPInstruction::PtrAdd:
660 case VPInstruction::WidePtrAdd:
661 case VPInstruction::WideIVStep:
662 case VPInstruction::CalculateTripCountMinusVF:
663 case VPInstruction::ResumeForEpilogue:
664 return 2;
665 case Instruction::InsertElement:
666 case Instruction::Select:
667 case VPInstruction::ActiveLaneMask:
668 case VPInstruction::ReductionStartVector:
669 return 3;
670 case Instruction::Call:
671 return getCalledFnOperandIndex(Operands: ArrayRef<VPValue *>(op_begin(), op_end())) +
672 1;
673 case Instruction::GetElementPtr:
674 case Instruction::PHI:
675 case Instruction::Switch:
676 case VPInstruction::AnyOf:
677 case VPInstruction::BuildStructVector:
678 case VPInstruction::BuildVector:
679 case VPInstruction::CanonicalIVIncrementForPart:
680 case VPInstruction::ComputeReductionResult:
681 case VPInstruction::FirstActiveLane:
682 case VPInstruction::LastActiveLane:
683 case VPInstruction::ExtractLane:
684 case VPInstruction::ExtractLastActive:
685 // Cannot determine the number of operands from the opcode.
686 return -1u;
687 }
688 llvm_unreachable("all cases should be handled above");
689}
690
691bool VPInstruction::doesGeneratePerAllLanes() const {
692 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
693}
694
695bool VPInstruction::canGenerateScalarForFirstLane() const {
696 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
697 return true;
698 if (isSingleScalar() || isVectorToScalar())
699 return true;
700 switch (Opcode) {
701 case Instruction::Freeze:
702 case Instruction::ICmp:
703 case Instruction::PHI:
704 case Instruction::Select:
705 case VPInstruction::BranchOnCond:
706 case VPInstruction::BranchOnTwoConds:
707 case VPInstruction::BranchOnCount:
708 case VPInstruction::CalculateTripCountMinusVF:
709 case VPInstruction::CanonicalIVIncrementForPart:
710 case VPInstruction::PtrAdd:
711 case VPInstruction::ExplicitVectorLength:
712 case VPInstruction::AnyOf:
713 case VPInstruction::Not:
714 return true;
715 default:
716 return false;
717 }
718}
719
720static Instruction::BinaryOps getSubRecurOpcode(RecurKind Kind) {
721 if (Kind == RecurKind::Sub)
722 return Instruction::Add;
723 if (Kind == RecurKind::FSub)
724 return Instruction::FAdd;
725 llvm_unreachable("RecurKind should be Sub/FSub.");
726}
727
728Value *VPInstruction::generate(VPTransformState &State) {
729 IRBuilderBase &Builder = State.Builder;
730
731 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
732 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
733 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
734 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
735 auto *Res =
736 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
737 if (auto *I = dyn_cast<Instruction>(Val: Res))
738 applyFlags(I&: *I);
739 return Res;
740 }
741
742 switch (getOpcode()) {
743 case VPInstruction::Not: {
744 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
745 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
746 return Builder.CreateNot(V: A, Name);
747 }
748 case Instruction::ExtractElement: {
749 assert(State.VF.isVector() && "Only extract elements from vectors");
750 if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1)))
751 return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue()));
752 Value *Vec = State.get(Def: getOperand(N: 0));
753 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
754 return Builder.CreateExtractElement(Vec, Idx, Name);
755 }
756 case Instruction::InsertElement: {
757 assert(State.VF.isVector() && "Can only insert elements into vectors");
758 Value *Vec = State.get(Def: getOperand(N: 0), /*IsScalar=*/false);
759 Value *Elt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
760 Value *Idx = State.get(Def: getOperand(N: 2), /*IsScalar=*/true);
761 return Builder.CreateInsertElement(Vec, NewElt: Elt, Idx, Name);
762 }
763 case Instruction::Freeze: {
764 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
765 return Builder.CreateFreeze(V: Op, Name);
766 }
767 case Instruction::FCmp:
768 case Instruction::ICmp: {
769 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
770 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
771 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
772 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
773 }
774 case Instruction::PHI: {
775 llvm_unreachable("should be handled by VPPhi::execute");
776 }
777 case Instruction::Select: {
778 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
779 Value *Cond =
780 State.get(Def: getOperand(N: 0),
781 IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0)));
782 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
783 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
784 return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlagsOrNone(),
785 Name);
786 }
787 case VPInstruction::ActiveLaneMask: {
788 // Get first lane of vector induction variable.
789 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
790 // Get the original loop tripcount.
791 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
792
793 // If this part of the active lane mask is scalar, generate the CMP directly
794 // to avoid unnecessary extracts.
795 if (State.VF.isScalar())
796 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
797 Name);
798
799 ElementCount EC = State.VF.multiplyCoefficientBy(
800 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
801 auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
802 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
803 OverloadTypes: {PredTy, ScalarTC->getType()},
804 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
805 }
806 case VPInstruction::NumActiveLanes: {
807 Value *Op = State.get(Def: getOperand(N: 0));
808 auto *VecTy = cast<VectorType>(Val: Op->getType());
809 assert(VecTy->getScalarSizeInBits() == 1 &&
810 "NumActiveLanes only implemented for i1 vectors");
811
812 Type *Ty = getScalarType();
813 Value *ZExt = Builder.CreateCast(
814 Op: Instruction::ZExt, V: Op, DestTy: VectorType::get(ElementType: Ty, EC: VecTy->getElementCount()));
815 Value *NumActive =
816 Builder.CreateUnaryIntrinsic(ID: Intrinsic::vector_reduce_add, Op: ZExt);
817 return NumActive;
818 }
819 case VPInstruction::FirstOrderRecurrenceSplice: {
820 // Generate code to combine the previous and current values in vector v3.
821 //
822 // vector.ph:
823 // v_init = vector(..., ..., ..., a[-1])
824 // br vector.body
825 //
826 // vector.body
827 // i = phi [0, vector.ph], [i+4, vector.body]
828 // v1 = phi [v_init, vector.ph], [v2, vector.body]
829 // v2 = a[i, i+1, i+2, i+3];
830 // v3 = vector(v1(3), v2(0, 1, 2))
831
832 auto *V1 = State.get(Def: getOperand(N: 0));
833 if (!V1->getType()->isVectorTy())
834 return V1;
835 Value *V2 = State.get(Def: getOperand(N: 1));
836 return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name);
837 }
838 case VPInstruction::CalculateTripCountMinusVF: {
839 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
840 Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
841 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF);
842 Value *Cmp =
843 Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF);
844 Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
845 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
846 }
847 case VPInstruction::ExplicitVectorLength: {
848 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
849 // be outside of the main loop.
850 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
851 // Compute EVL
852 assert(AVL->getType()->isIntegerTy() &&
853 "Requested vector length should be an integer.");
854
855 assert(State.VF.isScalable() && "Expected scalable vector factor.");
856 Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
857
858 Value *EVL = Builder.CreateIntrinsic(
859 RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
860 Args: {AVL, VFArg, Builder.getTrue()});
861 return EVL;
862 }
863 case VPInstruction::BranchOnCond: {
864 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
865 // Replace the temporary unreachable terminator with a new conditional
866 // branch, hooking it up to backward destination for latch blocks now, and
867 // to forward destination(s) later when they are created.
868 // Second successor may be backwards - iff it is already in VPBB2IRBB.
869 VPBasicBlock *SecondVPSucc =
870 cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]);
871 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
872 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
873 auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
874 // First successor is always forward, reset it to nullptr.
875 Br->setSuccessor(idx: 0, NewSucc: nullptr);
876 IRBB->getTerminator()->eraseFromParent();
877 applyMetadata(I&: *Br);
878 return Br;
879 }
880 case VPInstruction::Broadcast: {
881 return Builder.CreateVectorSplat(
882 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
883 }
884 case VPInstruction::BuildStructVector: {
885 // For struct types, we need to build a new 'wide' struct type, where each
886 // element is widened, i.e., we create a struct of vectors.
887 auto *StructTy = cast<StructType>(Val: getOperand(N: 0)->getScalarType());
888 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
889 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
890 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
891 FieldIndex++) {
892 Value *ScalarValue =
893 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
894 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
895 VectorValue =
896 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
897 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
898 }
899 }
900 return Res;
901 }
902 case VPInstruction::BuildVector: {
903 auto *ScalarTy = getOperand(N: 0)->getScalarType();
904 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
905 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
906 for (const auto &[Idx, Op] : enumerate(First: operands()))
907 Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
908 Idx: Builder.getInt32(C: Idx));
909 return Res;
910 }
911 case VPInstruction::ReductionStartVector: {
912 if (State.VF.isScalar())
913 return State.get(Def: getOperand(N: 0), IsScalar: true);
914 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
915 Builder.setFastMathFlags(getFastMathFlagsOrNone());
916 // If this start vector is scaled then it should produce a vector with fewer
917 // elements than the VF.
918 ElementCount VF = State.VF.divideCoefficientBy(
919 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
920 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
921 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
922 Idx: Builder.getInt32(C: 0));
923 }
924 case VPInstruction::ComputeReductionResult: {
925 RecurKind RK = getRecurKind();
926 bool IsOrdered = isReductionOrdered();
927 bool IsInLoop = isReductionInLoop();
928 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
929 "FindIV should use min/max reduction kinds");
930
931 // The recipe may have multiple operands to be reduced together.
932 unsigned NumOperandsToReduce = getNumOperands();
933 VectorParts RdxParts(NumOperandsToReduce);
934 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
935 RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
936
937 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
938 Builder.setFastMathFlags(getFastMathFlagsOrNone());
939
940 // Reduce multiple operands into one.
941 Value *ReducedPartRdx = RdxParts[0];
942 if (IsOrdered) {
943 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
944 } else {
945 // Floating-point operations should have some FMF to enable the reduction.
946 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
947 Value *RdxPart = RdxParts[Part];
948 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
949 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
950 else {
951 // For sub-recurrences, each part's reduction variable is already
952 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
953 Instruction::BinaryOps Opcode =
954 RecurrenceDescriptor::isSubRecurrenceKind(Kind: RK)
955 ? getSubRecurOpcode(Kind: RK)
956 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
957 ReducedPartRdx =
958 Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
959 }
960 }
961 }
962
963 // Create the reduction after the loop. Note that inloop reductions create
964 // the target reduction in the loop using a Reduction recipe.
965 if (State.VF.isVector() && !IsInLoop) {
966 // TODO: Support in-order reductions based on the recurrence descriptor.
967 // All ops in the reduction inherit fast-math-flags from the recurrence
968 // descriptor.
969 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
970 }
971
972 return ReducedPartRdx;
973 }
974 case VPInstruction::ExtractLastLane:
975 case VPInstruction::ExtractPenultimateElement: {
976 unsigned Offset =
977 getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
978 Value *Res;
979 if (State.VF.isVector()) {
980 assert(Offset <= State.VF.getKnownMinValue() &&
981 "invalid offset to extract from");
982 // Extract lane VF - Offset from the operand.
983 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
984 } else {
985 // TODO: Remove ExtractLastLane for scalar VFs.
986 assert(Offset <= 1 && "invalid offset to extract from");
987 Res = State.get(Def: getOperand(N: 0));
988 }
989 if (isa<ExtractElementInst>(Val: Res))
990 Res->setName(Name);
991 return Res;
992 }
993 case VPInstruction::LogicalAnd: {
994 Value *A = State.get(Def: getOperand(N: 0));
995 Value *B = State.get(Def: getOperand(N: 1));
996 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
997 }
998 case VPInstruction::LogicalOr: {
999 Value *A = State.get(Def: getOperand(N: 0));
1000 Value *B = State.get(Def: getOperand(N: 1));
1001 return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name);
1002 }
1003 case VPInstruction::PtrAdd: {
1004 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
1005 "can only generate first lane for PtrAdd");
1006 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1007 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
1008 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
1009 }
1010 case VPInstruction::WidePtrAdd: {
1011 Value *Ptr =
1012 State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0)));
1013 Value *Addend = State.get(Def: getOperand(N: 1));
1014 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
1015 }
1016 case VPInstruction::AnyOf: {
1017 Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0)));
1018 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
1019 Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
1020 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
1021 }
1022 case VPInstruction::ExtractLane: {
1023 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
1024 "simplified to ExtractElement.");
1025 Value *LaneToExtract = State.get(Def: getOperand(N: 0), IsScalar: true);
1026 Type *IdxTy = getOperand(N: 0)->getScalarType();
1027 Value *Res = nullptr;
1028 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
1029
1030 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
1031 Value *VectorStart =
1032 Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1));
1033 Value *VectorIdx = Idx == 1
1034 ? LaneToExtract
1035 : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
1036 Value *Ext = State.VF.isScalar()
1037 ? State.get(Def: getOperand(N: Idx))
1038 : Builder.CreateExtractElement(
1039 Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
1040 if (Res) {
1041 Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
1042 Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
1043 } else {
1044 Res = Ext;
1045 }
1046 }
1047 return Res;
1048 }
1049 case VPInstruction::FirstActiveLane: {
1050 Type *Ty = this->getScalarType();
1051 if (getNumOperands() == 1) {
1052 Value *Mask = State.get(Def: getOperand(N: 0));
1053 return Builder.CreateCountTrailingZeroElems(ResTy: Ty, Mask,
1054 /*ZeroIsPoison=*/false, Name);
1055 }
1056 // If there are multiple operands, create a chain of selects to pick the
1057 // first operand with an active lane and add the number of lanes of the
1058 // preceding operands.
1059 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty, VF: State.VF);
1060 unsigned LastOpIdx = getNumOperands() - 1;
1061 Value *Res = nullptr;
1062 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
1063 Value *TrailingZeros =
1064 State.VF.isScalar()
1065 ? Builder.CreateZExt(
1066 V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
1067 RHS: Builder.getFalse()),
1068 DestTy: Ty)
1069 : Builder.CreateCountTrailingZeroElems(
1070 ResTy: Ty, Mask: State.get(Def: getOperand(N: Idx)),
1071 /*ZeroIsPoison=*/false, Name);
1072 Value *Current = Builder.CreateAdd(
1073 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty, V: Idx)),
1074 RHS: TrailingZeros);
1075 if (Res) {
1076 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
1077 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
1078 } else {
1079 Res = Current;
1080 }
1081 }
1082
1083 return Res;
1084 }
1085 case VPInstruction::ResumeForEpilogue:
1086 return State.get(Def: getOperand(N: 0), IsScalar: true);
1087 case VPInstruction::Reverse:
1088 return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse");
1089 case VPInstruction::ExtractLastActive: {
1090 Value *Result = State.get(Def: getOperand(N: 0), /*IsScalar=*/true);
1091 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
1092 Value *Data = State.get(Def: getOperand(N: Idx));
1093 Value *Mask = State.get(Def: getOperand(N: Idx + 1));
1094 Type *VTy = Data->getType();
1095
1096 if (State.VF.isScalar())
1097 Result = Builder.CreateSelect(C: Mask, True: Data, False: Result);
1098 else
1099 Result = Builder.CreateIntrinsic(
1100 ID: Intrinsic::experimental_vector_extract_last_active, OverloadTypes: {VTy},
1101 Args: {Data, Mask, Result});
1102 }
1103
1104 return Result;
1105 }
1106 default:
1107 llvm_unreachable("Unsupported opcode for instruction");
1108 }
1109}
1110
1111InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
1112 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
1113 Type *ScalarTy = this->getScalarType();
1114 Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
1115 switch (Opcode) {
1116 case Instruction::FNeg:
1117 return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
1118 case Instruction::UDiv:
1119 case Instruction::SDiv:
1120 case Instruction::SRem:
1121 case Instruction::URem:
1122 case Instruction::Add:
1123 case Instruction::FAdd:
1124 case Instruction::Sub:
1125 case Instruction::FSub:
1126 case Instruction::Mul:
1127 case Instruction::FMul:
1128 case Instruction::FDiv:
1129 case Instruction::FRem:
1130 case Instruction::Shl:
1131 case Instruction::LShr:
1132 case Instruction::AShr:
1133 case Instruction::And:
1134 case Instruction::Or:
1135 case Instruction::Xor: {
1136 // Certain instructions can be cheaper if they have a constant second
1137 // operand. One example of this are shifts on x86.
1138 VPValue *RHS = getOperand(N: 1);
1139 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS);
1140
1141 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1142 getOperand(N: 1)->isDefinedOutsideLoopRegions())
1143 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1144
1145 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1146 SmallVector<const Value *, 4> Operands;
1147 if (CtxI)
1148 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
1149 return Ctx.TTI.getArithmeticInstrCost(
1150 Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
1151 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1152 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
1153 }
1154 case Instruction::Freeze:
1155 // NOTE: The only way to ask for the cost is via getInstructionCost, which
1156 // requires the actual vector instruction. Instead, both here and in the
1157 // LoopVectorizationCostModel::getInstructionCost the costs mirror the
1158 // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
1159 // them in sync.
1160 return TTI::TCC_Free;
1161 case Instruction::ExtractValue:
1162 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
1163 CostKind: Ctx.CostKind);
1164 case Instruction::ICmp:
1165 case Instruction::FCmp: {
1166 Type *ScalarOpTy = getOperand(N: 0)->getScalarType();
1167 Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
1168 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
1169 return Ctx.TTI.getCmpSelInstrCost(
1170 Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
1171 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1172 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
1173 }
1174 case Instruction::BitCast: {
1175 Type *ScalarTy = this->getScalarType();
1176 if (ScalarTy->isPointerTy())
1177 return 0;
1178 [[fallthrough]];
1179 }
1180 case Instruction::SExt:
1181 case Instruction::ZExt:
1182 case Instruction::FPToUI:
1183 case Instruction::FPToSI:
1184 case Instruction::FPExt:
1185 case Instruction::PtrToInt:
1186 case Instruction::PtrToAddr:
1187 case Instruction::IntToPtr:
1188 case Instruction::SIToFP:
1189 case Instruction::UIToFP:
1190 case Instruction::Trunc:
1191 case Instruction::FPTrunc:
1192 case Instruction::AddrSpaceCast: {
1193 // Computes the CastContextHint from a recipe that may access memory.
1194 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1195 if (isa<VPInterleaveBase>(Val: R))
1196 return TTI::CastContextHint::Interleave;
1197 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
1198 // Only compute CCH for memory operations, matching the legacy model
1199 // which only considers loads/stores for cast context hints.
1200 auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
1201 if (!isa<LoadInst, StoreInst>(Val: UI))
1202 return TTI::CastContextHint::None;
1203 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1204 : TTI::CastContextHint::Normal;
1205 }
1206 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1207 if (WidenMemoryRecipe == nullptr)
1208 return TTI::CastContextHint::None;
1209 if (VF.isScalar())
1210 return TTI::CastContextHint::Normal;
1211 if (!WidenMemoryRecipe->isConsecutive())
1212 return TTI::CastContextHint::GatherScatter;
1213 if (WidenMemoryRecipe->isMasked())
1214 return TTI::CastContextHint::Masked;
1215 return TTI::CastContextHint::Normal;
1216 };
1217
1218 VPValue *Operand = getOperand(N: 0);
1219 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1220 bool IsReverse = false;
1221 // For Trunc/FPTrunc, get the context from the only user.
1222 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1223 if (auto *Recipe = cast_or_null<VPRecipeBase>(Val: getSingleUser())) {
1224 if (match(V: Recipe,
1225 P: m_CombineOr(
1226 Ps: m_Reverse(Op0: m_VPValue()),
1227 Ps: m_Intrinsic<Intrinsic::experimental_vp_reverse>()))) {
1228 IsReverse = true;
1229 Recipe = cast_or_null<VPRecipeBase>(
1230 Val: Recipe->getVPSingleValue()->getSingleUser());
1231 }
1232 if (Recipe)
1233 CCH = ComputeCCH(Recipe);
1234 }
1235 }
1236 // For Z/Sext, get the context from the operand.
1237 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1238 Opcode == Instruction::FPExt) {
1239 if (auto *Recipe = Operand->getDefiningRecipe()) {
1240 VPValue *ReverseOp;
1241 if (match(V: Recipe,
1242 P: m_CombineOr(Ps: m_Reverse(Op0: m_VPValue(V&: ReverseOp)),
1243 Ps: m_Intrinsic<Intrinsic::experimental_vp_reverse>(
1244 Op0: m_VPValue(V&: ReverseOp))))) {
1245 Recipe = ReverseOp->getDefiningRecipe();
1246 IsReverse = true;
1247 }
1248 if (Recipe)
1249 CCH = ComputeCCH(Recipe);
1250 }
1251 }
1252 if (IsReverse && CCH != TTI::CastContextHint::None)
1253 CCH = TTI::CastContextHint::Reversed;
1254
1255 auto *ScalarSrcTy = Operand->getScalarType();
1256 Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1257 // Arm TTI will use the underlying instruction to determine the cost.
1258 return Ctx.TTI.getCastInstrCost(
1259 Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1260 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1261 }
1262 case Instruction::Select: {
1263 SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1264 bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1265 Type *ScalarTy = this->getScalarType();
1266
1267 VPValue *Op0, *Op1;
1268 bool IsLogicalAnd =
1269 match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1270 bool IsLogicalOr =
1271 match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1272 // Also match the inverted forms:
1273 // select x, false, y --> !x & y (still AND)
1274 // select x, y, true --> !x | y (still OR)
1275 IsLogicalAnd |=
1276 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1277 IsLogicalOr |=
1278 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1279
1280 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1281 (IsLogicalAnd || IsLogicalOr)) {
1282 // select x, y, false --> x & y
1283 // select x, true, y --> x | y
1284 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1285 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1286
1287 SmallVector<const Value *, 2> Operands;
1288 if (SI && all_of(Range: operands(),
1289 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1290 append_range(C&: Operands, R: SI->operands());
1291 return Ctx.TTI.getArithmeticInstrCost(
1292 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1293 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1294 }
1295
1296 Type *CondTy = getOperand(N: 0)->getScalarType();
1297 if (!IsScalarCond && VF.isVector())
1298 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1299
1300 llvm::CmpPredicate Pred;
1301 if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1302 if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0)))
1303 if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1304 Pred = Cmp->getPredicate();
1305 Type *VectorTy = toVectorTy(Scalar: this->getScalarType(), EC: VF);
1306 return Ctx.TTI.getCmpSelInstrCost(
1307 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1308 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1309 }
1310 }
1311 llvm_unreachable("called for unsupported opcode");
1312}
1313
1314InstructionCost VPInstruction::computeCost(ElementCount VF,
1315 VPCostContext &Ctx) const {
1316 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1317 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1318 // TODO: Compute cost for VPInstructions without underlying values once
1319 // the legacy cost model has been retired.
1320 return 0;
1321 }
1322
1323 assert(!doesGeneratePerAllLanes() &&
1324 "Should only generate a vector value or single scalar, not scalars "
1325 "for all lanes.");
1326 return getCostForRecipeWithOpcode(
1327 Opcode: getOpcode(),
1328 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1329 }
1330
1331 switch (getOpcode()) {
1332 case Instruction::Select: {
1333 llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1334 match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1335 auto *CondTy = getOperand(N: 0)->getScalarType();
1336 auto *VecTy = getOperand(N: 1)->getScalarType();
1337 if (!vputils::onlyFirstLaneUsed(Def: this)) {
1338 CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1339 VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1340 }
1341 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1342 CostKind: Ctx.CostKind);
1343 }
1344 case Instruction::ExtractElement:
1345 case VPInstruction::ExtractLane: {
1346 if (VF.isScalar()) {
1347 // ExtractLane with VF=1 takes care of handling extracting across multiple
1348 // parts.
1349 return 0;
1350 }
1351
1352 // Add on the cost of extracting the element.
1353 auto *VecTy = toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF);
1354 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1355 CostKind: Ctx.CostKind);
1356 }
1357 case VPInstruction::AnyOf: {
1358 auto *VecTy = toVectorTy(Scalar: this->getScalarType(), EC: VF);
1359 return Ctx.TTI.getArithmeticReductionCost(
1360 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1361 }
1362 case VPInstruction::FirstActiveLane: {
1363 Type *Ty = this->getScalarType();
1364 Type *ScalarTy = getOperand(N: 0)->getScalarType();
1365 if (VF.isScalar())
1366 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1367 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1368 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1369 // Calculate the cost of determining the lane index.
1370 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1371 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1372 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1373 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1374 }
1375 case VPInstruction::LastActiveLane: {
1376 Type *Ty = this->getScalarType();
1377 Type *ScalarTy = getOperand(N: 0)->getScalarType();
1378 if (VF.isScalar())
1379 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1380 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1381 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1382 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1383 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1384 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1385 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1386 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1387 // Add cost of NOT operation on the predicate.
1388 Cost += Ctx.TTI.getArithmeticInstrCost(
1389 Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1390 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1391 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1392 .Properties: TargetTransformInfo::OP_None});
1393 // Add cost of SUB operation on the index.
1394 Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind: Ctx.CostKind);
1395 return Cost;
1396 }
1397 case VPInstruction::ExtractLastActive: {
1398 Type *ScalarTy = this->getScalarType();
1399 Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1400 Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1401 IntrinsicCostAttributes ICA(
1402 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1403 {VecTy, MaskTy, ScalarTy});
1404 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1405 }
1406 case VPInstruction::FirstOrderRecurrenceSplice: {
1407 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1408 Type *VectorTy = toVectorTy(Scalar: this->getScalarType(), EC: VF);
1409 return Ctx.TTI.getShuffleCost(
1410 Kind: TargetTransformInfo::SK_Splice, DstTy: cast<VectorType>(Val: VectorTy),
1411 SrcTy: cast<VectorType>(Val: VectorTy), Mask: {}, CostKind: Ctx.CostKind, Index: -1);
1412 }
1413 case VPInstruction::ActiveLaneMask: {
1414 Type *ArgTy = getOperand(N: 0)->getScalarType();
1415 unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue();
1416 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier);
1417 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1418 {ArgTy, ArgTy});
1419 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1420 }
1421 case VPInstruction::ExplicitVectorLength: {
1422 Type *Arg0Ty = getOperand(N: 0)->getScalarType();
1423 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1424 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1425 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1426 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1427 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1428 }
1429 case VPInstruction::Reverse: {
1430 assert(VF.isVector() && "Reverse operation must be vector type");
1431 Type *EltTy = this->getScalarType();
1432 // Skip the reverse operation cost for the mask.
1433 // FIXME: Remove this once redundant mask reverse operations can be
1434 // eliminated by VPlanTransforms::cse before cost computation.
1435 if (EltTy->isIntegerTy(BitWidth: 1))
1436 return 0;
1437 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: EltTy, EC: VF));
1438 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1439 SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind,
1440 /*Index=*/0);
1441 }
1442 case VPInstruction::ExtractLastLane: {
1443 // Add on the cost of extracting the element.
1444 auto *VecTy = toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF);
1445 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1446 Val: VecTy, CostKind: Ctx.CostKind, Index: 0);
1447 }
1448 case VPInstruction::Not: {
1449 Type *ValTy = this->getScalarType();
1450 // InstCombine will fold `xor` to the conditional branch.
1451 if (auto *U = const_cast<VPUser *>(getSingleUser()))
1452 if (match(U, P: m_BranchOnCond(Op0: m_VPValue())))
1453 return 0;
1454 if (!vputils::onlyFirstLaneUsed(Def: this))
1455 ValTy = toVectorTy(Scalar: ValTy, EC: VF);
1456 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Xor, Ty: ValTy,
1457 CostKind: Ctx.CostKind);
1458 }
1459 case VPInstruction::BranchOnCount: {
1460 // If TC <= VF then this is just a branch.
1461 // FIXME: Removing the branch happens in simplifyBranchConditionForVFAndUF
1462 // where it checks TC <= VF * UF, but we don't know UF yet. This means in
1463 // some cases we get a cost that's too high due to counting a cmp that
1464 // later gets removed.
1465 // FIXME: The compare could also be removed if TC = M * vscale,
1466 // VF = N * vscale, and M <= N. Detecting that would require having the
1467 // trip count as a SCEV though.
1468 Value *TC = getParent()->getPlan()->getTripCount()->getUnderlyingValue();
1469 ConstantInt *TCConst = dyn_cast_if_present<ConstantInt>(Val: TC);
1470 if (TCConst && TCConst->getValue().ule(RHS: VF.getKnownMinValue()))
1471 return 0;
1472 // Otherwise BranchOnCount generates ICmpEQ followed by a branch.
1473 Type *ValTy = getOperand(N: 0)->getScalarType();
1474 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy,
1475 CondTy: CmpInst::makeCmpResultType(opnd_type: ValTy),
1476 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1477 }
1478 case Instruction::FCmp:
1479 case Instruction::ICmp:
1480 return getCostForRecipeWithOpcode(
1481 Opcode: getOpcode(),
1482 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1483 case VPInstruction::ExtractPenultimateElement:
1484 if (VF == ElementCount::getScalable(MinVal: 1))
1485 return InstructionCost::getInvalid();
1486 [[fallthrough]];
1487 default:
1488 // TODO: Compute cost other VPInstructions once the legacy cost model has
1489 // been retired.
1490 assert(!getUnderlyingValue() &&
1491 "unexpected VPInstruction witht underlying value");
1492 return 0;
1493 }
1494}
1495
1496bool VPInstruction::isVectorToScalar() const {
1497 return getOpcode() == VPInstruction::ExtractLastLane ||
1498 getOpcode() == VPInstruction::ExtractPenultimateElement ||
1499 getOpcode() == Instruction::ExtractElement ||
1500 getOpcode() == VPInstruction::ExtractLane ||
1501 getOpcode() == VPInstruction::FirstActiveLane ||
1502 getOpcode() == VPInstruction::LastActiveLane ||
1503 getOpcode() == VPInstruction::ExtractLastActive ||
1504 getOpcode() == VPInstruction::ComputeReductionResult ||
1505 getOpcode() == VPInstruction::AnyOf ||
1506 getOpcode() == VPInstruction::NumActiveLanes;
1507}
1508
1509bool VPInstruction::isSingleScalar() const {
1510 switch (getOpcode()) {
1511 case Instruction::Load:
1512 case Instruction::PHI:
1513 case VPInstruction::ExplicitVectorLength:
1514 case VPInstruction::ResumeForEpilogue:
1515 case VPInstruction::VScale:
1516 return true;
1517 default:
1518 return Instruction::isCast(Opcode: getOpcode());
1519 }
1520}
1521
1522void VPInstruction::addOperand(VPValue *Op) {
1523#ifndef NDEBUG
1524 Type *Ty = Op->getScalarType();
1525 switch (getOpcode()) {
1526 case VPInstruction::AnyOf:
1527 case VPInstruction::FirstActiveLane:
1528 case VPInstruction::LastActiveLane:
1529 assert(Ty == getOperand(0)->getScalarType() &&
1530 "types of operand 0 and new operand must match");
1531 break;
1532 case VPInstruction::ComputeReductionResult:
1533 case VPInstruction::BuildVector:
1534 case VPInstruction::BuildStructVector:
1535 assert(Ty == getOperand(0)->getScalarType() &&
1536 "appended operand must match operand 0's scalar type");
1537 break;
1538 case VPInstruction::ExtractLane:
1539 assert(Ty == getOperand(1)->getScalarType() &&
1540 "appended operand must match operand 1's scalar type");
1541 break;
1542 case VPInstruction::ExtractLastActive: {
1543 // The recipe is constructed with 3 operands (result, data, mask). Extra
1544 // operands beyond that are appended in (data, mask) pairs.
1545 constexpr unsigned NumInitialOperands = 3;
1546 assert(getNumOperands() >= NumInitialOperands &&
1547 "ExtractLastActive must have at least the initial 3 operands");
1548 bool IsMaskSlot = ((getNumOperands() - NumInitialOperands) & 1u) == 1u;
1549 assert((IsMaskSlot ? Ty->isIntegerTy(1)
1550 : Ty == getOperand(1)->getScalarType()) &&
1551 "ExtractLastActive expects alternating data/mask operands "
1552 "matching operand 1's type and i1, respectively");
1553 break;
1554 }
1555 default:
1556 llvm_unreachable("opcode does not support growing the operand list "
1557 "outside of construction");
1558 }
1559#endif
1560 VPUser::addOperand(Operand: Op);
1561}
1562
1563void VPInstruction::execute(VPTransformState &State) {
1564 assert(!isMasked() && "cannot execute masked VPInstruction");
1565 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1566 assert(flagsValidForOpcode(getOpcode()) &&
1567 "Set flags not supported for the provided opcode");
1568 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1569 "Opcode requires specific flags to be set");
1570 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
1571 Value *GeneratedValue = generate(State);
1572 if (!hasResult())
1573 return;
1574 assert(GeneratedValue && "generate must produce a value");
1575 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1576 (vputils::onlyFirstLaneUsed(Def: this) ||
1577 isVectorToScalar() || isSingleScalar());
1578 assert((((GeneratedValue->getType()->isVectorTy() ||
1579 GeneratedValue->getType()->isStructTy()) ==
1580 !GeneratesPerFirstLaneOnly) ||
1581 State.VF.isScalar()) &&
1582 "scalar value but not only first lane defined");
1583 State.set(Def: this, V: GeneratedValue,
1584 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1585 if (getOpcode() == VPInstruction::ResumeForEpilogue ||
1586 getOpcode() == Instruction::Freeze) {
1587 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1588 // resume phis, and to let epilogue vectorization recover the frozen
1589 // reduction start from the main plan. Must be removed once epilogue
1590 // vectorization explicitly connects VPlans.
1591 setUnderlyingValue(GeneratedValue);
1592 }
1593}
1594
1595bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1596 if (Instruction::isBinaryOp(Opcode: getOpcode()) ||
1597 Instruction::isUnaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1598 return false;
1599 switch (getOpcode()) {
1600 case Instruction::ExtractValue:
1601 case Instruction::InsertValue:
1602 case Instruction::GetElementPtr:
1603 case Instruction::ExtractElement:
1604 case Instruction::InsertElement:
1605 case Instruction::Freeze:
1606 case Instruction::FCmp:
1607 case Instruction::ICmp:
1608 case Instruction::Select:
1609 case Instruction::PHI:
1610 case VPInstruction::AnyOf:
1611 case VPInstruction::BranchOnCond:
1612 case VPInstruction::BranchOnTwoConds:
1613 case VPInstruction::BranchOnCount:
1614 case VPInstruction::Broadcast:
1615 case VPInstruction::BuildStructVector:
1616 case VPInstruction::BuildVector:
1617 case VPInstruction::CalculateTripCountMinusVF:
1618 case VPInstruction::CanonicalIVIncrementForPart:
1619 case VPInstruction::ExtractLane:
1620 case VPInstruction::ExtractLastLane:
1621 case VPInstruction::ExtractLastPart:
1622 case VPInstruction::ExtractPenultimateElement:
1623 case VPInstruction::ActiveLaneMask:
1624 case VPInstruction::IncomingAliasMask:
1625 case VPInstruction::ExitingIVValue:
1626 case VPInstruction::ExplicitVectorLength:
1627 case VPInstruction::FirstActiveLane:
1628 case VPInstruction::LastActiveLane:
1629 case VPInstruction::ExtractLastActive:
1630 case VPInstruction::FirstOrderRecurrenceSplice:
1631 case VPInstruction::LogicalAnd:
1632 case VPInstruction::LogicalOr:
1633 case VPInstruction::MaskedCond:
1634 case VPInstruction::Not:
1635 case VPInstruction::PtrAdd:
1636 case VPInstruction::WideIVStep:
1637 case VPInstruction::WidePtrAdd:
1638 case VPInstruction::StepVector:
1639 case VPInstruction::ReductionStartVector:
1640 case VPInstruction::Reverse:
1641 case VPInstruction::VScale:
1642 case VPInstruction::Unpack:
1643 return false;
1644 case Instruction::Call:
1645 return !getCalledFunction(Operands: ArrayRef<VPValue *>(op_begin(), op_end()))
1646 ->doesNotAccessMemory();
1647 default:
1648 return true;
1649 }
1650}
1651
1652bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
1653 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1654 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1655 return vputils::onlyFirstLaneUsed(Def: this);
1656
1657 switch (getOpcode()) {
1658 default:
1659 return false;
1660 case Instruction::ExtractElement:
1661 return Op == getOperand(N: 1);
1662 case Instruction::InsertElement:
1663 return Op == getOperand(N: 1) || Op == getOperand(N: 2);
1664 case Instruction::PHI:
1665 return true;
1666 case Instruction::FCmp:
1667 case Instruction::ICmp:
1668 case Instruction::Select:
1669 case Instruction::Or:
1670 case Instruction::Freeze:
1671 case VPInstruction::Not:
1672 // TODO: Cover additional opcodes.
1673 return vputils::onlyFirstLaneUsed(Def: this);
1674 case Instruction::Load:
1675 case VPInstruction::ActiveLaneMask:
1676 case VPInstruction::ExplicitVectorLength:
1677 case VPInstruction::CalculateTripCountMinusVF:
1678 case VPInstruction::CanonicalIVIncrementForPart:
1679 case VPInstruction::BranchOnCount:
1680 case VPInstruction::BranchOnCond:
1681 case VPInstruction::BranchOnTwoConds:
1682 case VPInstruction::Broadcast:
1683 case VPInstruction::ReductionStartVector:
1684 case VPInstruction::ResumeForEpilogue:
1685 return true;
1686 case VPInstruction::BuildStructVector:
1687 case VPInstruction::BuildVector:
1688 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1689 // operand, after replicating its operands only the first lane is used.
1690 // Before replicating, it will have only a single operand.
1691 return getNumOperands() > 1;
1692 case VPInstruction::PtrAdd:
1693 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1694 case VPInstruction::WidePtrAdd:
1695 // WidePtrAdd supports scalar and vector base addresses.
1696 return false;
1697 case VPInstruction::ExitingIVValue:
1698 case VPInstruction::ExtractLane:
1699 return Op == getOperand(N: 0);
1700 };
1701 llvm_unreachable("switch should return");
1702}
1703
1704bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
1705 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1706 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1707 return vputils::onlyFirstPartUsed(Def: this);
1708
1709 switch (getOpcode()) {
1710 default:
1711 return false;
1712 case Instruction::FCmp:
1713 case Instruction::ICmp:
1714 case Instruction::Select:
1715 return vputils::onlyFirstPartUsed(Def: this);
1716 case VPInstruction::BranchOnCount:
1717 case VPInstruction::BranchOnCond:
1718 case VPInstruction::BranchOnTwoConds:
1719 case VPInstruction::CanonicalIVIncrementForPart:
1720 return true;
1721 };
1722 llvm_unreachable("switch should return");
1723}
1724
1725#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1726void VPInstruction::dump() const {
1727 VPSlotTracker SlotTracker(getParent()->getPlan());
1728 printRecipe(dbgs(), "", SlotTracker);
1729}
1730
1731void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1732 VPSlotTracker &SlotTracker) const {
1733 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1734
1735 if (hasResult()) {
1736 printAsOperand(O, SlotTracker);
1737 O << " = ";
1738 }
1739
1740 switch (getOpcode()) {
1741 case VPInstruction::Not:
1742 O << "not";
1743 break;
1744 case VPInstruction::ActiveLaneMask:
1745 O << "active lane mask";
1746 break;
1747 case VPInstruction::IncomingAliasMask:
1748 O << "incoming-alias-mask";
1749 break;
1750 case VPInstruction::ExplicitVectorLength:
1751 O << "EXPLICIT-VECTOR-LENGTH";
1752 break;
1753 case VPInstruction::FirstOrderRecurrenceSplice:
1754 O << "first-order splice";
1755 break;
1756 case VPInstruction::BranchOnCond:
1757 O << "branch-on-cond";
1758 break;
1759 case VPInstruction::BranchOnTwoConds:
1760 O << "branch-on-two-conds";
1761 break;
1762 case VPInstruction::CalculateTripCountMinusVF:
1763 O << "TC > VF ? TC - VF : 0";
1764 break;
1765 case VPInstruction::CanonicalIVIncrementForPart:
1766 O << "VF * Part +";
1767 break;
1768 case VPInstruction::BranchOnCount:
1769 O << "branch-on-count";
1770 break;
1771 case VPInstruction::Broadcast:
1772 O << "broadcast";
1773 break;
1774 case VPInstruction::BuildStructVector:
1775 O << "buildstructvector";
1776 break;
1777 case VPInstruction::BuildVector:
1778 O << "buildvector";
1779 break;
1780 case VPInstruction::ExitingIVValue:
1781 O << "exiting-iv-value";
1782 break;
1783 case VPInstruction::MaskedCond:
1784 O << "masked-cond";
1785 break;
1786 case VPInstruction::ExtractLane:
1787 O << "extract-lane";
1788 break;
1789 case VPInstruction::ExtractLastLane:
1790 O << "extract-last-lane";
1791 break;
1792 case VPInstruction::ExtractLastPart:
1793 O << "extract-last-part";
1794 break;
1795 case VPInstruction::ExtractPenultimateElement:
1796 O << "extract-penultimate-element";
1797 break;
1798 case VPInstruction::ComputeReductionResult:
1799 O << "compute-reduction-result";
1800 break;
1801 case VPInstruction::LogicalAnd:
1802 O << "logical-and";
1803 break;
1804 case VPInstruction::LogicalOr:
1805 O << "logical-or";
1806 break;
1807 case VPInstruction::PtrAdd:
1808 O << "ptradd";
1809 break;
1810 case VPInstruction::WidePtrAdd:
1811 O << "wide-ptradd";
1812 break;
1813 case VPInstruction::AnyOf:
1814 O << "any-of";
1815 break;
1816 case VPInstruction::FirstActiveLane:
1817 O << "first-active-lane";
1818 break;
1819 case VPInstruction::LastActiveLane:
1820 O << "last-active-lane";
1821 break;
1822 case VPInstruction::ReductionStartVector:
1823 O << "reduction-start-vector";
1824 break;
1825 case VPInstruction::ResumeForEpilogue:
1826 O << "resume-for-epilogue";
1827 break;
1828 case VPInstruction::Reverse:
1829 O << "reverse";
1830 break;
1831 case VPInstruction::Unpack:
1832 O << "unpack";
1833 break;
1834 case VPInstruction::ExtractLastActive:
1835 O << "extract-last-active";
1836 break;
1837 case VPInstruction::NumActiveLanes:
1838 O << "num-active-lanes";
1839 break;
1840 default:
1841 O << Instruction::getOpcodeName(getOpcode());
1842 }
1843
1844 printFlags(O);
1845 printOperands(O, SlotTracker);
1846}
1847#endif
1848
1849void VPInstructionWithType::execute(VPTransformState &State) {
1850 Type *ResultTy = getResultType();
1851 if (Instruction::isCast(Opcode: getOpcode())) {
1852 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1853 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1854 V: Op, DestTy: ResultTy);
1855 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
1856 applyFlags(I&: *CastOp);
1857 applyMetadata(I&: *CastOp);
1858 }
1859 State.set(Def: this, V: Cast, Lane: VPLane(0));
1860 return;
1861 }
1862 switch (getOpcode()) {
1863 case VPInstruction::StepVector: {
1864 Value *StepVector =
1865 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1866 State.set(Def: this, V: StepVector);
1867 break;
1868 }
1869 case VPInstruction::VScale: {
1870 Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1871 State.set(Def: this, V: VScale, IsScalar: true);
1872 break;
1873 }
1874
1875 default:
1876 llvm_unreachable("opcode not implemented yet");
1877 }
1878}
1879
1880InstructionCost VPInstructionWithType::computeCost(ElementCount VF,
1881 VPCostContext &Ctx) const {
1882 // NOTE: At the moment it seems only possible to expose this path for
1883 // the trunc, zext and sext opcodes. However, isScalarCast also covers
1884 // int<>fp conversions, bitcasts, ptr<>int conversions, etc.
1885 if (Instruction::isCast(Opcode: getOpcode()))
1886 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
1887 Ctx);
1888
1889 switch (getOpcode()) {
1890 case VPInstruction::VScale: {
1891 Type *Ty = this->getScalarType();
1892 ArrayRef<Type *> Tys;
1893 IntrinsicCostAttributes Attrs(Intrinsic::vscale, Ty, Tys);
1894 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1895 }
1896 case VPInstruction::StepVector:
1897 // TODO: This isn't quite right since even if the step-vector is hoisted
1898 // out of the loop it has a non-zero cost in the middle block, etc.
1899 // Once the stepvector is correctly hoisted out of the vector loop by the
1900 // licm transform we can add the cost here so that it doesn't incorrectly
1901 // affect the choice of VF.
1902 return 0;
1903 default:
1904 // Although VPInstructionWithType is also used for
1905 // VPInstruction::WideIVStep it isn't currently possible to expose cases
1906 // where the cost is queried.
1907 llvm_unreachable("Unhandled opcode");
1908 }
1909 return 0;
1910}
1911
1912#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1913void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1914 VPSlotTracker &SlotTracker) const {
1915 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1916 printAsOperand(O, SlotTracker);
1917 O << " = ";
1918
1919 Type *ResultTy = getResultType();
1920 switch (getOpcode()) {
1921 case VPInstruction::WideIVStep:
1922 O << "wide-iv-step ";
1923 printOperands(O, SlotTracker);
1924 break;
1925 case VPInstruction::StepVector:
1926 O << "step-vector " << *ResultTy;
1927 break;
1928 case VPInstruction::VScale:
1929 O << "vscale " << *ResultTy;
1930 break;
1931 case Instruction::Load:
1932 O << "load ";
1933 printOperands(O, SlotTracker);
1934 break;
1935 default:
1936 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1937 O << Instruction::getOpcodeName(getOpcode());
1938 printFlags(O);
1939 printOperands(O, SlotTracker);
1940 O << " to " << *ResultTy;
1941 }
1942}
1943#endif
1944
1945/// Shared execute logic for VPPhi and VPWidenPHIRecipe. Creates a PHI node,
1946/// adds incoming values, and stores the result in State. For header phis, only
1947/// the preheader incoming value is added; the backedge is fixed up later by
1948/// VPlan::execute().
1949static void executePhiRecipe(VPSingleDefRecipe *R, VPPhiAccessors &Phi,
1950 VPTransformState &State, bool IsScalar,
1951 const Twine &Name) {
1952 unsigned NumIncoming = VPBlockUtils::isHeader(VPB: R->getParent(), VPDT: State.VPDT)
1953 ? 1
1954 : Phi.getNumIncoming();
1955 Value *FirstInc = State.get(Def: Phi.getIncomingValue(Idx: 0), IsScalar);
1956 PHINode *NewPhi = State.Builder.CreatePHI(Ty: FirstInc->getType(), NumReservedValues: 2, Name);
1957 NewPhi->addIncoming(V: FirstInc,
1958 BB: State.CFG.VPBB2IRBB.at(Val: Phi.getIncomingBlock(Idx: 0)));
1959 for (unsigned Idx = 1; Idx != NumIncoming; ++Idx)
1960 NewPhi->addIncoming(V: State.get(Def: Phi.getIncomingValue(Idx), IsScalar),
1961 BB: State.CFG.VPBB2IRBB.at(Val: Phi.getIncomingBlock(Idx)));
1962 State.set(Def: R, V: NewPhi, IsScalar);
1963}
1964
1965void VPPhi::execute(VPTransformState &State) {
1966 executePhiRecipe(R: this, Phi&: *this, State, /*IsScalar=*/true, Name: getName());
1967}
1968
1969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1970void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1971 VPSlotTracker &SlotTracker) const {
1972 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1973 printAsOperand(O, SlotTracker);
1974 O << " = phi";
1975 printFlags(O);
1976 printPhiOperands(O, SlotTracker);
1977}
1978#endif
1979
1980VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1981 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1982 return new VPIRPhi(*Phi);
1983 return new VPIRInstruction(I);
1984}
1985
1986void VPIRInstruction::execute(VPTransformState &State) {
1987 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1988 "PHINodes must be handled by VPIRPhi");
1989 // Advance the insert point after the wrapped IR instruction. This allows
1990 // interleaving VPIRInstructions and other recipes.
1991 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1992}
1993
1994InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1995 VPCostContext &Ctx) const {
1996 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1997 // hence it does not contribute to the cost-modeling for the VPlan.
1998 return 0;
1999}
2000
2001#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2002void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
2003 VPSlotTracker &SlotTracker) const {
2004 O << Indent << "IR " << I;
2005}
2006#endif
2007
2008void VPIRPhi::execute(VPTransformState &State) {
2009 PHINode *Phi = &getIRPhi();
2010 for (const auto &[Idx, Op] : enumerate(First: operands())) {
2011 VPValue *ExitValue = Op;
2012 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
2013 ? VPLane::getFirstLane()
2014 : VPLane::getLastLaneForVF(VF: State.VF);
2015 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
2016 auto *PredVPBB = Pred->getExitingBasicBlock();
2017 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
2018 // Set insertion point in PredBB in case an extract needs to be generated.
2019 // TODO: Model extracts explicitly.
2020 State.Builder.SetInsertPoint(PredBB->getTerminator());
2021 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
2022 // If there is no existing block for PredBB in the phi, add a new incoming
2023 // value. Otherwise update the existing incoming value for PredBB.
2024 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
2025 Phi->addIncoming(V, BB: PredBB);
2026 else
2027 Phi->setIncomingValueForBlock(BB: PredBB, V);
2028 }
2029
2030 // Advance the insert point after the wrapped IR instruction. This allows
2031 // interleaving VPIRInstructions and other recipes.
2032 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
2033}
2034
2035void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
2036 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2037 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
2038 "Number of phi operands must match number of predecessors");
2039 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
2040 R->removeOperand(Idx: Position);
2041}
2042
2043VPValue *
2044VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
2045 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2046 return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB));
2047}
2048
2049void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
2050 VPValue *V) const {
2051 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
2052 R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V);
2053}
2054
2055#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2056void VPPhiAccessors::printPhiOperands(raw_ostream &O,
2057 VPSlotTracker &SlotTracker) const {
2058 interleaveComma(enumerate(getAsRecipe()->operands()), O,
2059 [this, &O, &SlotTracker](auto Op) {
2060 O << "[ ";
2061 Op.value()->printAsOperand(O, SlotTracker);
2062 O << ", ";
2063 getIncomingBlock(Op.index())->printAsOperand(O);
2064 O << " ]";
2065 });
2066}
2067#endif
2068
2069#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2070void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
2071 VPSlotTracker &SlotTracker) const {
2072 VPIRInstruction::printRecipe(O, Indent, SlotTracker);
2073
2074 if (getNumOperands() != 0) {
2075 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
2076 interleaveComma(incoming_values_and_blocks(), O,
2077 [&O, &SlotTracker](auto Op) {
2078 std::get<0>(Op)->printAsOperand(O, SlotTracker);
2079 O << " from ";
2080 std::get<1>(Op)->printAsOperand(O);
2081 });
2082 O << ")";
2083 }
2084}
2085#endif
2086
2087void VPIRMetadata::applyMetadata(Instruction &I) const {
2088 for (const auto &[Kind, Node] : Metadata)
2089 I.setMetadata(KindID: Kind, Node);
2090}
2091
2092void VPIRMetadata::intersect(const VPIRMetadata &Other) {
2093 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
2094 for (const auto &[KindA, MDA] : Metadata) {
2095 for (const auto &[KindB, MDB] : Other.Metadata) {
2096 if (KindA == KindB && MDA == MDB) {
2097 MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
2098 break;
2099 }
2100 }
2101 }
2102 Metadata = std::move(MetadataIntersection);
2103}
2104
2105#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2106void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
2107 const Module *M = SlotTracker.getModule();
2108 if (Metadata.empty() || !M || !VPlanPrintMetadata)
2109 return;
2110
2111 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
2112 O << " (";
2113 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
2114 auto [Kind, Node] = KindNodePair;
2115 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
2116 "Unexpected unnamed metadata kind");
2117 O << "!" << MDNames[Kind] << " ";
2118 Node->printAsOperand(O, M);
2119 });
2120 O << ")";
2121}
2122#endif
2123
2124void VPWidenCallRecipe::execute(VPTransformState &State) {
2125 assert(State.VF.isVector() && "not widening");
2126 assert(Variant != nullptr && "Can't create vector function.");
2127
2128 FunctionType *VFTy = Variant->getFunctionType();
2129 // Add return type if intrinsic is overloaded on it.
2130 SmallVector<Value *, 4> Args;
2131 for (const auto &I : enumerate(First: args())) {
2132 Value *Arg;
2133 // Some vectorized function variants may also take a scalar argument,
2134 // e.g. linear parameters for pointers. This needs to be the scalar value
2135 // from the start of the respective part when interleaving.
2136 if (!VFTy->getParamType(i: I.index())->isVectorTy())
2137 Arg = State.get(Def: I.value(), Lane: VPLane(0));
2138 else
2139 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
2140 Args.push_back(Elt: Arg);
2141 }
2142
2143 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
2144 SmallVector<OperandBundleDef, 1> OpBundles;
2145 if (CI)
2146 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
2147
2148 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
2149 applyFlags(I&: *V);
2150 applyMetadata(I&: *V);
2151 V->setCallingConv(Variant->getCallingConv());
2152
2153 if (!V->getType()->isVoidTy())
2154 State.set(Def: this, V);
2155}
2156
2157InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
2158 VPCostContext &Ctx) const {
2159 assert(getVectorizedTypeVF(Variant->getReturnType()) == VF &&
2160 "Variant return type must match VF");
2161 return computeCallCost(Variant, Ctx);
2162}
2163
2164InstructionCost VPWidenCallRecipe::computeCallCost(Function *Variant,
2165 VPCostContext &Ctx) {
2166 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
2167 Tys: Variant->getFunctionType()->params(),
2168 CostKind: Ctx.CostKind);
2169}
2170
2171bool VPWidenCallRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2172 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2173 assert(Variant && "Variant not set");
2174 FunctionType *VFTy = Variant->getFunctionType();
2175 return all_of(Range: enumerate(First: args()), P: [VFTy, &Op](const auto &Arg) {
2176 auto [Idx, V] = Arg;
2177 Type *ArgTy = VFTy->getParamType(i: Idx);
2178 return V != Op || ArgTy->isIntegerTy() || ArgTy->isFloatingPointTy() ||
2179 ArgTy->isPointerTy() || ArgTy->isByteTy();
2180 });
2181}
2182
2183#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2184void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2185 VPSlotTracker &SlotTracker) const {
2186 O << Indent << "WIDEN-CALL ";
2187
2188 Function *CalledFn = getCalledScalarFunction();
2189 if (CalledFn->getReturnType()->isVoidTy())
2190 O << "void ";
2191 else {
2192 printAsOperand(O, SlotTracker);
2193 O << " = ";
2194 }
2195
2196 O << "call";
2197 printFlags(O);
2198 O << " @" << CalledFn->getName() << "(";
2199 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
2200 Op->printAsOperand(O, SlotTracker);
2201 });
2202 O << ")";
2203
2204 O << " (using library function";
2205 if (Variant->hasName())
2206 O << ": " << Variant->getName();
2207 O << ")";
2208}
2209#endif
2210
2211CallInst *VPWidenIntrinsicRecipe::createVectorCall(VPTransformState &State) {
2212 assert(State.VF.isVector() && "not widening");
2213
2214 SmallVector<Type *, 2> TysForDecl;
2215 // Add return type if intrinsic is overloaded on it.
2216 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1,
2217 TTI: State.TTI)) {
2218 Type *RetTy = toVectorizedTy(Ty: getScalarType(), EC: State.VF);
2219 ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
2220 for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
2221 if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
2222 RetIdx: Idx, TTI: State.TTI))
2223 TysForDecl.push_back(Elt: Ty);
2224 }
2225 }
2226 SmallVector<Value *, 4> Args;
2227 for (const auto &I : enumerate(First: operands())) {
2228 // Some intrinsics have a scalar argument - don't replace it with a
2229 // vector.
2230 Value *Arg;
2231 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
2232 TTI: State.TTI))
2233 Arg = State.get(Def: I.value(), Lane: VPLane(0));
2234 else
2235 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
2236 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
2237 TTI: State.TTI))
2238 TysForDecl.push_back(Elt: Arg->getType());
2239 Args.push_back(Elt: Arg);
2240 }
2241
2242 // Use vector version of the intrinsic.
2243 Module *M = State.Builder.GetInsertBlock()->getModule();
2244 Function *VectorF =
2245 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, OverloadTys: TysForDecl);
2246 assert(VectorF &&
2247 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
2248
2249 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
2250 SmallVector<OperandBundleDef, 1> OpBundles;
2251 if (CI)
2252 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
2253
2254 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
2255
2256 applyFlags(I&: *V);
2257 applyMetadata(I&: *V);
2258
2259 return V;
2260}
2261
2262void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
2263 CallInst *V = createVectorCall(State);
2264 if (!V->getType()->isVoidTy())
2265 State.set(Def: this, V);
2266}
2267
2268InstructionCost VPWidenIntrinsicRecipe::computeCallCost(
2269 Intrinsic::ID ID, ArrayRef<const VPValue *> Operands,
2270 const VPRecipeWithIRFlags &R, ElementCount VF, VPCostContext &Ctx) {
2271 Type *ScalarRetTy = R.getScalarType();
2272 // Skip the reverse operation cost for the mask.
2273 // FIXME: Remove this once redundant mask reverse operations can be eliminated
2274 // by VPlanTransforms::cse before cost computation.
2275 if (ID == Intrinsic::experimental_vp_reverse && ScalarRetTy->isIntegerTy(BitWidth: 1))
2276 return InstructionCost(0);
2277
2278 // Some backends analyze intrinsic arguments to determine cost. Use the
2279 // underlying value for the operand if it has one. Otherwise try to use the
2280 // operand of the underlying call instruction, if there is one. Otherwise
2281 // clear Arguments.
2282 // TODO: Rework TTI interface to be independent of concrete IR values.
2283 SmallVector<const Value *> Arguments;
2284 for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
2285 auto *V = Op->getUnderlyingValue();
2286 if (!V) {
2287 if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
2288 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
2289 continue;
2290 }
2291 Arguments.clear();
2292 break;
2293 }
2294 Arguments.push_back(Elt: V);
2295 }
2296
2297 Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
2298 SmallVector<Type *> ParamTys =
2299 map_to_vector(C&: Operands, F: [&](const VPValue *Op) {
2300 return toVectorTy(Scalar: Op->getScalarType(), EC: VF);
2301 });
2302
2303 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
2304 IntrinsicCostAttributes CostAttrs(
2305 ID, RetTy, Arguments, ParamTys, R.getFastMathFlagsOrNone(),
2306 dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
2307 InstructionCost::getInvalid());
2308 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
2309}
2310
2311InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
2312 VPCostContext &Ctx) const {
2313 SmallVector<const VPValue *> ArgOps(operands());
2314 return computeCallCost(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
2315}
2316
2317StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
2318 return Intrinsic::getBaseName(id: VectorIntrinsicID);
2319}
2320
2321bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2322 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2323 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
2324 auto [Idx, V] = X;
2325 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
2326 Idx, nullptr);
2327 });
2328}
2329
2330#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2331void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2332 VPSlotTracker &SlotTracker) const {
2333 O << Indent << "WIDEN-INTRINSIC ";
2334 if (getScalarType()->isVoidTy()) {
2335 O << "void ";
2336 } else {
2337 printAsOperand(O, SlotTracker);
2338 O << " = ";
2339 }
2340
2341 O << "call";
2342 printFlags(O);
2343 O << getIntrinsicName() << "(";
2344
2345 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
2346 Op->printAsOperand(O, SlotTracker);
2347 });
2348 O << ")";
2349}
2350#endif
2351
2352void VPWidenMemIntrinsicRecipe::execute(VPTransformState &State) {
2353 CallInst *MemI = createVectorCall(State);
2354 MemI->addParamAttr(
2355 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: MemI->getContext(), Alignment));
2356 State.set(Def: this, V: MemI);
2357}
2358
2359InstructionCost VPWidenMemIntrinsicRecipe::computeMemIntrinsicCost(
2360 Intrinsic::ID IID, Type *Ty, bool IsMasked, Align Alignment,
2361 VPCostContext &Ctx) {
2362 return Ctx.TTI.getMemIntrinsicInstrCost(
2363 MICA: MemIntrinsicCostAttributes(IID, Ty, /*Ptr=*/nullptr, IsMasked, Alignment),
2364 CostKind: Ctx.CostKind);
2365}
2366
2367InstructionCost
2368VPWidenMemIntrinsicRecipe::computeCost(ElementCount VF,
2369 VPCostContext &Ctx) const {
2370 Type *Ty = toVectorTy(Scalar: getScalarType(), EC: VF);
2371 return computeMemIntrinsicCost(IID: getVectorIntrinsicID(), Ty,
2372 IsMasked: !match(V: getOperand(N: 2), P: m_True()), Alignment,
2373 Ctx);
2374}
2375
2376void VPHistogramRecipe::execute(VPTransformState &State) {
2377 IRBuilderBase &Builder = State.Builder;
2378
2379 Value *Address = State.get(Def: getOperand(N: 0));
2380 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
2381 VectorType *VTy = cast<VectorType>(Val: Address->getType());
2382
2383 // The histogram intrinsic requires a mask even if the recipe doesn't;
2384 // if the mask operand was omitted then all lanes should be executed and
2385 // we just need to synthesize an all-true mask.
2386 Value *Mask = nullptr;
2387 if (VPValue *VPMask = getMask())
2388 Mask = State.get(Def: VPMask);
2389 else
2390 Mask =
2391 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
2392
2393 // If this is a subtract, we want to invert the increment amount. We may
2394 // add a separate intrinsic in future, but for now we'll try this.
2395 if (Opcode == Instruction::Sub)
2396 IncAmt = Builder.CreateNeg(V: IncAmt);
2397 else
2398 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2399
2400 Instruction *HistogramInst = State.Builder.CreateIntrinsicWithoutFolding(
2401 ID: Intrinsic::experimental_vector_histogram_add, OverloadTypes: {VTy, IncAmt->getType()},
2402 Args: {Address, IncAmt, Mask});
2403 applyMetadata(I&: *HistogramInst);
2404}
2405
2406InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
2407 VPCostContext &Ctx) const {
2408 // FIXME: Take the gather and scatter into account as well. For now we're
2409 // generating the same cost as the fallback path, but we'll likely
2410 // need to create a new TTI method for determining the cost, including
2411 // whether we can use base + vec-of-smaller-indices or just
2412 // vec-of-pointers.
2413 assert(VF.isVector() && "Invalid VF for histogram cost");
2414 Type *AddressTy = getOperand(N: 0)->getScalarType();
2415 VPValue *IncAmt = getOperand(N: 1);
2416 Type *IncTy = IncAmt->getScalarType();
2417 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2418
2419 // Assume that a non-constant update value (or a constant != 1) requires
2420 // a multiply, and add that into the cost.
2421 InstructionCost MulCost =
2422 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2423 if (match(V: IncAmt, P: m_One()))
2424 MulCost = TTI::TCC_Free;
2425
2426 // Find the cost of the histogram operation itself.
2427 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2428 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2429 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2430 Type::getVoidTy(C&: Ctx.LLVMCtx),
2431 {PtrTy, IncTy, MaskTy});
2432
2433 // Add the costs together with the add/sub operation.
2434 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2435 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2436}
2437
2438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2439void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2440 VPSlotTracker &SlotTracker) const {
2441 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2442 getOperand(0)->printAsOperand(O, SlotTracker);
2443
2444 if (Opcode == Instruction::Sub)
2445 O << ", dec: ";
2446 else {
2447 assert(Opcode == Instruction::Add);
2448 O << ", inc: ";
2449 }
2450 getOperand(1)->printAsOperand(O, SlotTracker);
2451
2452 if (VPValue *Mask = getMask()) {
2453 O << ", mask: ";
2454 Mask->printAsOperand(O, SlotTracker);
2455 }
2456}
2457#endif
2458
2459VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2460 AllowReassoc = FMF.allowReassoc();
2461 NoNaNs = FMF.noNaNs();
2462 NoInfs = FMF.noInfs();
2463 NoSignedZeros = FMF.noSignedZeros();
2464 AllowReciprocal = FMF.allowReciprocal();
2465 AllowContract = FMF.allowContract();
2466 ApproxFunc = FMF.approxFunc();
2467}
2468
2469VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2470 switch (Opcode) {
2471 case Instruction::Add:
2472 case Instruction::Sub:
2473 case Instruction::Mul:
2474 case Instruction::Shl:
2475 case VPInstruction::CanonicalIVIncrementForPart:
2476 return WrapFlagsTy(false, false);
2477 case Instruction::Trunc:
2478 return TruncFlagsTy(false, false);
2479 case Instruction::Or:
2480 return DisjointFlagsTy(false);
2481 case Instruction::AShr:
2482 case Instruction::LShr:
2483 case Instruction::UDiv:
2484 case Instruction::SDiv:
2485 return ExactFlagsTy(false);
2486 case Instruction::GetElementPtr:
2487 case VPInstruction::PtrAdd:
2488 case VPInstruction::WidePtrAdd:
2489 return GEPNoWrapFlags::none();
2490 case Instruction::ZExt:
2491 case Instruction::UIToFP:
2492 return NonNegFlagsTy(false);
2493 case Instruction::FAdd:
2494 case Instruction::FSub:
2495 case Instruction::FMul:
2496 case Instruction::FDiv:
2497 case Instruction::FRem:
2498 case Instruction::FNeg:
2499 case Instruction::FPExt:
2500 case Instruction::FPTrunc:
2501 return FastMathFlags();
2502 case Instruction::ICmp:
2503 case Instruction::FCmp:
2504 case VPInstruction::ComputeReductionResult:
2505 llvm_unreachable("opcode requires explicit flags");
2506 default:
2507 return VPIRFlags();
2508 }
2509}
2510
2511#if !defined(NDEBUG)
2512bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2513 switch (OpType) {
2514 case OperationType::OverflowingBinOp:
2515 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2516 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2517 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2518 case OperationType::Trunc:
2519 return Opcode == Instruction::Trunc;
2520 case OperationType::DisjointOp:
2521 return Opcode == Instruction::Or;
2522 case OperationType::PossiblyExactOp:
2523 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2524 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2525 case OperationType::GEPOp:
2526 return Opcode == Instruction::GetElementPtr ||
2527 Opcode == VPInstruction::PtrAdd ||
2528 Opcode == VPInstruction::WidePtrAdd;
2529 case OperationType::FPMathOp:
2530 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2531 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2532 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2533 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2534 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2535 Opcode == Instruction::Select || Opcode == Instruction::SIToFP ||
2536 Opcode == Instruction::UIToFP ||
2537 Opcode == VPInstruction::WideIVStep ||
2538 Opcode == VPInstruction::ReductionStartVector;
2539 case OperationType::FCmp:
2540 return Opcode == Instruction::FCmp;
2541 case OperationType::NonNegOp:
2542 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2543 case OperationType::Cmp:
2544 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2545 case OperationType::ReductionOp:
2546 return Opcode == VPInstruction::ComputeReductionResult;
2547 case OperationType::Other:
2548 return true;
2549 }
2550 llvm_unreachable("Unknown OperationType enum");
2551}
2552
2553bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2554 // Handle opcodes without default flags.
2555 if (Opcode == Instruction::ICmp)
2556 return OpType == OperationType::Cmp;
2557 if (Opcode == Instruction::FCmp)
2558 return OpType == OperationType::FCmp;
2559 if (Opcode == VPInstruction::ComputeReductionResult)
2560 return OpType == OperationType::ReductionOp;
2561
2562 OperationType Required = getDefaultFlags(Opcode).OpType;
2563 return Required == OperationType::Other || Required == OpType;
2564}
2565#endif
2566
2567#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2568static void printRecurrenceKind(raw_ostream &OS, const RecurKind &Kind) {
2569 switch (Kind) {
2570 case RecurKind::None:
2571 OS << "none";
2572 break;
2573 case RecurKind::Add:
2574 OS << "add";
2575 break;
2576 case RecurKind::Sub:
2577 OS << "sub";
2578 break;
2579 case RecurKind::AddChainWithSubs:
2580 OS << "add-chain-with-subs";
2581 break;
2582 case RecurKind::Mul:
2583 OS << "mul";
2584 break;
2585 case RecurKind::Or:
2586 OS << "or";
2587 break;
2588 case RecurKind::And:
2589 OS << "and";
2590 break;
2591 case RecurKind::Xor:
2592 OS << "xor";
2593 break;
2594 case RecurKind::SMin:
2595 OS << "smin";
2596 break;
2597 case RecurKind::SMax:
2598 OS << "smax";
2599 break;
2600 case RecurKind::UMin:
2601 OS << "umin";
2602 break;
2603 case RecurKind::UMax:
2604 OS << "umax";
2605 break;
2606 case RecurKind::FAdd:
2607 OS << "fadd";
2608 break;
2609 case RecurKind::FAddChainWithSubs:
2610 OS << "fadd-chain-with-subs";
2611 break;
2612 case RecurKind::FSub:
2613 OS << "fsub";
2614 break;
2615 case RecurKind::FMul:
2616 OS << "fmul";
2617 break;
2618 case RecurKind::FMin:
2619 OS << "fmin";
2620 break;
2621 case RecurKind::FMax:
2622 OS << "fmax";
2623 break;
2624 case RecurKind::FMinNum:
2625 OS << "fminnum";
2626 break;
2627 case RecurKind::FMaxNum:
2628 OS << "fmaxnum";
2629 break;
2630 case RecurKind::FMinimum:
2631 OS << "fminimum";
2632 break;
2633 case RecurKind::FMaximum:
2634 OS << "fmaximum";
2635 break;
2636 case RecurKind::FMinimumNum:
2637 OS << "fminimumnum";
2638 break;
2639 case RecurKind::FMaximumNum:
2640 OS << "fmaximumnum";
2641 break;
2642 case RecurKind::FMulAdd:
2643 OS << "fmuladd";
2644 break;
2645 case RecurKind::AnyOf:
2646 OS << "any-of";
2647 break;
2648 case RecurKind::FindIV:
2649 OS << "find-iv";
2650 break;
2651 case RecurKind::FindLast:
2652 OS << "find-last";
2653 break;
2654 }
2655}
2656
2657void VPIRFlags::printFlags(raw_ostream &O) const {
2658 switch (OpType) {
2659 case OperationType::Cmp:
2660 O << " " << CmpInst::getPredicateName(getPredicate());
2661 break;
2662 case OperationType::FCmp:
2663 O << " " << CmpInst::getPredicateName(getPredicate());
2664 getFastMathFlagsOrNone().print(O);
2665 break;
2666 case OperationType::DisjointOp:
2667 if (DisjointFlags.IsDisjoint)
2668 O << " disjoint";
2669 break;
2670 case OperationType::PossiblyExactOp:
2671 if (ExactFlags.IsExact)
2672 O << " exact";
2673 break;
2674 case OperationType::OverflowingBinOp:
2675 if (WrapFlags.HasNUW)
2676 O << " nuw";
2677 if (WrapFlags.HasNSW)
2678 O << " nsw";
2679 break;
2680 case OperationType::Trunc:
2681 if (TruncFlags.HasNUW)
2682 O << " nuw";
2683 if (TruncFlags.HasNSW)
2684 O << " nsw";
2685 break;
2686 case OperationType::FPMathOp:
2687 getFastMathFlagsOrNone().print(O);
2688 break;
2689 case OperationType::GEPOp: {
2690 GEPNoWrapFlags Flags = getGEPNoWrapFlags();
2691 if (Flags.isInBounds())
2692 O << " inbounds";
2693 else if (Flags.hasNoUnsignedSignedWrap())
2694 O << " nusw";
2695 if (Flags.hasNoUnsignedWrap())
2696 O << " nuw";
2697 break;
2698 }
2699 case OperationType::NonNegOp:
2700 if (NonNegFlags.NonNeg)
2701 O << " nneg";
2702 break;
2703 case OperationType::ReductionOp: {
2704 O << " (";
2705 printRecurrenceKind(O, getRecurKind());
2706 if (isReductionInLoop())
2707 O << ", in-loop";
2708 if (isReductionOrdered())
2709 O << ", ordered";
2710 O << ")";
2711 getFastMathFlagsOrNone().print(O);
2712 break;
2713 }
2714 case OperationType::Other:
2715 break;
2716 }
2717 O << " ";
2718}
2719#endif
2720
2721void VPWidenRecipe::execute(VPTransformState &State) {
2722 auto &Builder = State.Builder;
2723 switch (Opcode) {
2724 case Instruction::Call:
2725 case Instruction::UncondBr:
2726 case Instruction::CondBr:
2727 case Instruction::PHI:
2728 case Instruction::GetElementPtr:
2729 llvm_unreachable("This instruction is handled by a different recipe.");
2730 case Instruction::UDiv:
2731 case Instruction::SDiv:
2732 case Instruction::SRem:
2733 case Instruction::URem:
2734 case Instruction::Add:
2735 case Instruction::FAdd:
2736 case Instruction::Sub:
2737 case Instruction::FSub:
2738 case Instruction::FNeg:
2739 case Instruction::Mul:
2740 case Instruction::FMul:
2741 case Instruction::FDiv:
2742 case Instruction::FRem:
2743 case Instruction::Shl:
2744 case Instruction::LShr:
2745 case Instruction::AShr:
2746 case Instruction::And:
2747 case Instruction::Or:
2748 case Instruction::Xor: {
2749 // Just widen unops and binops.
2750 SmallVector<Value *, 2> Ops;
2751 for (VPValue *VPOp : operands())
2752 Ops.push_back(Elt: State.get(Def: VPOp));
2753
2754 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2755
2756 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2757 applyFlags(I&: *VecOp);
2758 applyMetadata(I&: *VecOp);
2759 }
2760
2761 // Use this vector value for all users of the original instruction.
2762 State.set(Def: this, V);
2763 break;
2764 }
2765 case Instruction::ExtractValue: {
2766 assert(getNumOperands() == 2 && "expected single level extractvalue");
2767 Value *Op = State.get(Def: getOperand(N: 0));
2768 Value *Extract = Builder.CreateExtractValue(
2769 Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue());
2770 State.set(Def: this, V: Extract);
2771 break;
2772 }
2773 case Instruction::Freeze: {
2774 Value *Op = State.get(Def: getOperand(N: 0));
2775 Value *Freeze = Builder.CreateFreeze(V: Op);
2776 State.set(Def: this, V: Freeze);
2777 break;
2778 }
2779 case Instruction::ICmp:
2780 case Instruction::FCmp: {
2781 // Widen compares. Generate vector compares.
2782 bool FCmp = Opcode == Instruction::FCmp;
2783 Value *A = State.get(Def: getOperand(N: 0));
2784 Value *B = State.get(Def: getOperand(N: 1));
2785 Value *C = nullptr;
2786 if (FCmp) {
2787 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2788 } else {
2789 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2790 }
2791 if (auto *I = dyn_cast<Instruction>(Val: C)) {
2792 applyFlags(I&: *I);
2793 applyMetadata(I&: *I);
2794 }
2795 State.set(Def: this, V: C);
2796 break;
2797 }
2798 case Instruction::Select: {
2799 VPValue *CondOp = getOperand(N: 0);
2800 Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2801 Value *Op0 = State.get(Def: getOperand(N: 1));
2802 Value *Op1 = State.get(Def: getOperand(N: 2));
2803 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2804 State.set(Def: this, V: Sel);
2805 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2806 if (isa<FPMathOperator>(Val: I))
2807 applyFlags(I&: *I);
2808 applyMetadata(I&: *I);
2809 }
2810 break;
2811 }
2812 default:
2813 // This instruction is not vectorized by simple widening.
2814 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2815 << Instruction::getOpcodeName(Opcode));
2816 llvm_unreachable("Unhandled instruction!");
2817 } // end of switch.
2818
2819#if !defined(NDEBUG)
2820 // Verify that VPlan type inference results agree with the type of the
2821 // generated values.
2822 assert(VectorType::get(this->getScalarType(), State.VF) ==
2823 State.get(this)->getType() &&
2824 "inferred type and type from generated instructions do not match");
2825#endif
2826}
2827
2828InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2829 VPCostContext &Ctx) const {
2830 switch (Opcode) {
2831 case Instruction::UDiv:
2832 case Instruction::SDiv:
2833 case Instruction::SRem:
2834 case Instruction::URem:
2835 // If the div/rem operation isn't safe to speculate and requires
2836 // predication, then the only way we can even create a vplan is to insert
2837 // a select on the second input operand to ensure we use the value of 1
2838 // for the inactive lanes. The select will be costed separately.
2839 case Instruction::FNeg:
2840 case Instruction::Add:
2841 case Instruction::FAdd:
2842 case Instruction::Sub:
2843 case Instruction::FSub:
2844 case Instruction::Mul:
2845 case Instruction::FMul:
2846 case Instruction::FDiv:
2847 case Instruction::FRem:
2848 case Instruction::Shl:
2849 case Instruction::LShr:
2850 case Instruction::AShr:
2851 case Instruction::And:
2852 case Instruction::Or:
2853 case Instruction::Xor:
2854 case Instruction::Freeze:
2855 case Instruction::ExtractValue:
2856 case Instruction::ICmp:
2857 case Instruction::FCmp:
2858 case Instruction::Select:
2859 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2860 default:
2861 llvm_unreachable("Unsupported opcode for instruction");
2862 }
2863}
2864
2865#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2866void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2867 VPSlotTracker &SlotTracker) const {
2868 O << Indent << "WIDEN ";
2869 printAsOperand(O, SlotTracker);
2870 O << " = " << Instruction::getOpcodeName(Opcode);
2871 printFlags(O);
2872 printOperands(O, SlotTracker);
2873}
2874#endif
2875
2876void VPWidenCastRecipe::execute(VPTransformState &State) {
2877 auto &Builder = State.Builder;
2878 /// Vectorize casts.
2879 assert(State.VF.isVector() && "Not vectorizing?");
2880 Type *DestTy = VectorType::get(ElementType: getScalarType(), EC: State.VF);
2881 VPValue *Op = getOperand(N: 0);
2882 Value *A = State.get(Def: Op);
2883 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2884 State.set(Def: this, V: Cast);
2885 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2886 applyFlags(I&: *CastOp);
2887 applyMetadata(I&: *CastOp);
2888 }
2889}
2890
2891InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2892 VPCostContext &Ctx) const {
2893 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2894}
2895
2896#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2897void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2898 VPSlotTracker &SlotTracker) const {
2899 O << Indent << "WIDEN-CAST ";
2900 printAsOperand(O, SlotTracker);
2901 O << " = " << Instruction::getOpcodeName(Opcode);
2902 printFlags(O);
2903 printOperands(O, SlotTracker);
2904 O << " to " << *getScalarType();
2905}
2906#endif
2907
2908InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2909 VPCostContext &Ctx) const {
2910 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2911}
2912
2913#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2914void VPWidenIntOrFpInductionRecipe::printRecipe(
2915 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2916 O << Indent;
2917 printAsOperand(O, SlotTracker);
2918 O << " = WIDEN-INDUCTION";
2919 printFlags(O);
2920 printOperands(O, SlotTracker);
2921
2922 if (auto *TI = getTruncInst())
2923 O << " (truncated to " << *TI->getType() << ")";
2924}
2925#endif
2926
2927bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2928 // The step may be defined by a recipe in the preheader (e.g. if it requires
2929 // SCEV expansion), but for the canonical induction the step is required to be
2930 // 1, which is represented as live-in.
2931 return match(V: getStartValue(), P: m_ZeroInt()) &&
2932 match(V: getStepValue(), P: m_One()) &&
2933 getScalarType() == getRegion()->getCanonicalIVType();
2934}
2935
2936InstructionCost VPDerivedIVRecipe::computeCost(ElementCount VF,
2937 VPCostContext &Ctx) const {
2938 // The cost model for this is modelled on expandVPDerivedIV in
2939 // VPlanTransforms.cpp. In order to avoid overly pessimistic costs that can
2940 // negatively affect vectorization it takes into account any expected
2941 // simplifications that happen in simplifyRecipe.
2942 switch (getInductionKind()) {
2943 default:
2944 // TODO: Compute cost for remaining kinds.
2945 break;
2946 case InductionDescriptor::IK_IntInduction: {
2947 // There are currently no tests that expose a path where all lanes are
2948 // used, so it's better to bail out for now.
2949 if (!vputils::onlyFirstLaneUsed(Def: this))
2950 break;
2951
2952 // Start off by assuming we need both mul and add, then refine this.
2953 bool NeedsMul = true, NeedsAdd = true, NeedsShl = false;
2954
2955 // If the start value is zero the add gets folded away.
2956 if (auto *VPV = dyn_cast<VPIRValue>(Val: getStartValue()))
2957 if (auto *StartC = dyn_cast<ConstantInt>(Val: VPV->getValue()))
2958 NeedsAdd = !StartC->isZero();
2959
2960 // For some values of step the arithmetic changes:
2961 // 1. A step of 1 requires no operation.
2962 // 2. A step of -1 requires a negate.
2963 // 3. A power-of-2 step will use a shl, instead of a mul.
2964 Type *StepTy = getStepValue()->getScalarType();
2965 InstructionCost Cost(0);
2966 if (auto *VPV = dyn_cast<VPIRValue>(Val: getStepValue())) {
2967 if (auto *StepC = dyn_cast<ConstantInt>(Val: VPV->getValue())) {
2968 if (StepC->isOne())
2969 NeedsMul = false;
2970 else if (StepC->isMinusOne()) {
2971 // This will most likely end up as a negate in simplifyRecipe, and
2972 // the negate will be combined with the add to make a sub.
2973 // NOTE: This is perhaps an invalid assumption that the cost of an
2974 // 'add' is the same as a 'sub'.
2975 NeedsMul = false;
2976 NeedsAdd = true;
2977 } else if (StepC->getValue().isPowerOf2()) {
2978 // This will most likely end up as a shift-left in simplifyRecipe
2979 NeedsMul = false;
2980 NeedsShl = true;
2981 }
2982 }
2983 }
2984
2985 // Add the cost of the conversion from index to step type if the index
2986 // will be used.
2987 Type *IndexTy = getIndex()->getScalarType();
2988 unsigned StepTySize = StepTy->getScalarSizeInBits();
2989 unsigned IndexTySize = IndexTy->getScalarSizeInBits();
2990 if ((NeedsAdd || NeedsMul || NeedsShl) && StepTySize != IndexTySize) {
2991 unsigned CastOpc =
2992 StepTySize < IndexTySize ? Instruction::Trunc : Instruction::SExt;
2993 Cost += Ctx.TTI.getCastInstrCost(
2994 Opcode: CastOpc, Dst: StepTy, Src: IndexTy, CCH: TTI::CastContextHint::None, CostKind: Ctx.CostKind);
2995 }
2996
2997 if (NeedsMul)
2998 Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: StepTy,
2999 CostKind: Ctx.CostKind);
3000 if (NeedsShl)
3001 Cost += Ctx.TTI.getArithmeticInstrCost(
3002 Opcode: Instruction::Shl, Ty: StepTy, CostKind: Ctx.CostKind,
3003 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3004 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
3005 .Properties: TargetTransformInfo::OP_None});
3006 if (NeedsAdd)
3007 Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Add, Ty: StepTy,
3008 CostKind: Ctx.CostKind);
3009 return Cost;
3010 }
3011 }
3012
3013 return 0;
3014}
3015
3016#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3017void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3018 VPSlotTracker &SlotTracker) const {
3019 O << Indent;
3020 printAsOperand(O, SlotTracker);
3021 O << " = DERIVED-IV ";
3022 getStartValue()->printAsOperand(O, SlotTracker);
3023 O << " + ";
3024 getOperand(1)->printAsOperand(O, SlotTracker);
3025 O << " * ";
3026 getStepValue()->printAsOperand(O, SlotTracker);
3027}
3028#endif
3029
3030InstructionCost VPScalarIVStepsRecipe::computeCost(ElementCount VF,
3031 VPCostContext &Ctx) const {
3032 // TODO: Add costs for floating point.
3033 Type *BaseIVTy = getOperand(N: 0)->getScalarType();
3034 if (!BaseIVTy->isIntegerTy())
3035 return 0;
3036
3037 // TODO: Add support for predicated regions. Requires scaling the cost by the
3038 // probability of entering the block.
3039 if (getRegion() && getRegion()->isReplicator())
3040 return 0;
3041
3042 // If only the first lane is used, then there won't be any code that remains
3043 // in the loop for the first unrolled part.
3044 if (vputils::onlyFirstLaneUsed(Def: this))
3045 return 0;
3046
3047 // Typically the operations are:
3048 // 1. Add the start index to each lane value.
3049 // 2. Multiply the start index by the step.
3050 // 3. Add the scaled start index to base IV.
3051 // Any code generated for 1 and 2 should be loop invariant and therefore
3052 // hoisted out of the loop. We only need to add on the cost of 3.
3053
3054 // Given the users of VPScalarIVStepsRecipe tend to be scalarized GEPs, i.e.
3055 // %add1 = add i32 %iv, 0
3056 // %add2 = add i32 %iv, 1
3057 // %gep1 = getelementptr i8, ptr %p, i32 %add1
3058 // %gep2 = getelementptr i8, ptr %p, i32 %add2
3059 // it's very likely that these GEPs will all be rewritten to have a common
3060 // base such that what's left is just
3061 // %base_gep = getelementptr i8, ptr %p, i32 %iv
3062 // %gep1 = getelementptr i8, ptr %base_gep, i32 0
3063 // %gep2 = getelementptr i8, ptr %base_gep, i32 1
3064 // Therefore, in reality the cost is somewhere betwen 1*AddCost and
3065 // (NumLanes - 1) * AddCost. For now, assume the cost of a single add.
3066 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Add, Ty: BaseIVTy,
3067 CostKind: Ctx.CostKind);
3068}
3069
3070void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
3071 // Fast-math-flags propagate from the original induction instruction.
3072 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
3073 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
3074
3075 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
3076 /// variable on which to base the steps, \p Step is the size of the step.
3077
3078 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
3079 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
3080 IRBuilderBase &Builder = State.Builder;
3081
3082 // Ensure step has the same type as that of scalar IV.
3083 Type *BaseIVTy = BaseIV->getType()->getScalarType();
3084 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
3085
3086 // We build scalar steps for both integer and floating-point induction
3087 // variables. Here, we determine the kind of arithmetic we will perform.
3088 Instruction::BinaryOps AddOp;
3089 Instruction::BinaryOps MulOp;
3090 if (BaseIVTy->isIntegerTy()) {
3091 AddOp = Instruction::Add;
3092 MulOp = Instruction::Mul;
3093 } else {
3094 AddOp = InductionOpcode;
3095 MulOp = Instruction::FMul;
3096 }
3097
3098 // Determine the number of scalars we need to generate.
3099 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
3100 // Compute the scalar steps and save the results in State.
3101
3102 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
3103 Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true)
3104 : Constant::getNullValue(Ty: BaseIVTy);
3105
3106 for (unsigned Lane = 0; Lane < EndLane; ++Lane) {
3107 // It is okay if the induction variable type cannot hold the lane number,
3108 // we expect truncation in this case.
3109 Constant *LaneValue =
3110 BaseIVTy->isIntegerTy()
3111 ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false,
3112 /*ImplicitTrunc=*/true)
3113 : ConstantFP::get(Ty: BaseIVTy, V: Lane);
3114 Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
3115 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
3116 "Expected StartIdx to be folded to a constant when VF is not "
3117 "scalable");
3118 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
3119 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
3120 State.set(Def: this, V: Add, Lane: VPLane(Lane));
3121 }
3122}
3123
3124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3125void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3126 VPSlotTracker &SlotTracker) const {
3127 O << Indent;
3128 printAsOperand(O, SlotTracker);
3129 O << " = SCALAR-STEPS ";
3130 printOperands(O, SlotTracker);
3131}
3132#endif
3133
3134bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
3135 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
3136 return vputils::isSingleScalar(VPV: Op);
3137}
3138
3139void VPWidenGEPRecipe::execute(VPTransformState &State) {
3140 assert(State.VF.isVector() && "not widening");
3141 auto Ops = map_to_vector(C: operands(), F: [&](VPValue *Op) {
3142 return State.get(Def: Op, IsScalar: vputils::isSingleScalar(VPV: Op));
3143 });
3144 auto *GEP =
3145 State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops.front(),
3146 IdxList: drop_begin(RangeOrContainer&: Ops), Name: "wide.gep", NW: getGEPNoWrapFlags());
3147 State.set(Def: this, V: GEP, IsScalar: vputils::isSingleScalar(VPV: this));
3148}
3149
3150#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3151void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3152 VPSlotTracker &SlotTracker) const {
3153 O << Indent << "WIDEN-GEP ";
3154 printAsOperand(O, SlotTracker);
3155 O << " = getelementptr";
3156 printFlags(O);
3157 printOperands(O, SlotTracker);
3158}
3159#endif
3160
3161void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
3162 assert(!getOffset() && "Unexpected offset operand");
3163 VPBuilder Builder(this);
3164 VPlan &Plan = *getParent()->getPlan();
3165 VPValue *VFVal = getVFValue();
3166 const DataLayout &DL = Plan.getDataLayout();
3167 Type *IndexTy = DL.getIndexType(PtrTy: this->getScalarType());
3168 VPValue *Stride =
3169 Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true);
3170 Type *VFTy = VFVal->getScalarType();
3171 VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy,
3172 DL: DebugLoc::getUnknown());
3173
3174 // Offset for Part0 = Offset0 = Stride * (VF - 1).
3175 VPInstruction *VFMinusOne =
3176 Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u),
3177 DL: DebugLoc::getUnknown(), Name: "", WrapFlags: {true, true});
3178 VPInstruction *Offset0 =
3179 Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride});
3180
3181 // Offset for PartN = Offset0 + Part * Stride * VF.
3182 VPValue *PartxStride =
3183 Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true);
3184 VPValue *Offset = Builder.createAdd(
3185 LHS: Offset0,
3186 RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF}));
3187 addOffset(Offset);
3188}
3189
3190void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
3191 auto &Builder = State.Builder;
3192 assert(getOffset() && "Expected prior materialization of offset");
3193 Value *Ptr = State.get(Def: getPointer(), IsScalar: true);
3194 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
3195 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
3196 NW: getGEPNoWrapFlags());
3197 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
3198}
3199
3200#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3201void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3202 VPSlotTracker &SlotTracker) const {
3203 O << Indent;
3204 printAsOperand(O, SlotTracker);
3205 O << " = vector-end-pointer";
3206 printFlags(O);
3207 printOperands(O, SlotTracker);
3208}
3209#endif
3210
3211void VPVectorPointerRecipe::execute(VPTransformState &State) {
3212 assert(getVFxPart() &&
3213 "Expected prior simplification of recipe without VFxPart");
3214
3215 auto &Builder = State.Builder;
3216 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
3217 Value *Offset = State.get(Def: getVFxPart(), IsScalar: true);
3218 // TODO: Expand to VPInstruction to support constant folding.
3219 if (!match(V: getStride(), P: m_One())) {
3220 Value *Stride = Builder.CreateZExtOrTrunc(V: State.get(Def: getStride(), IsScalar: true),
3221 DestTy: Offset->getType());
3222 Offset = Builder.CreateMul(LHS: Offset, RHS: Stride);
3223 }
3224 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
3225 NW: getGEPNoWrapFlags());
3226 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
3227}
3228
3229#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3230void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3231 VPSlotTracker &SlotTracker) const {
3232 O << Indent;
3233 printAsOperand(O, SlotTracker);
3234 O << " = vector-pointer";
3235 printFlags(O);
3236 printOperands(O, SlotTracker);
3237}
3238#endif
3239
3240InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
3241 VPCostContext &Ctx) const {
3242 // A blend will be expanded to a select VPInstruction, which will generate a
3243 // scalar select if only the first lane is used.
3244 if (vputils::onlyFirstLaneUsed(Def: this))
3245 VF = ElementCount::getFixed(MinVal: 1);
3246
3247 Type *ResultTy = toVectorTy(Scalar: this->getScalarType(), EC: VF);
3248 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
3249 return (getNumIncomingValues() - 1) *
3250 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
3251 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
3252}
3253
3254#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3255void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3256 VPSlotTracker &SlotTracker) const {
3257 O << Indent << "BLEND ";
3258 printAsOperand(O, SlotTracker);
3259 O << " =";
3260 printFlags(O);
3261 if (getNumIncomingValues() == 1) {
3262 // Not a User of any mask: not really blending, this is a
3263 // single-predecessor phi.
3264 getIncomingValue(0)->printAsOperand(O, SlotTracker);
3265 } else {
3266 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
3267 if (I != 0)
3268 O << " ";
3269 getIncomingValue(I)->printAsOperand(O, SlotTracker);
3270 if (I == 0 && isNormalized())
3271 continue;
3272 O << "/";
3273 getMask(I)->printAsOperand(O, SlotTracker);
3274 }
3275 }
3276}
3277#endif
3278
3279void VPReductionRecipe::execute(VPTransformState &State) {
3280 RecurKind Kind = getRecurrenceKind();
3281 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
3282 "In-loop AnyOf reductions aren't currently supported");
3283 // Propagate the fast-math flags carried by the underlying instruction.
3284 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
3285 State.Builder.setFastMathFlags(getFastMathFlagsOrNone());
3286 Value *NewVecOp = State.get(Def: getVecOp());
3287 if (VPValue *Cond = getCondOp()) {
3288 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
3289 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
3290 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
3291
3292 Value *Start =
3293 getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlagsOrNone());
3294 if (State.VF.isVector())
3295 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
3296
3297 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
3298 NewVecOp = Select;
3299 }
3300 Value *NewRed;
3301 Value *NextInChain;
3302 if (isOrdered()) {
3303 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
3304 if (State.VF.isVector())
3305 NewRed =
3306 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
3307 else
3308 NewRed = State.Builder.CreateBinOp(
3309 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
3310 LHS: PrevInChain, RHS: NewVecOp);
3311 PrevInChain = NewRed;
3312 NextInChain = NewRed;
3313 } else if (isPartialReduction()) {
3314 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
3315 "Unexpected partial reduction kind");
3316 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false);
3317 NewRed = State.Builder.CreateIntrinsic(
3318 RetTy: PrevInChain->getType(),
3319 ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
3320 : Intrinsic::vector_partial_reduce_fadd,
3321 Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
3322 Name: "partial.reduce");
3323 PrevInChain = NewRed;
3324 NextInChain = NewRed;
3325 } else {
3326 assert(isInLoop() &&
3327 "The reduction must either be ordered, partial or in-loop");
3328 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
3329 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
3330 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
3331 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
3332 else
3333 NextInChain = State.Builder.CreateBinOp(
3334 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
3335 LHS: PrevInChain, RHS: NewRed);
3336 }
3337 State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction());
3338}
3339
3340void VPReductionEVLRecipe::execute(VPTransformState &State) {
3341
3342 auto &Builder = State.Builder;
3343 // Propagate the fast-math flags carried by the underlying instruction.
3344 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
3345 Builder.setFastMathFlags(getFastMathFlagsOrNone());
3346
3347 RecurKind Kind = getRecurrenceKind();
3348 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
3349 Value *VecOp = State.get(Def: getVecOp());
3350 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3351
3352 Value *Mask;
3353 if (VPValue *CondOp = getCondOp())
3354 Mask = State.get(Def: CondOp);
3355 else
3356 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3357
3358 Value *NewRed;
3359 if (isOrdered()) {
3360 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
3361 } else {
3362 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
3363 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
3364 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
3365 else
3366 NewRed = Builder.CreateBinOp(
3367 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
3368 RHS: Prev);
3369 }
3370 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
3371}
3372
3373InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
3374 VPCostContext &Ctx) const {
3375 RecurKind RdxKind = getRecurrenceKind();
3376 Type *ElementTy = this->getScalarType();
3377 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
3378 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
3379 FastMathFlags FMFs = getFastMathFlagsOrNone();
3380 std::optional<FastMathFlags> OptionalFMF =
3381 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
3382
3383 if (isPartialReduction()) {
3384 InstructionCost CondCost = 0;
3385 if (isConditional()) {
3386 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
3387 auto *CondTy =
3388 cast<VectorType>(Val: toVectorTy(Scalar: getCondOp()->getScalarType(), EC: VF));
3389 CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
3390 CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
3391 }
3392 return CondCost + Ctx.TTI.getPartialReductionCost(
3393 Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
3394 OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
3395 FMF: OptionalFMF);
3396 }
3397
3398 // TODO: Support any-of reductions.
3399 assert(
3400 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
3401 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
3402 "Any-of reduction not implemented in VPlan-based cost model currently.");
3403
3404 // Note that TTI should model the cost of moving result to the scalar register
3405 // and the BinOp cost in the getMinMaxReductionCost().
3406 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
3407 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
3408 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
3409 }
3410
3411 // Note that TTI should model the cost of moving result to the scalar register
3412 // and the BinOp cost in the getArithmeticReductionCost().
3413 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
3414 CostKind: Ctx.CostKind);
3415}
3416
3417VPExpressionRecipe::VPExpressionRecipe(
3418 ExpressionTypes ExpressionType,
3419 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
3420 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {},
3421 cast<VPReductionRecipe>(Val: ExpressionRecipes.back())
3422 ->getChainOp()
3423 ->getScalarType()),
3424 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
3425 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
3426 assert(
3427 none_of(ExpressionRecipes,
3428 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3429 "expression cannot contain recipes with side-effects");
3430
3431 // Maintain a copy of the expression recipes as a set of users.
3432 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
3433 for (auto *R : ExpressionRecipes)
3434 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
3435
3436 // Recipes in the expression, except the last one, must only be used by
3437 // (other) recipes inside the expression. If there are other users, external
3438 // to the expression, use a clone of the recipe for external users.
3439 for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
3440 if (R != ExpressionRecipes.back() &&
3441 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
3442 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
3443 })) {
3444 // There are users outside of the expression. Clone the recipe and use the
3445 // clone those external users.
3446 VPSingleDefRecipe *CopyForExtUsers = R->clone();
3447 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
3448 VPUser &U, unsigned) {
3449 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
3450 });
3451 CopyForExtUsers->insertBefore(InsertPos: R);
3452 }
3453 if (R->getParent())
3454 R->removeFromParent();
3455 }
3456
3457 // Internalize all external operands to the expression recipes. To do so,
3458 // create new temporary VPValues for all operands defined by a recipe outside
3459 // the expression. The original operands are added as operands of the
3460 // VPExpressionRecipe itself.
3461 for (auto *R : ExpressionRecipes) {
3462 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
3463 auto *Def = Op->getDefiningRecipe();
3464 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
3465 continue;
3466 addOperand(Operand: Op);
3467 LiveInPlaceholders.push_back(Elt: new VPSymbolicValue(Op->getScalarType()));
3468 }
3469 }
3470
3471 // Replace each external operand with the first one created for it in
3472 // LiveInPlaceholders.
3473 for (auto *R : ExpressionRecipes)
3474 for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
3475 R->replaceUsesOfWith(From: LiveIn, To: Tmp);
3476}
3477
3478void VPExpressionRecipe::decompose() {
3479 for (auto *R : ExpressionRecipes)
3480 // Since the list could contain duplicates, make sure the recipe hasn't
3481 // already been inserted.
3482 if (!R->getParent())
3483 R->insertBefore(InsertPos: this);
3484
3485 for (const auto &[Idx, Op] : enumerate(First: operands()))
3486 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
3487
3488 replaceAllUsesWith(New: ExpressionRecipes.back());
3489 ExpressionRecipes.clear();
3490}
3491
3492InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
3493 VPCostContext &Ctx) const {
3494 Type *RedTy = this->getScalarType();
3495 auto *SrcVecTy =
3496 cast<VectorType>(Val: toVectorTy(Scalar: getOperand(N: 0)->getScalarType(), EC: VF));
3497 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3498 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
3499 switch (ExpressionType) {
3500 case ExpressionTypes::NegatedExtendedReduction:
3501 assert((Opcode == Instruction::Add || Opcode == Instruction::FAdd) &&
3502 "Unexpected opcode");
3503 Opcode = Opcode == Instruction::Add ? Instruction::Sub : Instruction::FSub;
3504 [[fallthrough]];
3505 case ExpressionTypes::ExtendedReduction: {
3506 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3507 auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3508
3509 if (RedR->isPartialReduction())
3510 return Ctx.TTI.getPartialReductionCost(
3511 Opcode, InputTypeA: getOperand(N: 0)->getScalarType(), InputTypeB: nullptr, AccumType: RedTy, VF,
3512 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
3513 OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
3514 FMF: RedTy->isFloatingPointTy()
3515 ? std::optional{RedR->getFastMathFlagsOrNone()}
3516 : std::nullopt);
3517 else if (!RedTy->isFloatingPointTy())
3518 // TTI::getExtendedReductionCost only supports integer types.
3519 return Ctx.TTI.getExtendedReductionCost(
3520 Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
3521 FMF: std::nullopt, CostKind: Ctx.CostKind);
3522 else
3523 return InstructionCost::getInvalid();
3524 }
3525 case ExpressionTypes::MulAccReduction:
3526 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
3527 CostKind: Ctx.CostKind);
3528
3529 case ExpressionTypes::ExtNegatedMulAccReduction:
3530 switch (Opcode) {
3531 case Instruction::Add:
3532 Opcode = Instruction::Sub;
3533 break;
3534 case Instruction::FAdd:
3535 Opcode = Instruction::FSub;
3536 break;
3537 default:
3538 llvm_unreachable("Unsupported opcode for ExtNegatedMulAccReduction");
3539 }
3540 [[fallthrough]];
3541 case ExpressionTypes::ExtMulAccReduction: {
3542 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3543 if (RedR->isPartialReduction()) {
3544 auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3545 auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]);
3546 auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]);
3547 return Ctx.TTI.getPartialReductionCost(
3548 Opcode, InputTypeA: getOperand(N: 0)->getScalarType(),
3549 InputTypeB: getOperand(N: 1)->getScalarType(), AccumType: RedTy, VF,
3550 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3551 CastOpc: Ext0R->getOpcode()),
3552 OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3553 CastOpc: Ext1R->getOpcode()),
3554 BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3555 FMF: RedTy->isFloatingPointTy()
3556 ? std::optional{RedR->getFastMathFlagsOrNone()}
3557 : std::nullopt);
3558 }
3559 assert(Opcode != Instruction::FSub && "Only integer types are supported");
3560 return Ctx.TTI.getMulAccReductionCost(
3561 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3562 Instruction::ZExt,
3563 RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3564 }
3565 }
3566 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3567}
3568
3569bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3570 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3571 return R->mayReadFromMemory() || R->mayWriteToMemory();
3572 });
3573}
3574
3575bool VPExpressionRecipe::mayHaveSideEffects() const {
3576 assert(
3577 none_of(ExpressionRecipes,
3578 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3579 "expression cannot contain recipes with side-effects");
3580 return false;
3581}
3582
3583bool VPExpressionRecipe::isVectorToScalar() const {
3584 auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3585 return RR && !RR->isPartialReduction();
3586}
3587
3588#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3589
3590void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3591 VPSlotTracker &SlotTracker) const {
3592 O << Indent << "EXPRESSION ";
3593 printAsOperand(O, SlotTracker);
3594 O << " = ";
3595 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3596 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3597 VPValue *RdxStart =
3598 getOperand(getNumOperands() - (Red->isConditional() ? 2 : 1));
3599
3600 switch (ExpressionType) {
3601 case ExpressionTypes::NegatedExtendedReduction:
3602 case ExpressionTypes::ExtendedReduction: {
3603 bool Negated = ExpressionType == ExpressionTypes::NegatedExtendedReduction;
3604 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3605 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3606 O << Instruction::getOpcodeName(Opcode) << " (";
3607 if (Negated)
3608 O << (Opcode == Instruction::Add ? "sub (0, " : "fneg(");
3609 getOperand(0)->printAsOperand(O, SlotTracker);
3610 if (Negated)
3611 O << ")";
3612 Red->printFlags(O);
3613
3614 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3615 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3616 << *Ext0->getScalarType();
3617 if (Red->isConditional()) {
3618 O << ", ";
3619 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3620 }
3621 O << ")";
3622 break;
3623 }
3624 case ExpressionTypes::ExtNegatedMulAccReduction: {
3625 RdxStart->printAsOperand(O, SlotTracker);
3626 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3627 O << Instruction::getOpcodeName(
3628 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3629 << " (sub (0, mul";
3630 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3631 Mul->printFlags(O);
3632 O << "(";
3633 getOperand(0)->printAsOperand(O, SlotTracker);
3634 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3635 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3636 << *Ext0->getScalarType() << "), (";
3637 getOperand(1)->printAsOperand(O, SlotTracker);
3638 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3639 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3640 << *Ext1->getScalarType() << ")";
3641 if (Red->isConditional()) {
3642 O << ", ";
3643 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3644 }
3645 O << "))";
3646 break;
3647 }
3648 case ExpressionTypes::MulAccReduction:
3649 case ExpressionTypes::ExtMulAccReduction: {
3650 RdxStart->printAsOperand(O, SlotTracker);
3651 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3652 O << Instruction::getOpcodeName(
3653 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3654 << " (";
3655 O << "mul";
3656 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3657 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3658 : ExpressionRecipes[0]);
3659 Mul->printFlags(O);
3660 if (IsExtended)
3661 O << "(";
3662 getOperand(0)->printAsOperand(O, SlotTracker);
3663 if (IsExtended) {
3664 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3665 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3666 << *Ext0->getScalarType() << "), (";
3667 } else {
3668 O << ", ";
3669 }
3670 getOperand(1)->printAsOperand(O, SlotTracker);
3671 if (IsExtended) {
3672 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3673 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3674 << *Ext1->getScalarType() << ")";
3675 }
3676 if (Red->isConditional()) {
3677 O << ", ";
3678 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3679 }
3680 O << ")";
3681 break;
3682 }
3683 }
3684}
3685
3686void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3687 VPSlotTracker &SlotTracker) const {
3688 if (isPartialReduction())
3689 O << Indent << "PARTIAL-REDUCE ";
3690 else
3691 O << Indent << "REDUCE ";
3692 printAsOperand(O, SlotTracker);
3693 O << " = ";
3694 getChainOp()->printAsOperand(O, SlotTracker);
3695 O << " +";
3696 printFlags(O);
3697 O << " reduce.";
3698 printRecurrenceKind(O, getRecurrenceKind());
3699 O << " (";
3700 getVecOp()->printAsOperand(O, SlotTracker);
3701 if (isConditional()) {
3702 O << ", ";
3703 getCondOp()->printAsOperand(O, SlotTracker);
3704 }
3705 O << ")";
3706}
3707
3708void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3709 VPSlotTracker &SlotTracker) const {
3710 O << Indent << "REDUCE ";
3711 printAsOperand(O, SlotTracker);
3712 O << " = ";
3713 getChainOp()->printAsOperand(O, SlotTracker);
3714 O << " +";
3715 printFlags(O);
3716 O << " vp.reduce."
3717 << Instruction::getOpcodeName(
3718 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3719 << " (";
3720 getVecOp()->printAsOperand(O, SlotTracker);
3721 O << ", ";
3722 getEVL()->printAsOperand(O, SlotTracker);
3723 if (isConditional()) {
3724 O << ", ";
3725 getCondOp()->printAsOperand(O, SlotTracker);
3726 }
3727 O << ")";
3728}
3729
3730#endif
3731
3732void VPReplicateRecipe::execute(VPTransformState &State) {
3733 assert(IsSingleScalar &&
3734 "VPReplicateRecipes must be unrolled before ::execute");
3735 auto *Instr = getUnderlyingInstr();
3736 Instruction *Cloned = Instr->clone();
3737 Type *ResultTy = getScalarType();
3738 if (!ResultTy->isVoidTy()) {
3739 Cloned->setName(Instr->getName() + ".cloned");
3740 // The operands of the replicate recipe may have been narrowed, resulting in
3741 // a narrower result type. Update the type of the cloned instruction to the
3742 // correct type.
3743 if (ResultTy != Cloned->getType())
3744 Cloned->mutateType(Ty: ResultTy);
3745 }
3746
3747 applyFlags(I&: *Cloned);
3748 applyMetadata(I&: *Cloned);
3749
3750 if (hasPredicate())
3751 cast<CmpInst>(Val: Cloned)->setPredicate(getPredicate());
3752
3753 // Replace the operands of the cloned instructions with their scalar
3754 // equivalents in the new loop.
3755 for (const auto &[Idx, V] : enumerate(First: operands()))
3756 Cloned->setOperand(i: Idx, Val: State.get(Def: V, IsScalar: true));
3757
3758 // Place the cloned scalar in the new loop.
3759 State.Builder.Insert(I: Cloned);
3760
3761 State.set(Def: this, V: Cloned, IsScalar: true);
3762
3763 // If we just cloned a new assumption, add it the assumption cache.
3764 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3765 State.AC->registerAssumption(CI: II);
3766}
3767
3768/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3769/// which the legacy cost model computes a SCEV expression when computing the
3770/// address cost. Computing SCEVs for VPValues is incomplete and returns
3771/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3772/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3773static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3774 PredicatedScalarEvolution &PSE,
3775 const Loop *L) {
3776 const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3777 if (isa<SCEVCouldNotCompute>(Val: Addr))
3778 return Addr;
3779
3780 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr;
3781}
3782
3783InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3784 VPCostContext &Ctx) const {
3785 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3786 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3787 // transform, avoid computing their cost multiple times for now.
3788 Ctx.SkipCostComputation.insert(Ptr: UI);
3789
3790 if (VF.isScalable() && !isSingleScalar())
3791 return InstructionCost::getInvalid();
3792
3793 switch (UI->getOpcode()) {
3794 case Instruction::Alloca:
3795 if (VF.isScalable())
3796 return InstructionCost::getInvalid();
3797 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul,
3798 Ty: this->getScalarType(), CostKind: Ctx.CostKind);
3799 case Instruction::GetElementPtr:
3800 // We mark this instruction as zero-cost because the cost of GEPs in
3801 // vectorized code depends on whether the corresponding memory instruction
3802 // is scalarized or not. Therefore, we handle GEPs with the memory
3803 // instruction cost.
3804 return 0;
3805 case Instruction::Call: {
3806 auto *CalledFn =
3807 cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue());
3808 Type *ResultTy = this->getScalarType();
3809 SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3810 return computeCallCost(CalledFn, ResultTy, ArgOps, IsSingleScalar: isSingleScalar(), VF,
3811 Ctx);
3812 }
3813 case Instruction::Add:
3814 case Instruction::Sub:
3815 case Instruction::FAdd:
3816 case Instruction::FSub:
3817 case Instruction::Mul:
3818 case Instruction::FMul:
3819 case Instruction::FDiv:
3820 case Instruction::FRem:
3821 case Instruction::Shl:
3822 case Instruction::LShr:
3823 case Instruction::AShr:
3824 case Instruction::And:
3825 case Instruction::Or:
3826 case Instruction::Xor:
3827 case Instruction::ICmp:
3828 case Instruction::FCmp:
3829 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3830 Ctx) *
3831 (isSingleScalar() ? 1 : VF.getFixedValue());
3832 case Instruction::SDiv:
3833 case Instruction::UDiv:
3834 case Instruction::SRem:
3835 case Instruction::URem: {
3836 InstructionCost ScalarCost =
3837 getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx);
3838 if (isSingleScalar())
3839 return ScalarCost;
3840
3841 // If any of the operands is from a different replicate region and has its
3842 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3843 // model to avoid cost mis-match.
3844 if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3845 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3846 if (!PredR)
3847 return false;
3848 return Ctx.skipCostComputation(
3849 UI: dyn_cast_or_null<Instruction>(
3850 Val: PredR->getOperand(N: 0)->getUnderlyingValue()),
3851 IsVector: VF.isVector());
3852 }))
3853 break;
3854
3855 ScalarCost = ScalarCost * VF.getFixedValue() +
3856 Ctx.getScalarizationOverhead(ResultTy: this->getScalarType(),
3857 Operands: to_vector(Range: operands()), VF);
3858 // If the recipe is not predicated (i.e. not in a replicate region), return
3859 // the scalar cost. Otherwise handle predicated cost.
3860 if (!getRegion()->isReplicator())
3861 return ScalarCost;
3862
3863 // Account for the phi nodes that we will create.
3864 ScalarCost += VF.getFixedValue() *
3865 Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3866 // Scale the cost by the probability of executing the predicated blocks.
3867 // This assumes the predicated block for each vector lane is equally
3868 // likely.
3869 ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3870 return ScalarCost;
3871 }
3872 case Instruction::Load:
3873 case Instruction::Store: {
3874 bool IsLoad = UI->getOpcode() == Instruction::Load;
3875 const VPValue *PtrOp = getOperand(N: !IsLoad);
3876 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3877 if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3878 break;
3879
3880 Type *ValTy = (IsLoad ? this : getOperand(N: 0))->getScalarType();
3881 Type *ScalarPtrTy = PtrOp->getScalarType();
3882 const Align Alignment = getLoadStoreAlignment(I: UI);
3883 unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3884 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0));
3885 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3886 bool UsedByLoadStoreAddress =
3887 !PreferVectorizedAddressing && vputils::isUsedByLoadStoreAddress(V: this);
3888 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3889 Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo,
3890 I: UsedByLoadStoreAddress ? UI : nullptr);
3891
3892 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3893 InstructionCost ScalarCost =
3894 ScalarMemOpCost +
3895 Ctx.TTI.getAddressComputationCost(
3896 PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3897 CostKind: Ctx.CostKind);
3898 if (isSingleScalar())
3899 return ScalarCost;
3900
3901 SmallVector<const VPValue *> OpsToScalarize;
3902 Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3903 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3904 // don't assign scalarization overhead in general, if the target prefers
3905 // vectorized addressing or the loaded value is used as part of an address
3906 // of another load or store.
3907 if (!UsedByLoadStoreAddress) {
3908 bool EfficientVectorLoadStore =
3909 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3910 if (!(IsLoad && !PreferVectorizedAddressing) &&
3911 !(!IsLoad && EfficientVectorLoadStore))
3912 append_range(C&: OpsToScalarize, R: operands());
3913
3914 if (!EfficientVectorLoadStore)
3915 ResultTy = this->getScalarType();
3916 }
3917
3918 TTI::VectorInstrContext VIC =
3919 IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3920 InstructionCost Cost =
3921 (ScalarCost * VF.getFixedValue()) +
3922 Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true);
3923
3924 const VPRegionBlock *ParentRegion = getRegion();
3925 if (ParentRegion && ParentRegion->isReplicator()) {
3926 if (!PtrSCEV)
3927 break;
3928 Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3929 Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Ctx.CostKind);
3930
3931 auto *VecI1Ty = VectorType::get(
3932 ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF);
3933 Cost += Ctx.TTI.getScalarizationOverhead(
3934 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
3935 /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind);
3936
3937 if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) {
3938 // Artificially setting to a high enough value to practically disable
3939 // vectorization with such operations.
3940 return 3000000;
3941 }
3942 }
3943 return Cost;
3944 }
3945 case Instruction::SExt:
3946 case Instruction::ZExt:
3947 case Instruction::FPToUI:
3948 case Instruction::FPToSI:
3949 case Instruction::FPExt:
3950 case Instruction::PtrToInt:
3951 case Instruction::PtrToAddr:
3952 case Instruction::IntToPtr:
3953 case Instruction::SIToFP:
3954 case Instruction::UIToFP:
3955 case Instruction::Trunc:
3956 case Instruction::FPTrunc:
3957 case Instruction::Select:
3958 case Instruction::AddrSpaceCast: {
3959 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3960 Ctx) *
3961 (isSingleScalar() ? 1 : VF.getFixedValue());
3962 }
3963 case Instruction::ExtractValue:
3964 case Instruction::InsertValue:
3965 return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3966 }
3967
3968 return Ctx.getLegacyCost(UI, VF);
3969}
3970
3971InstructionCost VPReplicateRecipe::computeCallCost(
3972 Function *CalledFn, Type *ResultTy, ArrayRef<const VPValue *> ArgOps,
3973 bool IsSingleScalar, ElementCount VF, VPCostContext &Ctx) {
3974 SmallVector<Type *, 4> Tys = map_to_vector<4>(
3975 C&: ArgOps, F: [&](const VPValue *Op) { return Op->getScalarType(); });
3976
3977 Intrinsic::ID IntrinID = CalledFn->getIntrinsicID();
3978 auto GetIntrinsicCost = [&] {
3979 if (!IntrinID)
3980 return InstructionCost::getInvalid();
3981 return Ctx.TTI.getIntrinsicInstrCost(
3982 ICA: IntrinsicCostAttributes(IntrinID, ResultTy, Tys), CostKind: Ctx.CostKind);
3983 };
3984
3985 if (IntrinID && VPCostContext::isFreeScalarIntrinsic(ID: IntrinID)) {
3986 assert(GetIntrinsicCost() == 0 && "scalarizing intrinsic should be free");
3987 return 0;
3988 }
3989
3990 InstructionCost ScalarCallCost =
3991 Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3992 if (IsSingleScalar) {
3993 ScalarCallCost = std::min(a: ScalarCallCost, b: GetIntrinsicCost());
3994 return ScalarCallCost;
3995 }
3996
3997 // Scalarization overhead is undefined for scalable VFs.
3998 if (VF.isScalable())
3999 return InstructionCost::getInvalid();
4000
4001 return ScalarCallCost * VF.getFixedValue() +
4002 Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
4003}
4004
4005#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4006void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4007 VPSlotTracker &SlotTracker) const {
4008 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
4009
4010 if (!getScalarType()->isVoidTy()) {
4011 printAsOperand(O, SlotTracker);
4012 O << " = ";
4013 }
4014 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
4015 O << "call";
4016 printFlags(O);
4017 O << "@" << CB->getCalledFunction()->getName() << "(";
4018 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
4019 O, [&O, &SlotTracker](VPValue *Op) {
4020 Op->printAsOperand(O, SlotTracker);
4021 });
4022 O << ")";
4023 } else {
4024 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
4025 printFlags(O);
4026 printOperands(O, SlotTracker);
4027 }
4028
4029 // Find if the recipe is used by a widened recipe via an intervening
4030 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
4031 if (any_of(users(), [](const VPUser *U) {
4032 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
4033 return !vputils::onlyScalarValuesUsed(PredR);
4034 return false;
4035 }))
4036 O << " (S->V)";
4037}
4038#endif
4039
4040void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
4041 llvm_unreachable("recipe must be removed when dissolving replicate region");
4042}
4043
4044InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
4045 VPCostContext &Ctx) const {
4046 // The legacy cost model doesn't assign costs to branches for individual
4047 // replicate regions. Match the current behavior in the VPlan cost model for
4048 // now.
4049 return 0;
4050}
4051
4052void VPPredInstPHIRecipe::execute(VPTransformState &State) {
4053 llvm_unreachable("recipe must be removed when dissolving replicate region");
4054}
4055
4056#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4057void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4058 VPSlotTracker &SlotTracker) const {
4059 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
4060 printAsOperand(O, SlotTracker);
4061 O << " = ";
4062 printOperands(O, SlotTracker);
4063}
4064#endif
4065
4066VPRecipeBase *VPWidenLoadRecipe::getAsRecipe() { return this; }
4067const VPRecipeBase *VPWidenLoadRecipe::getAsRecipe() const { return this; }
4068
4069VPRecipeBase *VPWidenLoadEVLRecipe::getAsRecipe() { return this; }
4070const VPRecipeBase *VPWidenLoadEVLRecipe::getAsRecipe() const { return this; }
4071
4072VPRecipeBase *VPWidenStoreRecipe::getAsRecipe() { return this; }
4073const VPRecipeBase *VPWidenStoreRecipe::getAsRecipe() const { return this; }
4074
4075VPRecipeBase *VPWidenStoreEVLRecipe::getAsRecipe() { return this; }
4076const VPRecipeBase *VPWidenStoreEVLRecipe::getAsRecipe() const { return this; }
4077
4078InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
4079 VPCostContext &Ctx) const {
4080 const VPRecipeBase *R = getAsRecipe();
4081 bool IsLoad = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: R);
4082 Type *ScalarTy = IsLoad ? cast<VPSingleDefRecipe>(Val: R)->getScalarType()
4083 : R->getOperand(N: 1)->getScalarType();
4084 Type *Ty = toVectorTy(Scalar: ScalarTy, EC: VF);
4085 unsigned AS =
4086 cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace();
4087 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
4088
4089 if (!Consecutive) {
4090 // TODO: Using the original IR may not be accurate.
4091 // Currently, ARM will use the underlying IR to calculate gather/scatter
4092 // instruction cost.
4093 [[maybe_unused]] auto IsReverseMask = [this, R]() {
4094 VPValue *Mask = getMask();
4095 if (!Mask)
4096 return false;
4097
4098 if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(Val: R))
4099 return match(V: Mask, P: m_Intrinsic<Intrinsic::experimental_vp_reverse>());
4100
4101 return match(V: Mask, P: m_Reverse(Op0: m_VPValue()));
4102 };
4103 assert(!IsReverseMask() &&
4104 "Inconsecutive memory access should not have reverse order");
4105 Type *PtrTy = getAddr()->getScalarType();
4106 const Value *Ptr = getAddr()->getUnderlyingValue();
4107
4108 // If the address value is uniform across all lanes, then the address can be
4109 // calculated with scalar type and broadcast.
4110 if (!vputils::isSingleScalar(VPV: getAddr()))
4111 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
4112
4113 unsigned IID = isa<VPWidenLoadRecipe>(Val: R) ? Intrinsic::masked_gather
4114 : isa<VPWidenStoreRecipe>(Val: R) ? Intrinsic::masked_scatter
4115 : isa<VPWidenLoadEVLRecipe>(Val: R) ? Intrinsic::vp_gather
4116 : Intrinsic::vp_scatter;
4117 return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
4118 CostKind: Ctx.CostKind) +
4119 Ctx.TTI.getMemIntrinsicInstrCost(
4120 MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
4121 &Ingredient),
4122 CostKind: Ctx.CostKind);
4123 }
4124
4125 InstructionCost Cost = 0;
4126 if (IsMasked) {
4127 unsigned IID = isa<VPWidenLoadRecipe>(Val: R) ? Intrinsic::masked_load
4128 : Intrinsic::masked_store;
4129 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
4130 MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
4131 } else {
4132 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
4133 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: R) ? R->getOperand(N: 0)
4134 : R->getOperand(N: 1));
4135 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
4136 OpdInfo: OpInfo, I: &Ingredient);
4137 }
4138 return Cost;
4139}
4140
4141void VPWidenLoadRecipe::execute(VPTransformState &State) {
4142 Type *ScalarDataTy = getScalarType();
4143 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
4144 bool CreateGather = !isConsecutive();
4145
4146 auto &Builder = State.Builder;
4147 Value *Mask = nullptr;
4148 if (auto *VPMask = getMask())
4149 Mask = State.get(Def: VPMask);
4150
4151 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
4152 Value *NewLI;
4153 if (CreateGather) {
4154 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
4155 Name: "wide.masked.gather");
4156 } else if (Mask) {
4157 NewLI =
4158 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
4159 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
4160 } else {
4161 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
4162 }
4163 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
4164 State.set(Def: this, V: NewLI);
4165}
4166
4167#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4168void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4169 VPSlotTracker &SlotTracker) const {
4170 O << Indent << "WIDEN ";
4171 printAsOperand(O, SlotTracker);
4172 O << " = load ";
4173 printOperands(O, SlotTracker);
4174}
4175#endif
4176
4177void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
4178 Type *ScalarDataTy = getScalarType();
4179 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
4180 bool CreateGather = !isConsecutive();
4181
4182 auto &Builder = State.Builder;
4183 CallInst *NewLI;
4184 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4185 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
4186 Value *Mask = nullptr;
4187 if (VPValue *VPMask = getMask())
4188 Mask = State.get(Def: VPMask);
4189 else
4190 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
4191
4192 if (CreateGather) {
4193 NewLI = Builder.CreateIntrinsicWithoutFolding(RetTy: DataTy, ID: Intrinsic::vp_gather,
4194 Args: {Addr, Mask, EVL}, FMFSource: nullptr,
4195 Name: "wide.masked.gather");
4196 } else {
4197 NewLI = Builder.CreateIntrinsicWithoutFolding(
4198 RetTy: DataTy, ID: Intrinsic::vp_load, Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
4199 }
4200 NewLI->addParamAttr(
4201 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
4202 applyMetadata(I&: *NewLI);
4203 State.set(Def: this, V: NewLI);
4204}
4205
4206InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
4207 VPCostContext &Ctx) const {
4208 if (!Consecutive || IsMasked)
4209 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4210
4211 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4212 // here because the EVL recipes using EVL to replace the tail mask. But in the
4213 // legacy model, it will always calculate the cost of mask.
4214 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4215 // don't need to compare to the legacy cost model.
4216 Type *Ty = toVectorTy(Scalar: getScalarType(), EC: VF);
4217 unsigned AS =
4218 cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace();
4219 return Ctx.TTI.getMemIntrinsicInstrCost(
4220 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
4221 CostKind: Ctx.CostKind);
4222}
4223
4224#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4225void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4226 VPSlotTracker &SlotTracker) const {
4227 O << Indent << "WIDEN ";
4228 printAsOperand(O, SlotTracker);
4229 O << " = vp.load ";
4230 printOperands(O, SlotTracker);
4231}
4232#endif
4233
4234void VPWidenStoreRecipe::execute(VPTransformState &State) {
4235 VPValue *StoredVPValue = getStoredValue();
4236 bool CreateScatter = !isConsecutive();
4237
4238 auto &Builder = State.Builder;
4239
4240 Value *Mask = nullptr;
4241 if (auto *VPMask = getMask())
4242 Mask = State.get(Def: VPMask);
4243
4244 Value *StoredVal = State.get(Def: StoredVPValue);
4245 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
4246 Instruction *NewSI = nullptr;
4247 if (CreateScatter)
4248 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
4249 else if (Mask)
4250 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
4251 else
4252 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
4253 applyMetadata(I&: *NewSI);
4254}
4255
4256#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4257void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4258 VPSlotTracker &SlotTracker) const {
4259 O << Indent << "WIDEN store ";
4260 printOperands(O, SlotTracker);
4261}
4262#endif
4263
4264void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
4265 VPValue *StoredValue = getStoredValue();
4266 bool CreateScatter = !isConsecutive();
4267
4268 auto &Builder = State.Builder;
4269
4270 CallInst *NewSI = nullptr;
4271 Value *StoredVal = State.get(Def: StoredValue);
4272 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4273 Value *Mask = nullptr;
4274 if (VPValue *VPMask = getMask())
4275 Mask = State.get(Def: VPMask);
4276 else
4277 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
4278
4279 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
4280 if (CreateScatter) {
4281 NewSI = Builder.CreateIntrinsicWithoutFolding(
4282 RetTy: Type::getVoidTy(C&: EVL->getContext()), ID: Intrinsic::vp_scatter,
4283 Args: {StoredVal, Addr, Mask, EVL});
4284 } else {
4285 NewSI = Builder.CreateIntrinsicWithoutFolding(
4286 RetTy: Type::getVoidTy(C&: EVL->getContext()), ID: Intrinsic::vp_store,
4287 Args: {StoredVal, Addr, Mask, EVL});
4288 }
4289 NewSI->addParamAttr(
4290 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
4291 applyMetadata(I&: *NewSI);
4292}
4293
4294InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
4295 VPCostContext &Ctx) const {
4296 if (!Consecutive || IsMasked)
4297 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4298
4299 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4300 // here because the EVL recipes using EVL to replace the tail mask. But in the
4301 // legacy model, it will always calculate the cost of mask.
4302 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4303 // don't need to compare to the legacy cost model.
4304 Type *Ty = toVectorTy(Scalar: getStoredValue()->getScalarType(), EC: VF);
4305 unsigned AS =
4306 cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace();
4307 return Ctx.TTI.getMemIntrinsicInstrCost(
4308 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4309 CostKind: Ctx.CostKind);
4310}
4311
4312#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4313void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4314 VPSlotTracker &SlotTracker) const {
4315 O << Indent << "WIDEN vp.store ";
4316 printOperands(O, SlotTracker);
4317}
4318#endif
4319
4320static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
4321 VectorType *DstVTy, const DataLayout &DL) {
4322 // Verify that V is a vector type with same number of elements as DstVTy.
4323 auto VF = DstVTy->getElementCount();
4324 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
4325 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4326 Type *SrcElemTy = SrcVecTy->getElementType();
4327 Type *DstElemTy = DstVTy->getElementType();
4328 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4329 "Vector elements must have same size");
4330
4331 // Do a direct cast if element types are castable.
4332 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
4333 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
4334 }
4335 // V cannot be directly casted to desired vector type.
4336 // May happen when V is a floating point vector but DstVTy is a vector of
4337 // pointers or vice-versa. Handle this using a two-step bitcast using an
4338 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4339 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4340 "Only one type should be a pointer type");
4341 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4342 "Only one type should be a floating point type");
4343 Type *IntTy =
4344 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
4345 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
4346 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
4347 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
4348}
4349
4350/// Return a vector containing interleaved elements from multiple
4351/// smaller input vectors.
4352static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
4353 const Twine &Name) {
4354 unsigned Factor = Vals.size();
4355 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4356
4357 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
4358#ifndef NDEBUG
4359 for (Value *Val : Vals)
4360 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4361#endif
4362
4363 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4364 // must use intrinsics to interleave.
4365 if (VecTy->isScalableTy()) {
4366 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4367 return Builder.CreateVectorInterleave(Ops: Vals, Name);
4368 }
4369
4370 // Fixed length. Start by concatenating all vectors into a wide vector.
4371 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
4372
4373 // Interleave the elements into the wide vector.
4374 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4375 return Builder.CreateShuffleVector(
4376 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
4377}
4378
4379// Try to vectorize the interleave group that \p Instr belongs to.
4380//
4381// E.g. Translate following interleaved load group (factor = 3):
4382// for (i = 0; i < N; i+=3) {
4383// R = Pic[i]; // Member of index 0
4384// G = Pic[i+1]; // Member of index 1
4385// B = Pic[i+2]; // Member of index 2
4386// ... // do something to R, G, B
4387// }
4388// To:
4389// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4390// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4391// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4392// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4393//
4394// Or translate following interleaved store group (factor = 3):
4395// for (i = 0; i < N; i+=3) {
4396// ... do something to R, G, B
4397// Pic[i] = R; // Member of index 0
4398// Pic[i+1] = G; // Member of index 1
4399// Pic[i+2] = B; // Member of index 2
4400// }
4401// To:
4402// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4403// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4404// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4405// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4406// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4407void VPInterleaveRecipe::execute(VPTransformState &State) {
4408 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4409 "Masking gaps for scalable vectors is not yet supported.");
4410 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4411 Instruction *Instr = Group->getInsertPos();
4412
4413 // Prepare for the vector type of the interleaved load/store.
4414 Type *ScalarTy = getLoadStoreType(I: Instr);
4415 unsigned InterleaveFactor = Group->getFactor();
4416 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
4417
4418 VPValue *BlockInMask = getMask();
4419 VPValue *Addr = getAddr();
4420 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4421
4422 auto CreateGroupMask = [&BlockInMask, &State,
4423 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4424 if (State.VF.isScalable()) {
4425 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4426 assert(InterleaveFactor <= 8 &&
4427 "Unsupported deinterleave factor for scalable vectors");
4428 auto *ResBlockInMask = State.get(Def: BlockInMask);
4429 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4430 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4431 }
4432
4433 if (!BlockInMask)
4434 return MaskForGaps;
4435
4436 Value *ResBlockInMask = State.get(Def: BlockInMask);
4437 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4438 V: ResBlockInMask,
4439 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4440 Name: "interleaved.mask");
4441 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4442 LHS: ShuffledMask, RHS: MaskForGaps)
4443 : ShuffledMask;
4444 };
4445
4446 const DataLayout &DL = Instr->getDataLayout();
4447 // Vectorize the interleaved load group.
4448 if (isa<LoadInst>(Val: Instr)) {
4449 Value *MaskForGaps = nullptr;
4450 if (needsMaskForGaps()) {
4451 MaskForGaps =
4452 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4453 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4454 }
4455
4456 Instruction *NewLoad;
4457 if (BlockInMask || MaskForGaps) {
4458 Value *GroupMask = CreateGroupMask(MaskForGaps);
4459 Value *PoisonVec = PoisonValue::get(T: VecTy);
4460 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4461 Alignment: Group->getAlign(), Mask: GroupMask,
4462 PassThru: PoisonVec, Name: "wide.masked.vec");
4463 } else
4464 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4465 Align: Group->getAlign(), Name: "wide.vec");
4466 applyMetadata(I&: *NewLoad);
4467 // TODO: Also manage existing metadata using VPIRMetadata.
4468 Group->addMetadata(NewInst: NewLoad);
4469
4470 ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4471 if (VecTy->isScalableTy()) {
4472 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4473 // so must use intrinsics to deinterleave.
4474 assert(InterleaveFactor <= 8 &&
4475 "Unsupported deinterleave factor for scalable vectors");
4476 NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4477 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4478 OverloadTypes: NewLoad->getType(), Args: NewLoad,
4479 /*FMFSource=*/nullptr, Name: "strided.vec");
4480 }
4481
4482 auto CreateStridedVector = [&InterleaveFactor, &State,
4483 &NewLoad](unsigned Index) -> Value * {
4484 assert(Index < InterleaveFactor && "Illegal group index");
4485 if (State.VF.isScalable())
4486 return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4487
4488 // For fixed length VF, use shuffle to extract the sub-vectors from the
4489 // wide load.
4490 auto StrideMask =
4491 createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4492 return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4493 Name: "strided.vec");
4494 };
4495
4496 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4497 Instruction *Member = Group->getMember(Index: I);
4498
4499 // Skip the gaps in the group.
4500 if (!Member)
4501 continue;
4502
4503 Value *StridedVec = CreateStridedVector(I);
4504
4505 // If this member has different type, cast the result type.
4506 if (Member->getType() != ScalarTy) {
4507 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4508 StridedVec =
4509 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4510 }
4511
4512 if (Group->isReverse())
4513 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4514
4515 State.set(Def: VPDefs[J], V: StridedVec);
4516 ++J;
4517 }
4518 return;
4519 }
4520
4521 // The sub vector type for current instruction.
4522 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4523
4524 // Vectorize the interleaved store group.
4525 Value *MaskForGaps =
4526 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4527 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4528 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4529 ArrayRef<VPValue *> StoredValues = getStoredValues();
4530 // Collect the stored vector from each member.
4531 SmallVector<Value *, 4> StoredVecs;
4532 unsigned StoredIdx = 0;
4533 for (unsigned i = 0; i < InterleaveFactor; i++) {
4534 assert((Group->getMember(i) || MaskForGaps) &&
4535 "Fail to get a member from an interleaved store group");
4536 Instruction *Member = Group->getMember(Index: i);
4537
4538 // Skip the gaps in the group.
4539 if (!Member) {
4540 Value *Undef = PoisonValue::get(T: SubVT);
4541 StoredVecs.push_back(Elt: Undef);
4542 continue;
4543 }
4544
4545 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4546 ++StoredIdx;
4547
4548 if (Group->isReverse())
4549 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4550
4551 // If this member has different type, cast it to a unified type.
4552
4553 if (StoredVec->getType() != SubVT)
4554 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4555
4556 StoredVecs.push_back(Elt: StoredVec);
4557 }
4558
4559 // Interleave all the smaller vectors into one wider vector.
4560 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4561 Instruction *NewStoreInstr;
4562 if (BlockInMask || MaskForGaps) {
4563 Value *GroupMask = CreateGroupMask(MaskForGaps);
4564 NewStoreInstr = State.Builder.CreateMaskedStore(
4565 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4566 } else
4567 NewStoreInstr =
4568 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4569
4570 applyMetadata(I&: *NewStoreInstr);
4571 // TODO: Also manage existing metadata using VPIRMetadata.
4572 Group->addMetadata(NewInst: NewStoreInstr);
4573}
4574
4575#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4576void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4577 VPSlotTracker &SlotTracker) const {
4578 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4579 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4580 getAddr()->printAsOperand(O, SlotTracker);
4581 VPValue *Mask = getMask();
4582 if (Mask) {
4583 O << ", ";
4584 Mask->printAsOperand(O, SlotTracker);
4585 }
4586
4587 unsigned OpIdx = 0;
4588 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4589 if (!IG->getMember(i))
4590 continue;
4591 if (getNumStoreOperands() > 0) {
4592 O << "\n" << Indent << " store ";
4593 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
4594 O << " to index " << i;
4595 } else {
4596 O << "\n" << Indent << " ";
4597 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4598 O << " = load from index " << i;
4599 }
4600 ++OpIdx;
4601 }
4602}
4603#endif
4604
4605void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4606 assert(State.VF.isScalable() &&
4607 "Only support scalable VF for EVL tail-folding.");
4608 assert(!needsMaskForGaps() &&
4609 "Masking gaps for scalable vectors is not yet supported.");
4610 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4611 Instruction *Instr = Group->getInsertPos();
4612
4613 // Prepare for the vector type of the interleaved load/store.
4614 Type *ScalarTy = getLoadStoreType(I: Instr);
4615 unsigned InterleaveFactor = Group->getFactor();
4616 assert(InterleaveFactor <= 8 &&
4617 "Unsupported deinterleave/interleave factor for scalable vectors");
4618 ElementCount WideVF = State.VF * InterleaveFactor;
4619 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4620
4621 VPValue *Addr = getAddr();
4622 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4623 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4624 Value *InterleaveEVL = State.Builder.CreateMul(
4625 LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4626 /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true);
4627 LLVMContext &Ctx = State.Builder.getContext();
4628
4629 Value *GroupMask = nullptr;
4630 if (VPValue *BlockInMask = getMask()) {
4631 SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4632 GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4633 } else {
4634 GroupMask =
4635 State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4636 }
4637
4638 // Vectorize the interleaved load group.
4639 if (isa<LoadInst>(Val: Instr)) {
4640 CallInst *NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4641 RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4642 Name: "wide.vp.load");
4643 NewLoad->addParamAttr(ArgNo: 0,
4644 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4645
4646 applyMetadata(I&: *NewLoad);
4647 // TODO: Also manage existing metadata using VPIRMetadata.
4648 Group->addMetadata(NewInst: NewLoad);
4649
4650 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4651 // so must use intrinsics to deinterleave.
4652 NewLoad = State.Builder.CreateIntrinsicWithoutFolding(
4653 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4654 OverloadTypes: NewLoad->getType(), Args: NewLoad,
4655 /*FMFSource=*/nullptr, Name: "strided.vec");
4656
4657 const DataLayout &DL = Instr->getDataLayout();
4658 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4659 Instruction *Member = Group->getMember(Index: I);
4660 // Skip the gaps in the group.
4661 if (!Member)
4662 continue;
4663
4664 Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4665 // If this member has different type, cast the result type.
4666 if (Member->getType() != ScalarTy) {
4667 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4668 StridedVec =
4669 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4670 }
4671
4672 State.set(Def: getVPValue(I: J), V: StridedVec);
4673 ++J;
4674 }
4675 return;
4676 } // End for interleaved load.
4677
4678 // The sub vector type for current instruction.
4679 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4680 // Vectorize the interleaved store group.
4681 ArrayRef<VPValue *> StoredValues = getStoredValues();
4682 // Collect the stored vector from each member.
4683 SmallVector<Value *, 4> StoredVecs;
4684 const DataLayout &DL = Instr->getDataLayout();
4685 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4686 Instruction *Member = Group->getMember(Index: I);
4687 // Skip the gaps in the group.
4688 if (!Member) {
4689 StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4690 continue;
4691 }
4692
4693 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4694 // If this member has different type, cast it to a unified type.
4695 if (StoredVec->getType() != SubVT)
4696 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4697
4698 StoredVecs.push_back(Elt: StoredVec);
4699 ++StoredIdx;
4700 }
4701
4702 // Interleave all the smaller vectors into one wider vector.
4703 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4704 CallInst *NewStore = State.Builder.CreateIntrinsicWithoutFolding(
4705 RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4706 Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4707
4708 NewStore->addParamAttr(ArgNo: 1,
4709 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4710
4711 applyMetadata(I&: *NewStore);
4712 // TODO: Also manage existing metadata using VPIRMetadata.
4713 Group->addMetadata(NewInst: NewStore);
4714}
4715
4716#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4717void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4718 VPSlotTracker &SlotTracker) const {
4719 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4720 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << ", ";
4721 getAddr()->printAsOperand(O, SlotTracker);
4722 O << ", ";
4723 getEVL()->printAsOperand(O, SlotTracker);
4724 if (VPValue *Mask = getMask()) {
4725 O << ", ";
4726 Mask->printAsOperand(O, SlotTracker);
4727 }
4728
4729 unsigned OpIdx = 0;
4730 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4731 if (!IG->getMember(i))
4732 continue;
4733 if (getNumStoreOperands() > 0) {
4734 O << "\n" << Indent << " vp.store ";
4735 getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
4736 O << " to index " << i;
4737 } else {
4738 O << "\n" << Indent << " ";
4739 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4740 O << " = vp.load from index " << i;
4741 }
4742 ++OpIdx;
4743 }
4744}
4745#endif
4746
4747InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4748 VPCostContext &Ctx) const {
4749 Instruction *InsertPos = getInsertPos();
4750 // Find the VPValue index of the interleave group. We need to skip gaps.
4751 unsigned InsertPosIdx = 0;
4752 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4753 if (auto *Member = IG->getMember(Index: Idx)) {
4754 if (Member == InsertPos)
4755 break;
4756 InsertPosIdx++;
4757 }
4758 const VPValue *ValV = getNumDefinedValues() > 0
4759 ? getVPValue(I: InsertPosIdx)
4760 : getStoredValues()[InsertPosIdx];
4761 Type *ValTy = ValV->getScalarType();
4762 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4763 unsigned AS =
4764 cast<PointerType>(Val: getAddr()->getScalarType())->getAddressSpace();
4765
4766 unsigned InterleaveFactor = IG->getFactor();
4767 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4768
4769 // Holds the indices of existing members in the interleaved group.
4770 SmallVector<unsigned, 4> Indices;
4771 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4772 if (IG->getMember(Index: IF))
4773 Indices.push_back(Elt: IF);
4774
4775 // Calculate the cost of the whole interleaved group.
4776 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4777 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4778 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4779
4780 if (!IG->isReverse())
4781 return Cost;
4782
4783 return Cost + IG->getNumMembers() *
4784 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4785 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4786 Index: 0);
4787}
4788
4789bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4790 return vputils::onlyScalarValuesUsed(Def: this) &&
4791 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
4792}
4793
4794#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4795void VPWidenPointerInductionRecipe::printRecipe(
4796 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4797 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4798 "unexpected number of operands");
4799 O << Indent << "EMIT ";
4800 printAsOperand(O, SlotTracker);
4801 O << " = WIDEN-POINTER-INDUCTION ";
4802 getStartValue()->printAsOperand(O, SlotTracker);
4803 O << ", ";
4804 getStepValue()->printAsOperand(O, SlotTracker);
4805 O << ", ";
4806 getOperand(2)->printAsOperand(O, SlotTracker);
4807 if (getNumOperands() == 5) {
4808 O << ", ";
4809 getOperand(3)->printAsOperand(O, SlotTracker);
4810 O << ", ";
4811 getOperand(4)->printAsOperand(O, SlotTracker);
4812 }
4813}
4814
4815void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4816 VPSlotTracker &SlotTracker) const {
4817 O << Indent << "EMIT ";
4818 printAsOperand(O, SlotTracker);
4819 O << " = EXPAND SCEV " << *Expr;
4820}
4821#endif
4822
4823#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4824void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4825 VPSlotTracker &SlotTracker) const {
4826 O << Indent << "EMIT ";
4827 printAsOperand(O, SlotTracker);
4828 O << " = WIDEN-CANONICAL-INDUCTION";
4829 printFlags(O);
4830 printOperands(O, SlotTracker);
4831}
4832#endif
4833
4834void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4835 auto &Builder = State.Builder;
4836 // Create a vector from the initial value.
4837 auto *VectorInit = getStartValue()->getLiveInIRValue();
4838
4839 Type *VecTy = State.VF.isScalar()
4840 ? VectorInit->getType()
4841 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4842
4843 BasicBlock *VectorPH =
4844 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4845 if (State.VF.isVector()) {
4846 auto *IdxTy = Builder.getInt32Ty();
4847 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
4848 IRBuilder<>::InsertPointGuard Guard(Builder);
4849 Builder.SetInsertPoint(VectorPH->getTerminator());
4850 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4851 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4852 VectorInit = Builder.CreateInsertElement(
4853 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4854 }
4855
4856 // Create a phi node for the new recurrence.
4857 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
4858 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4859 Phi->addIncoming(V: VectorInit, BB: VectorPH);
4860 State.set(Def: this, V: Phi);
4861}
4862
4863InstructionCost
4864VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4865 VPCostContext &Ctx) const {
4866 if (VF.isScalar())
4867 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4868
4869 return 0;
4870}
4871
4872#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4873void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4874 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4875 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4876 printAsOperand(O, SlotTracker);
4877 O << " = phi ";
4878 printOperands(O, SlotTracker);
4879}
4880#endif
4881
4882void VPReductionPHIRecipe::execute(VPTransformState &State) {
4883 // Reductions do not have to start at zero. They can start with
4884 // any loop invariant values.
4885 VPValue *StartVPV = getStartValue();
4886
4887 // In order to support recurrences we need to be able to vectorize Phi nodes.
4888 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4889 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4890 // this value when we vectorize all of the instructions that use the PHI.
4891 BasicBlock *VectorPH =
4892 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4893 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4894 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4895 Type *VecTy = StartV->getType();
4896
4897 BasicBlock *HeaderBB = State.CFG.PrevBB;
4898 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4899 "recipe must be in the vector loop header");
4900 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
4901 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4902 State.set(Def: this, V: Phi, IsScalar: isInLoop());
4903
4904 Phi->addIncoming(V: StartV, BB: VectorPH);
4905}
4906
4907#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4908void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4909 VPSlotTracker &SlotTracker) const {
4910 O << Indent << "WIDEN-REDUCTION-PHI ";
4911
4912 printAsOperand(O, SlotTracker);
4913 O << " = phi (";
4914 printRecurrenceKind(O, Kind);
4915 O << ")";
4916 printFlags(O);
4917 printOperands(O, SlotTracker);
4918 if (getVFScaleFactor() > 1)
4919 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4920}
4921#endif
4922
4923bool VPBlendRecipe::usesFirstLaneOnly(const VPValue *Op) const {
4924 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
4925 return vputils::onlyFirstLaneUsed(Def: this);
4926}
4927
4928void VPWidenPHIRecipe::execute(VPTransformState &State) {
4929 executePhiRecipe(R: this, Phi&: *this, State, /*IsScalar=*/false, Name);
4930}
4931
4932InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4933 VPCostContext &Ctx) const {
4934 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4935}
4936
4937#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4938void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4939 VPSlotTracker &SlotTracker) const {
4940 O << Indent << "WIDEN-PHI ";
4941
4942 printAsOperand(O, SlotTracker);
4943 O << " = phi ";
4944 printPhiOperands(O, SlotTracker);
4945}
4946#endif
4947
4948void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4949 BasicBlock *VectorPH =
4950 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4951 Value *StartMask = State.get(Def: getOperand(N: 0));
4952 PHINode *Phi =
4953 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
4954 Phi->addIncoming(V: StartMask, BB: VectorPH);
4955 State.set(Def: this, V: Phi);
4956}
4957
4958#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4959void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4960 VPSlotTracker &SlotTracker) const {
4961 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4962
4963 printAsOperand(O, SlotTracker);
4964 O << " = phi ";
4965 printOperands(O, SlotTracker);
4966}
4967#endif
4968
4969#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4970void VPCurrentIterationPHIRecipe::printRecipe(
4971 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4972 O << Indent << "CURRENT-ITERATION-PHI ";
4973
4974 printAsOperand(O, SlotTracker);
4975 O << " = phi ";
4976 printOperands(O, SlotTracker);
4977}
4978#endif
4979