1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include "llvm/Transforms/Utils/LoopUtils.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(Val: this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(Val: this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCanonicalIVPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
111bool VPRecipeBase::mayReadFromMemory() const {
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(Val: this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenIntrinsicSC:
128 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
129 case VPBranchOnMaskSC:
130 case VPDerivedIVSC:
131 case VPFirstOrderRecurrencePHISC:
132 case VPReductionPHISC:
133 case VPPredInstPHISC:
134 case VPScalarIVStepsSC:
135 case VPWidenStoreEVLSC:
136 case VPWidenStoreSC:
137 return false;
138 case VPBlendSC:
139 case VPReductionEVLSC:
140 case VPReductionSC:
141 case VPVectorPointerSC:
142 case VPWidenCanonicalIVSC:
143 case VPWidenCastSC:
144 case VPWidenGEPSC:
145 case VPWidenIntOrFpInductionSC:
146 case VPWidenPHISC:
147 case VPWidenPointerInductionSC:
148 case VPWidenSC: {
149 const Instruction *I =
150 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
151 (void)I;
152 assert((!I || !I->mayReadFromMemory()) &&
153 "underlying instruction may read from memory");
154 return false;
155 }
156 default:
157 // FIXME: Return false if the recipe represents an interleaved store.
158 return true;
159 }
160}
161
162bool VPRecipeBase::mayHaveSideEffects() const {
163 switch (getVPRecipeID()) {
164 case VPExpressionSC:
165 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
166 case VPActiveLaneMaskPHISC:
167 case VPDerivedIVSC:
168 case VPFirstOrderRecurrencePHISC:
169 case VPReductionPHISC:
170 case VPPredInstPHISC:
171 case VPVectorEndPointerSC:
172 return false;
173 case VPInstructionSC: {
174 auto *VPI = cast<VPInstruction>(Val: this);
175 return mayWriteToMemory() ||
176 VPI->getOpcode() == VPInstruction::BranchOnCount ||
177 VPI->getOpcode() == VPInstruction::BranchOnCond ||
178 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
179 }
180 case VPWidenCallSC: {
181 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
182 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
183 }
184 case VPWidenIntrinsicSC:
185 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
186 case VPBlendSC:
187 case VPReductionEVLSC:
188 case VPReductionSC:
189 case VPScalarIVStepsSC:
190 case VPVectorPointerSC:
191 case VPWidenCanonicalIVSC:
192 case VPWidenCastSC:
193 case VPWidenGEPSC:
194 case VPWidenIntOrFpInductionSC:
195 case VPWidenPHISC:
196 case VPWidenPointerInductionSC:
197 case VPWidenSC: {
198 const Instruction *I =
199 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
200 (void)I;
201 assert((!I || !I->mayHaveSideEffects()) &&
202 "underlying instruction has side-effects");
203 return false;
204 }
205 case VPInterleaveEVLSC:
206 case VPInterleaveSC:
207 return mayWriteToMemory();
208 case VPWidenLoadEVLSC:
209 case VPWidenLoadSC:
210 case VPWidenStoreEVLSC:
211 case VPWidenStoreSC:
212 assert(
213 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
214 mayWriteToMemory() &&
215 "mayHaveSideffects result for ingredient differs from this "
216 "implementation");
217 return mayWriteToMemory();
218 case VPReplicateSC: {
219 auto *R = cast<VPReplicateRecipe>(Val: this);
220 return R->getUnderlyingInstr()->mayHaveSideEffects();
221 }
222 default:
223 return true;
224 }
225}
226
227void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
228 assert(!Parent && "Recipe already in some VPBasicBlock");
229 assert(InsertPos->getParent() &&
230 "Insertion position not in any VPBasicBlock");
231 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
232}
233
234void VPRecipeBase::insertBefore(VPBasicBlock &BB,
235 iplist<VPRecipeBase>::iterator I) {
236 assert(!Parent && "Recipe already in some VPBasicBlock");
237 assert(I == BB.end() || I->getParent() == &BB);
238 BB.insert(Recipe: this, InsertPt: I);
239}
240
241void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
242 assert(!Parent && "Recipe already in some VPBasicBlock");
243 assert(InsertPos->getParent() &&
244 "Insertion position not in any VPBasicBlock");
245 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
246}
247
248void VPRecipeBase::removeFromParent() {
249 assert(getParent() && "Recipe not in any VPBasicBlock");
250 getParent()->getRecipeList().remove(IT: getIterator());
251 Parent = nullptr;
252}
253
254iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
255 assert(getParent() && "Recipe not in any VPBasicBlock");
256 return getParent()->getRecipeList().erase(where: getIterator());
257}
258
259void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
260 removeFromParent();
261 insertAfter(InsertPos);
262}
263
264void VPRecipeBase::moveBefore(VPBasicBlock &BB,
265 iplist<VPRecipeBase>::iterator I) {
266 removeFromParent();
267 insertBefore(BB, I);
268}
269
270InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
271 // Get the underlying instruction for the recipe, if there is one. It is used
272 // to
273 // * decide if cost computation should be skipped for this recipe,
274 // * apply forced target instruction cost.
275 Instruction *UI = nullptr;
276 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
277 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
278 else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this))
279 UI = IG->getInsertPos();
280 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
281 UI = &WidenMem->getIngredient();
282
283 InstructionCost RecipeCost;
284 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
285 RecipeCost = 0;
286 } else {
287 RecipeCost = computeCost(VF, Ctx);
288 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
289 RecipeCost.isValid()) {
290 if (UI)
291 RecipeCost = InstructionCost(ForceTargetInstructionCost);
292 else
293 RecipeCost = InstructionCost(0);
294 }
295 }
296
297 LLVM_DEBUG({
298 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
299 dump();
300 });
301 return RecipeCost;
302}
303
304InstructionCost VPRecipeBase::computeCost(ElementCount VF,
305 VPCostContext &Ctx) const {
306 llvm_unreachable("subclasses should implement computeCost");
307}
308
309bool VPRecipeBase::isPhi() const {
310 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
311 isa<VPPhi, VPIRPhi>(Val: this);
312}
313
314bool VPRecipeBase::isScalarCast() const {
315 auto *VPI = dyn_cast<VPInstruction>(Val: this);
316 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
317}
318
319void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
320 assert(OpType == Other.OpType && "OpType must match");
321 switch (OpType) {
322 case OperationType::OverflowingBinOp:
323 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
324 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
325 break;
326 case OperationType::Trunc:
327 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
328 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
329 break;
330 case OperationType::DisjointOp:
331 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
332 break;
333 case OperationType::PossiblyExactOp:
334 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
335 break;
336 case OperationType::GEPOp:
337 GEPFlagsStorage &= Other.GEPFlagsStorage;
338 break;
339 case OperationType::FPMathOp:
340 case OperationType::FCmp:
341 assert((OpType != OperationType::FCmp ||
342 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
343 "Cannot drop CmpPredicate");
344 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
345 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
346 break;
347 case OperationType::NonNegOp:
348 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
349 break;
350 case OperationType::Cmp:
351 assert(CmpPredStorage == Other.CmpPredStorage &&
352 "Cannot drop CmpPredicate");
353 break;
354 case OperationType::ReductionOp:
355 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
356 "Cannot change RecurKind");
357 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
358 "Cannot change IsOrdered");
359 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
360 "Cannot change IsInLoop");
361 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
362 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
363 break;
364 case OperationType::Other:
365 break;
366 }
367}
368
369FastMathFlags VPIRFlags::getFastMathFlags() const {
370 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
371 OpType == OperationType::ReductionOp ||
372 OpType == OperationType::Other) &&
373 "recipe doesn't have fast math flags");
374 if (OpType == OperationType::Other)
375 return FastMathFlags();
376 const FastMathFlagsTy &F = getFMFsRef();
377 FastMathFlags Res;
378 Res.setAllowReassoc(F.AllowReassoc);
379 Res.setNoNaNs(F.NoNaNs);
380 Res.setNoInfs(F.NoInfs);
381 Res.setNoSignedZeros(F.NoSignedZeros);
382 Res.setAllowReciprocal(F.AllowReciprocal);
383 Res.setAllowContract(F.AllowContract);
384 Res.setApproxFunc(F.ApproxFunc);
385 return Res;
386}
387
388#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
389void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
390
391void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
392 VPSlotTracker &SlotTracker) const {
393 printRecipe(O, Indent, SlotTracker);
394 if (auto DL = getDebugLoc()) {
395 O << ", !dbg ";
396 DL.print(O);
397 }
398
399 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
400 Metadata->print(O, SlotTracker);
401}
402#endif
403
404template <unsigned PartOpIdx>
405VPValue *
406VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
407 if (U.getNumOperands() == PartOpIdx + 1)
408 return U.getOperand(N: PartOpIdx);
409 return nullptr;
410}
411
412template <unsigned PartOpIdx>
413unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
414 if (auto *UnrollPartOp = getUnrollPartOperand(U))
415 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
416 return 0;
417}
418
419namespace llvm {
420template class VPUnrollPartAccessor<1>;
421template class VPUnrollPartAccessor<2>;
422template class VPUnrollPartAccessor<3>;
423}
424
425VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
426 const VPIRFlags &Flags, const VPIRMetadata &MD,
427 DebugLoc DL, const Twine &Name)
428 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
429 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
430 assert(flagsValidForOpcode(getOpcode()) &&
431 "Set flags not supported for the provided opcode");
432 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
433 "Opcode requires specific flags to be set");
434 assert((getNumOperandsForOpcode() == -1u ||
435 getNumOperandsForOpcode() == getNumOperands() ||
436 (isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) &&
437 "number of operands does not match opcode");
438}
439
440unsigned VPInstruction::getNumOperandsForOpcode() const {
441 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
442 return 1;
443
444 if (Instruction::isBinaryOp(Opcode))
445 return 2;
446
447 switch (Opcode) {
448 case VPInstruction::StepVector:
449 case VPInstruction::VScale:
450 return 0;
451 case Instruction::Alloca:
452 case Instruction::ExtractValue:
453 case Instruction::Freeze:
454 case Instruction::Load:
455 case VPInstruction::BranchOnCond:
456 case VPInstruction::Broadcast:
457 case VPInstruction::ExitingIVValue:
458 case VPInstruction::ExplicitVectorLength:
459 case VPInstruction::ExtractLastLane:
460 case VPInstruction::ExtractLastPart:
461 case VPInstruction::ExtractPenultimateElement:
462 case VPInstruction::MaskedCond:
463 case VPInstruction::Not:
464 case VPInstruction::ResumeForEpilogue:
465 case VPInstruction::Reverse:
466 case VPInstruction::Unpack:
467 return 1;
468 case Instruction::ICmp:
469 case Instruction::FCmp:
470 case Instruction::ExtractElement:
471 case Instruction::Store:
472 case VPInstruction::BranchOnCount:
473 case VPInstruction::BranchOnTwoConds:
474 case VPInstruction::FirstOrderRecurrenceSplice:
475 case VPInstruction::LogicalAnd:
476 case VPInstruction::LogicalOr:
477 case VPInstruction::PtrAdd:
478 case VPInstruction::WidePtrAdd:
479 case VPInstruction::WideIVStep:
480 case VPInstruction::CalculateTripCountMinusVF:
481 return 2;
482 case Instruction::Select:
483 case VPInstruction::ActiveLaneMask:
484 case VPInstruction::ReductionStartVector:
485 return 3;
486 case Instruction::Call: {
487 // For unmasked calls, the last argument will the called function. Use that
488 // to compute the number of operands without the mask.
489 VPValue *LastOp = getOperand(N: getNumOperands() - 1);
490 if (isa<VPIRValue>(Val: LastOp) && isa<Function>(Val: LastOp->getLiveInIRValue()))
491 return getNumOperands();
492 return getNumOperands() - 1;
493 }
494 case Instruction::GetElementPtr:
495 case Instruction::PHI:
496 case Instruction::Switch:
497 case VPInstruction::AnyOf:
498 case VPInstruction::BuildStructVector:
499 case VPInstruction::BuildVector:
500 case VPInstruction::CanonicalIVIncrementForPart:
501 case VPInstruction::ComputeAnyOfResult:
502 case VPInstruction::ComputeReductionResult:
503 case VPInstruction::FirstActiveLane:
504 case VPInstruction::LastActiveLane:
505 case VPInstruction::SLPLoad:
506 case VPInstruction::SLPStore:
507 case VPInstruction::ExtractLane:
508 case VPInstruction::ExtractLastActive:
509 // Cannot determine the number of operands from the opcode.
510 return -1u;
511 }
512 llvm_unreachable("all cases should be handled above");
513}
514
515bool VPInstruction::doesGeneratePerAllLanes() const {
516 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
517}
518
519bool VPInstruction::canGenerateScalarForFirstLane() const {
520 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
521 return true;
522 if (isSingleScalar() || isVectorToScalar())
523 return true;
524 switch (Opcode) {
525 case Instruction::Freeze:
526 case Instruction::ICmp:
527 case Instruction::PHI:
528 case Instruction::Select:
529 case VPInstruction::BranchOnCond:
530 case VPInstruction::BranchOnTwoConds:
531 case VPInstruction::BranchOnCount:
532 case VPInstruction::CalculateTripCountMinusVF:
533 case VPInstruction::CanonicalIVIncrementForPart:
534 case VPInstruction::PtrAdd:
535 case VPInstruction::ExplicitVectorLength:
536 case VPInstruction::AnyOf:
537 case VPInstruction::Not:
538 return true;
539 default:
540 return false;
541 }
542}
543
544Value *VPInstruction::generate(VPTransformState &State) {
545 IRBuilderBase &Builder = State.Builder;
546
547 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
548 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
549 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
550 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
551 auto *Res =
552 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
553 if (auto *I = dyn_cast<Instruction>(Val: Res))
554 applyFlags(I&: *I);
555 return Res;
556 }
557
558 switch (getOpcode()) {
559 case VPInstruction::Not: {
560 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
561 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
562 return Builder.CreateNot(V: A, Name);
563 }
564 case Instruction::ExtractElement: {
565 assert(State.VF.isVector() && "Only extract elements from vectors");
566 if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1)))
567 return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue()));
568 Value *Vec = State.get(Def: getOperand(N: 0));
569 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
570 return Builder.CreateExtractElement(Vec, Idx, Name);
571 }
572 case Instruction::Freeze: {
573 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
574 return Builder.CreateFreeze(V: Op, Name);
575 }
576 case Instruction::FCmp:
577 case Instruction::ICmp: {
578 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
579 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
580 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
581 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
582 }
583 case Instruction::PHI: {
584 llvm_unreachable("should be handled by VPPhi::execute");
585 }
586 case Instruction::Select: {
587 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
588 Value *Cond =
589 State.get(Def: getOperand(N: 0),
590 IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0)));
591 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
592 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
593 return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlags(), Name);
594 }
595 case VPInstruction::ActiveLaneMask: {
596 // Get first lane of vector induction variable.
597 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
598 // Get the original loop tripcount.
599 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
600
601 // If this part of the active lane mask is scalar, generate the CMP directly
602 // to avoid unnecessary extracts.
603 if (State.VF.isScalar())
604 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
605 Name);
606
607 ElementCount EC = State.VF.multiplyCoefficientBy(
608 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
609 auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
610 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
611 Types: {PredTy, ScalarTC->getType()},
612 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
613 }
614 case VPInstruction::FirstOrderRecurrenceSplice: {
615 // Generate code to combine the previous and current values in vector v3.
616 //
617 // vector.ph:
618 // v_init = vector(..., ..., ..., a[-1])
619 // br vector.body
620 //
621 // vector.body
622 // i = phi [0, vector.ph], [i+4, vector.body]
623 // v1 = phi [v_init, vector.ph], [v2, vector.body]
624 // v2 = a[i, i+1, i+2, i+3];
625 // v3 = vector(v1(3), v2(0, 1, 2))
626
627 auto *V1 = State.get(Def: getOperand(N: 0));
628 if (!V1->getType()->isVectorTy())
629 return V1;
630 Value *V2 = State.get(Def: getOperand(N: 1));
631 return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name);
632 }
633 case VPInstruction::CalculateTripCountMinusVF: {
634 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
635 Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
636 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF);
637 Value *Cmp =
638 Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF);
639 Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
640 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
641 }
642 case VPInstruction::ExplicitVectorLength: {
643 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
644 // be outside of the main loop.
645 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
646 // Compute EVL
647 assert(AVL->getType()->isIntegerTy() &&
648 "Requested vector length should be an integer.");
649
650 assert(State.VF.isScalable() && "Expected scalable vector factor.");
651 Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
652
653 Value *EVL = Builder.CreateIntrinsic(
654 RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
655 Args: {AVL, VFArg, Builder.getTrue()});
656 return EVL;
657 }
658 case VPInstruction::CanonicalIVIncrementForPart: {
659 auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
660 auto *VFxPart = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
661 // The canonical IV is incremented by the vectorization factor (num of
662 // SIMD elements) times the unroll part.
663 return Builder.CreateAdd(LHS: IV, RHS: VFxPart, Name, HasNUW: hasNoUnsignedWrap(),
664 HasNSW: hasNoSignedWrap());
665 }
666 case VPInstruction::BranchOnCond: {
667 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
668 // Replace the temporary unreachable terminator with a new conditional
669 // branch, hooking it up to backward destination for latch blocks now, and
670 // to forward destination(s) later when they are created.
671 // Second successor may be backwards - iff it is already in VPBB2IRBB.
672 VPBasicBlock *SecondVPSucc =
673 cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]);
674 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
675 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
676 auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
677 // First successor is always forward, reset it to nullptr.
678 Br->setSuccessor(idx: 0, NewSucc: nullptr);
679 IRBB->getTerminator()->eraseFromParent();
680 applyMetadata(I&: *Br);
681 return Br;
682 }
683 case VPInstruction::Broadcast: {
684 return Builder.CreateVectorSplat(
685 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
686 }
687 case VPInstruction::BuildStructVector: {
688 // For struct types, we need to build a new 'wide' struct type, where each
689 // element is widened, i.e., we create a struct of vectors.
690 auto *StructTy =
691 cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)));
692 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
693 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
694 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
695 FieldIndex++) {
696 Value *ScalarValue =
697 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
698 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
699 VectorValue =
700 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
701 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
702 }
703 }
704 return Res;
705 }
706 case VPInstruction::BuildVector: {
707 auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
708 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
709 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
710 for (const auto &[Idx, Op] : enumerate(First: operands()))
711 Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
712 Idx: Builder.getInt32(C: Idx));
713 return Res;
714 }
715 case VPInstruction::ReductionStartVector: {
716 if (State.VF.isScalar())
717 return State.get(Def: getOperand(N: 0), IsScalar: true);
718 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
719 Builder.setFastMathFlags(getFastMathFlags());
720 // If this start vector is scaled then it should produce a vector with fewer
721 // elements than the VF.
722 ElementCount VF = State.VF.divideCoefficientBy(
723 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
724 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
725 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
726 Idx: Builder.getInt32(C: 0));
727 }
728 case VPInstruction::ComputeAnyOfResult: {
729 Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
730 Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
731 Value *ReducedResult = State.get(Def: getOperand(N: 2));
732 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
733 ReducedResult =
734 Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)),
735 RHS: ReducedResult, Name: "bin.rdx");
736 // If any predicate is true it means that we want to select the new value.
737 if (ReducedResult->getType()->isVectorTy())
738 ReducedResult = Builder.CreateOrReduce(Src: ReducedResult);
739 // The compares in the loop may yield poison, which propagates through the
740 // bitwise ORs. Freeze it here before the condition is used.
741 ReducedResult = Builder.CreateFreeze(V: ReducedResult);
742 return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select");
743 }
744 case VPInstruction::ComputeReductionResult: {
745 RecurKind RK = getRecurKind();
746 bool IsOrdered = isReductionOrdered();
747 bool IsInLoop = isReductionInLoop();
748 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
749 "FindIV should use min/max reduction kinds");
750
751 // The recipe may have multiple operands to be reduced together.
752 unsigned NumOperandsToReduce = getNumOperands();
753 VectorParts RdxParts(NumOperandsToReduce);
754 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
755 RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
756
757 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
758 Builder.setFastMathFlags(getFastMathFlags());
759
760 // Reduce multiple operands into one.
761 Value *ReducedPartRdx = RdxParts[0];
762 if (IsOrdered) {
763 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
764 } else {
765 // Floating-point operations should have some FMF to enable the reduction.
766 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
767 Value *RdxPart = RdxParts[Part];
768 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
769 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
770 else {
771 // For sub-recurrences, each part's reduction variable is already
772 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
773 Instruction::BinaryOps Opcode =
774 RK == RecurKind::Sub
775 ? Instruction::Add
776 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
777 ReducedPartRdx =
778 Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
779 }
780 }
781 }
782
783 // Create the reduction after the loop. Note that inloop reductions create
784 // the target reduction in the loop using a Reduction recipe.
785 if (State.VF.isVector() && !IsInLoop) {
786 // TODO: Support in-order reductions based on the recurrence descriptor.
787 // All ops in the reduction inherit fast-math-flags from the recurrence
788 // descriptor.
789 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
790 }
791
792 return ReducedPartRdx;
793 }
794 case VPInstruction::ExtractLastLane:
795 case VPInstruction::ExtractPenultimateElement: {
796 unsigned Offset =
797 getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
798 Value *Res;
799 if (State.VF.isVector()) {
800 assert(Offset <= State.VF.getKnownMinValue() &&
801 "invalid offset to extract from");
802 // Extract lane VF - Offset from the operand.
803 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
804 } else {
805 // TODO: Remove ExtractLastLane for scalar VFs.
806 assert(Offset <= 1 && "invalid offset to extract from");
807 Res = State.get(Def: getOperand(N: 0));
808 }
809 if (isa<ExtractElementInst>(Val: Res))
810 Res->setName(Name);
811 return Res;
812 }
813 case VPInstruction::LogicalAnd: {
814 Value *A = State.get(Def: getOperand(N: 0));
815 Value *B = State.get(Def: getOperand(N: 1));
816 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
817 }
818 case VPInstruction::LogicalOr: {
819 Value *A = State.get(Def: getOperand(N: 0));
820 Value *B = State.get(Def: getOperand(N: 1));
821 return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name);
822 }
823 case VPInstruction::PtrAdd: {
824 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
825 "can only generate first lane for PtrAdd");
826 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
827 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
828 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
829 }
830 case VPInstruction::WidePtrAdd: {
831 Value *Ptr =
832 State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0)));
833 Value *Addend = State.get(Def: getOperand(N: 1));
834 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
835 }
836 case VPInstruction::AnyOf: {
837 Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0)));
838 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
839 Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
840 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
841 }
842 case VPInstruction::ExtractLane: {
843 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
844 "simplified to ExtractElement.");
845 Value *LaneToExtract = State.get(Def: getOperand(N: 0), IsScalar: true);
846 Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
847 Value *Res = nullptr;
848 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
849
850 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
851 Value *VectorStart =
852 Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1));
853 Value *VectorIdx = Idx == 1
854 ? LaneToExtract
855 : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
856 Value *Ext = State.VF.isScalar()
857 ? State.get(Def: getOperand(N: Idx))
858 : Builder.CreateExtractElement(
859 Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
860 if (Res) {
861 Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
862 Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
863 } else {
864 Res = Ext;
865 }
866 }
867 return Res;
868 }
869 case VPInstruction::FirstActiveLane: {
870 if (getNumOperands() == 1) {
871 Value *Mask = State.get(Def: getOperand(N: 0));
872 return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
873 /*ZeroIsPoison=*/false, Name);
874 }
875 // If there are multiple operands, create a chain of selects to pick the
876 // first operand with an active lane and add the number of lanes of the
877 // preceding operands.
878 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF);
879 unsigned LastOpIdx = getNumOperands() - 1;
880 Value *Res = nullptr;
881 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
882 Value *TrailingZeros =
883 State.VF.isScalar()
884 ? Builder.CreateZExt(
885 V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
886 RHS: Builder.getFalse()),
887 DestTy: Builder.getInt64Ty())
888 : Builder.CreateCountTrailingZeroElems(
889 ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)),
890 /*ZeroIsPoison=*/false, Name);
891 Value *Current = Builder.CreateAdd(
892 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
893 if (Res) {
894 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
895 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
896 } else {
897 Res = Current;
898 }
899 }
900
901 return Res;
902 }
903 case VPInstruction::ResumeForEpilogue:
904 return State.get(Def: getOperand(N: 0), IsScalar: true);
905 case VPInstruction::Reverse:
906 return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse");
907 case VPInstruction::ExtractLastActive: {
908 Value *Result = State.get(Def: getOperand(N: 0), /*IsScalar=*/true);
909 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
910 Value *Data = State.get(Def: getOperand(N: Idx));
911 Value *Mask = State.get(Def: getOperand(N: Idx + 1));
912 Type *VTy = Data->getType();
913
914 if (State.VF.isScalar())
915 Result = Builder.CreateSelect(C: Mask, True: Data, False: Result);
916 else
917 Result = Builder.CreateIntrinsic(
918 ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy},
919 Args: {Data, Mask, Result});
920 }
921
922 return Result;
923 }
924 default:
925 llvm_unreachable("Unsupported opcode for instruction");
926 }
927}
928
929InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
930 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
931 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
932 Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
933 switch (Opcode) {
934 case Instruction::FNeg:
935 return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
936 case Instruction::UDiv:
937 case Instruction::SDiv:
938 case Instruction::SRem:
939 case Instruction::URem:
940 case Instruction::Add:
941 case Instruction::FAdd:
942 case Instruction::Sub:
943 case Instruction::FSub:
944 case Instruction::Mul:
945 case Instruction::FMul:
946 case Instruction::FDiv:
947 case Instruction::FRem:
948 case Instruction::Shl:
949 case Instruction::LShr:
950 case Instruction::AShr:
951 case Instruction::And:
952 case Instruction::Or:
953 case Instruction::Xor: {
954 TargetTransformInfo::OperandValueInfo RHSInfo = {
955 .Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None};
956
957 if (VF.isVector()) {
958 // Certain instructions can be cheaper to vectorize if they have a
959 // constant second vector operand. One example of this are shifts on x86.
960 VPValue *RHS = getOperand(N: 1);
961 RHSInfo = Ctx.getOperandInfo(V: RHS);
962
963 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
964 getOperand(N: 1)->isDefinedOutsideLoopRegions())
965 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
966 }
967
968 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
969 SmallVector<const Value *, 4> Operands;
970 if (CtxI)
971 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
972 return Ctx.TTI.getArithmeticInstrCost(
973 Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
974 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
975 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
976 }
977 case Instruction::Freeze:
978 // This opcode is unknown. Assume that it is the same as 'mul'.
979 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy,
980 CostKind: Ctx.CostKind);
981 case Instruction::ExtractValue:
982 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
983 CostKind: Ctx.CostKind);
984 case Instruction::ICmp:
985 case Instruction::FCmp: {
986 Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
987 Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
988 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
989 return Ctx.TTI.getCmpSelInstrCost(
990 Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
991 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
992 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
993 }
994 case Instruction::BitCast: {
995 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
996 if (ScalarTy->isPointerTy())
997 return 0;
998 [[fallthrough]];
999 }
1000 case Instruction::SExt:
1001 case Instruction::ZExt:
1002 case Instruction::FPToUI:
1003 case Instruction::FPToSI:
1004 case Instruction::FPExt:
1005 case Instruction::PtrToInt:
1006 case Instruction::PtrToAddr:
1007 case Instruction::IntToPtr:
1008 case Instruction::SIToFP:
1009 case Instruction::UIToFP:
1010 case Instruction::Trunc:
1011 case Instruction::FPTrunc:
1012 case Instruction::AddrSpaceCast: {
1013 // Computes the CastContextHint from a recipe that may access memory.
1014 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1015 if (isa<VPInterleaveBase>(Val: R))
1016 return TTI::CastContextHint::Interleave;
1017 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
1018 // Only compute CCH for memory operations, matching the legacy model
1019 // which only considers loads/stores for cast context hints.
1020 auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
1021 if (!isa<LoadInst, StoreInst>(Val: UI))
1022 return TTI::CastContextHint::None;
1023 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1024 : TTI::CastContextHint::Normal;
1025 }
1026 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1027 if (WidenMemoryRecipe == nullptr)
1028 return TTI::CastContextHint::None;
1029 if (VF.isScalar())
1030 return TTI::CastContextHint::Normal;
1031 if (!WidenMemoryRecipe->isConsecutive())
1032 return TTI::CastContextHint::GatherScatter;
1033 if (WidenMemoryRecipe->isReverse())
1034 return TTI::CastContextHint::Reversed;
1035 if (WidenMemoryRecipe->isMasked())
1036 return TTI::CastContextHint::Masked;
1037 return TTI::CastContextHint::Normal;
1038 };
1039
1040 VPValue *Operand = getOperand(N: 0);
1041 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1042 // For Trunc/FPTrunc, get the context from the only user.
1043 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1044 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1045 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1046 return nullptr;
1047 return dyn_cast<VPRecipeBase>(Val: *R->user_begin());
1048 };
1049 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1050 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue())))
1051 Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe));
1052 if (Recipe)
1053 CCH = ComputeCCH(Recipe);
1054 }
1055 }
1056 // For Z/Sext, get the context from the operand.
1057 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1058 Opcode == Instruction::FPExt) {
1059 if (auto *Recipe = Operand->getDefiningRecipe()) {
1060 VPValue *ReverseOp;
1061 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp))))
1062 Recipe = ReverseOp->getDefiningRecipe();
1063 if (Recipe)
1064 CCH = ComputeCCH(Recipe);
1065 }
1066 }
1067
1068 auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand);
1069 Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1070 // Arm TTI will use the underlying instruction to determine the cost.
1071 return Ctx.TTI.getCastInstrCost(
1072 Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1073 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1074 }
1075 case Instruction::Select: {
1076 SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1077 bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1078 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1079
1080 VPValue *Op0, *Op1;
1081 bool IsLogicalAnd =
1082 match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1083 bool IsLogicalOr =
1084 match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1085 // Also match the inverted forms:
1086 // select x, false, y --> !x & y (still AND)
1087 // select x, y, true --> !x | y (still OR)
1088 IsLogicalAnd |=
1089 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1090 IsLogicalOr |=
1091 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1092
1093 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1094 (IsLogicalAnd || IsLogicalOr)) {
1095 // select x, y, false --> x & y
1096 // select x, true, y --> x | y
1097 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1098 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1099
1100 SmallVector<const Value *, 2> Operands;
1101 if (SI && all_of(Range: operands(),
1102 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1103 append_range(C&: Operands, R: SI->operands());
1104 return Ctx.TTI.getArithmeticInstrCost(
1105 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1106 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1107 }
1108
1109 Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1110 if (!IsScalarCond && VF.isVector())
1111 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1112
1113 llvm::CmpPredicate Pred;
1114 if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1115 if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0)))
1116 if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1117 Pred = Cmp->getPredicate();
1118 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1119 return Ctx.TTI.getCmpSelInstrCost(
1120 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1121 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1122 }
1123 }
1124 llvm_unreachable("called for unsupported opcode");
1125}
1126
1127InstructionCost VPInstruction::computeCost(ElementCount VF,
1128 VPCostContext &Ctx) const {
1129 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1130 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1131 // TODO: Compute cost for VPInstructions without underlying values once
1132 // the legacy cost model has been retired.
1133 return 0;
1134 }
1135
1136 assert(!doesGeneratePerAllLanes() &&
1137 "Should only generate a vector value or single scalar, not scalars "
1138 "for all lanes.");
1139 return getCostForRecipeWithOpcode(
1140 Opcode: getOpcode(),
1141 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1142 }
1143
1144 switch (getOpcode()) {
1145 case Instruction::Select: {
1146 llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1147 match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1148 auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1149 auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1));
1150 if (!vputils::onlyFirstLaneUsed(Def: this)) {
1151 CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1152 VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1153 }
1154 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1155 CostKind: Ctx.CostKind);
1156 }
1157 case Instruction::ExtractElement:
1158 case VPInstruction::ExtractLane: {
1159 if (VF.isScalar()) {
1160 // ExtractLane with VF=1 takes care of handling extracting across multiple
1161 // parts.
1162 return 0;
1163 }
1164
1165 // Add on the cost of extracting the element.
1166 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1167 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1168 CostKind: Ctx.CostKind);
1169 }
1170 case VPInstruction::AnyOf: {
1171 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1172 return Ctx.TTI.getArithmeticReductionCost(
1173 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1174 }
1175 case VPInstruction::FirstActiveLane: {
1176 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1177 if (VF.isScalar())
1178 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1179 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1180 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1181 // Calculate the cost of determining the lane index.
1182 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1183 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1184 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1185 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1186 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1187 }
1188 case VPInstruction::LastActiveLane: {
1189 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1190 if (VF.isScalar())
1191 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1192 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1193 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1194 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1195 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1196 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1197 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1198 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1199 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1200 // Add cost of NOT operation on the predicate.
1201 Cost += Ctx.TTI.getArithmeticInstrCost(
1202 Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1203 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1204 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1205 .Properties: TargetTransformInfo::OP_None});
1206 // Add cost of SUB operation on the index.
1207 Cost += Ctx.TTI.getArithmeticInstrCost(
1208 Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind);
1209 return Cost;
1210 }
1211 case VPInstruction::ExtractLastActive: {
1212 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1213 Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1214 Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1215 IntrinsicCostAttributes ICA(
1216 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1217 {VecTy, MaskTy, ScalarTy});
1218 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1219 }
1220 case VPInstruction::FirstOrderRecurrenceSplice: {
1221 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1222 SmallVector<int> Mask(VF.getKnownMinValue());
1223 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
1224 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1225
1226 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
1227 DstTy: cast<VectorType>(Val: VectorTy),
1228 SrcTy: cast<VectorType>(Val: VectorTy), Mask,
1229 CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1);
1230 }
1231 case VPInstruction::ActiveLaneMask: {
1232 Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1233 unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue();
1234 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier);
1235 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1236 {ArgTy, ArgTy});
1237 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1238 }
1239 case VPInstruction::ExplicitVectorLength: {
1240 Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1241 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1242 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1243 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1244 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1245 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1246 }
1247 case VPInstruction::Reverse: {
1248 assert(VF.isVector() && "Reverse operation must be vector type");
1249 auto *VectorTy = cast<VectorType>(
1250 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
1251 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1252 SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind,
1253 /*Index=*/0);
1254 }
1255 case VPInstruction::ExtractLastLane: {
1256 // Add on the cost of extracting the element.
1257 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1258 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1259 Val: VecTy, CostKind: Ctx.CostKind, Index: 0);
1260 }
1261 case VPInstruction::ExtractPenultimateElement:
1262 if (VF == ElementCount::getScalable(MinVal: 1))
1263 return InstructionCost::getInvalid();
1264 [[fallthrough]];
1265 default:
1266 // TODO: Compute cost other VPInstructions once the legacy cost model has
1267 // been retired.
1268 assert(!getUnderlyingValue() &&
1269 "unexpected VPInstruction witht underlying value");
1270 return 0;
1271 }
1272}
1273
1274bool VPInstruction::isVectorToScalar() const {
1275 return getOpcode() == VPInstruction::ExtractLastLane ||
1276 getOpcode() == VPInstruction::ExtractPenultimateElement ||
1277 getOpcode() == Instruction::ExtractElement ||
1278 getOpcode() == VPInstruction::ExtractLane ||
1279 getOpcode() == VPInstruction::FirstActiveLane ||
1280 getOpcode() == VPInstruction::LastActiveLane ||
1281 getOpcode() == VPInstruction::ComputeAnyOfResult ||
1282 getOpcode() == VPInstruction::ExtractLastActive ||
1283 getOpcode() == VPInstruction::ComputeReductionResult ||
1284 getOpcode() == VPInstruction::AnyOf;
1285}
1286
1287bool VPInstruction::isSingleScalar() const {
1288 switch (getOpcode()) {
1289 case Instruction::Load:
1290 case Instruction::PHI:
1291 case VPInstruction::ExplicitVectorLength:
1292 case VPInstruction::ResumeForEpilogue:
1293 case VPInstruction::VScale:
1294 return true;
1295 default:
1296 return isScalarCast();
1297 }
1298}
1299
1300void VPInstruction::execute(VPTransformState &State) {
1301 assert(!isMasked() && "cannot execute masked VPInstruction");
1302 assert(!State.Lane && "VPInstruction executing an Lane");
1303 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1304 assert(flagsValidForOpcode(getOpcode()) &&
1305 "Set flags not supported for the provided opcode");
1306 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1307 "Opcode requires specific flags to be set");
1308 if (hasFastMathFlags())
1309 State.Builder.setFastMathFlags(getFastMathFlags());
1310 Value *GeneratedValue = generate(State);
1311 if (!hasResult())
1312 return;
1313 assert(GeneratedValue && "generate must produce a value");
1314 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1315 (vputils::onlyFirstLaneUsed(Def: this) ||
1316 isVectorToScalar() || isSingleScalar());
1317 assert((((GeneratedValue->getType()->isVectorTy() ||
1318 GeneratedValue->getType()->isStructTy()) ==
1319 !GeneratesPerFirstLaneOnly) ||
1320 State.VF.isScalar()) &&
1321 "scalar value but not only first lane defined");
1322 State.set(Def: this, V: GeneratedValue,
1323 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1324 if (getOpcode() == VPInstruction::ResumeForEpilogue) {
1325 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1326 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1327 // vectorization explicitly connects VPlans.
1328 setUnderlyingValue(GeneratedValue);
1329 }
1330}
1331
1332bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1333 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1334 return false;
1335 switch (getOpcode()) {
1336 case Instruction::GetElementPtr:
1337 case Instruction::ExtractElement:
1338 case Instruction::Freeze:
1339 case Instruction::FCmp:
1340 case Instruction::ICmp:
1341 case Instruction::Select:
1342 case Instruction::PHI:
1343 case VPInstruction::AnyOf:
1344 case VPInstruction::BranchOnCond:
1345 case VPInstruction::BranchOnTwoConds:
1346 case VPInstruction::BranchOnCount:
1347 case VPInstruction::Broadcast:
1348 case VPInstruction::BuildStructVector:
1349 case VPInstruction::BuildVector:
1350 case VPInstruction::CalculateTripCountMinusVF:
1351 case VPInstruction::CanonicalIVIncrementForPart:
1352 case VPInstruction::ExtractLane:
1353 case VPInstruction::ExtractLastLane:
1354 case VPInstruction::ExtractLastPart:
1355 case VPInstruction::ExtractPenultimateElement:
1356 case VPInstruction::ActiveLaneMask:
1357 case VPInstruction::ExitingIVValue:
1358 case VPInstruction::ExplicitVectorLength:
1359 case VPInstruction::FirstActiveLane:
1360 case VPInstruction::LastActiveLane:
1361 case VPInstruction::ExtractLastActive:
1362 case VPInstruction::FirstOrderRecurrenceSplice:
1363 case VPInstruction::LogicalAnd:
1364 case VPInstruction::LogicalOr:
1365 case VPInstruction::MaskedCond:
1366 case VPInstruction::Not:
1367 case VPInstruction::PtrAdd:
1368 case VPInstruction::WideIVStep:
1369 case VPInstruction::WidePtrAdd:
1370 case VPInstruction::StepVector:
1371 case VPInstruction::ReductionStartVector:
1372 case VPInstruction::Reverse:
1373 case VPInstruction::VScale:
1374 case VPInstruction::Unpack:
1375 return false;
1376 default:
1377 return true;
1378 }
1379}
1380
1381bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
1382 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1383 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1384 return vputils::onlyFirstLaneUsed(Def: this);
1385
1386 switch (getOpcode()) {
1387 default:
1388 return false;
1389 case Instruction::ExtractElement:
1390 return Op == getOperand(N: 1);
1391 case Instruction::PHI:
1392 return true;
1393 case Instruction::FCmp:
1394 case Instruction::ICmp:
1395 case Instruction::Select:
1396 case Instruction::Or:
1397 case Instruction::Freeze:
1398 case VPInstruction::Not:
1399 // TODO: Cover additional opcodes.
1400 return vputils::onlyFirstLaneUsed(Def: this);
1401 case Instruction::Load:
1402 case VPInstruction::ActiveLaneMask:
1403 case VPInstruction::ExplicitVectorLength:
1404 case VPInstruction::CalculateTripCountMinusVF:
1405 case VPInstruction::CanonicalIVIncrementForPart:
1406 case VPInstruction::BranchOnCount:
1407 case VPInstruction::BranchOnCond:
1408 case VPInstruction::Broadcast:
1409 case VPInstruction::ReductionStartVector:
1410 return true;
1411 case VPInstruction::BuildStructVector:
1412 case VPInstruction::BuildVector:
1413 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1414 // operand, after replicating its operands only the first lane is used.
1415 // Before replicating, it will have only a single operand.
1416 return getNumOperands() > 1;
1417 case VPInstruction::PtrAdd:
1418 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1419 case VPInstruction::WidePtrAdd:
1420 // WidePtrAdd supports scalar and vector base addresses.
1421 return false;
1422 case VPInstruction::ComputeAnyOfResult:
1423 return Op == getOperand(N: 0) || Op == getOperand(N: 1);
1424 case VPInstruction::ExitingIVValue:
1425 case VPInstruction::ExtractLane:
1426 return Op == getOperand(N: 0);
1427 };
1428 llvm_unreachable("switch should return");
1429}
1430
1431bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
1432 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1433 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1434 return vputils::onlyFirstPartUsed(Def: this);
1435
1436 switch (getOpcode()) {
1437 default:
1438 return false;
1439 case Instruction::FCmp:
1440 case Instruction::ICmp:
1441 case Instruction::Select:
1442 return vputils::onlyFirstPartUsed(Def: this);
1443 case VPInstruction::BranchOnCount:
1444 case VPInstruction::BranchOnCond:
1445 case VPInstruction::BranchOnTwoConds:
1446 case VPInstruction::CanonicalIVIncrementForPart:
1447 return true;
1448 };
1449 llvm_unreachable("switch should return");
1450}
1451
1452#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1453void VPInstruction::dump() const {
1454 VPSlotTracker SlotTracker(getParent()->getPlan());
1455 printRecipe(dbgs(), "", SlotTracker);
1456}
1457
1458void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1459 VPSlotTracker &SlotTracker) const {
1460 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1461
1462 if (hasResult()) {
1463 printAsOperand(O, SlotTracker);
1464 O << " = ";
1465 }
1466
1467 switch (getOpcode()) {
1468 case VPInstruction::Not:
1469 O << "not";
1470 break;
1471 case VPInstruction::SLPLoad:
1472 O << "combined load";
1473 break;
1474 case VPInstruction::SLPStore:
1475 O << "combined store";
1476 break;
1477 case VPInstruction::ActiveLaneMask:
1478 O << "active lane mask";
1479 break;
1480 case VPInstruction::ExplicitVectorLength:
1481 O << "EXPLICIT-VECTOR-LENGTH";
1482 break;
1483 case VPInstruction::FirstOrderRecurrenceSplice:
1484 O << "first-order splice";
1485 break;
1486 case VPInstruction::BranchOnCond:
1487 O << "branch-on-cond";
1488 break;
1489 case VPInstruction::BranchOnTwoConds:
1490 O << "branch-on-two-conds";
1491 break;
1492 case VPInstruction::CalculateTripCountMinusVF:
1493 O << "TC > VF ? TC - VF : 0";
1494 break;
1495 case VPInstruction::CanonicalIVIncrementForPart:
1496 O << "VF * Part +";
1497 break;
1498 case VPInstruction::BranchOnCount:
1499 O << "branch-on-count";
1500 break;
1501 case VPInstruction::Broadcast:
1502 O << "broadcast";
1503 break;
1504 case VPInstruction::BuildStructVector:
1505 O << "buildstructvector";
1506 break;
1507 case VPInstruction::BuildVector:
1508 O << "buildvector";
1509 break;
1510 case VPInstruction::ExitingIVValue:
1511 O << "exiting-iv-value";
1512 break;
1513 case VPInstruction::MaskedCond:
1514 O << "masked-cond";
1515 break;
1516 case VPInstruction::ExtractLane:
1517 O << "extract-lane";
1518 break;
1519 case VPInstruction::ExtractLastLane:
1520 O << "extract-last-lane";
1521 break;
1522 case VPInstruction::ExtractLastPart:
1523 O << "extract-last-part";
1524 break;
1525 case VPInstruction::ExtractPenultimateElement:
1526 O << "extract-penultimate-element";
1527 break;
1528 case VPInstruction::ComputeAnyOfResult:
1529 O << "compute-anyof-result";
1530 break;
1531 case VPInstruction::ComputeReductionResult:
1532 O << "compute-reduction-result";
1533 break;
1534 case VPInstruction::LogicalAnd:
1535 O << "logical-and";
1536 break;
1537 case VPInstruction::LogicalOr:
1538 O << "logical-or";
1539 break;
1540 case VPInstruction::PtrAdd:
1541 O << "ptradd";
1542 break;
1543 case VPInstruction::WidePtrAdd:
1544 O << "wide-ptradd";
1545 break;
1546 case VPInstruction::AnyOf:
1547 O << "any-of";
1548 break;
1549 case VPInstruction::FirstActiveLane:
1550 O << "first-active-lane";
1551 break;
1552 case VPInstruction::LastActiveLane:
1553 O << "last-active-lane";
1554 break;
1555 case VPInstruction::ReductionStartVector:
1556 O << "reduction-start-vector";
1557 break;
1558 case VPInstruction::ResumeForEpilogue:
1559 O << "resume-for-epilogue";
1560 break;
1561 case VPInstruction::Reverse:
1562 O << "reverse";
1563 break;
1564 case VPInstruction::Unpack:
1565 O << "unpack";
1566 break;
1567 case VPInstruction::ExtractLastActive:
1568 O << "extract-last-active";
1569 break;
1570 default:
1571 O << Instruction::getOpcodeName(getOpcode());
1572 }
1573
1574 printFlags(O);
1575 printOperands(O, SlotTracker);
1576}
1577#endif
1578
1579void VPInstructionWithType::execute(VPTransformState &State) {
1580 State.setDebugLocFrom(getDebugLoc());
1581 if (isScalarCast()) {
1582 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1583 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1584 V: Op, DestTy: ResultTy);
1585 State.set(Def: this, V: Cast, Lane: VPLane(0));
1586 return;
1587 }
1588 switch (getOpcode()) {
1589 case VPInstruction::StepVector: {
1590 Value *StepVector =
1591 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1592 State.set(Def: this, V: StepVector);
1593 break;
1594 }
1595 case VPInstruction::VScale: {
1596 Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1597 State.set(Def: this, V: VScale, IsScalar: true);
1598 break;
1599 }
1600
1601 default:
1602 llvm_unreachable("opcode not implemented yet");
1603 }
1604}
1605
1606#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1607void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1608 VPSlotTracker &SlotTracker) const {
1609 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1610 printAsOperand(O, SlotTracker);
1611 O << " = ";
1612
1613 switch (getOpcode()) {
1614 case VPInstruction::WideIVStep:
1615 O << "wide-iv-step ";
1616 printOperands(O, SlotTracker);
1617 break;
1618 case VPInstruction::StepVector:
1619 O << "step-vector " << *ResultTy;
1620 break;
1621 case VPInstruction::VScale:
1622 O << "vscale " << *ResultTy;
1623 break;
1624 case Instruction::Load:
1625 O << "load ";
1626 printOperands(O, SlotTracker);
1627 break;
1628 default:
1629 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1630 O << Instruction::getOpcodeName(getOpcode()) << " ";
1631 printOperands(O, SlotTracker);
1632 O << " to " << *ResultTy;
1633 }
1634}
1635#endif
1636
1637void VPPhi::execute(VPTransformState &State) {
1638 State.setDebugLocFrom(getDebugLoc());
1639 PHINode *NewPhi = State.Builder.CreatePHI(
1640 Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName());
1641 unsigned NumIncoming = getNumIncoming();
1642 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1643 // TODO: Fixup all incoming values of header phis once recipes defining them
1644 // are introduced.
1645 NumIncoming = 1;
1646 }
1647 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1648 Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0));
1649 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1650 NewPhi->addIncoming(V: IncV, BB: PredBB);
1651 }
1652 State.set(Def: this, V: NewPhi, Lane: VPLane(0));
1653}
1654
1655#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1656void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1657 VPSlotTracker &SlotTracker) const {
1658 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1659 printAsOperand(O, SlotTracker);
1660 O << " = phi";
1661 printFlags(O);
1662 printPhiOperands(O, SlotTracker);
1663}
1664#endif
1665
1666VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1667 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1668 return new VPIRPhi(*Phi);
1669 return new VPIRInstruction(I);
1670}
1671
1672void VPIRInstruction::execute(VPTransformState &State) {
1673 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1674 "PHINodes must be handled by VPIRPhi");
1675 // Advance the insert point after the wrapped IR instruction. This allows
1676 // interleaving VPIRInstructions and other recipes.
1677 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1678}
1679
1680InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1681 VPCostContext &Ctx) const {
1682 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1683 // hence it does not contribute to the cost-modeling for the VPlan.
1684 return 0;
1685}
1686
1687#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1688void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1689 VPSlotTracker &SlotTracker) const {
1690 O << Indent << "IR " << I;
1691}
1692#endif
1693
1694void VPIRPhi::execute(VPTransformState &State) {
1695 PHINode *Phi = &getIRPhi();
1696 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1697 VPValue *ExitValue = Op;
1698 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1699 ? VPLane::getFirstLane()
1700 : VPLane::getLastLaneForVF(VF: State.VF);
1701 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1702 auto *PredVPBB = Pred->getExitingBasicBlock();
1703 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1704 // Set insertion point in PredBB in case an extract needs to be generated.
1705 // TODO: Model extracts explicitly.
1706 State.Builder.SetInsertPoint(PredBB->getTerminator());
1707 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
1708 // If there is no existing block for PredBB in the phi, add a new incoming
1709 // value. Otherwise update the existing incoming value for PredBB.
1710 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
1711 Phi->addIncoming(V, BB: PredBB);
1712 else
1713 Phi->setIncomingValueForBlock(BB: PredBB, V);
1714 }
1715
1716 // Advance the insert point after the wrapped IR instruction. This allows
1717 // interleaving VPIRInstructions and other recipes.
1718 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1719}
1720
1721void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1722 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1723 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1724 "Number of phi operands must match number of predecessors");
1725 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1726 R->removeOperand(Idx: Position);
1727}
1728
1729VPValue *
1730VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
1731 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1732 return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB));
1733}
1734
1735void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
1736 VPValue *V) const {
1737 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1738 R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V);
1739}
1740
1741#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1742void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1743 VPSlotTracker &SlotTracker) const {
1744 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1745 [this, &O, &SlotTracker](auto Op) {
1746 O << "[ ";
1747 Op.value()->printAsOperand(O, SlotTracker);
1748 O << ", ";
1749 getIncomingBlock(Op.index())->printAsOperand(O);
1750 O << " ]";
1751 });
1752}
1753#endif
1754
1755#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1756void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1757 VPSlotTracker &SlotTracker) const {
1758 VPIRInstruction::printRecipe(O, Indent, SlotTracker);
1759
1760 if (getNumOperands() != 0) {
1761 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1762 interleaveComma(incoming_values_and_blocks(), O,
1763 [&O, &SlotTracker](auto Op) {
1764 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1765 O << " from ";
1766 std::get<1>(Op)->printAsOperand(O);
1767 });
1768 O << ")";
1769 }
1770}
1771#endif
1772
1773void VPIRMetadata::applyMetadata(Instruction &I) const {
1774 for (const auto &[Kind, Node] : Metadata)
1775 I.setMetadata(KindID: Kind, Node);
1776}
1777
1778void VPIRMetadata::intersect(const VPIRMetadata &Other) {
1779 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1780 for (const auto &[KindA, MDA] : Metadata) {
1781 for (const auto &[KindB, MDB] : Other.Metadata) {
1782 if (KindA == KindB && MDA == MDB) {
1783 MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
1784 break;
1785 }
1786 }
1787 }
1788 Metadata = std::move(MetadataIntersection);
1789}
1790
1791#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1792void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1793 const Module *M = SlotTracker.getModule();
1794 if (Metadata.empty() || !M)
1795 return;
1796
1797 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1798 O << " (";
1799 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1800 auto [Kind, Node] = KindNodePair;
1801 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1802 "Unexpected unnamed metadata kind");
1803 O << "!" << MDNames[Kind] << " ";
1804 Node->printAsOperand(O, M);
1805 });
1806 O << ")";
1807}
1808#endif
1809
1810void VPWidenCallRecipe::execute(VPTransformState &State) {
1811 assert(State.VF.isVector() && "not widening");
1812 assert(Variant != nullptr && "Can't create vector function.");
1813
1814 FunctionType *VFTy = Variant->getFunctionType();
1815 // Add return type if intrinsic is overloaded on it.
1816 SmallVector<Value *, 4> Args;
1817 for (const auto &I : enumerate(First: args())) {
1818 Value *Arg;
1819 // Some vectorized function variants may also take a scalar argument,
1820 // e.g. linear parameters for pointers. This needs to be the scalar value
1821 // from the start of the respective part when interleaving.
1822 if (!VFTy->getParamType(i: I.index())->isVectorTy())
1823 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1824 else
1825 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1826 Args.push_back(Elt: Arg);
1827 }
1828
1829 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1830 SmallVector<OperandBundleDef, 1> OpBundles;
1831 if (CI)
1832 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1833
1834 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1835 applyFlags(I&: *V);
1836 applyMetadata(I&: *V);
1837 V->setCallingConv(Variant->getCallingConv());
1838
1839 if (!V->getType()->isVoidTy())
1840 State.set(Def: this, V);
1841}
1842
1843InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1844 VPCostContext &Ctx) const {
1845 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1846 Tys: Variant->getFunctionType()->params(),
1847 CostKind: Ctx.CostKind);
1848}
1849
1850#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1851void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1852 VPSlotTracker &SlotTracker) const {
1853 O << Indent << "WIDEN-CALL ";
1854
1855 Function *CalledFn = getCalledScalarFunction();
1856 if (CalledFn->getReturnType()->isVoidTy())
1857 O << "void ";
1858 else {
1859 printAsOperand(O, SlotTracker);
1860 O << " = ";
1861 }
1862
1863 O << "call";
1864 printFlags(O);
1865 O << " @" << CalledFn->getName() << "(";
1866 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1867 Op->printAsOperand(O, SlotTracker);
1868 });
1869 O << ")";
1870
1871 O << " (using library function";
1872 if (Variant->hasName())
1873 O << ": " << Variant->getName();
1874 O << ")";
1875}
1876#endif
1877
1878void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1879 assert(State.VF.isVector() && "not widening");
1880
1881 SmallVector<Type *, 2> TysForDecl;
1882 // Add return type if intrinsic is overloaded on it.
1883 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1,
1884 TTI: State.TTI)) {
1885 Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF);
1886 ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
1887 for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
1888 if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
1889 RetIdx: Idx, TTI: State.TTI))
1890 TysForDecl.push_back(Elt: Ty);
1891 }
1892 }
1893 SmallVector<Value *, 4> Args;
1894 for (const auto &I : enumerate(First: operands())) {
1895 // Some intrinsics have a scalar argument - don't replace it with a
1896 // vector.
1897 Value *Arg;
1898 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1899 TTI: State.TTI))
1900 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1901 else
1902 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1903 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1904 TTI: State.TTI))
1905 TysForDecl.push_back(Elt: Arg->getType());
1906 Args.push_back(Elt: Arg);
1907 }
1908
1909 // Use vector version of the intrinsic.
1910 Module *M = State.Builder.GetInsertBlock()->getModule();
1911 Function *VectorF =
1912 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1913 assert(VectorF &&
1914 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1915
1916 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1917 SmallVector<OperandBundleDef, 1> OpBundles;
1918 if (CI)
1919 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1920
1921 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1922
1923 applyFlags(I&: *V);
1924 applyMetadata(I&: *V);
1925
1926 if (!V->getType()->isVoidTy())
1927 State.set(Def: this, V);
1928}
1929
1930/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1931static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
1932 ArrayRef<const VPValue *> Operands,
1933 const VPRecipeWithIRFlags &R,
1934 ElementCount VF,
1935 VPCostContext &Ctx) {
1936 // Some backends analyze intrinsic arguments to determine cost. Use the
1937 // underlying value for the operand if it has one. Otherwise try to use the
1938 // operand of the underlying call instruction, if there is one. Otherwise
1939 // clear Arguments.
1940 // TODO: Rework TTI interface to be independent of concrete IR values.
1941 SmallVector<const Value *> Arguments;
1942 for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
1943 auto *V = Op->getUnderlyingValue();
1944 if (!V) {
1945 if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
1946 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1947 continue;
1948 }
1949 Arguments.clear();
1950 break;
1951 }
1952 Arguments.push_back(Elt: V);
1953 }
1954
1955 Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R);
1956 Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
1957 SmallVector<Type *> ParamTys;
1958 for (const VPValue *Op : Operands) {
1959 ParamTys.push_back(Elt: VF.isVector()
1960 ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF)
1961 : Ctx.Types.inferScalarType(V: Op));
1962 }
1963
1964 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1965 IntrinsicCostAttributes CostAttrs(
1966 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1967 dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
1968 InstructionCost::getInvalid());
1969 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1970}
1971
1972InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1973 VPCostContext &Ctx) const {
1974 SmallVector<const VPValue *> ArgOps(operands());
1975 return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
1976}
1977
1978StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1979 return Intrinsic::getBaseName(id: VectorIntrinsicID);
1980}
1981
1982bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
1983 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1984 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1985 auto [Idx, V] = X;
1986 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1987 Idx, nullptr);
1988 });
1989}
1990
1991#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1992void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1993 VPSlotTracker &SlotTracker) const {
1994 O << Indent << "WIDEN-INTRINSIC ";
1995 if (ResultTy->isVoidTy()) {
1996 O << "void ";
1997 } else {
1998 printAsOperand(O, SlotTracker);
1999 O << " = ";
2000 }
2001
2002 O << "call";
2003 printFlags(O);
2004 O << getIntrinsicName() << "(";
2005
2006 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
2007 Op->printAsOperand(O, SlotTracker);
2008 });
2009 O << ")";
2010}
2011#endif
2012
2013void VPHistogramRecipe::execute(VPTransformState &State) {
2014 IRBuilderBase &Builder = State.Builder;
2015
2016 Value *Address = State.get(Def: getOperand(N: 0));
2017 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
2018 VectorType *VTy = cast<VectorType>(Val: Address->getType());
2019
2020 // The histogram intrinsic requires a mask even if the recipe doesn't;
2021 // if the mask operand was omitted then all lanes should be executed and
2022 // we just need to synthesize an all-true mask.
2023 Value *Mask = nullptr;
2024 if (VPValue *VPMask = getMask())
2025 Mask = State.get(Def: VPMask);
2026 else
2027 Mask =
2028 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
2029
2030 // If this is a subtract, we want to invert the increment amount. We may
2031 // add a separate intrinsic in future, but for now we'll try this.
2032 if (Opcode == Instruction::Sub)
2033 IncAmt = Builder.CreateNeg(V: IncAmt);
2034 else
2035 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2036
2037 State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
2038 Types: {VTy, IncAmt->getType()},
2039 Args: {Address, IncAmt, Mask});
2040}
2041
2042InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
2043 VPCostContext &Ctx) const {
2044 // FIXME: Take the gather and scatter into account as well. For now we're
2045 // generating the same cost as the fallback path, but we'll likely
2046 // need to create a new TTI method for determining the cost, including
2047 // whether we can use base + vec-of-smaller-indices or just
2048 // vec-of-pointers.
2049 assert(VF.isVector() && "Invalid VF for histogram cost");
2050 Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
2051 VPValue *IncAmt = getOperand(N: 1);
2052 Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
2053 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2054
2055 // Assume that a non-constant update value (or a constant != 1) requires
2056 // a multiply, and add that into the cost.
2057 InstructionCost MulCost =
2058 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2059 if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt))
2060 if (CI->isOne())
2061 MulCost = TTI::TCC_Free;
2062
2063 // Find the cost of the histogram operation itself.
2064 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2065 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2066 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2067 Type::getVoidTy(C&: Ctx.LLVMCtx),
2068 {PtrTy, IncTy, MaskTy});
2069
2070 // Add the costs together with the add/sub operation.
2071 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2072 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2073}
2074
2075#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2076void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2077 VPSlotTracker &SlotTracker) const {
2078 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2079 getOperand(0)->printAsOperand(O, SlotTracker);
2080
2081 if (Opcode == Instruction::Sub)
2082 O << ", dec: ";
2083 else {
2084 assert(Opcode == Instruction::Add);
2085 O << ", inc: ";
2086 }
2087 getOperand(1)->printAsOperand(O, SlotTracker);
2088
2089 if (VPValue *Mask = getMask()) {
2090 O << ", mask: ";
2091 Mask->printAsOperand(O, SlotTracker);
2092 }
2093}
2094#endif
2095
2096VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2097 AllowReassoc = FMF.allowReassoc();
2098 NoNaNs = FMF.noNaNs();
2099 NoInfs = FMF.noInfs();
2100 NoSignedZeros = FMF.noSignedZeros();
2101 AllowReciprocal = FMF.allowReciprocal();
2102 AllowContract = FMF.allowContract();
2103 ApproxFunc = FMF.approxFunc();
2104}
2105
2106VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2107 switch (Opcode) {
2108 case Instruction::Add:
2109 case Instruction::Sub:
2110 case Instruction::Mul:
2111 case Instruction::Shl:
2112 case VPInstruction::CanonicalIVIncrementForPart:
2113 return WrapFlagsTy(false, false);
2114 case Instruction::Trunc:
2115 return TruncFlagsTy(false, false);
2116 case Instruction::Or:
2117 return DisjointFlagsTy(false);
2118 case Instruction::AShr:
2119 case Instruction::LShr:
2120 case Instruction::UDiv:
2121 case Instruction::SDiv:
2122 return ExactFlagsTy(false);
2123 case Instruction::GetElementPtr:
2124 case VPInstruction::PtrAdd:
2125 case VPInstruction::WidePtrAdd:
2126 return GEPNoWrapFlags::none();
2127 case Instruction::ZExt:
2128 case Instruction::UIToFP:
2129 return NonNegFlagsTy(false);
2130 case Instruction::FAdd:
2131 case Instruction::FSub:
2132 case Instruction::FMul:
2133 case Instruction::FDiv:
2134 case Instruction::FRem:
2135 case Instruction::FNeg:
2136 case Instruction::FPExt:
2137 case Instruction::FPTrunc:
2138 return FastMathFlags();
2139 case Instruction::ICmp:
2140 case Instruction::FCmp:
2141 case VPInstruction::ComputeReductionResult:
2142 llvm_unreachable("opcode requires explicit flags");
2143 default:
2144 return VPIRFlags();
2145 }
2146}
2147
2148#if !defined(NDEBUG)
2149bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2150 switch (OpType) {
2151 case OperationType::OverflowingBinOp:
2152 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2153 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2154 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2155 case OperationType::Trunc:
2156 return Opcode == Instruction::Trunc;
2157 case OperationType::DisjointOp:
2158 return Opcode == Instruction::Or;
2159 case OperationType::PossiblyExactOp:
2160 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2161 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2162 case OperationType::GEPOp:
2163 return Opcode == Instruction::GetElementPtr ||
2164 Opcode == VPInstruction::PtrAdd ||
2165 Opcode == VPInstruction::WidePtrAdd;
2166 case OperationType::FPMathOp:
2167 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2168 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2169 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2170 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2171 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2172 Opcode == Instruction::Select ||
2173 Opcode == VPInstruction::WideIVStep ||
2174 Opcode == VPInstruction::ReductionStartVector;
2175 case OperationType::FCmp:
2176 return Opcode == Instruction::FCmp;
2177 case OperationType::NonNegOp:
2178 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2179 case OperationType::Cmp:
2180 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2181 case OperationType::ReductionOp:
2182 return Opcode == VPInstruction::ComputeReductionResult;
2183 case OperationType::Other:
2184 return true;
2185 }
2186 llvm_unreachable("Unknown OperationType enum");
2187}
2188
2189bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2190 // Handle opcodes without default flags.
2191 if (Opcode == Instruction::ICmp)
2192 return OpType == OperationType::Cmp;
2193 if (Opcode == Instruction::FCmp)
2194 return OpType == OperationType::FCmp;
2195 if (Opcode == VPInstruction::ComputeReductionResult)
2196 return OpType == OperationType::ReductionOp;
2197
2198 OperationType Required = getDefaultFlags(Opcode).OpType;
2199 return Required == OperationType::Other || Required == OpType;
2200}
2201#endif
2202
2203#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2204void VPIRFlags::printFlags(raw_ostream &O) const {
2205 switch (OpType) {
2206 case OperationType::Cmp:
2207 O << " " << CmpInst::getPredicateName(getPredicate());
2208 break;
2209 case OperationType::FCmp:
2210 O << " " << CmpInst::getPredicateName(getPredicate());
2211 getFastMathFlags().print(O);
2212 break;
2213 case OperationType::DisjointOp:
2214 if (DisjointFlags.IsDisjoint)
2215 O << " disjoint";
2216 break;
2217 case OperationType::PossiblyExactOp:
2218 if (ExactFlags.IsExact)
2219 O << " exact";
2220 break;
2221 case OperationType::OverflowingBinOp:
2222 if (WrapFlags.HasNUW)
2223 O << " nuw";
2224 if (WrapFlags.HasNSW)
2225 O << " nsw";
2226 break;
2227 case OperationType::Trunc:
2228 if (TruncFlags.HasNUW)
2229 O << " nuw";
2230 if (TruncFlags.HasNSW)
2231 O << " nsw";
2232 break;
2233 case OperationType::FPMathOp:
2234 getFastMathFlags().print(O);
2235 break;
2236 case OperationType::GEPOp: {
2237 GEPNoWrapFlags Flags = getGEPNoWrapFlags();
2238 if (Flags.isInBounds())
2239 O << " inbounds";
2240 else if (Flags.hasNoUnsignedSignedWrap())
2241 O << " nusw";
2242 if (Flags.hasNoUnsignedWrap())
2243 O << " nuw";
2244 break;
2245 }
2246 case OperationType::NonNegOp:
2247 if (NonNegFlags.NonNeg)
2248 O << " nneg";
2249 break;
2250 case OperationType::ReductionOp: {
2251 RecurKind RK = getRecurKind();
2252 O << " (";
2253 switch (RK) {
2254 case RecurKind::AnyOf:
2255 O << "any-of";
2256 break;
2257 case RecurKind::FindLast:
2258 O << "find-last";
2259 break;
2260 case RecurKind::SMax:
2261 O << "smax";
2262 break;
2263 case RecurKind::SMin:
2264 O << "smin";
2265 break;
2266 case RecurKind::UMax:
2267 O << "umax";
2268 break;
2269 case RecurKind::UMin:
2270 O << "umin";
2271 break;
2272 case RecurKind::FMinNum:
2273 O << "fminnum";
2274 break;
2275 case RecurKind::FMaxNum:
2276 O << "fmaxnum";
2277 break;
2278 case RecurKind::FMinimum:
2279 O << "fminimum";
2280 break;
2281 case RecurKind::FMaximum:
2282 O << "fmaximum";
2283 break;
2284 case RecurKind::FMinimumNum:
2285 O << "fminimumnum";
2286 break;
2287 case RecurKind::FMaximumNum:
2288 O << "fmaximumnum";
2289 break;
2290 default:
2291 O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
2292 break;
2293 }
2294 if (isReductionInLoop())
2295 O << ", in-loop";
2296 if (isReductionOrdered())
2297 O << ", ordered";
2298 O << ")";
2299 getFastMathFlags().print(O);
2300 break;
2301 }
2302 case OperationType::Other:
2303 break;
2304 }
2305 O << " ";
2306}
2307#endif
2308
2309void VPWidenRecipe::execute(VPTransformState &State) {
2310 auto &Builder = State.Builder;
2311 switch (Opcode) {
2312 case Instruction::Call:
2313 case Instruction::UncondBr:
2314 case Instruction::CondBr:
2315 case Instruction::PHI:
2316 case Instruction::GetElementPtr:
2317 llvm_unreachable("This instruction is handled by a different recipe.");
2318 case Instruction::UDiv:
2319 case Instruction::SDiv:
2320 case Instruction::SRem:
2321 case Instruction::URem:
2322 case Instruction::Add:
2323 case Instruction::FAdd:
2324 case Instruction::Sub:
2325 case Instruction::FSub:
2326 case Instruction::FNeg:
2327 case Instruction::Mul:
2328 case Instruction::FMul:
2329 case Instruction::FDiv:
2330 case Instruction::FRem:
2331 case Instruction::Shl:
2332 case Instruction::LShr:
2333 case Instruction::AShr:
2334 case Instruction::And:
2335 case Instruction::Or:
2336 case Instruction::Xor: {
2337 // Just widen unops and binops.
2338 SmallVector<Value *, 2> Ops;
2339 for (VPValue *VPOp : operands())
2340 Ops.push_back(Elt: State.get(Def: VPOp));
2341
2342 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2343
2344 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2345 applyFlags(I&: *VecOp);
2346 applyMetadata(I&: *VecOp);
2347 }
2348
2349 // Use this vector value for all users of the original instruction.
2350 State.set(Def: this, V);
2351 break;
2352 }
2353 case Instruction::ExtractValue: {
2354 assert(getNumOperands() == 2 && "expected single level extractvalue");
2355 Value *Op = State.get(Def: getOperand(N: 0));
2356 Value *Extract = Builder.CreateExtractValue(
2357 Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue());
2358 State.set(Def: this, V: Extract);
2359 break;
2360 }
2361 case Instruction::Freeze: {
2362 Value *Op = State.get(Def: getOperand(N: 0));
2363 Value *Freeze = Builder.CreateFreeze(V: Op);
2364 State.set(Def: this, V: Freeze);
2365 break;
2366 }
2367 case Instruction::ICmp:
2368 case Instruction::FCmp: {
2369 // Widen compares. Generate vector compares.
2370 bool FCmp = Opcode == Instruction::FCmp;
2371 Value *A = State.get(Def: getOperand(N: 0));
2372 Value *B = State.get(Def: getOperand(N: 1));
2373 Value *C = nullptr;
2374 if (FCmp) {
2375 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2376 } else {
2377 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2378 }
2379 if (auto *I = dyn_cast<Instruction>(Val: C)) {
2380 applyFlags(I&: *I);
2381 applyMetadata(I&: *I);
2382 }
2383 State.set(Def: this, V: C);
2384 break;
2385 }
2386 case Instruction::Select: {
2387 VPValue *CondOp = getOperand(N: 0);
2388 Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2389 Value *Op0 = State.get(Def: getOperand(N: 1));
2390 Value *Op1 = State.get(Def: getOperand(N: 2));
2391 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2392 State.set(Def: this, V: Sel);
2393 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2394 if (isa<FPMathOperator>(Val: I))
2395 applyFlags(I&: *I);
2396 applyMetadata(I&: *I);
2397 }
2398 break;
2399 }
2400 default:
2401 // This instruction is not vectorized by simple widening.
2402 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2403 << Instruction::getOpcodeName(Opcode));
2404 llvm_unreachable("Unhandled instruction!");
2405 } // end of switch.
2406
2407#if !defined(NDEBUG)
2408 // Verify that VPlan type inference results agree with the type of the
2409 // generated values.
2410 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2411 State.get(this)->getType() &&
2412 "inferred type and type from generated instructions do not match");
2413#endif
2414}
2415
2416InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2417 VPCostContext &Ctx) const {
2418 switch (Opcode) {
2419 case Instruction::UDiv:
2420 case Instruction::SDiv:
2421 case Instruction::SRem:
2422 case Instruction::URem:
2423 // If the div/rem operation isn't safe to speculate and requires
2424 // predication, then the only way we can even create a vplan is to insert
2425 // a select on the second input operand to ensure we use the value of 1
2426 // for the inactive lanes. The select will be costed separately.
2427 case Instruction::FNeg:
2428 case Instruction::Add:
2429 case Instruction::FAdd:
2430 case Instruction::Sub:
2431 case Instruction::FSub:
2432 case Instruction::Mul:
2433 case Instruction::FMul:
2434 case Instruction::FDiv:
2435 case Instruction::FRem:
2436 case Instruction::Shl:
2437 case Instruction::LShr:
2438 case Instruction::AShr:
2439 case Instruction::And:
2440 case Instruction::Or:
2441 case Instruction::Xor:
2442 case Instruction::Freeze:
2443 case Instruction::ExtractValue:
2444 case Instruction::ICmp:
2445 case Instruction::FCmp:
2446 case Instruction::Select:
2447 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2448 default:
2449 llvm_unreachable("Unsupported opcode for instruction");
2450 }
2451}
2452
2453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2454void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2455 VPSlotTracker &SlotTracker) const {
2456 O << Indent << "WIDEN ";
2457 printAsOperand(O, SlotTracker);
2458 O << " = " << Instruction::getOpcodeName(Opcode);
2459 printFlags(O);
2460 printOperands(O, SlotTracker);
2461}
2462#endif
2463
2464void VPWidenCastRecipe::execute(VPTransformState &State) {
2465 auto &Builder = State.Builder;
2466 /// Vectorize casts.
2467 assert(State.VF.isVector() && "Not vectorizing?");
2468 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2469 VPValue *Op = getOperand(N: 0);
2470 Value *A = State.get(Def: Op);
2471 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2472 State.set(Def: this, V: Cast);
2473 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2474 applyFlags(I&: *CastOp);
2475 applyMetadata(I&: *CastOp);
2476 }
2477}
2478
2479InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2480 VPCostContext &Ctx) const {
2481 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2482 // the legacy cost model, including truncates/extends when evaluating a
2483 // reduction in a smaller type.
2484 if (!getUnderlyingValue())
2485 return 0;
2486 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2487}
2488
2489#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2490void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2491 VPSlotTracker &SlotTracker) const {
2492 O << Indent << "WIDEN-CAST ";
2493 printAsOperand(O, SlotTracker);
2494 O << " = " << Instruction::getOpcodeName(Opcode);
2495 printFlags(O);
2496 printOperands(O, SlotTracker);
2497 O << " to " << *getResultType();
2498}
2499#endif
2500
2501InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2502 VPCostContext &Ctx) const {
2503 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2504}
2505
2506#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2507void VPWidenIntOrFpInductionRecipe::printRecipe(
2508 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2509 O << Indent;
2510 printAsOperand(O, SlotTracker);
2511 O << " = WIDEN-INDUCTION";
2512 printFlags(O);
2513 printOperands(O, SlotTracker);
2514
2515 if (auto *TI = getTruncInst())
2516 O << " (truncated to " << *TI->getType() << ")";
2517}
2518#endif
2519
2520bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2521 // The step may be defined by a recipe in the preheader (e.g. if it requires
2522 // SCEV expansion), but for the canonical induction the step is required to be
2523 // 1, which is represented as live-in.
2524 auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue());
2525 auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue());
2526 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2527 getScalarType() == getRegion()->getCanonicalIVType();
2528}
2529
2530void VPDerivedIVRecipe::execute(VPTransformState &State) {
2531 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2532
2533 // Fast-math-flags propagate from the original induction instruction.
2534 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2535 if (FPBinOp)
2536 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2537
2538 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2539 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
2540 Value *DerivedIV = emitTransformedIndex(
2541 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
2542 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
2543 DerivedIV->setName(Name);
2544 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
2545}
2546
2547#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2548void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2549 VPSlotTracker &SlotTracker) const {
2550 O << Indent;
2551 printAsOperand(O, SlotTracker);
2552 O << " = DERIVED-IV ";
2553 getStartValue()->printAsOperand(O, SlotTracker);
2554 O << " + ";
2555 getOperand(1)->printAsOperand(O, SlotTracker);
2556 O << " * ";
2557 getStepValue()->printAsOperand(O, SlotTracker);
2558}
2559#endif
2560
2561void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2562 // Fast-math-flags propagate from the original induction instruction.
2563 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2564 State.Builder.setFastMathFlags(getFastMathFlags());
2565
2566 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2567 /// variable on which to base the steps, \p Step is the size of the step.
2568
2569 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2570 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2571 IRBuilderBase &Builder = State.Builder;
2572
2573 // Ensure step has the same type as that of scalar IV.
2574 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2575 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2576
2577 // We build scalar steps for both integer and floating-point induction
2578 // variables. Here, we determine the kind of arithmetic we will perform.
2579 Instruction::BinaryOps AddOp;
2580 Instruction::BinaryOps MulOp;
2581 if (BaseIVTy->isIntegerTy()) {
2582 AddOp = Instruction::Add;
2583 MulOp = Instruction::Mul;
2584 } else {
2585 AddOp = InductionOpcode;
2586 MulOp = Instruction::FMul;
2587 }
2588
2589 // Determine the number of scalars we need to generate.
2590 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2591 // Compute the scalar steps and save the results in State.
2592
2593 unsigned StartLane = 0;
2594 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2595 if (State.Lane) {
2596 StartLane = State.Lane->getKnownLane();
2597 EndLane = StartLane + 1;
2598 }
2599 Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true)
2600 : Constant::getNullValue(Ty: BaseIVTy);
2601
2602 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2603 // It is okay if the induction variable type cannot hold the lane number,
2604 // we expect truncation in this case.
2605 Constant *LaneValue =
2606 BaseIVTy->isIntegerTy()
2607 ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false,
2608 /*ImplicitTrunc=*/true)
2609 : ConstantFP::get(Ty: BaseIVTy, V: Lane);
2610 Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
2611 // The step returned by `createStepForVF` is a runtime-evaluated value
2612 // when VF is scalable. Otherwise, it should be folded into a Constant.
2613 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2614 "Expected StartIdx to be folded to a constant when VF is not "
2615 "scalable");
2616 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2617 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2618 State.set(Def: this, V: Add, Lane: VPLane(Lane));
2619 }
2620}
2621
2622#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2623void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2624 VPSlotTracker &SlotTracker) const {
2625 O << Indent;
2626 printAsOperand(O, SlotTracker);
2627 O << " = SCALAR-STEPS ";
2628 printOperands(O, SlotTracker);
2629}
2630#endif
2631
2632bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2633 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2634 return vputils::isSingleScalar(VPV: Op);
2635}
2636
2637void VPWidenGEPRecipe::execute(VPTransformState &State) {
2638 assert(State.VF.isVector() && "not widening");
2639 // Construct a vector GEP by widening the operands of the scalar GEP as
2640 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2641 // results in a vector of pointers when at least one operand of the GEP
2642 // is vector-typed. Thus, to keep the representation compact, we only use
2643 // vector-typed operands for loop-varying values.
2644
2645 bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) {
2646 return Op->isDefinedOutsideLoopRegions();
2647 });
2648 if (AllOperandsAreInvariant) {
2649 // If we are vectorizing, but the GEP has only loop-invariant operands,
2650 // the GEP we build (by only using vector-typed operands for
2651 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2652 // produce a vector of pointers, we need to either arbitrarily pick an
2653 // operand to broadcast, or broadcast a clone of the original GEP.
2654 // Here, we broadcast a clone of the original.
2655
2656 SmallVector<Value *> Ops;
2657 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2658 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0)));
2659
2660 auto *NewGEP =
2661 State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
2662 Name: "", NW: getGEPNoWrapFlags());
2663 Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2664 State.set(Def: this, V: Splat);
2665 return;
2666 }
2667
2668 // If the GEP has at least one loop-varying operand, we are sure to
2669 // produce a vector of pointers unless VF is scalar.
2670 // The pointer operand of the new GEP. If it's loop-invariant, we
2671 // won't broadcast it.
2672 auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant());
2673
2674 // Collect all the indices for the new GEP. If any index is
2675 // loop-invariant, we won't broadcast it.
2676 SmallVector<Value *, 4> Indices;
2677 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2678 VPValue *Operand = getOperand(N: I);
2679 Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1)));
2680 }
2681
2682 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2683 // but it should be a vector, otherwise.
2684 auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices,
2685 Name: "", NW: getGEPNoWrapFlags());
2686 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2687 "NewGEP is not a pointer vector");
2688 State.set(Def: this, V: NewGEP);
2689}
2690
2691#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2692void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2693 VPSlotTracker &SlotTracker) const {
2694 O << Indent << "WIDEN-GEP ";
2695 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2696 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2697 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2698
2699 O << " ";
2700 printAsOperand(O, SlotTracker);
2701 O << " = getelementptr";
2702 printFlags(O);
2703 printOperands(O, SlotTracker);
2704}
2705#endif
2706
2707void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
2708 assert(!getOffset() && "Unexpected offset operand");
2709 VPBuilder Builder(this);
2710 VPlan &Plan = *getParent()->getPlan();
2711 VPValue *VFVal = getVFValue();
2712 VPTypeAnalysis TypeInfo(Plan);
2713 const DataLayout &DL = Plan.getDataLayout();
2714 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: this));
2715 VPValue *Stride =
2716 Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true);
2717 Type *VFTy = TypeInfo.inferScalarType(V: VFVal);
2718 VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy,
2719 DL: DebugLoc::getUnknown());
2720
2721 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2722 VPInstruction *VFMinusOne =
2723 Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u),
2724 DL: DebugLoc::getUnknown(), Name: "", WrapFlags: {true, true});
2725 VPInstruction *Offset0 =
2726 Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride});
2727
2728 // Offset for PartN = Offset0 + Part * Stride * VF.
2729 VPValue *PartxStride =
2730 Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true);
2731 VPValue *Offset = Builder.createAdd(
2732 LHS: Offset0,
2733 RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF}));
2734 addOperand(Operand: Offset);
2735}
2736
2737void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2738 auto &Builder = State.Builder;
2739 assert(getOffset() && "Expected prior materialization of offset");
2740 Value *Ptr = State.get(Def: getPointer(), IsScalar: true);
2741 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2742 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2743 NW: getGEPNoWrapFlags());
2744 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2745}
2746
2747#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2748void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2749 VPSlotTracker &SlotTracker) const {
2750 O << Indent;
2751 printAsOperand(O, SlotTracker);
2752 O << " = vector-end-pointer";
2753 printFlags(O);
2754 printOperands(O, SlotTracker);
2755}
2756#endif
2757
2758void VPVectorPointerRecipe::execute(VPTransformState &State) {
2759 auto &Builder = State.Builder;
2760 assert(getOffset() &&
2761 "Expected prior simplification of recipe without offset");
2762 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2763 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2764 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2765 NW: getGEPNoWrapFlags());
2766 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2767}
2768
2769#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2770void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2771 VPSlotTracker &SlotTracker) const {
2772 O << Indent;
2773 printAsOperand(O, SlotTracker);
2774 O << " = vector-pointer";
2775 printFlags(O);
2776 printOperands(O, SlotTracker);
2777}
2778#endif
2779
2780InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2781 VPCostContext &Ctx) const {
2782 // A blend will be expanded to a select VPInstruction, which will generate a
2783 // scalar select if only the first lane is used.
2784 if (vputils::onlyFirstLaneUsed(Def: this))
2785 VF = ElementCount::getFixed(MinVal: 1);
2786
2787 Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
2788 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2789 return (getNumIncomingValues() - 1) *
2790 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2791 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2792}
2793
2794#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2795void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2796 VPSlotTracker &SlotTracker) const {
2797 O << Indent << "BLEND ";
2798 printAsOperand(O, SlotTracker);
2799 O << " =";
2800 printFlags(O);
2801 if (getNumIncomingValues() == 1) {
2802 // Not a User of any mask: not really blending, this is a
2803 // single-predecessor phi.
2804 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2805 } else {
2806 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2807 if (I != 0)
2808 O << " ";
2809 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2810 if (I == 0 && isNormalized())
2811 continue;
2812 O << "/";
2813 getMask(I)->printAsOperand(O, SlotTracker);
2814 }
2815 }
2816}
2817#endif
2818
2819void VPReductionRecipe::execute(VPTransformState &State) {
2820 assert(!State.Lane && "Reduction being replicated.");
2821 RecurKind Kind = getRecurrenceKind();
2822 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2823 "In-loop AnyOf reductions aren't currently supported");
2824 // Propagate the fast-math flags carried by the underlying instruction.
2825 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2826 State.Builder.setFastMathFlags(getFastMathFlags());
2827 Value *NewVecOp = State.get(Def: getVecOp());
2828 if (VPValue *Cond = getCondOp()) {
2829 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2830 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2831 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2832
2833 Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2834 if (State.VF.isVector())
2835 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2836
2837 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2838 NewVecOp = Select;
2839 }
2840 Value *NewRed;
2841 Value *NextInChain;
2842 if (isOrdered()) {
2843 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2844 if (State.VF.isVector())
2845 NewRed =
2846 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2847 else
2848 NewRed = State.Builder.CreateBinOp(
2849 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2850 LHS: PrevInChain, RHS: NewVecOp);
2851 PrevInChain = NewRed;
2852 NextInChain = NewRed;
2853 } else if (isPartialReduction()) {
2854 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2855 "Unexpected partial reduction kind");
2856 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false);
2857 NewRed = State.Builder.CreateIntrinsic(
2858 RetTy: PrevInChain->getType(),
2859 ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2860 : Intrinsic::vector_partial_reduce_fadd,
2861 Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
2862 Name: "partial.reduce");
2863 PrevInChain = NewRed;
2864 NextInChain = NewRed;
2865 } else {
2866 assert(isInLoop() &&
2867 "The reduction must either be ordered, partial or in-loop");
2868 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2869 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2870 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2871 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2872 else
2873 NextInChain = State.Builder.CreateBinOp(
2874 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2875 LHS: PrevInChain, RHS: NewRed);
2876 }
2877 State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction());
2878}
2879
2880void VPReductionEVLRecipe::execute(VPTransformState &State) {
2881 assert(!State.Lane && "Reduction being replicated.");
2882
2883 auto &Builder = State.Builder;
2884 // Propagate the fast-math flags carried by the underlying instruction.
2885 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2886 Builder.setFastMathFlags(getFastMathFlags());
2887
2888 RecurKind Kind = getRecurrenceKind();
2889 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
2890 Value *VecOp = State.get(Def: getVecOp());
2891 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
2892
2893 Value *Mask;
2894 if (VPValue *CondOp = getCondOp())
2895 Mask = State.get(Def: CondOp);
2896 else
2897 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2898
2899 Value *NewRed;
2900 if (isOrdered()) {
2901 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2902 } else {
2903 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2904 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2905 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2906 else
2907 NewRed = Builder.CreateBinOp(
2908 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2909 RHS: Prev);
2910 }
2911 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
2912}
2913
2914InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2915 VPCostContext &Ctx) const {
2916 RecurKind RdxKind = getRecurrenceKind();
2917 Type *ElementTy = Ctx.Types.inferScalarType(V: this);
2918 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2919 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2920 FastMathFlags FMFs = getFastMathFlags();
2921 std::optional<FastMathFlags> OptionalFMF =
2922 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2923
2924 if (isPartialReduction()) {
2925 InstructionCost CondCost = 0;
2926 if (isConditional()) {
2927 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
2928 auto *CondTy = cast<VectorType>(
2929 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF));
2930 CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
2931 CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
2932 }
2933 return CondCost + Ctx.TTI.getPartialReductionCost(
2934 Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
2935 OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
2936 FMF: OptionalFMF);
2937 }
2938
2939 // TODO: Support any-of reductions.
2940 assert(
2941 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2942 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2943 "Any-of reduction not implemented in VPlan-based cost model currently.");
2944
2945 // Note that TTI should model the cost of moving result to the scalar register
2946 // and the BinOp cost in the getMinMaxReductionCost().
2947 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2948 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2949 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2950 }
2951
2952 // Note that TTI should model the cost of moving result to the scalar register
2953 // and the BinOp cost in the getArithmeticReductionCost().
2954 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2955 CostKind: Ctx.CostKind);
2956}
2957
2958VPExpressionRecipe::VPExpressionRecipe(
2959 ExpressionTypes ExpressionType,
2960 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2961 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2962 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2963 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2964 assert(
2965 none_of(ExpressionRecipes,
2966 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2967 "expression cannot contain recipes with side-effects");
2968
2969 // Maintain a copy of the expression recipes as a set of users.
2970 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2971 for (auto *R : ExpressionRecipes)
2972 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2973
2974 // Recipes in the expression, except the last one, must only be used by
2975 // (other) recipes inside the expression. If there are other users, external
2976 // to the expression, use a clone of the recipe for external users.
2977 for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
2978 if (R != ExpressionRecipes.back() &&
2979 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2980 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2981 })) {
2982 // There are users outside of the expression. Clone the recipe and use the
2983 // clone those external users.
2984 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2985 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2986 VPUser &U, unsigned) {
2987 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2988 });
2989 CopyForExtUsers->insertBefore(InsertPos: R);
2990 }
2991 if (R->getParent())
2992 R->removeFromParent();
2993 }
2994
2995 // Internalize all external operands to the expression recipes. To do so,
2996 // create new temporary VPValues for all operands defined by a recipe outside
2997 // the expression. The original operands are added as operands of the
2998 // VPExpressionRecipe itself.
2999 for (auto *R : ExpressionRecipes) {
3000 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
3001 auto *Def = Op->getDefiningRecipe();
3002 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
3003 continue;
3004 addOperand(Operand: Op);
3005 LiveInPlaceholders.push_back(Elt: new VPSymbolicValue());
3006 }
3007 }
3008
3009 // Replace each external operand with the first one created for it in
3010 // LiveInPlaceholders.
3011 for (auto *R : ExpressionRecipes)
3012 for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
3013 R->replaceUsesOfWith(From: LiveIn, To: Tmp);
3014}
3015
3016void VPExpressionRecipe::decompose() {
3017 for (auto *R : ExpressionRecipes)
3018 // Since the list could contain duplicates, make sure the recipe hasn't
3019 // already been inserted.
3020 if (!R->getParent())
3021 R->insertBefore(InsertPos: this);
3022
3023 for (const auto &[Idx, Op] : enumerate(First: operands()))
3024 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
3025
3026 replaceAllUsesWith(New: ExpressionRecipes.back());
3027 ExpressionRecipes.clear();
3028}
3029
3030InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
3031 VPCostContext &Ctx) const {
3032 Type *RedTy = Ctx.Types.inferScalarType(V: this);
3033 auto *SrcVecTy = cast<VectorType>(
3034 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
3035 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3036 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
3037 switch (ExpressionType) {
3038 case ExpressionTypes::ExtendedReduction: {
3039 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3040 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind());
3041 auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3042 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3043
3044 if (RedR->isPartialReduction())
3045 return Ctx.TTI.getPartialReductionCost(
3046 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF,
3047 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
3048 OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
3049 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3050 : std::nullopt);
3051 else if (!RedTy->isFloatingPointTy())
3052 // TTI::getExtendedReductionCost only supports integer types.
3053 return Ctx.TTI.getExtendedReductionCost(
3054 Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
3055 FMF: std::nullopt, CostKind: Ctx.CostKind);
3056 else
3057 return InstructionCost::getInvalid();
3058 }
3059 case ExpressionTypes::MulAccReduction:
3060 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
3061 CostKind: Ctx.CostKind);
3062
3063 case ExpressionTypes::ExtNegatedMulAccReduction:
3064 assert(Opcode == Instruction::Add && "Unexpected opcode");
3065 Opcode = Instruction::Sub;
3066 [[fallthrough]];
3067 case ExpressionTypes::ExtMulAccReduction: {
3068 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3069 if (RedR->isPartialReduction()) {
3070 auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3071 auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]);
3072 auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]);
3073 return Ctx.TTI.getPartialReductionCost(
3074 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)),
3075 InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF,
3076 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3077 CastOpc: Ext0R->getOpcode()),
3078 OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3079 CastOpc: Ext1R->getOpcode()),
3080 BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3081 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3082 : std::nullopt);
3083 }
3084 return Ctx.TTI.getMulAccReductionCost(
3085 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3086 Instruction::ZExt,
3087 RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3088 }
3089 }
3090 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3091}
3092
3093bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3094 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3095 return R->mayReadFromMemory() || R->mayWriteToMemory();
3096 });
3097}
3098
3099bool VPExpressionRecipe::mayHaveSideEffects() const {
3100 assert(
3101 none_of(ExpressionRecipes,
3102 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3103 "expression cannot contain recipes with side-effects");
3104 return false;
3105}
3106
3107bool VPExpressionRecipe::isSingleScalar() const {
3108 // Cannot use vputils::isSingleScalar(), because all external operands
3109 // of the expression will be live-ins while bundled.
3110 auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3111 return RR && !RR->isPartialReduction();
3112}
3113
3114#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3115
3116void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3117 VPSlotTracker &SlotTracker) const {
3118 O << Indent << "EXPRESSION ";
3119 printAsOperand(O, SlotTracker);
3120 O << " = ";
3121 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3122 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3123
3124 switch (ExpressionType) {
3125 case ExpressionTypes::ExtendedReduction: {
3126 getOperand(1)->printAsOperand(O, SlotTracker);
3127 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3128 O << Instruction::getOpcodeName(Opcode) << " (";
3129 getOperand(0)->printAsOperand(O, SlotTracker);
3130 Red->printFlags(O);
3131
3132 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3133 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3134 << *Ext0->getResultType();
3135 if (Red->isConditional()) {
3136 O << ", ";
3137 Red->getCondOp()->printAsOperand(O, SlotTracker);
3138 }
3139 O << ")";
3140 break;
3141 }
3142 case ExpressionTypes::ExtNegatedMulAccReduction: {
3143 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3144 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3145 O << Instruction::getOpcodeName(
3146 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3147 << " (sub (0, mul";
3148 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3149 Mul->printFlags(O);
3150 O << "(";
3151 getOperand(0)->printAsOperand(O, SlotTracker);
3152 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3153 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3154 << *Ext0->getResultType() << "), (";
3155 getOperand(1)->printAsOperand(O, SlotTracker);
3156 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3157 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3158 << *Ext1->getResultType() << ")";
3159 if (Red->isConditional()) {
3160 O << ", ";
3161 Red->getCondOp()->printAsOperand(O, SlotTracker);
3162 }
3163 O << "))";
3164 break;
3165 }
3166 case ExpressionTypes::MulAccReduction:
3167 case ExpressionTypes::ExtMulAccReduction: {
3168 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3169 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3170 O << Instruction::getOpcodeName(
3171 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3172 << " (";
3173 O << "mul";
3174 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3175 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3176 : ExpressionRecipes[0]);
3177 Mul->printFlags(O);
3178 if (IsExtended)
3179 O << "(";
3180 getOperand(0)->printAsOperand(O, SlotTracker);
3181 if (IsExtended) {
3182 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3183 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3184 << *Ext0->getResultType() << "), (";
3185 } else {
3186 O << ", ";
3187 }
3188 getOperand(1)->printAsOperand(O, SlotTracker);
3189 if (IsExtended) {
3190 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3191 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3192 << *Ext1->getResultType() << ")";
3193 }
3194 if (Red->isConditional()) {
3195 O << ", ";
3196 Red->getCondOp()->printAsOperand(O, SlotTracker);
3197 }
3198 O << ")";
3199 break;
3200 }
3201 }
3202}
3203
3204void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3205 VPSlotTracker &SlotTracker) const {
3206 if (isPartialReduction())
3207 O << Indent << "PARTIAL-REDUCE ";
3208 else
3209 O << Indent << "REDUCE ";
3210 printAsOperand(O, SlotTracker);
3211 O << " = ";
3212 getChainOp()->printAsOperand(O, SlotTracker);
3213 O << " +";
3214 printFlags(O);
3215 O << " reduce."
3216 << Instruction::getOpcodeName(
3217 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3218 << " (";
3219 getVecOp()->printAsOperand(O, SlotTracker);
3220 if (isConditional()) {
3221 O << ", ";
3222 getCondOp()->printAsOperand(O, SlotTracker);
3223 }
3224 O << ")";
3225}
3226
3227void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3228 VPSlotTracker &SlotTracker) const {
3229 O << Indent << "REDUCE ";
3230 printAsOperand(O, SlotTracker);
3231 O << " = ";
3232 getChainOp()->printAsOperand(O, SlotTracker);
3233 O << " +";
3234 printFlags(O);
3235 O << " vp.reduce."
3236 << Instruction::getOpcodeName(
3237 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3238 << " (";
3239 getVecOp()->printAsOperand(O, SlotTracker);
3240 O << ", ";
3241 getEVL()->printAsOperand(O, SlotTracker);
3242 if (isConditional()) {
3243 O << ", ";
3244 getCondOp()->printAsOperand(O, SlotTracker);
3245 }
3246 O << ")";
3247}
3248
3249#endif
3250
3251/// A helper function to scalarize a single Instruction in the innermost loop.
3252/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3253/// operands from \p RepRecipe instead of \p Instr's operands.
3254static void scalarizeInstruction(const Instruction *Instr,
3255 VPReplicateRecipe *RepRecipe,
3256 const VPLane &Lane, VPTransformState &State) {
3257 assert((!Instr->getType()->isAggregateType() ||
3258 canVectorizeTy(Instr->getType())) &&
3259 "Expected vectorizable or non-aggregate type.");
3260
3261 // Does this instruction return a value ?
3262 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3263
3264 Instruction *Cloned = Instr->clone();
3265 if (!IsVoidRetTy) {
3266 Cloned->setName(Instr->getName() + ".cloned");
3267 Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe);
3268 // The operands of the replicate recipe may have been narrowed, resulting in
3269 // a narrower result type. Update the type of the cloned instruction to the
3270 // correct type.
3271 if (ResultTy != Cloned->getType())
3272 Cloned->mutateType(Ty: ResultTy);
3273 }
3274
3275 RepRecipe->applyFlags(I&: *Cloned);
3276 RepRecipe->applyMetadata(I&: *Cloned);
3277
3278 if (RepRecipe->hasPredicate())
3279 cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate());
3280
3281 if (auto DL = RepRecipe->getDebugLoc())
3282 State.setDebugLocFrom(DL);
3283
3284 // Replace the operands of the cloned instructions with their scalar
3285 // equivalents in the new loop.
3286 for (const auto &I : enumerate(First: RepRecipe->operands())) {
3287 auto InputLane = Lane;
3288 VPValue *Operand = I.value();
3289 if (vputils::isSingleScalar(VPV: Operand))
3290 InputLane = VPLane::getFirstLane();
3291 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
3292 }
3293
3294 // Place the cloned scalar in the new loop.
3295 State.Builder.Insert(I: Cloned);
3296
3297 State.set(Def: RepRecipe, V: Cloned, Lane);
3298
3299 // If we just cloned a new assumption, add it the assumption cache.
3300 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3301 State.AC->registerAssumption(CI: II);
3302
3303 assert(
3304 (RepRecipe->getRegion() ||
3305 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3306 all_of(RepRecipe->operands(),
3307 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3308 "Expected a recipe is either within a region or all of its operands "
3309 "are defined outside the vectorized region.");
3310}
3311
3312void VPReplicateRecipe::execute(VPTransformState &State) {
3313 Instruction *UI = getUnderlyingInstr();
3314
3315 if (!State.Lane) {
3316 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3317 "must have already been unrolled");
3318 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State);
3319 return;
3320 }
3321
3322 assert((State.VF.isScalar() || !isSingleScalar()) &&
3323 "uniform recipe shouldn't be predicated");
3324 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3325 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
3326 // Insert scalar instance packing it into a vector.
3327 if (State.VF.isVector() && shouldPack()) {
3328 Value *WideValue =
3329 State.Lane->isFirstLane()
3330 ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF))
3331 : State.get(Def: this);
3332 State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
3333 Lane: *State.Lane));
3334 }
3335}
3336
3337bool VPReplicateRecipe::shouldPack() const {
3338 // Find if the recipe is used by a widened recipe via an intervening
3339 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3340 return any_of(Range: users(), P: [](const VPUser *U) {
3341 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
3342 return !vputils::onlyScalarValuesUsed(Def: PredR);
3343 return false;
3344 });
3345}
3346
3347/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3348/// which the legacy cost model computes a SCEV expression when computing the
3349/// address cost. Computing SCEVs for VPValues is incomplete and returns
3350/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3351/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3352static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3353 PredicatedScalarEvolution &PSE,
3354 const Loop *L) {
3355 const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3356 if (isa<SCEVCouldNotCompute>(Val: Addr))
3357 return Addr;
3358
3359 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr;
3360}
3361
3362/// Returns true if \p V is used as part of the address of another load or
3363/// store.
3364static bool isUsedByLoadStoreAddress(const VPUser *V) {
3365 SmallPtrSet<const VPUser *, 4> Seen;
3366 SmallVector<const VPUser *> WorkList = {V};
3367
3368 while (!WorkList.empty()) {
3369 auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val());
3370 if (!Cur || !Seen.insert(Ptr: Cur).second)
3371 continue;
3372
3373 auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur);
3374 // Skip blends that use V only through a compare by checking if any incoming
3375 // value was already visited.
3376 if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()),
3377 P: [&](unsigned I) {
3378 return Seen.contains(
3379 Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe());
3380 }))
3381 continue;
3382
3383 for (VPUser *U : Cur->users()) {
3384 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U))
3385 if (InterleaveR->getAddr() == Cur)
3386 return true;
3387 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) {
3388 if (RepR->getOpcode() == Instruction::Load &&
3389 RepR->getOperand(N: 0) == Cur)
3390 return true;
3391 if (RepR->getOpcode() == Instruction::Store &&
3392 RepR->getOperand(N: 1) == Cur)
3393 return true;
3394 }
3395 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) {
3396 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3397 return true;
3398 }
3399 }
3400
3401 // The legacy cost model only supports scalarization loads/stores with phi
3402 // addresses, if the phi is directly used as load/store address. Don't
3403 // traverse further for Blends.
3404 if (Blend)
3405 continue;
3406
3407 append_range(C&: WorkList, R: Cur->users());
3408 }
3409 return false;
3410}
3411
3412InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3413 VPCostContext &Ctx) const {
3414 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3415 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3416 // transform, avoid computing their cost multiple times for now.
3417 Ctx.SkipCostComputation.insert(Ptr: UI);
3418
3419 if (VF.isScalable() && !isSingleScalar())
3420 return InstructionCost::getInvalid();
3421
3422 switch (UI->getOpcode()) {
3423 case Instruction::Alloca:
3424 if (VF.isScalable())
3425 return InstructionCost::getInvalid();
3426 return Ctx.TTI.getArithmeticInstrCost(
3427 Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind);
3428 case Instruction::GetElementPtr:
3429 // We mark this instruction as zero-cost because the cost of GEPs in
3430 // vectorized code depends on whether the corresponding memory instruction
3431 // is scalarized or not. Therefore, we handle GEPs with the memory
3432 // instruction cost.
3433 return 0;
3434 case Instruction::Call: {
3435 auto *CalledFn =
3436 cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue());
3437
3438 SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3439 SmallVector<Type *, 4> Tys;
3440 for (const VPValue *ArgOp : ArgOps)
3441 Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp));
3442
3443 if (CalledFn->isIntrinsic())
3444 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3445 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3446 switch (CalledFn->getIntrinsicID()) {
3447 case Intrinsic::assume:
3448 case Intrinsic::lifetime_end:
3449 case Intrinsic::lifetime_start:
3450 case Intrinsic::sideeffect:
3451 case Intrinsic::pseudoprobe:
3452 case Intrinsic::experimental_noalias_scope_decl: {
3453 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3454 ElementCount::getFixed(1), Ctx) == 0 &&
3455 "scalarizing intrinsic should be free");
3456 return InstructionCost(0);
3457 }
3458 default:
3459 break;
3460 }
3461
3462 Type *ResultTy = Ctx.Types.inferScalarType(V: this);
3463 InstructionCost ScalarCallCost =
3464 Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3465 if (isSingleScalar()) {
3466 if (CalledFn->isIntrinsic())
3467 ScalarCallCost = std::min(
3468 a: ScalarCallCost,
3469 b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this,
3470 VF: ElementCount::getFixed(MinVal: 1), Ctx));
3471 return ScalarCallCost;
3472 }
3473
3474 return ScalarCallCost * VF.getFixedValue() +
3475 Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
3476 }
3477 case Instruction::Add:
3478 case Instruction::Sub:
3479 case Instruction::FAdd:
3480 case Instruction::FSub:
3481 case Instruction::Mul:
3482 case Instruction::FMul:
3483 case Instruction::FDiv:
3484 case Instruction::FRem:
3485 case Instruction::Shl:
3486 case Instruction::LShr:
3487 case Instruction::AShr:
3488 case Instruction::And:
3489 case Instruction::Or:
3490 case Instruction::Xor:
3491 case Instruction::ICmp:
3492 case Instruction::FCmp:
3493 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3494 Ctx) *
3495 (isSingleScalar() ? 1 : VF.getFixedValue());
3496 case Instruction::SDiv:
3497 case Instruction::UDiv:
3498 case Instruction::SRem:
3499 case Instruction::URem: {
3500 InstructionCost ScalarCost =
3501 getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx);
3502 if (isSingleScalar())
3503 return ScalarCost;
3504
3505 // If any of the operands is from a different replicate region and has its
3506 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3507 // model to avoid cost mis-match.
3508 if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3509 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3510 if (!PredR)
3511 return false;
3512 return Ctx.skipCostComputation(
3513 UI: dyn_cast_or_null<Instruction>(
3514 Val: PredR->getOperand(N: 0)->getUnderlyingValue()),
3515 IsVector: VF.isVector());
3516 }))
3517 break;
3518
3519 ScalarCost = ScalarCost * VF.getFixedValue() +
3520 Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this),
3521 Operands: to_vector(Range: operands()), VF);
3522 // If the recipe is not predicated (i.e. not in a replicate region), return
3523 // the scalar cost. Otherwise handle predicated cost.
3524 if (!getRegion()->isReplicator())
3525 return ScalarCost;
3526
3527 // Account for the phi nodes that we will create.
3528 ScalarCost += VF.getFixedValue() *
3529 Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3530 // Scale the cost by the probability of executing the predicated blocks.
3531 // This assumes the predicated block for each vector lane is equally
3532 // likely.
3533 ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3534 return ScalarCost;
3535 }
3536 case Instruction::Load:
3537 case Instruction::Store: {
3538 bool IsLoad = UI->getOpcode() == Instruction::Load;
3539 const VPValue *PtrOp = getOperand(N: !IsLoad);
3540 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3541 if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3542 break;
3543
3544 Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0));
3545 Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp);
3546 const Align Alignment = getLoadStoreAlignment(I: UI);
3547 unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3548 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0));
3549 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3550 bool UsedByLoadStoreAddress =
3551 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this);
3552 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3553 Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo,
3554 I: UsedByLoadStoreAddress ? UI : nullptr);
3555
3556 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3557 InstructionCost ScalarCost =
3558 ScalarMemOpCost +
3559 Ctx.TTI.getAddressComputationCost(
3560 PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3561 CostKind: Ctx.CostKind);
3562 if (isSingleScalar())
3563 return ScalarCost;
3564
3565 SmallVector<const VPValue *> OpsToScalarize;
3566 Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3567 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3568 // don't assign scalarization overhead in general, if the target prefers
3569 // vectorized addressing or the loaded value is used as part of an address
3570 // of another load or store.
3571 if (!UsedByLoadStoreAddress) {
3572 bool EfficientVectorLoadStore =
3573 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3574 if (!(IsLoad && !PreferVectorizedAddressing) &&
3575 !(!IsLoad && EfficientVectorLoadStore))
3576 append_range(C&: OpsToScalarize, R: operands());
3577
3578 if (!EfficientVectorLoadStore)
3579 ResultTy = Ctx.Types.inferScalarType(V: this);
3580 }
3581
3582 TTI::VectorInstrContext VIC =
3583 IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3584 InstructionCost Cost =
3585 (ScalarCost * VF.getFixedValue()) +
3586 Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true);
3587
3588 const VPRegionBlock *ParentRegion = getRegion();
3589 if (ParentRegion && ParentRegion->isReplicator()) {
3590 // TODO: Handle loop-invariant pointers in predicated blocks. For now,
3591 // fall back to the legacy cost model.
3592 if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(S: PtrSCEV, L: Ctx.L))
3593 break;
3594 Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3595 Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Ctx.CostKind);
3596
3597 auto *VecI1Ty = VectorType::get(
3598 ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF);
3599 Cost += Ctx.TTI.getScalarizationOverhead(
3600 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
3601 /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind);
3602
3603 if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) {
3604 // Artificially setting to a high enough value to practically disable
3605 // vectorization with such operations.
3606 return 3000000;
3607 }
3608 }
3609 return Cost;
3610 }
3611 case Instruction::SExt:
3612 case Instruction::ZExt:
3613 case Instruction::FPToUI:
3614 case Instruction::FPToSI:
3615 case Instruction::FPExt:
3616 case Instruction::PtrToInt:
3617 case Instruction::PtrToAddr:
3618 case Instruction::IntToPtr:
3619 case Instruction::SIToFP:
3620 case Instruction::UIToFP:
3621 case Instruction::Trunc:
3622 case Instruction::FPTrunc:
3623 case Instruction::Select:
3624 case Instruction::AddrSpaceCast: {
3625 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3626 Ctx) *
3627 (isSingleScalar() ? 1 : VF.getFixedValue());
3628 }
3629 case Instruction::ExtractValue:
3630 case Instruction::InsertValue:
3631 return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3632 }
3633
3634 return Ctx.getLegacyCost(UI, VF);
3635}
3636
3637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3638void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3639 VPSlotTracker &SlotTracker) const {
3640 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3641
3642 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3643 printAsOperand(O, SlotTracker);
3644 O << " = ";
3645 }
3646 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3647 O << "call";
3648 printFlags(O);
3649 O << "@" << CB->getCalledFunction()->getName() << "(";
3650 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
3651 O, [&O, &SlotTracker](VPValue *Op) {
3652 Op->printAsOperand(O, SlotTracker);
3653 });
3654 O << ")";
3655 } else {
3656 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
3657 printFlags(O);
3658 printOperands(O, SlotTracker);
3659 }
3660
3661 if (shouldPack())
3662 O << " (S->V)";
3663}
3664#endif
3665
3666void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
3667 assert(State.Lane && "Branch on Mask works only on single instance.");
3668
3669 VPValue *BlockInMask = getOperand(N: 0);
3670 Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane);
3671
3672 // Replace the temporary unreachable terminator with a new conditional branch,
3673 // whose two destinations will be set later when they are created.
3674 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3675 assert(isa<UnreachableInst>(CurrentTerminator) &&
3676 "Expected to replace unreachable terminator with conditional branch.");
3677 auto CondBr =
3678 State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
3679 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
3680 CurrentTerminator->eraseFromParent();
3681}
3682
3683InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3684 VPCostContext &Ctx) const {
3685 // The legacy cost model doesn't assign costs to branches for individual
3686 // replicate regions. Match the current behavior in the VPlan cost model for
3687 // now.
3688 return 0;
3689}
3690
3691void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3692 assert(State.Lane && "Predicated instruction PHI works per instance.");
3693 Instruction *ScalarPredInst =
3694 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane));
3695 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3696 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3697 assert(PredicatingBB && "Predicated block has no single predecessor.");
3698 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3699 "operand must be VPReplicateRecipe");
3700
3701 // By current pack/unpack logic we need to generate only a single phi node: if
3702 // a vector value for the predicated instruction exists at this point it means
3703 // the instruction has vector users only, and a phi for the vector value is
3704 // needed. In this case the recipe of the predicated instruction is marked to
3705 // also do that packing, thereby "hoisting" the insert-element sequence.
3706 // Otherwise, a phi node for the scalar value is needed.
3707 if (State.hasVectorValue(Def: getOperand(N: 0))) {
3708 auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0)));
3709 assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
3710 "Packed operands must generate an insertelement or insertvalue");
3711
3712 // If VectorI is a struct, it will be a sequence like:
3713 // %1 = insertvalue %unmodified, %x, 0
3714 // %2 = insertvalue %1, %y, 1
3715 // %VectorI = insertvalue %2, %z, 2
3716 // To get the unmodified vector we need to look through the chain.
3717 if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType()))
3718 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3719 VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0));
3720
3721 PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2);
3722 VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector.
3723 VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element.
3724 if (State.hasVectorValue(Def: this))
3725 State.reset(Def: this, V: VPhi);
3726 else
3727 State.set(Def: this, V: VPhi);
3728 // NOTE: Currently we need to update the value of the operand, so the next
3729 // predicated iteration inserts its generated value in the correct vector.
3730 State.reset(Def: getOperand(N: 0), V: VPhi);
3731 } else {
3732 if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane())
3733 return;
3734
3735 Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
3736 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
3737 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3738 BB: PredicatingBB);
3739 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3740 if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3741 State.reset(Def: this, V: Phi, Lane: *State.Lane);
3742 else
3743 State.set(Def: this, V: Phi, Lane: *State.Lane);
3744 // NOTE: Currently we need to update the value of the operand, so the next
3745 // predicated iteration inserts its generated value in the correct vector.
3746 State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane);
3747 }
3748}
3749
3750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3751void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3752 VPSlotTracker &SlotTracker) const {
3753 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3754 printAsOperand(O, SlotTracker);
3755 O << " = ";
3756 printOperands(O, SlotTracker);
3757}
3758#endif
3759
3760InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3761 VPCostContext &Ctx) const {
3762 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3763 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3764 ->getAddressSpace();
3765 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3766 ? Instruction::Load
3767 : Instruction::Store;
3768
3769 if (!Consecutive) {
3770 // TODO: Using the original IR may not be accurate.
3771 // Currently, ARM will use the underlying IR to calculate gather/scatter
3772 // instruction cost.
3773 assert(!Reverse &&
3774 "Inconsecutive memory access should not have the order.");
3775
3776 const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3777 Type *PtrTy = Ptr->getType();
3778
3779 // If the address value is uniform across all lanes, then the address can be
3780 // calculated with scalar type and broadcast.
3781 if (!vputils::isSingleScalar(VPV: getAddr()))
3782 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
3783
3784 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather
3785 : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter
3786 : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather
3787 : Intrinsic::vp_scatter;
3788 return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
3789 CostKind: Ctx.CostKind) +
3790 Ctx.TTI.getMemIntrinsicInstrCost(
3791 MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
3792 &Ingredient),
3793 CostKind: Ctx.CostKind);
3794 }
3795
3796 InstructionCost Cost = 0;
3797 if (IsMasked) {
3798 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load
3799 : Intrinsic::masked_store;
3800 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3801 MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
3802 } else {
3803 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3804 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0)
3805 : getOperand(N: 1));
3806 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3807 OpdInfo: OpInfo, I: &Ingredient);
3808 }
3809 return Cost;
3810}
3811
3812void VPWidenLoadRecipe::execute(VPTransformState &State) {
3813 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3814 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3815 bool CreateGather = !isConsecutive();
3816
3817 auto &Builder = State.Builder;
3818 Value *Mask = nullptr;
3819 if (auto *VPMask = getMask()) {
3820 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3821 // of a null all-one mask is a null mask.
3822 Mask = State.get(Def: VPMask);
3823 if (isReverse())
3824 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3825 }
3826
3827 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
3828 Value *NewLI;
3829 if (CreateGather) {
3830 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3831 Name: "wide.masked.gather");
3832 } else if (Mask) {
3833 NewLI =
3834 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3835 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3836 } else {
3837 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3838 }
3839 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3840 State.set(Def: this, V: NewLI);
3841}
3842
3843#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3844void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3845 VPSlotTracker &SlotTracker) const {
3846 O << Indent << "WIDEN ";
3847 printAsOperand(O, SlotTracker);
3848 O << " = load ";
3849 printOperands(O, SlotTracker);
3850}
3851#endif
3852
3853/// Use all-true mask for reverse rather than actual mask, as it avoids a
3854/// dependence w/o affecting the result.
3855static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3856 Value *EVL, const Twine &Name) {
3857 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3858 Value *AllTrueMask =
3859 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3860 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3861 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3862}
3863
3864void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3865 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3866 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3867 bool CreateGather = !isConsecutive();
3868
3869 auto &Builder = State.Builder;
3870 CallInst *NewLI;
3871 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3872 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3873 Value *Mask = nullptr;
3874 if (VPValue *VPMask = getMask()) {
3875 Mask = State.get(Def: VPMask);
3876 if (isReverse())
3877 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3878 } else {
3879 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3880 }
3881
3882 if (CreateGather) {
3883 NewLI =
3884 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3885 FMFSource: nullptr, Name: "wide.masked.gather");
3886 } else {
3887 NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3888 Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3889 }
3890 NewLI->addParamAttr(
3891 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3892 applyMetadata(I&: *NewLI);
3893 Instruction *Res = NewLI;
3894 State.set(Def: this, V: Res);
3895}
3896
3897InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3898 VPCostContext &Ctx) const {
3899 if (!Consecutive || IsMasked)
3900 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3901
3902 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3903 // here because the EVL recipes using EVL to replace the tail mask. But in the
3904 // legacy model, it will always calculate the cost of mask.
3905 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3906 // don't need to compare to the legacy cost model.
3907 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3908 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3909 ->getAddressSpace();
3910 return Ctx.TTI.getMemIntrinsicInstrCost(
3911 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3912 CostKind: Ctx.CostKind);
3913}
3914
3915#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3916void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3917 VPSlotTracker &SlotTracker) const {
3918 O << Indent << "WIDEN ";
3919 printAsOperand(O, SlotTracker);
3920 O << " = vp.load ";
3921 printOperands(O, SlotTracker);
3922}
3923#endif
3924
3925void VPWidenStoreRecipe::execute(VPTransformState &State) {
3926 VPValue *StoredVPValue = getStoredValue();
3927 bool CreateScatter = !isConsecutive();
3928
3929 auto &Builder = State.Builder;
3930
3931 Value *Mask = nullptr;
3932 if (auto *VPMask = getMask()) {
3933 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3934 // of a null all-one mask is a null mask.
3935 Mask = State.get(Def: VPMask);
3936 if (isReverse())
3937 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3938 }
3939
3940 Value *StoredVal = State.get(Def: StoredVPValue);
3941 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
3942 Instruction *NewSI = nullptr;
3943 if (CreateScatter)
3944 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3945 else if (Mask)
3946 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3947 else
3948 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3949 applyMetadata(I&: *NewSI);
3950}
3951
3952#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3953void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3954 VPSlotTracker &SlotTracker) const {
3955 O << Indent << "WIDEN store ";
3956 printOperands(O, SlotTracker);
3957}
3958#endif
3959
3960void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3961 VPValue *StoredValue = getStoredValue();
3962 bool CreateScatter = !isConsecutive();
3963
3964 auto &Builder = State.Builder;
3965
3966 CallInst *NewSI = nullptr;
3967 Value *StoredVal = State.get(Def: StoredValue);
3968 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3969 Value *Mask = nullptr;
3970 if (VPValue *VPMask = getMask()) {
3971 Mask = State.get(Def: VPMask);
3972 if (isReverse())
3973 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3974 } else {
3975 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3976 }
3977 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3978 if (CreateScatter) {
3979 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3980 ID: Intrinsic::vp_scatter,
3981 Args: {StoredVal, Addr, Mask, EVL});
3982 } else {
3983 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3984 ID: Intrinsic::vp_store,
3985 Args: {StoredVal, Addr, Mask, EVL});
3986 }
3987 NewSI->addParamAttr(
3988 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3989 applyMetadata(I&: *NewSI);
3990}
3991
3992InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3993 VPCostContext &Ctx) const {
3994 if (!Consecutive || IsMasked)
3995 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3996
3997 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3998 // here because the EVL recipes using EVL to replace the tail mask. But in the
3999 // legacy model, it will always calculate the cost of mask.
4000 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4001 // don't need to compare to the legacy cost model.
4002 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
4003 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4004 ->getAddressSpace();
4005 return Ctx.TTI.getMemIntrinsicInstrCost(
4006 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4007 CostKind: Ctx.CostKind);
4008}
4009
4010#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4011void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4012 VPSlotTracker &SlotTracker) const {
4013 O << Indent << "WIDEN vp.store ";
4014 printOperands(O, SlotTracker);
4015}
4016#endif
4017
4018static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
4019 VectorType *DstVTy, const DataLayout &DL) {
4020 // Verify that V is a vector type with same number of elements as DstVTy.
4021 auto VF = DstVTy->getElementCount();
4022 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
4023 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4024 Type *SrcElemTy = SrcVecTy->getElementType();
4025 Type *DstElemTy = DstVTy->getElementType();
4026 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4027 "Vector elements must have same size");
4028
4029 // Do a direct cast if element types are castable.
4030 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
4031 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
4032 }
4033 // V cannot be directly casted to desired vector type.
4034 // May happen when V is a floating point vector but DstVTy is a vector of
4035 // pointers or vice-versa. Handle this using a two-step bitcast using an
4036 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4037 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4038 "Only one type should be a pointer type");
4039 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4040 "Only one type should be a floating point type");
4041 Type *IntTy =
4042 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
4043 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
4044 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
4045 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
4046}
4047
4048/// Return a vector containing interleaved elements from multiple
4049/// smaller input vectors.
4050static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
4051 const Twine &Name) {
4052 unsigned Factor = Vals.size();
4053 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4054
4055 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
4056#ifndef NDEBUG
4057 for (Value *Val : Vals)
4058 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4059#endif
4060
4061 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4062 // must use intrinsics to interleave.
4063 if (VecTy->isScalableTy()) {
4064 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4065 return Builder.CreateVectorInterleave(Ops: Vals, Name);
4066 }
4067
4068 // Fixed length. Start by concatenating all vectors into a wide vector.
4069 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
4070
4071 // Interleave the elements into the wide vector.
4072 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4073 return Builder.CreateShuffleVector(
4074 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
4075}
4076
4077// Try to vectorize the interleave group that \p Instr belongs to.
4078//
4079// E.g. Translate following interleaved load group (factor = 3):
4080// for (i = 0; i < N; i+=3) {
4081// R = Pic[i]; // Member of index 0
4082// G = Pic[i+1]; // Member of index 1
4083// B = Pic[i+2]; // Member of index 2
4084// ... // do something to R, G, B
4085// }
4086// To:
4087// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4088// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4089// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4090// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4091//
4092// Or translate following interleaved store group (factor = 3):
4093// for (i = 0; i < N; i+=3) {
4094// ... do something to R, G, B
4095// Pic[i] = R; // Member of index 0
4096// Pic[i+1] = G; // Member of index 1
4097// Pic[i+2] = B; // Member of index 2
4098// }
4099// To:
4100// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4101// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4102// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4103// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4104// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4105void VPInterleaveRecipe::execute(VPTransformState &State) {
4106 assert(!State.Lane && "Interleave group being replicated.");
4107 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4108 "Masking gaps for scalable vectors is not yet supported.");
4109 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4110 Instruction *Instr = Group->getInsertPos();
4111
4112 // Prepare for the vector type of the interleaved load/store.
4113 Type *ScalarTy = getLoadStoreType(I: Instr);
4114 unsigned InterleaveFactor = Group->getFactor();
4115 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
4116
4117 VPValue *BlockInMask = getMask();
4118 VPValue *Addr = getAddr();
4119 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4120
4121 auto CreateGroupMask = [&BlockInMask, &State,
4122 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4123 if (State.VF.isScalable()) {
4124 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4125 assert(InterleaveFactor <= 8 &&
4126 "Unsupported deinterleave factor for scalable vectors");
4127 auto *ResBlockInMask = State.get(Def: BlockInMask);
4128 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4129 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4130 }
4131
4132 if (!BlockInMask)
4133 return MaskForGaps;
4134
4135 Value *ResBlockInMask = State.get(Def: BlockInMask);
4136 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4137 V: ResBlockInMask,
4138 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4139 Name: "interleaved.mask");
4140 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4141 LHS: ShuffledMask, RHS: MaskForGaps)
4142 : ShuffledMask;
4143 };
4144
4145 const DataLayout &DL = Instr->getDataLayout();
4146 // Vectorize the interleaved load group.
4147 if (isa<LoadInst>(Val: Instr)) {
4148 Value *MaskForGaps = nullptr;
4149 if (needsMaskForGaps()) {
4150 MaskForGaps =
4151 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4152 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4153 }
4154
4155 Instruction *NewLoad;
4156 if (BlockInMask || MaskForGaps) {
4157 Value *GroupMask = CreateGroupMask(MaskForGaps);
4158 Value *PoisonVec = PoisonValue::get(T: VecTy);
4159 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4160 Alignment: Group->getAlign(), Mask: GroupMask,
4161 PassThru: PoisonVec, Name: "wide.masked.vec");
4162 } else
4163 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4164 Align: Group->getAlign(), Name: "wide.vec");
4165 applyMetadata(I&: *NewLoad);
4166 // TODO: Also manage existing metadata using VPIRMetadata.
4167 Group->addMetadata(NewInst: NewLoad);
4168
4169 ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4170 if (VecTy->isScalableTy()) {
4171 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4172 // so must use intrinsics to deinterleave.
4173 assert(InterleaveFactor <= 8 &&
4174 "Unsupported deinterleave factor for scalable vectors");
4175 NewLoad = State.Builder.CreateIntrinsic(
4176 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4177 Types: NewLoad->getType(), Args: NewLoad,
4178 /*FMFSource=*/nullptr, Name: "strided.vec");
4179 }
4180
4181 auto CreateStridedVector = [&InterleaveFactor, &State,
4182 &NewLoad](unsigned Index) -> Value * {
4183 assert(Index < InterleaveFactor && "Illegal group index");
4184 if (State.VF.isScalable())
4185 return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4186
4187 // For fixed length VF, use shuffle to extract the sub-vectors from the
4188 // wide load.
4189 auto StrideMask =
4190 createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4191 return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4192 Name: "strided.vec");
4193 };
4194
4195 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4196 Instruction *Member = Group->getMember(Index: I);
4197
4198 // Skip the gaps in the group.
4199 if (!Member)
4200 continue;
4201
4202 Value *StridedVec = CreateStridedVector(I);
4203
4204 // If this member has different type, cast the result type.
4205 if (Member->getType() != ScalarTy) {
4206 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4207 StridedVec =
4208 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4209 }
4210
4211 if (Group->isReverse())
4212 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4213
4214 State.set(Def: VPDefs[J], V: StridedVec);
4215 ++J;
4216 }
4217 return;
4218 }
4219
4220 // The sub vector type for current instruction.
4221 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4222
4223 // Vectorize the interleaved store group.
4224 Value *MaskForGaps =
4225 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4226 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4227 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4228 ArrayRef<VPValue *> StoredValues = getStoredValues();
4229 // Collect the stored vector from each member.
4230 SmallVector<Value *, 4> StoredVecs;
4231 unsigned StoredIdx = 0;
4232 for (unsigned i = 0; i < InterleaveFactor; i++) {
4233 assert((Group->getMember(i) || MaskForGaps) &&
4234 "Fail to get a member from an interleaved store group");
4235 Instruction *Member = Group->getMember(Index: i);
4236
4237 // Skip the gaps in the group.
4238 if (!Member) {
4239 Value *Undef = PoisonValue::get(T: SubVT);
4240 StoredVecs.push_back(Elt: Undef);
4241 continue;
4242 }
4243
4244 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4245 ++StoredIdx;
4246
4247 if (Group->isReverse())
4248 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4249
4250 // If this member has different type, cast it to a unified type.
4251
4252 if (StoredVec->getType() != SubVT)
4253 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4254
4255 StoredVecs.push_back(Elt: StoredVec);
4256 }
4257
4258 // Interleave all the smaller vectors into one wider vector.
4259 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4260 Instruction *NewStoreInstr;
4261 if (BlockInMask || MaskForGaps) {
4262 Value *GroupMask = CreateGroupMask(MaskForGaps);
4263 NewStoreInstr = State.Builder.CreateMaskedStore(
4264 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4265 } else
4266 NewStoreInstr =
4267 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4268
4269 applyMetadata(I&: *NewStoreInstr);
4270 // TODO: Also manage existing metadata using VPIRMetadata.
4271 Group->addMetadata(NewInst: NewStoreInstr);
4272}
4273
4274#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4275void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4276 VPSlotTracker &SlotTracker) const {
4277 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4278 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4279 IG->getInsertPos()->printAsOperand(O, false);
4280 O << ", ";
4281 getAddr()->printAsOperand(O, SlotTracker);
4282 VPValue *Mask = getMask();
4283 if (Mask) {
4284 O << ", ";
4285 Mask->printAsOperand(O, SlotTracker);
4286 }
4287
4288 unsigned OpIdx = 0;
4289 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4290 if (!IG->getMember(i))
4291 continue;
4292 if (getNumStoreOperands() > 0) {
4293 O << "\n" << Indent << " store ";
4294 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
4295 O << " to index " << i;
4296 } else {
4297 O << "\n" << Indent << " ";
4298 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4299 O << " = load from index " << i;
4300 }
4301 ++OpIdx;
4302 }
4303}
4304#endif
4305
4306void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4307 assert(!State.Lane && "Interleave group being replicated.");
4308 assert(State.VF.isScalable() &&
4309 "Only support scalable VF for EVL tail-folding.");
4310 assert(!needsMaskForGaps() &&
4311 "Masking gaps for scalable vectors is not yet supported.");
4312 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4313 Instruction *Instr = Group->getInsertPos();
4314
4315 // Prepare for the vector type of the interleaved load/store.
4316 Type *ScalarTy = getLoadStoreType(I: Instr);
4317 unsigned InterleaveFactor = Group->getFactor();
4318 assert(InterleaveFactor <= 8 &&
4319 "Unsupported deinterleave/interleave factor for scalable vectors");
4320 ElementCount WideVF = State.VF * InterleaveFactor;
4321 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4322
4323 VPValue *Addr = getAddr();
4324 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4325 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4326 Value *InterleaveEVL = State.Builder.CreateMul(
4327 LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4328 /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true);
4329 LLVMContext &Ctx = State.Builder.getContext();
4330
4331 Value *GroupMask = nullptr;
4332 if (VPValue *BlockInMask = getMask()) {
4333 SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4334 GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4335 } else {
4336 GroupMask =
4337 State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4338 }
4339
4340 // Vectorize the interleaved load group.
4341 if (isa<LoadInst>(Val: Instr)) {
4342 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4343 RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4344 Name: "wide.vp.load");
4345 NewLoad->addParamAttr(ArgNo: 0,
4346 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4347
4348 applyMetadata(I&: *NewLoad);
4349 // TODO: Also manage existing metadata using VPIRMetadata.
4350 Group->addMetadata(NewInst: NewLoad);
4351
4352 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4353 // so must use intrinsics to deinterleave.
4354 NewLoad = State.Builder.CreateIntrinsic(
4355 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4356 Types: NewLoad->getType(), Args: NewLoad,
4357 /*FMFSource=*/nullptr, Name: "strided.vec");
4358
4359 const DataLayout &DL = Instr->getDataLayout();
4360 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4361 Instruction *Member = Group->getMember(Index: I);
4362 // Skip the gaps in the group.
4363 if (!Member)
4364 continue;
4365
4366 Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4367 // If this member has different type, cast the result type.
4368 if (Member->getType() != ScalarTy) {
4369 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4370 StridedVec =
4371 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4372 }
4373
4374 State.set(Def: getVPValue(I: J), V: StridedVec);
4375 ++J;
4376 }
4377 return;
4378 } // End for interleaved load.
4379
4380 // The sub vector type for current instruction.
4381 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4382 // Vectorize the interleaved store group.
4383 ArrayRef<VPValue *> StoredValues = getStoredValues();
4384 // Collect the stored vector from each member.
4385 SmallVector<Value *, 4> StoredVecs;
4386 const DataLayout &DL = Instr->getDataLayout();
4387 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4388 Instruction *Member = Group->getMember(Index: I);
4389 // Skip the gaps in the group.
4390 if (!Member) {
4391 StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4392 continue;
4393 }
4394
4395 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4396 // If this member has different type, cast it to a unified type.
4397 if (StoredVec->getType() != SubVT)
4398 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4399
4400 StoredVecs.push_back(Elt: StoredVec);
4401 ++StoredIdx;
4402 }
4403
4404 // Interleave all the smaller vectors into one wider vector.
4405 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4406 CallInst *NewStore =
4407 State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4408 Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4409 NewStore->addParamAttr(ArgNo: 1,
4410 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4411
4412 applyMetadata(I&: *NewStore);
4413 // TODO: Also manage existing metadata using VPIRMetadata.
4414 Group->addMetadata(NewInst: NewStore);
4415}
4416
4417#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4418void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4419 VPSlotTracker &SlotTracker) const {
4420 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4421 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4422 IG->getInsertPos()->printAsOperand(O, false);
4423 O << ", ";
4424 getAddr()->printAsOperand(O, SlotTracker);
4425 O << ", ";
4426 getEVL()->printAsOperand(O, SlotTracker);
4427 if (VPValue *Mask = getMask()) {
4428 O << ", ";
4429 Mask->printAsOperand(O, SlotTracker);
4430 }
4431
4432 unsigned OpIdx = 0;
4433 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4434 if (!IG->getMember(i))
4435 continue;
4436 if (getNumStoreOperands() > 0) {
4437 O << "\n" << Indent << " vp.store ";
4438 getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
4439 O << " to index " << i;
4440 } else {
4441 O << "\n" << Indent << " ";
4442 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4443 O << " = vp.load from index " << i;
4444 }
4445 ++OpIdx;
4446 }
4447}
4448#endif
4449
4450InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4451 VPCostContext &Ctx) const {
4452 Instruction *InsertPos = getInsertPos();
4453 // Find the VPValue index of the interleave group. We need to skip gaps.
4454 unsigned InsertPosIdx = 0;
4455 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4456 if (auto *Member = IG->getMember(Index: Idx)) {
4457 if (Member == InsertPos)
4458 break;
4459 InsertPosIdx++;
4460 }
4461 Type *ValTy = Ctx.Types.inferScalarType(
4462 V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx)
4463 : getStoredValues()[InsertPosIdx]);
4464 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4465 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4466 ->getAddressSpace();
4467
4468 unsigned InterleaveFactor = IG->getFactor();
4469 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4470
4471 // Holds the indices of existing members in the interleaved group.
4472 SmallVector<unsigned, 4> Indices;
4473 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4474 if (IG->getMember(Index: IF))
4475 Indices.push_back(Elt: IF);
4476
4477 // Calculate the cost of the whole interleaved group.
4478 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4479 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4480 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4481
4482 if (!IG->isReverse())
4483 return Cost;
4484
4485 return Cost + IG->getNumMembers() *
4486 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4487 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4488 Index: 0);
4489}
4490
4491#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4492void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4493 VPSlotTracker &SlotTracker) const {
4494 O << Indent << "EMIT ";
4495 printAsOperand(O, SlotTracker);
4496 O << " = CANONICAL-INDUCTION ";
4497 printOperands(O, SlotTracker);
4498}
4499#endif
4500
4501bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4502 return vputils::onlyScalarValuesUsed(Def: this) &&
4503 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
4504}
4505
4506#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4507void VPWidenPointerInductionRecipe::printRecipe(
4508 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4509 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4510 "unexpected number of operands");
4511 O << Indent << "EMIT ";
4512 printAsOperand(O, SlotTracker);
4513 O << " = WIDEN-POINTER-INDUCTION ";
4514 getStartValue()->printAsOperand(O, SlotTracker);
4515 O << ", ";
4516 getStepValue()->printAsOperand(O, SlotTracker);
4517 O << ", ";
4518 getOperand(2)->printAsOperand(O, SlotTracker);
4519 if (getNumOperands() == 5) {
4520 O << ", ";
4521 getOperand(3)->printAsOperand(O, SlotTracker);
4522 O << ", ";
4523 getOperand(4)->printAsOperand(O, SlotTracker);
4524 }
4525}
4526
4527void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4528 VPSlotTracker &SlotTracker) const {
4529 O << Indent << "EMIT ";
4530 printAsOperand(O, SlotTracker);
4531 O << " = EXPAND SCEV " << *Expr;
4532}
4533#endif
4534
4535void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
4536 Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
4537 Type *STy = CanonicalIV->getType();
4538 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4539 ElementCount VF = State.VF;
4540 Value *VStart = VF.isScalar()
4541 ? CanonicalIV
4542 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
4543 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: *this));
4544 if (VF.isVector()) {
4545 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
4546 VStep =
4547 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
4548 }
4549 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
4550 State.set(Def: this, V: CanonicalVectorIV);
4551}
4552
4553#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4554void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4555 VPSlotTracker &SlotTracker) const {
4556 O << Indent << "EMIT ";
4557 printAsOperand(O, SlotTracker);
4558 O << " = WIDEN-CANONICAL-INDUCTION ";
4559 printOperands(O, SlotTracker);
4560}
4561#endif
4562
4563void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4564 auto &Builder = State.Builder;
4565 // Create a vector from the initial value.
4566 auto *VectorInit = getStartValue()->getLiveInIRValue();
4567
4568 Type *VecTy = State.VF.isScalar()
4569 ? VectorInit->getType()
4570 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4571
4572 BasicBlock *VectorPH =
4573 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4574 if (State.VF.isVector()) {
4575 auto *IdxTy = Builder.getInt32Ty();
4576 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
4577 IRBuilder<>::InsertPointGuard Guard(Builder);
4578 Builder.SetInsertPoint(VectorPH->getTerminator());
4579 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4580 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4581 VectorInit = Builder.CreateInsertElement(
4582 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4583 }
4584
4585 // Create a phi node for the new recurrence.
4586 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
4587 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4588 Phi->addIncoming(V: VectorInit, BB: VectorPH);
4589 State.set(Def: this, V: Phi);
4590}
4591
4592InstructionCost
4593VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4594 VPCostContext &Ctx) const {
4595 if (VF.isScalar())
4596 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4597
4598 return 0;
4599}
4600
4601#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4602void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4603 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4604 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4605 printAsOperand(O, SlotTracker);
4606 O << " = phi ";
4607 printOperands(O, SlotTracker);
4608}
4609#endif
4610
4611void VPReductionPHIRecipe::execute(VPTransformState &State) {
4612 // Reductions do not have to start at zero. They can start with
4613 // any loop invariant values.
4614 VPValue *StartVPV = getStartValue();
4615
4616 // In order to support recurrences we need to be able to vectorize Phi nodes.
4617 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4618 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4619 // this value when we vectorize all of the instructions that use the PHI.
4620 BasicBlock *VectorPH =
4621 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4622 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4623 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4624 Type *VecTy = StartV->getType();
4625
4626 BasicBlock *HeaderBB = State.CFG.PrevBB;
4627 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4628 "recipe must be in the vector loop header");
4629 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
4630 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4631 State.set(Def: this, V: Phi, IsScalar: isInLoop());
4632
4633 Phi->addIncoming(V: StartV, BB: VectorPH);
4634}
4635
4636#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4637void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4638 VPSlotTracker &SlotTracker) const {
4639 O << Indent << "WIDEN-REDUCTION-PHI ";
4640
4641 printAsOperand(O, SlotTracker);
4642 O << " = phi";
4643 printFlags(O);
4644 printOperands(O, SlotTracker);
4645 if (getVFScaleFactor() > 1)
4646 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4647}
4648#endif
4649
4650void VPWidenPHIRecipe::execute(VPTransformState &State) {
4651 Value *Op0 = State.get(Def: getOperand(N: 0));
4652 Type *VecTy = Op0->getType();
4653 Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name);
4654 State.set(Def: this, V: VecPhi);
4655}
4656
4657InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4658 VPCostContext &Ctx) const {
4659 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4660}
4661
4662#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4663void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4664 VPSlotTracker &SlotTracker) const {
4665 O << Indent << "WIDEN-PHI ";
4666
4667 printAsOperand(O, SlotTracker);
4668 O << " = phi ";
4669 printPhiOperands(O, SlotTracker);
4670}
4671#endif
4672
4673void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4674 BasicBlock *VectorPH =
4675 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4676 Value *StartMask = State.get(Def: getOperand(N: 0));
4677 PHINode *Phi =
4678 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
4679 Phi->addIncoming(V: StartMask, BB: VectorPH);
4680 State.set(Def: this, V: Phi);
4681}
4682
4683#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4684void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4685 VPSlotTracker &SlotTracker) const {
4686 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4687
4688 printAsOperand(O, SlotTracker);
4689 O << " = phi ";
4690 printOperands(O, SlotTracker);
4691}
4692#endif
4693
4694#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4695void VPCurrentIterationPHIRecipe::printRecipe(
4696 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4697 O << Indent << "CURRENT-ITERATION-PHI ";
4698
4699 printAsOperand(O, SlotTracker);
4700 O << " = phi ";
4701 printOperands(O, SlotTracker);
4702}
4703#endif
4704