1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include "llvm/Transforms/Utils/LoopUtils.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(Val: this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(Val: this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCanonicalIVPHISC:
78 case VPBranchOnMaskSC:
79 case VPDerivedIVSC:
80 case VPFirstOrderRecurrencePHISC:
81 case VPReductionPHISC:
82 case VPScalarIVStepsSC:
83 case VPPredInstPHISC:
84 return false;
85 case VPBlendSC:
86 case VPReductionEVLSC:
87 case VPReductionSC:
88 case VPVectorPointerSC:
89 case VPWidenCanonicalIVSC:
90 case VPWidenCastSC:
91 case VPWidenGEPSC:
92 case VPWidenIntOrFpInductionSC:
93 case VPWidenLoadEVLSC:
94 case VPWidenLoadSC:
95 case VPWidenPHISC:
96 case VPWidenPointerInductionSC:
97 case VPWidenSC: {
98 const Instruction *I =
99 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
100 (void)I;
101 assert((!I || !I->mayWriteToMemory()) &&
102 "underlying instruction may write to memory");
103 return false;
104 }
105 default:
106 return true;
107 }
108}
109
110bool VPRecipeBase::mayReadFromMemory() const {
111 switch (getVPRecipeID()) {
112 case VPExpressionSC:
113 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
114 case VPInstructionSC:
115 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
116 case VPWidenLoadEVLSC:
117 case VPWidenLoadSC:
118 return true;
119 case VPReplicateSC:
120 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
121 ->mayReadFromMemory();
122 case VPWidenCallSC:
123 return !cast<VPWidenCallRecipe>(Val: this)
124 ->getCalledScalarFunction()
125 ->onlyWritesMemory();
126 case VPWidenIntrinsicSC:
127 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
128 case VPBranchOnMaskSC:
129 case VPDerivedIVSC:
130 case VPFirstOrderRecurrencePHISC:
131 case VPReductionPHISC:
132 case VPPredInstPHISC:
133 case VPScalarIVStepsSC:
134 case VPWidenStoreEVLSC:
135 case VPWidenStoreSC:
136 return false;
137 case VPBlendSC:
138 case VPReductionEVLSC:
139 case VPReductionSC:
140 case VPVectorPointerSC:
141 case VPWidenCanonicalIVSC:
142 case VPWidenCastSC:
143 case VPWidenGEPSC:
144 case VPWidenIntOrFpInductionSC:
145 case VPWidenPHISC:
146 case VPWidenPointerInductionSC:
147 case VPWidenSC: {
148 const Instruction *I =
149 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
150 (void)I;
151 assert((!I || !I->mayReadFromMemory()) &&
152 "underlying instruction may read from memory");
153 return false;
154 }
155 default:
156 // FIXME: Return false if the recipe represents an interleaved store.
157 return true;
158 }
159}
160
161bool VPRecipeBase::mayHaveSideEffects() const {
162 switch (getVPRecipeID()) {
163 case VPExpressionSC:
164 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
165 case VPActiveLaneMaskPHISC:
166 case VPDerivedIVSC:
167 case VPFirstOrderRecurrencePHISC:
168 case VPReductionPHISC:
169 case VPPredInstPHISC:
170 case VPVectorEndPointerSC:
171 return false;
172 case VPInstructionSC: {
173 auto *VPI = cast<VPInstruction>(Val: this);
174 return mayWriteToMemory() ||
175 VPI->getOpcode() == VPInstruction::BranchOnCount ||
176 VPI->getOpcode() == VPInstruction::BranchOnCond ||
177 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
178 }
179 case VPWidenCallSC: {
180 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
181 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
182 }
183 case VPWidenIntrinsicSC:
184 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
185 case VPBlendSC:
186 case VPReductionEVLSC:
187 case VPReductionSC:
188 case VPScalarIVStepsSC:
189 case VPVectorPointerSC:
190 case VPWidenCanonicalIVSC:
191 case VPWidenCastSC:
192 case VPWidenGEPSC:
193 case VPWidenIntOrFpInductionSC:
194 case VPWidenPHISC:
195 case VPWidenPointerInductionSC:
196 case VPWidenSC: {
197 const Instruction *I =
198 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
199 (void)I;
200 assert((!I || !I->mayHaveSideEffects()) &&
201 "underlying instruction has side-effects");
202 return false;
203 }
204 case VPInterleaveEVLSC:
205 case VPInterleaveSC:
206 return mayWriteToMemory();
207 case VPWidenLoadEVLSC:
208 case VPWidenLoadSC:
209 case VPWidenStoreEVLSC:
210 case VPWidenStoreSC:
211 assert(
212 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
213 mayWriteToMemory() &&
214 "mayHaveSideffects result for ingredient differs from this "
215 "implementation");
216 return mayWriteToMemory();
217 case VPReplicateSC: {
218 auto *R = cast<VPReplicateRecipe>(Val: this);
219 return R->getUnderlyingInstr()->mayHaveSideEffects();
220 }
221 default:
222 return true;
223 }
224}
225
226void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
227 assert(!Parent && "Recipe already in some VPBasicBlock");
228 assert(InsertPos->getParent() &&
229 "Insertion position not in any VPBasicBlock");
230 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
231}
232
233void VPRecipeBase::insertBefore(VPBasicBlock &BB,
234 iplist<VPRecipeBase>::iterator I) {
235 assert(!Parent && "Recipe already in some VPBasicBlock");
236 assert(I == BB.end() || I->getParent() == &BB);
237 BB.insert(Recipe: this, InsertPt: I);
238}
239
240void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
241 assert(!Parent && "Recipe already in some VPBasicBlock");
242 assert(InsertPos->getParent() &&
243 "Insertion position not in any VPBasicBlock");
244 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
245}
246
247void VPRecipeBase::removeFromParent() {
248 assert(getParent() && "Recipe not in any VPBasicBlock");
249 getParent()->getRecipeList().remove(IT: getIterator());
250 Parent = nullptr;
251}
252
253iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
254 assert(getParent() && "Recipe not in any VPBasicBlock");
255 return getParent()->getRecipeList().erase(where: getIterator());
256}
257
258void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
259 removeFromParent();
260 insertAfter(InsertPos);
261}
262
263void VPRecipeBase::moveBefore(VPBasicBlock &BB,
264 iplist<VPRecipeBase>::iterator I) {
265 removeFromParent();
266 insertBefore(BB, I);
267}
268
269InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
270 // Get the underlying instruction for the recipe, if there is one. It is used
271 // to
272 // * decide if cost computation should be skipped for this recipe,
273 // * apply forced target instruction cost.
274 Instruction *UI = nullptr;
275 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
276 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
277 else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this))
278 UI = IG->getInsertPos();
279 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
280 UI = &WidenMem->getIngredient();
281
282 InstructionCost RecipeCost;
283 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
284 RecipeCost = 0;
285 } else {
286 RecipeCost = computeCost(VF, Ctx);
287 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
288 RecipeCost.isValid()) {
289 if (UI)
290 RecipeCost = InstructionCost(ForceTargetInstructionCost);
291 else
292 RecipeCost = InstructionCost(0);
293 }
294 }
295
296 LLVM_DEBUG({
297 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
298 dump();
299 });
300 return RecipeCost;
301}
302
303InstructionCost VPRecipeBase::computeCost(ElementCount VF,
304 VPCostContext &Ctx) const {
305 llvm_unreachable("subclasses should implement computeCost");
306}
307
308bool VPRecipeBase::isPhi() const {
309 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
310 isa<VPPhi, VPIRPhi>(Val: this);
311}
312
313bool VPRecipeBase::isScalarCast() const {
314 auto *VPI = dyn_cast<VPInstruction>(Val: this);
315 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
316}
317
318void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
319 assert(OpType == Other.OpType && "OpType must match");
320 switch (OpType) {
321 case OperationType::OverflowingBinOp:
322 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
323 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
324 break;
325 case OperationType::Trunc:
326 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
327 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
328 break;
329 case OperationType::DisjointOp:
330 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
331 break;
332 case OperationType::PossiblyExactOp:
333 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
334 break;
335 case OperationType::GEPOp:
336 GEPFlags &= Other.GEPFlags;
337 break;
338 case OperationType::FPMathOp:
339 case OperationType::FCmp:
340 assert((OpType != OperationType::FCmp ||
341 FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
342 "Cannot drop CmpPredicate");
343 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
344 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
345 break;
346 case OperationType::NonNegOp:
347 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
348 break;
349 case OperationType::Cmp:
350 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
351 break;
352 case OperationType::ReductionOp:
353 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
354 "Cannot change RecurKind");
355 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
356 "Cannot change IsOrdered");
357 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
358 "Cannot change IsInLoop");
359 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
360 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
361 break;
362 case OperationType::Other:
363 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
364 break;
365 }
366}
367
368FastMathFlags VPIRFlags::getFastMathFlags() const {
369 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
370 OpType == OperationType::ReductionOp) &&
371 "recipe doesn't have fast math flags");
372 const FastMathFlagsTy &F = getFMFsRef();
373 FastMathFlags Res;
374 Res.setAllowReassoc(F.AllowReassoc);
375 Res.setNoNaNs(F.NoNaNs);
376 Res.setNoInfs(F.NoInfs);
377 Res.setNoSignedZeros(F.NoSignedZeros);
378 Res.setAllowReciprocal(F.AllowReciprocal);
379 Res.setAllowContract(F.AllowContract);
380 Res.setApproxFunc(F.ApproxFunc);
381 return Res;
382}
383
384#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
385void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
386
387void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
388 VPSlotTracker &SlotTracker) const {
389 printRecipe(O, Indent, SlotTracker);
390 if (auto DL = getDebugLoc()) {
391 O << ", !dbg ";
392 DL.print(O);
393 }
394
395 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
396 Metadata->print(O, SlotTracker);
397}
398#endif
399
400template <unsigned PartOpIdx>
401VPValue *
402VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
403 if (U.getNumOperands() == PartOpIdx + 1)
404 return U.getOperand(N: PartOpIdx);
405 return nullptr;
406}
407
408template <unsigned PartOpIdx>
409unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
410 if (auto *UnrollPartOp = getUnrollPartOperand(U))
411 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
412 return 0;
413}
414
415namespace llvm {
416template class VPUnrollPartAccessor<1>;
417template class VPUnrollPartAccessor<2>;
418template class VPUnrollPartAccessor<3>;
419}
420
421VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
422 const VPIRFlags &Flags, const VPIRMetadata &MD,
423 DebugLoc DL, const Twine &Name)
424 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
425 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
426 assert(flagsValidForOpcode(getOpcode()) &&
427 "Set flags not supported for the provided opcode");
428 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
429 "Opcode requires specific flags to be set");
430 assert((getNumOperandsForOpcode(Opcode) == -1u ||
431 getNumOperandsForOpcode(Opcode) == getNumOperands()) &&
432 "number of operands does not match opcode");
433}
434
435#ifndef NDEBUG
436unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
437 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
438 return 1;
439
440 if (Instruction::isBinaryOp(Opcode))
441 return 2;
442
443 switch (Opcode) {
444 case VPInstruction::StepVector:
445 case VPInstruction::VScale:
446 return 0;
447 case Instruction::Alloca:
448 case Instruction::ExtractValue:
449 case Instruction::Freeze:
450 case Instruction::Load:
451 case VPInstruction::BranchOnCond:
452 case VPInstruction::Broadcast:
453 case VPInstruction::BuildStructVector:
454 case VPInstruction::BuildVector:
455 case VPInstruction::CalculateTripCountMinusVF:
456 case VPInstruction::CanonicalIVIncrementForPart:
457 case VPInstruction::ComputeReductionResult:
458 case VPInstruction::ExplicitVectorLength:
459 case VPInstruction::ExtractLastLane:
460 case VPInstruction::ExtractLastPart:
461 case VPInstruction::ExtractPenultimateElement:
462 case VPInstruction::Not:
463 case VPInstruction::ResumeForEpilogue:
464 case VPInstruction::Reverse:
465 case VPInstruction::Unpack:
466 return 1;
467 case Instruction::ICmp:
468 case Instruction::FCmp:
469 case Instruction::ExtractElement:
470 case Instruction::Store:
471 case VPInstruction::BranchOnCount:
472 case VPInstruction::BranchOnTwoConds:
473 case VPInstruction::FirstOrderRecurrenceSplice:
474 case VPInstruction::LogicalAnd:
475 case VPInstruction::PtrAdd:
476 case VPInstruction::WidePtrAdd:
477 case VPInstruction::WideIVStep:
478 return 2;
479 case Instruction::Select:
480 case VPInstruction::ActiveLaneMask:
481 case VPInstruction::ComputeAnyOfResult:
482 case VPInstruction::ReductionStartVector:
483 case VPInstruction::ExtractLastActive:
484 return 3;
485 case Instruction::Call:
486 case Instruction::GetElementPtr:
487 case Instruction::PHI:
488 case Instruction::Switch:
489 case VPInstruction::AnyOf:
490 case VPInstruction::FirstActiveLane:
491 case VPInstruction::LastActiveLane:
492 case VPInstruction::SLPLoad:
493 case VPInstruction::SLPStore:
494 case VPInstruction::ExtractLane:
495 // Cannot determine the number of operands from the opcode.
496 return -1u;
497 }
498 llvm_unreachable("all cases should be handled above");
499}
500#endif
501
502bool VPInstruction::doesGeneratePerAllLanes() const {
503 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
504}
505
506bool VPInstruction::canGenerateScalarForFirstLane() const {
507 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
508 return true;
509 if (isSingleScalar() || isVectorToScalar())
510 return true;
511 switch (Opcode) {
512 case Instruction::Freeze:
513 case Instruction::ICmp:
514 case Instruction::PHI:
515 case Instruction::Select:
516 case VPInstruction::BranchOnCond:
517 case VPInstruction::BranchOnTwoConds:
518 case VPInstruction::BranchOnCount:
519 case VPInstruction::CalculateTripCountMinusVF:
520 case VPInstruction::CanonicalIVIncrementForPart:
521 case VPInstruction::PtrAdd:
522 case VPInstruction::ExplicitVectorLength:
523 case VPInstruction::AnyOf:
524 case VPInstruction::Not:
525 return true;
526 default:
527 return false;
528 }
529}
530
531Value *VPInstruction::generate(VPTransformState &State) {
532 IRBuilderBase &Builder = State.Builder;
533
534 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
535 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
536 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
537 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
538 auto *Res =
539 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
540 if (auto *I = dyn_cast<Instruction>(Val: Res))
541 applyFlags(I&: *I);
542 return Res;
543 }
544
545 switch (getOpcode()) {
546 case VPInstruction::Not: {
547 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
548 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
549 return Builder.CreateNot(V: A, Name);
550 }
551 case Instruction::ExtractElement: {
552 assert(State.VF.isVector() && "Only extract elements from vectors");
553 if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1)))
554 return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue()));
555 Value *Vec = State.get(Def: getOperand(N: 0));
556 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
557 return Builder.CreateExtractElement(Vec, Idx, Name);
558 }
559 case Instruction::Freeze: {
560 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
561 return Builder.CreateFreeze(V: Op, Name);
562 }
563 case Instruction::FCmp:
564 case Instruction::ICmp: {
565 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
566 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
567 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
568 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
569 }
570 case Instruction::PHI: {
571 llvm_unreachable("should be handled by VPPhi::execute");
572 }
573 case Instruction::Select: {
574 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
575 Value *Cond =
576 State.get(Def: getOperand(N: 0),
577 IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0)));
578 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
579 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
580 return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name);
581 }
582 case VPInstruction::ActiveLaneMask: {
583 // Get first lane of vector induction variable.
584 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
585 // Get the original loop tripcount.
586 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
587
588 // If this part of the active lane mask is scalar, generate the CMP directly
589 // to avoid unnecessary extracts.
590 if (State.VF.isScalar())
591 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
592 Name);
593
594 ElementCount EC = State.VF.multiplyCoefficientBy(
595 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
596 auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
597 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
598 Types: {PredTy, ScalarTC->getType()},
599 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
600 }
601 case VPInstruction::FirstOrderRecurrenceSplice: {
602 // Generate code to combine the previous and current values in vector v3.
603 //
604 // vector.ph:
605 // v_init = vector(..., ..., ..., a[-1])
606 // br vector.body
607 //
608 // vector.body
609 // i = phi [0, vector.ph], [i+4, vector.body]
610 // v1 = phi [v_init, vector.ph], [v2, vector.body]
611 // v2 = a[i, i+1, i+2, i+3];
612 // v3 = vector(v1(3), v2(0, 1, 2))
613
614 auto *V1 = State.get(Def: getOperand(N: 0));
615 if (!V1->getType()->isVectorTy())
616 return V1;
617 Value *V2 = State.get(Def: getOperand(N: 1));
618 return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name);
619 }
620 case VPInstruction::CalculateTripCountMinusVF: {
621 unsigned UF = getParent()->getPlan()->getUF();
622 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
623 Value *Step = createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: UF);
624 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step);
625 Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step);
626 Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
627 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
628 }
629 case VPInstruction::ExplicitVectorLength: {
630 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
631 // be outside of the main loop.
632 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
633 // Compute EVL
634 assert(AVL->getType()->isIntegerTy() &&
635 "Requested vector length should be an integer.");
636
637 assert(State.VF.isScalable() && "Expected scalable vector factor.");
638 Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
639
640 Value *EVL = Builder.CreateIntrinsic(
641 RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
642 Args: {AVL, VFArg, Builder.getTrue()});
643 return EVL;
644 }
645 case VPInstruction::CanonicalIVIncrementForPart: {
646 unsigned Part = getUnrollPart(U: *this);
647 auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
648 assert(Part != 0 && "Must have a positive part");
649 // The canonical IV is incremented by the vectorization factor (num of
650 // SIMD elements) times the unroll part.
651 Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part);
652 return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(),
653 HasNSW: hasNoSignedWrap());
654 }
655 case VPInstruction::BranchOnCond: {
656 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
657 // Replace the temporary unreachable terminator with a new conditional
658 // branch, hooking it up to backward destination for latch blocks now, and
659 // to forward destination(s) later when they are created.
660 // Second successor may be backwards - iff it is already in VPBB2IRBB.
661 VPBasicBlock *SecondVPSucc =
662 cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]);
663 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
664 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
665 auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
666 // First successor is always forward, reset it to nullptr.
667 Br->setSuccessor(idx: 0, NewSucc: nullptr);
668 IRBB->getTerminator()->eraseFromParent();
669 applyMetadata(I&: *Br);
670 return Br;
671 }
672 case VPInstruction::Broadcast: {
673 return Builder.CreateVectorSplat(
674 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
675 }
676 case VPInstruction::BuildStructVector: {
677 // For struct types, we need to build a new 'wide' struct type, where each
678 // element is widened, i.e., we create a struct of vectors.
679 auto *StructTy =
680 cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)));
681 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
682 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
683 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
684 FieldIndex++) {
685 Value *ScalarValue =
686 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
687 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
688 VectorValue =
689 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
690 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
691 }
692 }
693 return Res;
694 }
695 case VPInstruction::BuildVector: {
696 auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
697 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
698 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
699 for (const auto &[Idx, Op] : enumerate(First: operands()))
700 Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
701 Idx: Builder.getInt32(C: Idx));
702 return Res;
703 }
704 case VPInstruction::ReductionStartVector: {
705 if (State.VF.isScalar())
706 return State.get(Def: getOperand(N: 0), IsScalar: true);
707 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
708 Builder.setFastMathFlags(getFastMathFlags());
709 // If this start vector is scaled then it should produce a vector with fewer
710 // elements than the VF.
711 ElementCount VF = State.VF.divideCoefficientBy(
712 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
713 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
714 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
715 Idx: Builder.getInt32(C: 0));
716 }
717 case VPInstruction::ComputeAnyOfResult: {
718 Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
719 Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
720 Value *ReducedResult = State.get(Def: getOperand(N: 2));
721 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
722 ReducedResult =
723 Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)),
724 RHS: ReducedResult, Name: "bin.rdx");
725 // If any predicate is true it means that we want to select the new value.
726 if (ReducedResult->getType()->isVectorTy())
727 ReducedResult = Builder.CreateOrReduce(Src: ReducedResult);
728 // The compares in the loop may yield poison, which propagates through the
729 // bitwise ORs. Freeze it here before the condition is used.
730 ReducedResult = Builder.CreateFreeze(V: ReducedResult);
731 return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select");
732 }
733 case VPInstruction::ComputeReductionResult: {
734 RecurKind RK = getRecurKind();
735 bool IsOrdered = isReductionOrdered();
736 bool IsInLoop = isReductionInLoop();
737 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
738 "FindIV should use min/max reduction kinds");
739
740 // The recipe may have multiple operands to be reduced together.
741 unsigned NumOperandsToReduce = getNumOperands();
742 VectorParts RdxParts(NumOperandsToReduce);
743 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
744 RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
745
746 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
747 if (hasFastMathFlags())
748 Builder.setFastMathFlags(getFastMathFlags());
749
750 // Reduce multiple operands into one.
751 Value *ReducedPartRdx = RdxParts[0];
752 if (IsOrdered) {
753 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
754 } else {
755 // Floating-point operations should have some FMF to enable the reduction.
756 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
757 Value *RdxPart = RdxParts[Part];
758 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
759 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
760 else {
761 // For sub-recurrences, each part's reduction variable is already
762 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
763 Instruction::BinaryOps Opcode =
764 RK == RecurKind::Sub
765 ? Instruction::Add
766 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
767 ReducedPartRdx =
768 Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
769 }
770 }
771 }
772
773 // Create the reduction after the loop. Note that inloop reductions create
774 // the target reduction in the loop using a Reduction recipe.
775 if (State.VF.isVector() && !IsInLoop) {
776 // TODO: Support in-order reductions based on the recurrence descriptor.
777 // All ops in the reduction inherit fast-math-flags from the recurrence
778 // descriptor.
779 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
780 }
781
782 return ReducedPartRdx;
783 }
784 case VPInstruction::ExtractLastLane:
785 case VPInstruction::ExtractPenultimateElement: {
786 unsigned Offset =
787 getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
788 Value *Res;
789 if (State.VF.isVector()) {
790 assert(Offset <= State.VF.getKnownMinValue() &&
791 "invalid offset to extract from");
792 // Extract lane VF - Offset from the operand.
793 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
794 } else {
795 // TODO: Remove ExtractLastLane for scalar VFs.
796 assert(Offset <= 1 && "invalid offset to extract from");
797 Res = State.get(Def: getOperand(N: 0));
798 }
799 if (isa<ExtractElementInst>(Val: Res))
800 Res->setName(Name);
801 return Res;
802 }
803 case VPInstruction::LogicalAnd: {
804 Value *A = State.get(Def: getOperand(N: 0));
805 Value *B = State.get(Def: getOperand(N: 1));
806 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
807 }
808 case VPInstruction::PtrAdd: {
809 assert(vputils::onlyFirstLaneUsed(this) &&
810 "can only generate first lane for PtrAdd");
811 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
812 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
813 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
814 }
815 case VPInstruction::WidePtrAdd: {
816 Value *Ptr =
817 State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0)));
818 Value *Addend = State.get(Def: getOperand(N: 1));
819 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
820 }
821 case VPInstruction::AnyOf: {
822 Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0)));
823 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
824 Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
825 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
826 }
827 case VPInstruction::ExtractLane: {
828 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
829 "simplified to ExtractElement.");
830 Value *LaneToExtract = State.get(Def: getOperand(N: 0), IsScalar: true);
831 Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
832 Value *Res = nullptr;
833 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
834
835 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
836 Value *VectorStart =
837 Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1));
838 Value *VectorIdx = Idx == 1
839 ? LaneToExtract
840 : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
841 Value *Ext = State.VF.isScalar()
842 ? State.get(Def: getOperand(N: Idx))
843 : Builder.CreateExtractElement(
844 Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
845 if (Res) {
846 Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
847 Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
848 } else {
849 Res = Ext;
850 }
851 }
852 return Res;
853 }
854 case VPInstruction::FirstActiveLane: {
855 if (getNumOperands() == 1) {
856 Value *Mask = State.get(Def: getOperand(N: 0));
857 return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
858 /*ZeroIsPoison=*/false, Name);
859 }
860 // If there are multiple operands, create a chain of selects to pick the
861 // first operand with an active lane and add the number of lanes of the
862 // preceding operands.
863 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF);
864 unsigned LastOpIdx = getNumOperands() - 1;
865 Value *Res = nullptr;
866 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
867 Value *TrailingZeros =
868 State.VF.isScalar()
869 ? Builder.CreateZExt(
870 V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
871 RHS: Builder.getFalse()),
872 DestTy: Builder.getInt64Ty())
873 : Builder.CreateCountTrailingZeroElems(
874 ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)),
875 /*ZeroIsPoison=*/false, Name);
876 Value *Current = Builder.CreateAdd(
877 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
878 if (Res) {
879 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
880 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
881 } else {
882 Res = Current;
883 }
884 }
885
886 return Res;
887 }
888 case VPInstruction::ResumeForEpilogue:
889 return State.get(Def: getOperand(N: 0), IsScalar: true);
890 case VPInstruction::Reverse:
891 return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse");
892 case VPInstruction::ExtractLastActive: {
893 Value *Data = State.get(Def: getOperand(N: 0));
894 Value *Mask = State.get(Def: getOperand(N: 1));
895 Value *Default = State.get(Def: getOperand(N: 2), /*IsScalar=*/true);
896 Type *VTy = Data->getType();
897 return Builder.CreateIntrinsic(
898 ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy},
899 Args: {Data, Mask, Default});
900 }
901 default:
902 llvm_unreachable("Unsupported opcode for instruction");
903 }
904}
905
906InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
907 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
908 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
909 Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
910 switch (Opcode) {
911 case Instruction::FNeg:
912 return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
913 case Instruction::UDiv:
914 case Instruction::SDiv:
915 case Instruction::SRem:
916 case Instruction::URem:
917 case Instruction::Add:
918 case Instruction::FAdd:
919 case Instruction::Sub:
920 case Instruction::FSub:
921 case Instruction::Mul:
922 case Instruction::FMul:
923 case Instruction::FDiv:
924 case Instruction::FRem:
925 case Instruction::Shl:
926 case Instruction::LShr:
927 case Instruction::AShr:
928 case Instruction::And:
929 case Instruction::Or:
930 case Instruction::Xor: {
931 TargetTransformInfo::OperandValueInfo RHSInfo = {
932 .Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None};
933
934 if (VF.isVector()) {
935 // Certain instructions can be cheaper to vectorize if they have a
936 // constant second vector operand. One example of this are shifts on x86.
937 VPValue *RHS = getOperand(N: 1);
938 RHSInfo = Ctx.getOperandInfo(V: RHS);
939
940 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
941 getOperand(N: 1)->isDefinedOutsideLoopRegions())
942 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
943 }
944
945 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
946 SmallVector<const Value *, 4> Operands;
947 if (CtxI)
948 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
949 return Ctx.TTI.getArithmeticInstrCost(
950 Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
951 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
952 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
953 }
954 case Instruction::Freeze:
955 // This opcode is unknown. Assume that it is the same as 'mul'.
956 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy,
957 CostKind: Ctx.CostKind);
958 case Instruction::ExtractValue:
959 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
960 CostKind: Ctx.CostKind);
961 case Instruction::ICmp:
962 case Instruction::FCmp: {
963 Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
964 Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
965 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
966 return Ctx.TTI.getCmpSelInstrCost(
967 Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
968 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
969 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
970 }
971 case Instruction::BitCast: {
972 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
973 if (ScalarTy->isPointerTy())
974 return 0;
975 [[fallthrough]];
976 }
977 case Instruction::SExt:
978 case Instruction::ZExt:
979 case Instruction::FPToUI:
980 case Instruction::FPToSI:
981 case Instruction::FPExt:
982 case Instruction::PtrToInt:
983 case Instruction::PtrToAddr:
984 case Instruction::IntToPtr:
985 case Instruction::SIToFP:
986 case Instruction::UIToFP:
987 case Instruction::Trunc:
988 case Instruction::FPTrunc:
989 case Instruction::AddrSpaceCast: {
990 // Computes the CastContextHint from a recipe that may access memory.
991 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
992 if (isa<VPInterleaveBase>(Val: R))
993 return TTI::CastContextHint::Interleave;
994 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
995 // Only compute CCH for memory operations, matching the legacy model
996 // which only considers loads/stores for cast context hints.
997 auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
998 if (!isa<LoadInst, StoreInst>(Val: UI))
999 return TTI::CastContextHint::None;
1000 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1001 : TTI::CastContextHint::Normal;
1002 }
1003 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1004 if (WidenMemoryRecipe == nullptr)
1005 return TTI::CastContextHint::None;
1006 if (VF.isScalar())
1007 return TTI::CastContextHint::Normal;
1008 if (!WidenMemoryRecipe->isConsecutive())
1009 return TTI::CastContextHint::GatherScatter;
1010 if (WidenMemoryRecipe->isReverse())
1011 return TTI::CastContextHint::Reversed;
1012 if (WidenMemoryRecipe->isMasked())
1013 return TTI::CastContextHint::Masked;
1014 return TTI::CastContextHint::Normal;
1015 };
1016
1017 VPValue *Operand = getOperand(N: 0);
1018 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1019 // For Trunc/FPTrunc, get the context from the only user.
1020 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1021 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1022 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1023 return nullptr;
1024 return dyn_cast<VPRecipeBase>(Val: *R->user_begin());
1025 };
1026 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1027 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue())))
1028 Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe));
1029 if (Recipe)
1030 CCH = ComputeCCH(Recipe);
1031 }
1032 }
1033 // For Z/Sext, get the context from the operand.
1034 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1035 Opcode == Instruction::FPExt) {
1036 if (auto *Recipe = Operand->getDefiningRecipe()) {
1037 VPValue *ReverseOp;
1038 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp))))
1039 Recipe = ReverseOp->getDefiningRecipe();
1040 if (Recipe)
1041 CCH = ComputeCCH(Recipe);
1042 }
1043 }
1044
1045 auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand);
1046 Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1047 // Arm TTI will use the underlying instruction to determine the cost.
1048 return Ctx.TTI.getCastInstrCost(
1049 Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1050 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1051 }
1052 case Instruction::Select: {
1053 SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1054 bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1055 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1056
1057 VPValue *Op0, *Op1;
1058 bool IsLogicalAnd =
1059 match(V: this, P: m_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1060 bool IsLogicalOr = match(V: this, P: m_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1061 // Also match the inverted forms:
1062 // select x, false, y --> !x & y (still AND)
1063 // select x, y, true --> !x | y (still OR)
1064 IsLogicalAnd |=
1065 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1066 IsLogicalOr |=
1067 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1068
1069 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1070 (IsLogicalAnd || IsLogicalOr)) {
1071 // select x, y, false --> x & y
1072 // select x, true, y --> x | y
1073 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1074 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1075
1076 SmallVector<const Value *, 2> Operands;
1077 if (SI && all_of(Range: operands(),
1078 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1079 append_range(C&: Operands, R: SI->operands());
1080 return Ctx.TTI.getArithmeticInstrCost(
1081 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1082 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1083 }
1084
1085 Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1086 if (!IsScalarCond)
1087 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1088
1089 llvm::CmpPredicate Pred;
1090 if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1091 if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0)))
1092 if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1093 Pred = Cmp->getPredicate();
1094 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1095 return Ctx.TTI.getCmpSelInstrCost(
1096 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1097 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1098 }
1099 }
1100 llvm_unreachable("called for unsupported opcode");
1101}
1102
1103InstructionCost VPInstruction::computeCost(ElementCount VF,
1104 VPCostContext &Ctx) const {
1105 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1106 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1107 // TODO: Compute cost for VPInstructions without underlying values once
1108 // the legacy cost model has been retired.
1109 return 0;
1110 }
1111
1112 assert(!doesGeneratePerAllLanes() &&
1113 "Should only generate a vector value or single scalar, not scalars "
1114 "for all lanes.");
1115 return getCostForRecipeWithOpcode(
1116 Opcode: getOpcode(),
1117 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1118 }
1119
1120 switch (getOpcode()) {
1121 case Instruction::Select: {
1122 llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1123 match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1124 auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1125 auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1));
1126 if (!vputils::onlyFirstLaneUsed(Def: this)) {
1127 CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1128 VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1129 }
1130 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1131 CostKind: Ctx.CostKind);
1132 }
1133 case Instruction::ExtractElement:
1134 case VPInstruction::ExtractLane: {
1135 if (VF.isScalar()) {
1136 // ExtractLane with VF=1 takes care of handling extracting across multiple
1137 // parts.
1138 return 0;
1139 }
1140
1141 // Add on the cost of extracting the element.
1142 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1143 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1144 CostKind: Ctx.CostKind);
1145 }
1146 case VPInstruction::AnyOf: {
1147 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1148 return Ctx.TTI.getArithmeticReductionCost(
1149 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1150 }
1151 case VPInstruction::FirstActiveLane: {
1152 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1153 if (VF.isScalar())
1154 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1155 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1156 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1157 // Calculate the cost of determining the lane index.
1158 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1159 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1160 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1161 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1162 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1163 }
1164 case VPInstruction::LastActiveLane: {
1165 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1166 if (VF.isScalar())
1167 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1168 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1169 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1170 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1171 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1172 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1173 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1174 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1175 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1176 // Add cost of NOT operation on the predicate.
1177 Cost += Ctx.TTI.getArithmeticInstrCost(
1178 Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1179 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1180 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1181 .Properties: TargetTransformInfo::OP_None});
1182 // Add cost of SUB operation on the index.
1183 Cost += Ctx.TTI.getArithmeticInstrCost(
1184 Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind);
1185 return Cost;
1186 }
1187 case VPInstruction::ExtractLastActive: {
1188 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1189 Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1190 Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1191 IntrinsicCostAttributes ICA(
1192 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1193 {VecTy, MaskTy, ScalarTy});
1194 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1195 }
1196 case VPInstruction::FirstOrderRecurrenceSplice: {
1197 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1198 SmallVector<int> Mask(VF.getKnownMinValue());
1199 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
1200 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1201
1202 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
1203 DstTy: cast<VectorType>(Val: VectorTy),
1204 SrcTy: cast<VectorType>(Val: VectorTy), Mask,
1205 CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1);
1206 }
1207 case VPInstruction::ActiveLaneMask: {
1208 Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1209 unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue();
1210 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier);
1211 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1212 {ArgTy, ArgTy});
1213 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1214 }
1215 case VPInstruction::ExplicitVectorLength: {
1216 Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1217 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1218 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1219 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1220 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1221 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1222 }
1223 case VPInstruction::Reverse: {
1224 assert(VF.isVector() && "Reverse operation must be vector type");
1225 auto *VectorTy = cast<VectorType>(
1226 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
1227 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1228 SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind,
1229 /*Index=*/0);
1230 }
1231 case VPInstruction::ExtractLastLane: {
1232 // Add on the cost of extracting the element.
1233 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1234 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1235 Val: VecTy, CostKind: Ctx.CostKind, Index: 0);
1236 }
1237 case VPInstruction::ExtractPenultimateElement:
1238 if (VF == ElementCount::getScalable(MinVal: 1))
1239 return InstructionCost::getInvalid();
1240 [[fallthrough]];
1241 default:
1242 // TODO: Compute cost other VPInstructions once the legacy cost model has
1243 // been retired.
1244 assert(!getUnderlyingValue() &&
1245 "unexpected VPInstruction witht underlying value");
1246 return 0;
1247 }
1248}
1249
1250bool VPInstruction::isVectorToScalar() const {
1251 return getOpcode() == VPInstruction::ExtractLastLane ||
1252 getOpcode() == VPInstruction::ExtractPenultimateElement ||
1253 getOpcode() == Instruction::ExtractElement ||
1254 getOpcode() == VPInstruction::ExtractLane ||
1255 getOpcode() == VPInstruction::FirstActiveLane ||
1256 getOpcode() == VPInstruction::LastActiveLane ||
1257 getOpcode() == VPInstruction::ComputeAnyOfResult ||
1258 getOpcode() == VPInstruction::ExtractLastActive ||
1259 getOpcode() == VPInstruction::ComputeReductionResult ||
1260 getOpcode() == VPInstruction::AnyOf;
1261}
1262
1263bool VPInstruction::isSingleScalar() const {
1264 switch (getOpcode()) {
1265 case Instruction::PHI:
1266 case VPInstruction::ExplicitVectorLength:
1267 case VPInstruction::ResumeForEpilogue:
1268 case VPInstruction::VScale:
1269 return true;
1270 default:
1271 return isScalarCast();
1272 }
1273}
1274
1275void VPInstruction::execute(VPTransformState &State) {
1276 assert(!State.Lane && "VPInstruction executing an Lane");
1277 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1278 assert(flagsValidForOpcode(getOpcode()) &&
1279 "Set flags not supported for the provided opcode");
1280 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1281 "Opcode requires specific flags to be set");
1282 if (hasFastMathFlags())
1283 State.Builder.setFastMathFlags(getFastMathFlags());
1284 Value *GeneratedValue = generate(State);
1285 if (!hasResult())
1286 return;
1287 assert(GeneratedValue && "generate must produce a value");
1288 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1289 (vputils::onlyFirstLaneUsed(Def: this) ||
1290 isVectorToScalar() || isSingleScalar());
1291 assert((((GeneratedValue->getType()->isVectorTy() ||
1292 GeneratedValue->getType()->isStructTy()) ==
1293 !GeneratesPerFirstLaneOnly) ||
1294 State.VF.isScalar()) &&
1295 "scalar value but not only first lane defined");
1296 State.set(Def: this, V: GeneratedValue,
1297 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1298}
1299
1300bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1301 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1302 return false;
1303 switch (getOpcode()) {
1304 case Instruction::GetElementPtr:
1305 case Instruction::ExtractElement:
1306 case Instruction::Freeze:
1307 case Instruction::FCmp:
1308 case Instruction::ICmp:
1309 case Instruction::Select:
1310 case Instruction::PHI:
1311 case VPInstruction::AnyOf:
1312 case VPInstruction::BranchOnCond:
1313 case VPInstruction::BranchOnTwoConds:
1314 case VPInstruction::BranchOnCount:
1315 case VPInstruction::Broadcast:
1316 case VPInstruction::BuildStructVector:
1317 case VPInstruction::BuildVector:
1318 case VPInstruction::CalculateTripCountMinusVF:
1319 case VPInstruction::CanonicalIVIncrementForPart:
1320 case VPInstruction::ExtractLane:
1321 case VPInstruction::ExtractLastLane:
1322 case VPInstruction::ExtractLastPart:
1323 case VPInstruction::ExtractPenultimateElement:
1324 case VPInstruction::ActiveLaneMask:
1325 case VPInstruction::ExplicitVectorLength:
1326 case VPInstruction::FirstActiveLane:
1327 case VPInstruction::LastActiveLane:
1328 case VPInstruction::ExtractLastActive:
1329 case VPInstruction::FirstOrderRecurrenceSplice:
1330 case VPInstruction::LogicalAnd:
1331 case VPInstruction::Not:
1332 case VPInstruction::PtrAdd:
1333 case VPInstruction::WideIVStep:
1334 case VPInstruction::WidePtrAdd:
1335 case VPInstruction::StepVector:
1336 case VPInstruction::ReductionStartVector:
1337 case VPInstruction::Reverse:
1338 case VPInstruction::VScale:
1339 case VPInstruction::Unpack:
1340 return false;
1341 default:
1342 return true;
1343 }
1344}
1345
1346bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
1347 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1348 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1349 return vputils::onlyFirstLaneUsed(Def: this);
1350
1351 switch (getOpcode()) {
1352 default:
1353 return false;
1354 case Instruction::ExtractElement:
1355 return Op == getOperand(N: 1);
1356 case Instruction::PHI:
1357 return true;
1358 case Instruction::FCmp:
1359 case Instruction::ICmp:
1360 case Instruction::Select:
1361 case Instruction::Or:
1362 case Instruction::Freeze:
1363 case VPInstruction::Not:
1364 // TODO: Cover additional opcodes.
1365 return vputils::onlyFirstLaneUsed(Def: this);
1366 case VPInstruction::ActiveLaneMask:
1367 case VPInstruction::ExplicitVectorLength:
1368 case VPInstruction::CalculateTripCountMinusVF:
1369 case VPInstruction::CanonicalIVIncrementForPart:
1370 case VPInstruction::BranchOnCount:
1371 case VPInstruction::BranchOnCond:
1372 case VPInstruction::Broadcast:
1373 case VPInstruction::ReductionStartVector:
1374 return true;
1375 case VPInstruction::BuildStructVector:
1376 case VPInstruction::BuildVector:
1377 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1378 // operand, after replicating its operands only the first lane is used.
1379 // Before replicating, it will have only a single operand.
1380 return getNumOperands() > 1;
1381 case VPInstruction::PtrAdd:
1382 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1383 case VPInstruction::WidePtrAdd:
1384 // WidePtrAdd supports scalar and vector base addresses.
1385 return false;
1386 case VPInstruction::ComputeAnyOfResult:
1387 return Op == getOperand(N: 0) || Op == getOperand(N: 1);
1388 case VPInstruction::ExtractLane:
1389 return Op == getOperand(N: 0);
1390 };
1391 llvm_unreachable("switch should return");
1392}
1393
1394bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
1395 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1396 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1397 return vputils::onlyFirstPartUsed(Def: this);
1398
1399 switch (getOpcode()) {
1400 default:
1401 return false;
1402 case Instruction::FCmp:
1403 case Instruction::ICmp:
1404 case Instruction::Select:
1405 return vputils::onlyFirstPartUsed(Def: this);
1406 case VPInstruction::BranchOnCount:
1407 case VPInstruction::BranchOnCond:
1408 case VPInstruction::BranchOnTwoConds:
1409 case VPInstruction::CanonicalIVIncrementForPart:
1410 return true;
1411 };
1412 llvm_unreachable("switch should return");
1413}
1414
1415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1416void VPInstruction::dump() const {
1417 VPSlotTracker SlotTracker(getParent()->getPlan());
1418 printRecipe(dbgs(), "", SlotTracker);
1419}
1420
1421void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1422 VPSlotTracker &SlotTracker) const {
1423 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1424
1425 if (hasResult()) {
1426 printAsOperand(O, SlotTracker);
1427 O << " = ";
1428 }
1429
1430 switch (getOpcode()) {
1431 case VPInstruction::Not:
1432 O << "not";
1433 break;
1434 case VPInstruction::SLPLoad:
1435 O << "combined load";
1436 break;
1437 case VPInstruction::SLPStore:
1438 O << "combined store";
1439 break;
1440 case VPInstruction::ActiveLaneMask:
1441 O << "active lane mask";
1442 break;
1443 case VPInstruction::ExplicitVectorLength:
1444 O << "EXPLICIT-VECTOR-LENGTH";
1445 break;
1446 case VPInstruction::FirstOrderRecurrenceSplice:
1447 O << "first-order splice";
1448 break;
1449 case VPInstruction::BranchOnCond:
1450 O << "branch-on-cond";
1451 break;
1452 case VPInstruction::BranchOnTwoConds:
1453 O << "branch-on-two-conds";
1454 break;
1455 case VPInstruction::CalculateTripCountMinusVF:
1456 O << "TC > VF ? TC - VF : 0";
1457 break;
1458 case VPInstruction::CanonicalIVIncrementForPart:
1459 O << "VF * Part +";
1460 break;
1461 case VPInstruction::BranchOnCount:
1462 O << "branch-on-count";
1463 break;
1464 case VPInstruction::Broadcast:
1465 O << "broadcast";
1466 break;
1467 case VPInstruction::BuildStructVector:
1468 O << "buildstructvector";
1469 break;
1470 case VPInstruction::BuildVector:
1471 O << "buildvector";
1472 break;
1473 case VPInstruction::ExtractLane:
1474 O << "extract-lane";
1475 break;
1476 case VPInstruction::ExtractLastLane:
1477 O << "extract-last-lane";
1478 break;
1479 case VPInstruction::ExtractLastPart:
1480 O << "extract-last-part";
1481 break;
1482 case VPInstruction::ExtractPenultimateElement:
1483 O << "extract-penultimate-element";
1484 break;
1485 case VPInstruction::ComputeAnyOfResult:
1486 O << "compute-anyof-result";
1487 break;
1488 case VPInstruction::ComputeReductionResult:
1489 O << "compute-reduction-result";
1490 break;
1491 case VPInstruction::LogicalAnd:
1492 O << "logical-and";
1493 break;
1494 case VPInstruction::PtrAdd:
1495 O << "ptradd";
1496 break;
1497 case VPInstruction::WidePtrAdd:
1498 O << "wide-ptradd";
1499 break;
1500 case VPInstruction::AnyOf:
1501 O << "any-of";
1502 break;
1503 case VPInstruction::FirstActiveLane:
1504 O << "first-active-lane";
1505 break;
1506 case VPInstruction::LastActiveLane:
1507 O << "last-active-lane";
1508 break;
1509 case VPInstruction::ReductionStartVector:
1510 O << "reduction-start-vector";
1511 break;
1512 case VPInstruction::ResumeForEpilogue:
1513 O << "resume-for-epilogue";
1514 break;
1515 case VPInstruction::Reverse:
1516 O << "reverse";
1517 break;
1518 case VPInstruction::Unpack:
1519 O << "unpack";
1520 break;
1521 case VPInstruction::ExtractLastActive:
1522 O << "extract-last-active";
1523 break;
1524 default:
1525 O << Instruction::getOpcodeName(getOpcode());
1526 }
1527
1528 printFlags(O);
1529 printOperands(O, SlotTracker);
1530}
1531#endif
1532
1533void VPInstructionWithType::execute(VPTransformState &State) {
1534 State.setDebugLocFrom(getDebugLoc());
1535 if (isScalarCast()) {
1536 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1537 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1538 V: Op, DestTy: ResultTy);
1539 State.set(Def: this, V: Cast, Lane: VPLane(0));
1540 return;
1541 }
1542 switch (getOpcode()) {
1543 case VPInstruction::StepVector: {
1544 Value *StepVector =
1545 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1546 State.set(Def: this, V: StepVector);
1547 break;
1548 }
1549 case VPInstruction::VScale: {
1550 Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1551 State.set(Def: this, V: VScale, IsScalar: true);
1552 break;
1553 }
1554
1555 default:
1556 llvm_unreachable("opcode not implemented yet");
1557 }
1558}
1559
1560#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1561void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1562 VPSlotTracker &SlotTracker) const {
1563 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1564 printAsOperand(O, SlotTracker);
1565 O << " = ";
1566
1567 switch (getOpcode()) {
1568 case VPInstruction::WideIVStep:
1569 O << "wide-iv-step ";
1570 printOperands(O, SlotTracker);
1571 break;
1572 case VPInstruction::StepVector:
1573 O << "step-vector " << *ResultTy;
1574 break;
1575 case VPInstruction::VScale:
1576 O << "vscale " << *ResultTy;
1577 break;
1578 default:
1579 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1580 O << Instruction::getOpcodeName(getOpcode()) << " ";
1581 printOperands(O, SlotTracker);
1582 O << " to " << *ResultTy;
1583 }
1584}
1585#endif
1586
1587void VPPhi::execute(VPTransformState &State) {
1588 State.setDebugLocFrom(getDebugLoc());
1589 PHINode *NewPhi = State.Builder.CreatePHI(
1590 Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName());
1591 unsigned NumIncoming = getNumIncoming();
1592 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1593 // TODO: Fixup all incoming values of header phis once recipes defining them
1594 // are introduced.
1595 NumIncoming = 1;
1596 }
1597 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1598 Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0));
1599 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1600 NewPhi->addIncoming(V: IncV, BB: PredBB);
1601 }
1602 State.set(Def: this, V: NewPhi, Lane: VPLane(0));
1603}
1604
1605#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1606void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1607 VPSlotTracker &SlotTracker) const {
1608 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1609 printAsOperand(O, SlotTracker);
1610 O << " = phi ";
1611 printPhiOperands(O, SlotTracker);
1612}
1613#endif
1614
1615VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1616 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1617 return new VPIRPhi(*Phi);
1618 return new VPIRInstruction(I);
1619}
1620
1621void VPIRInstruction::execute(VPTransformState &State) {
1622 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1623 "PHINodes must be handled by VPIRPhi");
1624 // Advance the insert point after the wrapped IR instruction. This allows
1625 // interleaving VPIRInstructions and other recipes.
1626 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1627}
1628
1629InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1630 VPCostContext &Ctx) const {
1631 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1632 // hence it does not contribute to the cost-modeling for the VPlan.
1633 return 0;
1634}
1635
1636void VPIRInstruction::extractLastLaneOfLastPartOfFirstOperand(
1637 VPBuilder &Builder) {
1638 assert(isa<PHINode>(getInstruction()) &&
1639 "can only update exiting operands to phi nodes");
1640 assert(getNumOperands() > 0 && "must have at least one operand");
1641 VPValue *Exiting = getOperand(N: 0);
1642 if (isa<VPIRValue>(Val: Exiting))
1643 return;
1644
1645 Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastPart, Operands: Exiting);
1646 Exiting = Builder.createNaryOp(Opcode: VPInstruction::ExtractLastLane, Operands: Exiting);
1647 setOperand(I: 0, New: Exiting);
1648}
1649
1650#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1651void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1652 VPSlotTracker &SlotTracker) const {
1653 O << Indent << "IR " << I;
1654}
1655#endif
1656
1657void VPIRPhi::execute(VPTransformState &State) {
1658 PHINode *Phi = &getIRPhi();
1659 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1660 VPValue *ExitValue = Op;
1661 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1662 ? VPLane::getFirstLane()
1663 : VPLane::getLastLaneForVF(VF: State.VF);
1664 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1665 auto *PredVPBB = Pred->getExitingBasicBlock();
1666 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1667 // Set insertion point in PredBB in case an extract needs to be generated.
1668 // TODO: Model extracts explicitly.
1669 State.Builder.SetInsertPoint(TheBB: PredBB, IP: PredBB->getFirstNonPHIIt());
1670 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
1671 // If there is no existing block for PredBB in the phi, add a new incoming
1672 // value. Otherwise update the existing incoming value for PredBB.
1673 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
1674 Phi->addIncoming(V, BB: PredBB);
1675 else
1676 Phi->setIncomingValueForBlock(BB: PredBB, V);
1677 }
1678
1679 // Advance the insert point after the wrapped IR instruction. This allows
1680 // interleaving VPIRInstructions and other recipes.
1681 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1682}
1683
1684void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1685 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1686 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1687 "Number of phi operands must match number of predecessors");
1688 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1689 R->removeOperand(Idx: Position);
1690}
1691
1692#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1693void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1694 VPSlotTracker &SlotTracker) const {
1695 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1696 [this, &O, &SlotTracker](auto Op) {
1697 O << "[ ";
1698 Op.value()->printAsOperand(O, SlotTracker);
1699 O << ", ";
1700 getIncomingBlock(Op.index())->printAsOperand(O);
1701 O << " ]";
1702 });
1703}
1704#endif
1705
1706#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1707void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1708 VPSlotTracker &SlotTracker) const {
1709 VPIRInstruction::printRecipe(O, Indent, SlotTracker);
1710
1711 if (getNumOperands() != 0) {
1712 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1713 interleaveComma(incoming_values_and_blocks(), O,
1714 [&O, &SlotTracker](auto Op) {
1715 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1716 O << " from ";
1717 std::get<1>(Op)->printAsOperand(O);
1718 });
1719 O << ")";
1720 }
1721}
1722#endif
1723
1724void VPIRMetadata::applyMetadata(Instruction &I) const {
1725 for (const auto &[Kind, Node] : Metadata)
1726 I.setMetadata(KindID: Kind, Node);
1727}
1728
1729void VPIRMetadata::intersect(const VPIRMetadata &Other) {
1730 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1731 for (const auto &[KindA, MDA] : Metadata) {
1732 for (const auto &[KindB, MDB] : Other.Metadata) {
1733 if (KindA == KindB && MDA == MDB) {
1734 MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
1735 break;
1736 }
1737 }
1738 }
1739 Metadata = std::move(MetadataIntersection);
1740}
1741
1742#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1743void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1744 const Module *M = SlotTracker.getModule();
1745 if (Metadata.empty() || !M)
1746 return;
1747
1748 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1749 O << " (";
1750 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1751 auto [Kind, Node] = KindNodePair;
1752 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1753 "Unexpected unnamed metadata kind");
1754 O << "!" << MDNames[Kind] << " ";
1755 Node->printAsOperand(O, M);
1756 });
1757 O << ")";
1758}
1759#endif
1760
1761void VPWidenCallRecipe::execute(VPTransformState &State) {
1762 assert(State.VF.isVector() && "not widening");
1763 assert(Variant != nullptr && "Can't create vector function.");
1764
1765 FunctionType *VFTy = Variant->getFunctionType();
1766 // Add return type if intrinsic is overloaded on it.
1767 SmallVector<Value *, 4> Args;
1768 for (const auto &I : enumerate(First: args())) {
1769 Value *Arg;
1770 // Some vectorized function variants may also take a scalar argument,
1771 // e.g. linear parameters for pointers. This needs to be the scalar value
1772 // from the start of the respective part when interleaving.
1773 if (!VFTy->getParamType(i: I.index())->isVectorTy())
1774 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1775 else
1776 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1777 Args.push_back(Elt: Arg);
1778 }
1779
1780 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1781 SmallVector<OperandBundleDef, 1> OpBundles;
1782 if (CI)
1783 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1784
1785 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1786 applyFlags(I&: *V);
1787 applyMetadata(I&: *V);
1788 V->setCallingConv(Variant->getCallingConv());
1789
1790 if (!V->getType()->isVoidTy())
1791 State.set(Def: this, V);
1792}
1793
1794InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1795 VPCostContext &Ctx) const {
1796 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1797 Tys: Variant->getFunctionType()->params(),
1798 CostKind: Ctx.CostKind);
1799}
1800
1801#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1802void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1803 VPSlotTracker &SlotTracker) const {
1804 O << Indent << "WIDEN-CALL ";
1805
1806 Function *CalledFn = getCalledScalarFunction();
1807 if (CalledFn->getReturnType()->isVoidTy())
1808 O << "void ";
1809 else {
1810 printAsOperand(O, SlotTracker);
1811 O << " = ";
1812 }
1813
1814 O << "call";
1815 printFlags(O);
1816 O << " @" << CalledFn->getName() << "(";
1817 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1818 Op->printAsOperand(O, SlotTracker);
1819 });
1820 O << ")";
1821
1822 O << " (using library function";
1823 if (Variant->hasName())
1824 O << ": " << Variant->getName();
1825 O << ")";
1826}
1827#endif
1828
1829void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1830 assert(State.VF.isVector() && "not widening");
1831
1832 SmallVector<Type *, 2> TysForDecl;
1833 // Add return type if intrinsic is overloaded on it.
1834 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1,
1835 TTI: State.TTI)) {
1836 Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF);
1837 ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
1838 for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
1839 if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
1840 RetIdx: Idx, TTI: State.TTI))
1841 TysForDecl.push_back(Elt: Ty);
1842 }
1843 }
1844 SmallVector<Value *, 4> Args;
1845 for (const auto &I : enumerate(First: operands())) {
1846 // Some intrinsics have a scalar argument - don't replace it with a
1847 // vector.
1848 Value *Arg;
1849 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1850 TTI: State.TTI))
1851 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1852 else
1853 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1854 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1855 TTI: State.TTI))
1856 TysForDecl.push_back(Elt: Arg->getType());
1857 Args.push_back(Elt: Arg);
1858 }
1859
1860 // Use vector version of the intrinsic.
1861 Module *M = State.Builder.GetInsertBlock()->getModule();
1862 Function *VectorF =
1863 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1864 assert(VectorF &&
1865 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1866
1867 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1868 SmallVector<OperandBundleDef, 1> OpBundles;
1869 if (CI)
1870 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1871
1872 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1873
1874 applyFlags(I&: *V);
1875 applyMetadata(I&: *V);
1876
1877 if (!V->getType()->isVoidTy())
1878 State.set(Def: this, V);
1879}
1880
1881/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1882static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
1883 ArrayRef<const VPValue *> Operands,
1884 const VPRecipeWithIRFlags &R,
1885 ElementCount VF,
1886 VPCostContext &Ctx) {
1887 // Some backends analyze intrinsic arguments to determine cost. Use the
1888 // underlying value for the operand if it has one. Otherwise try to use the
1889 // operand of the underlying call instruction, if there is one. Otherwise
1890 // clear Arguments.
1891 // TODO: Rework TTI interface to be independent of concrete IR values.
1892 SmallVector<const Value *> Arguments;
1893 for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
1894 auto *V = Op->getUnderlyingValue();
1895 if (!V) {
1896 if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
1897 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1898 continue;
1899 }
1900 Arguments.clear();
1901 break;
1902 }
1903 Arguments.push_back(Elt: V);
1904 }
1905
1906 Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R);
1907 Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
1908 SmallVector<Type *> ParamTys;
1909 for (const VPValue *Op : Operands) {
1910 ParamTys.push_back(Elt: VF.isVector()
1911 ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF)
1912 : Ctx.Types.inferScalarType(V: Op));
1913 }
1914
1915 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1916 FastMathFlags FMF =
1917 R.hasFastMathFlags() ? R.getFastMathFlags() : FastMathFlags();
1918 IntrinsicCostAttributes CostAttrs(
1919 ID, RetTy, Arguments, ParamTys, FMF,
1920 dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
1921 InstructionCost::getInvalid(), &Ctx.TLI);
1922 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1923}
1924
1925InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1926 VPCostContext &Ctx) const {
1927 SmallVector<const VPValue *> ArgOps(operands());
1928 return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
1929}
1930
1931StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1932 return Intrinsic::getBaseName(id: VectorIntrinsicID);
1933}
1934
1935bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
1936 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1937 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1938 auto [Idx, V] = X;
1939 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1940 Idx, nullptr);
1941 });
1942}
1943
1944#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1945void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1946 VPSlotTracker &SlotTracker) const {
1947 O << Indent << "WIDEN-INTRINSIC ";
1948 if (ResultTy->isVoidTy()) {
1949 O << "void ";
1950 } else {
1951 printAsOperand(O, SlotTracker);
1952 O << " = ";
1953 }
1954
1955 O << "call";
1956 printFlags(O);
1957 O << getIntrinsicName() << "(";
1958
1959 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1960 Op->printAsOperand(O, SlotTracker);
1961 });
1962 O << ")";
1963}
1964#endif
1965
1966void VPHistogramRecipe::execute(VPTransformState &State) {
1967 IRBuilderBase &Builder = State.Builder;
1968
1969 Value *Address = State.get(Def: getOperand(N: 0));
1970 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
1971 VectorType *VTy = cast<VectorType>(Val: Address->getType());
1972
1973 // The histogram intrinsic requires a mask even if the recipe doesn't;
1974 // if the mask operand was omitted then all lanes should be executed and
1975 // we just need to synthesize an all-true mask.
1976 Value *Mask = nullptr;
1977 if (VPValue *VPMask = getMask())
1978 Mask = State.get(Def: VPMask);
1979 else
1980 Mask =
1981 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
1982
1983 // If this is a subtract, we want to invert the increment amount. We may
1984 // add a separate intrinsic in future, but for now we'll try this.
1985 if (Opcode == Instruction::Sub)
1986 IncAmt = Builder.CreateNeg(V: IncAmt);
1987 else
1988 assert(Opcode == Instruction::Add && "only add or sub supported for now");
1989
1990 State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
1991 Types: {VTy, IncAmt->getType()},
1992 Args: {Address, IncAmt, Mask});
1993}
1994
1995InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
1996 VPCostContext &Ctx) const {
1997 // FIXME: Take the gather and scatter into account as well. For now we're
1998 // generating the same cost as the fallback path, but we'll likely
1999 // need to create a new TTI method for determining the cost, including
2000 // whether we can use base + vec-of-smaller-indices or just
2001 // vec-of-pointers.
2002 assert(VF.isVector() && "Invalid VF for histogram cost");
2003 Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
2004 VPValue *IncAmt = getOperand(N: 1);
2005 Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
2006 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2007
2008 // Assume that a non-constant update value (or a constant != 1) requires
2009 // a multiply, and add that into the cost.
2010 InstructionCost MulCost =
2011 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2012 if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt))
2013 if (CI->isOne())
2014 MulCost = TTI::TCC_Free;
2015
2016 // Find the cost of the histogram operation itself.
2017 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2018 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2019 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2020 Type::getVoidTy(C&: Ctx.LLVMCtx),
2021 {PtrTy, IncTy, MaskTy});
2022
2023 // Add the costs together with the add/sub operation.
2024 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2025 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2026}
2027
2028#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2029void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2030 VPSlotTracker &SlotTracker) const {
2031 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2032 getOperand(0)->printAsOperand(O, SlotTracker);
2033
2034 if (Opcode == Instruction::Sub)
2035 O << ", dec: ";
2036 else {
2037 assert(Opcode == Instruction::Add);
2038 O << ", inc: ";
2039 }
2040 getOperand(1)->printAsOperand(O, SlotTracker);
2041
2042 if (VPValue *Mask = getMask()) {
2043 O << ", mask: ";
2044 Mask->printAsOperand(O, SlotTracker);
2045 }
2046}
2047#endif
2048
2049VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2050 AllowReassoc = FMF.allowReassoc();
2051 NoNaNs = FMF.noNaNs();
2052 NoInfs = FMF.noInfs();
2053 NoSignedZeros = FMF.noSignedZeros();
2054 AllowReciprocal = FMF.allowReciprocal();
2055 AllowContract = FMF.allowContract();
2056 ApproxFunc = FMF.approxFunc();
2057}
2058
2059VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2060 switch (Opcode) {
2061 case Instruction::Add:
2062 case Instruction::Sub:
2063 case Instruction::Mul:
2064 case Instruction::Shl:
2065 case VPInstruction::CanonicalIVIncrementForPart:
2066 return WrapFlagsTy(false, false);
2067 case Instruction::Trunc:
2068 return TruncFlagsTy(false, false);
2069 case Instruction::Or:
2070 return DisjointFlagsTy(false);
2071 case Instruction::AShr:
2072 case Instruction::LShr:
2073 case Instruction::UDiv:
2074 case Instruction::SDiv:
2075 return ExactFlagsTy(false);
2076 case Instruction::GetElementPtr:
2077 case VPInstruction::PtrAdd:
2078 case VPInstruction::WidePtrAdd:
2079 return GEPNoWrapFlags::none();
2080 case Instruction::ZExt:
2081 case Instruction::UIToFP:
2082 return NonNegFlagsTy(false);
2083 case Instruction::FAdd:
2084 case Instruction::FSub:
2085 case Instruction::FMul:
2086 case Instruction::FDiv:
2087 case Instruction::FRem:
2088 case Instruction::FNeg:
2089 case Instruction::FPExt:
2090 case Instruction::FPTrunc:
2091 return FastMathFlags();
2092 case Instruction::ICmp:
2093 case Instruction::FCmp:
2094 case VPInstruction::ComputeReductionResult:
2095 llvm_unreachable("opcode requires explicit flags");
2096 default:
2097 return VPIRFlags();
2098 }
2099}
2100
2101#if !defined(NDEBUG)
2102bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2103 switch (OpType) {
2104 case OperationType::OverflowingBinOp:
2105 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2106 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2107 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2108 case OperationType::Trunc:
2109 return Opcode == Instruction::Trunc;
2110 case OperationType::DisjointOp:
2111 return Opcode == Instruction::Or;
2112 case OperationType::PossiblyExactOp:
2113 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2114 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2115 case OperationType::GEPOp:
2116 return Opcode == Instruction::GetElementPtr ||
2117 Opcode == VPInstruction::PtrAdd ||
2118 Opcode == VPInstruction::WidePtrAdd;
2119 case OperationType::FPMathOp:
2120 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2121 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2122 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2123 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2124 Opcode == Instruction::FPTrunc || Opcode == Instruction::Select ||
2125 Opcode == VPInstruction::WideIVStep ||
2126 Opcode == VPInstruction::ReductionStartVector;
2127 case OperationType::FCmp:
2128 return Opcode == Instruction::FCmp;
2129 case OperationType::NonNegOp:
2130 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2131 case OperationType::Cmp:
2132 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2133 case OperationType::ReductionOp:
2134 return Opcode == VPInstruction::ComputeReductionResult;
2135 case OperationType::Other:
2136 return true;
2137 }
2138 llvm_unreachable("Unknown OperationType enum");
2139}
2140
2141bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2142 // Handle opcodes without default flags.
2143 if (Opcode == Instruction::ICmp)
2144 return OpType == OperationType::Cmp;
2145 if (Opcode == Instruction::FCmp)
2146 return OpType == OperationType::FCmp;
2147 if (Opcode == VPInstruction::ComputeReductionResult)
2148 return OpType == OperationType::ReductionOp;
2149
2150 OperationType Required = getDefaultFlags(Opcode).OpType;
2151 return Required == OperationType::Other || Required == OpType;
2152}
2153#endif
2154
2155#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2156void VPIRFlags::printFlags(raw_ostream &O) const {
2157 switch (OpType) {
2158 case OperationType::Cmp:
2159 O << " " << CmpInst::getPredicateName(getPredicate());
2160 break;
2161 case OperationType::FCmp:
2162 O << " " << CmpInst::getPredicateName(getPredicate());
2163 getFastMathFlags().print(O);
2164 break;
2165 case OperationType::DisjointOp:
2166 if (DisjointFlags.IsDisjoint)
2167 O << " disjoint";
2168 break;
2169 case OperationType::PossiblyExactOp:
2170 if (ExactFlags.IsExact)
2171 O << " exact";
2172 break;
2173 case OperationType::OverflowingBinOp:
2174 if (WrapFlags.HasNUW)
2175 O << " nuw";
2176 if (WrapFlags.HasNSW)
2177 O << " nsw";
2178 break;
2179 case OperationType::Trunc:
2180 if (TruncFlags.HasNUW)
2181 O << " nuw";
2182 if (TruncFlags.HasNSW)
2183 O << " nsw";
2184 break;
2185 case OperationType::FPMathOp:
2186 getFastMathFlags().print(O);
2187 break;
2188 case OperationType::GEPOp:
2189 if (GEPFlags.isInBounds())
2190 O << " inbounds";
2191 else if (GEPFlags.hasNoUnsignedSignedWrap())
2192 O << " nusw";
2193 if (GEPFlags.hasNoUnsignedWrap())
2194 O << " nuw";
2195 break;
2196 case OperationType::NonNegOp:
2197 if (NonNegFlags.NonNeg)
2198 O << " nneg";
2199 break;
2200 case OperationType::ReductionOp: {
2201 RecurKind RK = getRecurKind();
2202 O << " (";
2203 switch (RK) {
2204 case RecurKind::AnyOf:
2205 O << "any-of";
2206 break;
2207 case RecurKind::SMax:
2208 O << "smax";
2209 break;
2210 case RecurKind::SMin:
2211 O << "smin";
2212 break;
2213 case RecurKind::UMax:
2214 O << "umax";
2215 break;
2216 case RecurKind::UMin:
2217 O << "umin";
2218 break;
2219 case RecurKind::FMinNum:
2220 O << "fminnum";
2221 break;
2222 case RecurKind::FMaxNum:
2223 O << "fmaxnum";
2224 break;
2225 case RecurKind::FMinimum:
2226 O << "fminimum";
2227 break;
2228 case RecurKind::FMaximum:
2229 O << "fmaximum";
2230 break;
2231 case RecurKind::FMinimumNum:
2232 O << "fminimumnum";
2233 break;
2234 case RecurKind::FMaximumNum:
2235 O << "fmaximumnum";
2236 break;
2237 default:
2238 O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
2239 break;
2240 }
2241 if (isReductionInLoop())
2242 O << ", in-loop";
2243 if (isReductionOrdered())
2244 O << ", ordered";
2245 O << ")";
2246 getFastMathFlags().print(O);
2247 break;
2248 }
2249 case OperationType::Other:
2250 break;
2251 }
2252 O << " ";
2253}
2254#endif
2255
2256void VPWidenRecipe::execute(VPTransformState &State) {
2257 auto &Builder = State.Builder;
2258 switch (Opcode) {
2259 case Instruction::Call:
2260 case Instruction::Br:
2261 case Instruction::PHI:
2262 case Instruction::GetElementPtr:
2263 llvm_unreachable("This instruction is handled by a different recipe.");
2264 case Instruction::UDiv:
2265 case Instruction::SDiv:
2266 case Instruction::SRem:
2267 case Instruction::URem:
2268 case Instruction::Add:
2269 case Instruction::FAdd:
2270 case Instruction::Sub:
2271 case Instruction::FSub:
2272 case Instruction::FNeg:
2273 case Instruction::Mul:
2274 case Instruction::FMul:
2275 case Instruction::FDiv:
2276 case Instruction::FRem:
2277 case Instruction::Shl:
2278 case Instruction::LShr:
2279 case Instruction::AShr:
2280 case Instruction::And:
2281 case Instruction::Or:
2282 case Instruction::Xor: {
2283 // Just widen unops and binops.
2284 SmallVector<Value *, 2> Ops;
2285 for (VPValue *VPOp : operands())
2286 Ops.push_back(Elt: State.get(Def: VPOp));
2287
2288 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2289
2290 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2291 applyFlags(I&: *VecOp);
2292 applyMetadata(I&: *VecOp);
2293 }
2294
2295 // Use this vector value for all users of the original instruction.
2296 State.set(Def: this, V);
2297 break;
2298 }
2299 case Instruction::ExtractValue: {
2300 assert(getNumOperands() == 2 && "expected single level extractvalue");
2301 Value *Op = State.get(Def: getOperand(N: 0));
2302 Value *Extract = Builder.CreateExtractValue(
2303 Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue());
2304 State.set(Def: this, V: Extract);
2305 break;
2306 }
2307 case Instruction::Freeze: {
2308 Value *Op = State.get(Def: getOperand(N: 0));
2309 Value *Freeze = Builder.CreateFreeze(V: Op);
2310 State.set(Def: this, V: Freeze);
2311 break;
2312 }
2313 case Instruction::ICmp:
2314 case Instruction::FCmp: {
2315 // Widen compares. Generate vector compares.
2316 bool FCmp = Opcode == Instruction::FCmp;
2317 Value *A = State.get(Def: getOperand(N: 0));
2318 Value *B = State.get(Def: getOperand(N: 1));
2319 Value *C = nullptr;
2320 if (FCmp) {
2321 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2322 } else {
2323 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2324 }
2325 if (auto *I = dyn_cast<Instruction>(Val: C)) {
2326 applyFlags(I&: *I);
2327 applyMetadata(I&: *I);
2328 }
2329 State.set(Def: this, V: C);
2330 break;
2331 }
2332 case Instruction::Select: {
2333 VPValue *CondOp = getOperand(N: 0);
2334 Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2335 Value *Op0 = State.get(Def: getOperand(N: 1));
2336 Value *Op1 = State.get(Def: getOperand(N: 2));
2337 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2338 State.set(Def: this, V: Sel);
2339 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2340 if (isa<FPMathOperator>(Val: I))
2341 applyFlags(I&: *I);
2342 applyMetadata(I&: *I);
2343 }
2344 break;
2345 }
2346 default:
2347 // This instruction is not vectorized by simple widening.
2348 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2349 << Instruction::getOpcodeName(Opcode));
2350 llvm_unreachable("Unhandled instruction!");
2351 } // end of switch.
2352
2353#if !defined(NDEBUG)
2354 // Verify that VPlan type inference results agree with the type of the
2355 // generated values.
2356 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2357 State.get(this)->getType() &&
2358 "inferred type and type from generated instructions do not match");
2359#endif
2360}
2361
2362InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2363 VPCostContext &Ctx) const {
2364 switch (Opcode) {
2365 case Instruction::UDiv:
2366 case Instruction::SDiv:
2367 case Instruction::SRem:
2368 case Instruction::URem:
2369 // If the div/rem operation isn't safe to speculate and requires
2370 // predication, then the only way we can even create a vplan is to insert
2371 // a select on the second input operand to ensure we use the value of 1
2372 // for the inactive lanes. The select will be costed separately.
2373 case Instruction::FNeg:
2374 case Instruction::Add:
2375 case Instruction::FAdd:
2376 case Instruction::Sub:
2377 case Instruction::FSub:
2378 case Instruction::Mul:
2379 case Instruction::FMul:
2380 case Instruction::FDiv:
2381 case Instruction::FRem:
2382 case Instruction::Shl:
2383 case Instruction::LShr:
2384 case Instruction::AShr:
2385 case Instruction::And:
2386 case Instruction::Or:
2387 case Instruction::Xor:
2388 case Instruction::Freeze:
2389 case Instruction::ExtractValue:
2390 case Instruction::ICmp:
2391 case Instruction::FCmp:
2392 case Instruction::Select:
2393 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2394 default:
2395 llvm_unreachable("Unsupported opcode for instruction");
2396 }
2397}
2398
2399#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2400void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2401 VPSlotTracker &SlotTracker) const {
2402 O << Indent << "WIDEN ";
2403 printAsOperand(O, SlotTracker);
2404 O << " = " << Instruction::getOpcodeName(Opcode);
2405 printFlags(O);
2406 printOperands(O, SlotTracker);
2407}
2408#endif
2409
2410void VPWidenCastRecipe::execute(VPTransformState &State) {
2411 auto &Builder = State.Builder;
2412 /// Vectorize casts.
2413 assert(State.VF.isVector() && "Not vectorizing?");
2414 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2415 VPValue *Op = getOperand(N: 0);
2416 Value *A = State.get(Def: Op);
2417 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2418 State.set(Def: this, V: Cast);
2419 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2420 applyFlags(I&: *CastOp);
2421 applyMetadata(I&: *CastOp);
2422 }
2423}
2424
2425InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2426 VPCostContext &Ctx) const {
2427 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2428 // the legacy cost model, including truncates/extends when evaluating a
2429 // reduction in a smaller type.
2430 if (!getUnderlyingValue())
2431 return 0;
2432 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2433}
2434
2435#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2436void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2437 VPSlotTracker &SlotTracker) const {
2438 O << Indent << "WIDEN-CAST ";
2439 printAsOperand(O, SlotTracker);
2440 O << " = " << Instruction::getOpcodeName(Opcode);
2441 printFlags(O);
2442 printOperands(O, SlotTracker);
2443 O << " to " << *getResultType();
2444}
2445#endif
2446
2447InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2448 VPCostContext &Ctx) const {
2449 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2450}
2451
2452#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2453void VPWidenIntOrFpInductionRecipe::printRecipe(
2454 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2455 O << Indent;
2456 printAsOperand(O, SlotTracker);
2457 O << " = WIDEN-INDUCTION";
2458 printFlags(O);
2459 printOperands(O, SlotTracker);
2460
2461 if (auto *TI = getTruncInst())
2462 O << " (truncated to " << *TI->getType() << ")";
2463}
2464#endif
2465
2466bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2467 // The step may be defined by a recipe in the preheader (e.g. if it requires
2468 // SCEV expansion), but for the canonical induction the step is required to be
2469 // 1, which is represented as live-in.
2470 auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue());
2471 auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue());
2472 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2473 getScalarType() == getRegion()->getCanonicalIVType();
2474}
2475
2476void VPDerivedIVRecipe::execute(VPTransformState &State) {
2477 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2478
2479 // Fast-math-flags propagate from the original induction instruction.
2480 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2481 if (FPBinOp)
2482 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2483
2484 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2485 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
2486 Value *DerivedIV = emitTransformedIndex(
2487 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
2488 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
2489 DerivedIV->setName(Name);
2490 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
2491}
2492
2493#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2494void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2495 VPSlotTracker &SlotTracker) const {
2496 O << Indent;
2497 printAsOperand(O, SlotTracker);
2498 O << " = DERIVED-IV ";
2499 getStartValue()->printAsOperand(O, SlotTracker);
2500 O << " + ";
2501 getOperand(1)->printAsOperand(O, SlotTracker);
2502 O << " * ";
2503 getStepValue()->printAsOperand(O, SlotTracker);
2504}
2505#endif
2506
2507void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2508 // Fast-math-flags propagate from the original induction instruction.
2509 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2510 if (hasFastMathFlags())
2511 State.Builder.setFastMathFlags(getFastMathFlags());
2512
2513 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2514 /// variable on which to base the steps, \p Step is the size of the step.
2515
2516 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2517 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2518 IRBuilderBase &Builder = State.Builder;
2519
2520 // Ensure step has the same type as that of scalar IV.
2521 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2522 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2523
2524 // We build scalar steps for both integer and floating-point induction
2525 // variables. Here, we determine the kind of arithmetic we will perform.
2526 Instruction::BinaryOps AddOp;
2527 Instruction::BinaryOps MulOp;
2528 if (BaseIVTy->isIntegerTy()) {
2529 AddOp = Instruction::Add;
2530 MulOp = Instruction::Mul;
2531 } else {
2532 AddOp = InductionOpcode;
2533 MulOp = Instruction::FMul;
2534 }
2535
2536 // Determine the number of scalars we need to generate.
2537 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2538 // Compute the scalar steps and save the results in State.
2539
2540 unsigned StartLane = 0;
2541 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2542 if (State.Lane) {
2543 StartLane = State.Lane->getKnownLane();
2544 EndLane = StartLane + 1;
2545 }
2546 Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true)
2547 : Constant::getNullValue(Ty: BaseIVTy);
2548
2549 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2550 // It is okay if the induction variable type cannot hold the lane number,
2551 // we expect truncation in this case.
2552 Constant *LaneValue =
2553 BaseIVTy->isIntegerTy()
2554 ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false,
2555 /*ImplicitTrunc=*/true)
2556 : ConstantFP::get(Ty: BaseIVTy, V: Lane);
2557 Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
2558 // The step returned by `createStepForVF` is a runtime-evaluated value
2559 // when VF is scalable. Otherwise, it should be folded into a Constant.
2560 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2561 "Expected StartIdx to be folded to a constant when VF is not "
2562 "scalable");
2563 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2564 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2565 State.set(Def: this, V: Add, Lane: VPLane(Lane));
2566 }
2567}
2568
2569#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2570void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2571 VPSlotTracker &SlotTracker) const {
2572 O << Indent;
2573 printAsOperand(O, SlotTracker);
2574 O << " = SCALAR-STEPS ";
2575 printOperands(O, SlotTracker);
2576}
2577#endif
2578
2579bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2580 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2581 return vputils::isSingleScalar(VPV: Op);
2582}
2583
2584void VPWidenGEPRecipe::execute(VPTransformState &State) {
2585 assert(State.VF.isVector() && "not widening");
2586 // Construct a vector GEP by widening the operands of the scalar GEP as
2587 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2588 // results in a vector of pointers when at least one operand of the GEP
2589 // is vector-typed. Thus, to keep the representation compact, we only use
2590 // vector-typed operands for loop-varying values.
2591
2592 bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) {
2593 return Op->isDefinedOutsideLoopRegions();
2594 });
2595 if (AllOperandsAreInvariant) {
2596 // If we are vectorizing, but the GEP has only loop-invariant operands,
2597 // the GEP we build (by only using vector-typed operands for
2598 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2599 // produce a vector of pointers, we need to either arbitrarily pick an
2600 // operand to broadcast, or broadcast a clone of the original GEP.
2601 // Here, we broadcast a clone of the original.
2602
2603 SmallVector<Value *> Ops;
2604 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2605 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0)));
2606
2607 auto *NewGEP =
2608 State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
2609 Name: "", NW: getGEPNoWrapFlags());
2610 Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2611 State.set(Def: this, V: Splat);
2612 return;
2613 }
2614
2615 // If the GEP has at least one loop-varying operand, we are sure to
2616 // produce a vector of pointers unless VF is scalar.
2617 // The pointer operand of the new GEP. If it's loop-invariant, we
2618 // won't broadcast it.
2619 auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant());
2620
2621 // Collect all the indices for the new GEP. If any index is
2622 // loop-invariant, we won't broadcast it.
2623 SmallVector<Value *, 4> Indices;
2624 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2625 VPValue *Operand = getOperand(N: I);
2626 Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1)));
2627 }
2628
2629 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2630 // but it should be a vector, otherwise.
2631 auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices,
2632 Name: "", NW: getGEPNoWrapFlags());
2633 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2634 "NewGEP is not a pointer vector");
2635 State.set(Def: this, V: NewGEP);
2636}
2637
2638#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2639void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2640 VPSlotTracker &SlotTracker) const {
2641 O << Indent << "WIDEN-GEP ";
2642 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2643 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2644 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2645
2646 O << " ";
2647 printAsOperand(O, SlotTracker);
2648 O << " = getelementptr";
2649 printFlags(O);
2650 printOperands(O, SlotTracker);
2651}
2652#endif
2653
2654void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2655 auto &Builder = State.Builder;
2656 unsigned CurrentPart = getUnrollPart(U: *this);
2657 const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2658 Type *IndexTy = DL.getIndexType(PtrTy: State.TypeAnalysis.inferScalarType(V: this));
2659
2660 // The wide store needs to start at the last vector element.
2661 Value *RunTimeVF = State.get(Def: getVFValue(), Lane: VPLane(0));
2662 if (IndexTy != RunTimeVF->getType())
2663 RunTimeVF = Builder.CreateZExtOrTrunc(V: RunTimeVF, DestTy: IndexTy);
2664 // NumElt = Stride * CurrentPart * RunTimeVF
2665 Value *NumElt = Builder.CreateMul(
2666 LHS: ConstantInt::getSigned(Ty: IndexTy, V: Stride * (int64_t)CurrentPart),
2667 RHS: RunTimeVF);
2668 // LastLane = Stride * (RunTimeVF - 1)
2669 Value *LastLane = Builder.CreateSub(LHS: RunTimeVF, RHS: ConstantInt::get(Ty: IndexTy, V: 1));
2670 if (Stride != 1)
2671 LastLane =
2672 Builder.CreateMul(LHS: ConstantInt::getSigned(Ty: IndexTy, V: Stride), RHS: LastLane);
2673 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2674 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: NumElt, Name: "",
2675 NW: getGEPNoWrapFlags());
2676 ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr: ResultPtr, IdxList: LastLane, Name: "",
2677 NW: getGEPNoWrapFlags());
2678
2679 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2680}
2681
2682#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2683void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2684 VPSlotTracker &SlotTracker) const {
2685 O << Indent;
2686 printAsOperand(O, SlotTracker);
2687 O << " = vector-end-pointer";
2688 printFlags(O);
2689 printOperands(O, SlotTracker);
2690}
2691#endif
2692
2693void VPVectorPointerRecipe::execute(VPTransformState &State) {
2694 auto &Builder = State.Builder;
2695 assert(getOffset() &&
2696 "Expected prior simplification of recipe without offset");
2697 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2698 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2699 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2700 NW: getGEPNoWrapFlags());
2701 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2702}
2703
2704#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2705void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2706 VPSlotTracker &SlotTracker) const {
2707 O << Indent;
2708 printAsOperand(O, SlotTracker);
2709 O << " = vector-pointer";
2710 printFlags(O);
2711 printOperands(O, SlotTracker);
2712}
2713#endif
2714
2715InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2716 VPCostContext &Ctx) const {
2717 // A blend will be expanded to a select VPInstruction, which will generate a
2718 // scalar select if only the first lane is used.
2719 if (vputils::onlyFirstLaneUsed(Def: this))
2720 VF = ElementCount::getFixed(MinVal: 1);
2721
2722 Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
2723 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2724 return (getNumIncomingValues() - 1) *
2725 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2726 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2727}
2728
2729#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2730void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2731 VPSlotTracker &SlotTracker) const {
2732 O << Indent << "BLEND ";
2733 printAsOperand(O, SlotTracker);
2734 O << " =";
2735 if (getNumIncomingValues() == 1) {
2736 // Not a User of any mask: not really blending, this is a
2737 // single-predecessor phi.
2738 O << " ";
2739 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2740 } else {
2741 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2742 O << " ";
2743 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2744 if (I == 0)
2745 continue;
2746 O << "/";
2747 getMask(I)->printAsOperand(O, SlotTracker);
2748 }
2749 }
2750}
2751#endif
2752
2753void VPReductionRecipe::execute(VPTransformState &State) {
2754 assert(!State.Lane && "Reduction being replicated.");
2755 RecurKind Kind = getRecurrenceKind();
2756 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2757 "In-loop AnyOf reductions aren't currently supported");
2758 // Propagate the fast-math flags carried by the underlying instruction.
2759 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2760 State.Builder.setFastMathFlags(getFastMathFlags());
2761 Value *NewVecOp = State.get(Def: getVecOp());
2762 if (VPValue *Cond = getCondOp()) {
2763 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2764 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2765 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2766
2767 Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2768 if (State.VF.isVector())
2769 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2770
2771 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2772 NewVecOp = Select;
2773 }
2774 Value *NewRed;
2775 Value *NextInChain;
2776 if (isOrdered()) {
2777 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2778 if (State.VF.isVector())
2779 NewRed =
2780 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2781 else
2782 NewRed = State.Builder.CreateBinOp(
2783 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2784 LHS: PrevInChain, RHS: NewVecOp);
2785 PrevInChain = NewRed;
2786 NextInChain = NewRed;
2787 } else if (isPartialReduction()) {
2788 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2789 "Unexpected partial reduction kind");
2790 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false);
2791 NewRed = State.Builder.CreateIntrinsic(
2792 RetTy: PrevInChain->getType(),
2793 ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2794 : Intrinsic::vector_partial_reduce_fadd,
2795 Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
2796 Name: "partial.reduce");
2797 PrevInChain = NewRed;
2798 NextInChain = NewRed;
2799 } else {
2800 assert(isInLoop() &&
2801 "The reduction must either be ordered, partial or in-loop");
2802 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2803 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2804 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2805 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2806 else
2807 NextInChain = State.Builder.CreateBinOp(
2808 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2809 LHS: PrevInChain, RHS: NewRed);
2810 }
2811 State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction());
2812}
2813
2814void VPReductionEVLRecipe::execute(VPTransformState &State) {
2815 assert(!State.Lane && "Reduction being replicated.");
2816
2817 auto &Builder = State.Builder;
2818 // Propagate the fast-math flags carried by the underlying instruction.
2819 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2820 Builder.setFastMathFlags(getFastMathFlags());
2821
2822 RecurKind Kind = getRecurrenceKind();
2823 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
2824 Value *VecOp = State.get(Def: getVecOp());
2825 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
2826
2827 Value *Mask;
2828 if (VPValue *CondOp = getCondOp())
2829 Mask = State.get(Def: CondOp);
2830 else
2831 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2832
2833 Value *NewRed;
2834 if (isOrdered()) {
2835 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2836 } else {
2837 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2838 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2839 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2840 else
2841 NewRed = Builder.CreateBinOp(
2842 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2843 RHS: Prev);
2844 }
2845 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
2846}
2847
2848InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2849 VPCostContext &Ctx) const {
2850 RecurKind RdxKind = getRecurrenceKind();
2851 Type *ElementTy = Ctx.Types.inferScalarType(V: this);
2852 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2853 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2854 FastMathFlags FMFs = getFastMathFlags();
2855 std::optional<FastMathFlags> OptionalFMF =
2856 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2857
2858 if (isPartialReduction()) {
2859 InstructionCost CondCost = 0;
2860 if (isConditional()) {
2861 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
2862 auto *CondTy = cast<VectorType>(
2863 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF));
2864 CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
2865 CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
2866 }
2867 return CondCost + Ctx.TTI.getPartialReductionCost(
2868 Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
2869 OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
2870 FMF: OptionalFMF);
2871 }
2872
2873 // TODO: Support any-of reductions.
2874 assert(
2875 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2876 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2877 "Any-of reduction not implemented in VPlan-based cost model currently.");
2878
2879 // Note that TTI should model the cost of moving result to the scalar register
2880 // and the BinOp cost in the getMinMaxReductionCost().
2881 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2882 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2883 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2884 }
2885
2886 // Note that TTI should model the cost of moving result to the scalar register
2887 // and the BinOp cost in the getArithmeticReductionCost().
2888 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2889 CostKind: Ctx.CostKind);
2890}
2891
2892VPExpressionRecipe::VPExpressionRecipe(
2893 ExpressionTypes ExpressionType,
2894 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2895 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2896 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2897 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2898 assert(
2899 none_of(ExpressionRecipes,
2900 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2901 "expression cannot contain recipes with side-effects");
2902
2903 // Maintain a copy of the expression recipes as a set of users.
2904 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2905 for (auto *R : ExpressionRecipes)
2906 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2907
2908 // Recipes in the expression, except the last one, must only be used by
2909 // (other) recipes inside the expression. If there are other users, external
2910 // to the expression, use a clone of the recipe for external users.
2911 for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
2912 if (R != ExpressionRecipes.back() &&
2913 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2914 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2915 })) {
2916 // There are users outside of the expression. Clone the recipe and use the
2917 // clone those external users.
2918 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2919 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2920 VPUser &U, unsigned) {
2921 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2922 });
2923 CopyForExtUsers->insertBefore(InsertPos: R);
2924 }
2925 if (R->getParent())
2926 R->removeFromParent();
2927 }
2928
2929 // Internalize all external operands to the expression recipes. To do so,
2930 // create new temporary VPValues for all operands defined by a recipe outside
2931 // the expression. The original operands are added as operands of the
2932 // VPExpressionRecipe itself.
2933 for (auto *R : ExpressionRecipes) {
2934 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
2935 auto *Def = Op->getDefiningRecipe();
2936 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
2937 continue;
2938 addOperand(Operand: Op);
2939 LiveInPlaceholders.push_back(Elt: new VPSymbolicValue());
2940 }
2941 }
2942
2943 // Replace each external operand with the first one created for it in
2944 // LiveInPlaceholders.
2945 for (auto *R : ExpressionRecipes)
2946 for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
2947 R->replaceUsesOfWith(From: LiveIn, To: Tmp);
2948}
2949
2950void VPExpressionRecipe::decompose() {
2951 for (auto *R : ExpressionRecipes)
2952 // Since the list could contain duplicates, make sure the recipe hasn't
2953 // already been inserted.
2954 if (!R->getParent())
2955 R->insertBefore(InsertPos: this);
2956
2957 for (const auto &[Idx, Op] : enumerate(First: operands()))
2958 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
2959
2960 replaceAllUsesWith(New: ExpressionRecipes.back());
2961 ExpressionRecipes.clear();
2962}
2963
2964InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
2965 VPCostContext &Ctx) const {
2966 Type *RedTy = Ctx.Types.inferScalarType(V: this);
2967 auto *SrcVecTy = cast<VectorType>(
2968 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
2969 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2970 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
2971 switch (ExpressionType) {
2972 case ExpressionTypes::ExtendedReduction: {
2973 unsigned Opcode = RecurrenceDescriptor::getOpcode(
2974 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind());
2975 auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
2976 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
2977
2978 if (RedR->isPartialReduction())
2979 return Ctx.TTI.getPartialReductionCost(
2980 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF,
2981 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
2982 OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
2983 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
2984 : std::nullopt);
2985 else if (!RedTy->isFloatingPointTy())
2986 // TTI::getExtendedReductionCost only supports integer types.
2987 return Ctx.TTI.getExtendedReductionCost(
2988 Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
2989 FMF: std::nullopt, CostKind: Ctx.CostKind);
2990 else
2991 return InstructionCost::getInvalid();
2992 }
2993 case ExpressionTypes::MulAccReduction:
2994 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
2995 CostKind: Ctx.CostKind);
2996
2997 case ExpressionTypes::ExtNegatedMulAccReduction:
2998 assert(Opcode == Instruction::Add && "Unexpected opcode");
2999 Opcode = Instruction::Sub;
3000 [[fallthrough]];
3001 case ExpressionTypes::ExtMulAccReduction: {
3002 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3003 if (RedR->isPartialReduction()) {
3004 auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3005 auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]);
3006 auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]);
3007 return Ctx.TTI.getPartialReductionCost(
3008 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)),
3009 InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF,
3010 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3011 CastOpc: Ext0R->getOpcode()),
3012 OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3013 CastOpc: Ext1R->getOpcode()),
3014 BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3015 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3016 : std::nullopt);
3017 }
3018 return Ctx.TTI.getMulAccReductionCost(
3019 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3020 Instruction::ZExt,
3021 RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3022 }
3023 }
3024 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3025}
3026
3027bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3028 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3029 return R->mayReadFromMemory() || R->mayWriteToMemory();
3030 });
3031}
3032
3033bool VPExpressionRecipe::mayHaveSideEffects() const {
3034 assert(
3035 none_of(ExpressionRecipes,
3036 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3037 "expression cannot contain recipes with side-effects");
3038 return false;
3039}
3040
3041bool VPExpressionRecipe::isSingleScalar() const {
3042 // Cannot use vputils::isSingleScalar(), because all external operands
3043 // of the expression will be live-ins while bundled.
3044 auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3045 return RR && !RR->isPartialReduction();
3046}
3047
3048#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3049
3050void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3051 VPSlotTracker &SlotTracker) const {
3052 O << Indent << "EXPRESSION ";
3053 printAsOperand(O, SlotTracker);
3054 O << " = ";
3055 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3056 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3057
3058 switch (ExpressionType) {
3059 case ExpressionTypes::ExtendedReduction: {
3060 getOperand(1)->printAsOperand(O, SlotTracker);
3061 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3062 O << Instruction::getOpcodeName(Opcode) << " (";
3063 getOperand(0)->printAsOperand(O, SlotTracker);
3064 Red->printFlags(O);
3065
3066 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3067 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3068 << *Ext0->getResultType();
3069 if (Red->isConditional()) {
3070 O << ", ";
3071 Red->getCondOp()->printAsOperand(O, SlotTracker);
3072 }
3073 O << ")";
3074 break;
3075 }
3076 case ExpressionTypes::ExtNegatedMulAccReduction: {
3077 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3078 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3079 O << Instruction::getOpcodeName(
3080 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3081 << " (sub (0, mul";
3082 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3083 Mul->printFlags(O);
3084 O << "(";
3085 getOperand(0)->printAsOperand(O, SlotTracker);
3086 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3087 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3088 << *Ext0->getResultType() << "), (";
3089 getOperand(1)->printAsOperand(O, SlotTracker);
3090 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3091 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3092 << *Ext1->getResultType() << ")";
3093 if (Red->isConditional()) {
3094 O << ", ";
3095 Red->getCondOp()->printAsOperand(O, SlotTracker);
3096 }
3097 O << "))";
3098 break;
3099 }
3100 case ExpressionTypes::MulAccReduction:
3101 case ExpressionTypes::ExtMulAccReduction: {
3102 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3103 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3104 O << Instruction::getOpcodeName(
3105 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3106 << " (";
3107 O << "mul";
3108 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3109 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3110 : ExpressionRecipes[0]);
3111 Mul->printFlags(O);
3112 if (IsExtended)
3113 O << "(";
3114 getOperand(0)->printAsOperand(O, SlotTracker);
3115 if (IsExtended) {
3116 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3117 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3118 << *Ext0->getResultType() << "), (";
3119 } else {
3120 O << ", ";
3121 }
3122 getOperand(1)->printAsOperand(O, SlotTracker);
3123 if (IsExtended) {
3124 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3125 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3126 << *Ext1->getResultType() << ")";
3127 }
3128 if (Red->isConditional()) {
3129 O << ", ";
3130 Red->getCondOp()->printAsOperand(O, SlotTracker);
3131 }
3132 O << ")";
3133 break;
3134 }
3135 }
3136}
3137
3138void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3139 VPSlotTracker &SlotTracker) const {
3140 if (isPartialReduction())
3141 O << Indent << "PARTIAL-REDUCE ";
3142 else
3143 O << Indent << "REDUCE ";
3144 printAsOperand(O, SlotTracker);
3145 O << " = ";
3146 getChainOp()->printAsOperand(O, SlotTracker);
3147 O << " +";
3148 printFlags(O);
3149 O << " reduce."
3150 << Instruction::getOpcodeName(
3151 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3152 << " (";
3153 getVecOp()->printAsOperand(O, SlotTracker);
3154 if (isConditional()) {
3155 O << ", ";
3156 getCondOp()->printAsOperand(O, SlotTracker);
3157 }
3158 O << ")";
3159}
3160
3161void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3162 VPSlotTracker &SlotTracker) const {
3163 O << Indent << "REDUCE ";
3164 printAsOperand(O, SlotTracker);
3165 O << " = ";
3166 getChainOp()->printAsOperand(O, SlotTracker);
3167 O << " +";
3168 printFlags(O);
3169 O << " vp.reduce."
3170 << Instruction::getOpcodeName(
3171 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3172 << " (";
3173 getVecOp()->printAsOperand(O, SlotTracker);
3174 O << ", ";
3175 getEVL()->printAsOperand(O, SlotTracker);
3176 if (isConditional()) {
3177 O << ", ";
3178 getCondOp()->printAsOperand(O, SlotTracker);
3179 }
3180 O << ")";
3181}
3182
3183#endif
3184
3185/// A helper function to scalarize a single Instruction in the innermost loop.
3186/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3187/// operands from \p RepRecipe instead of \p Instr's operands.
3188static void scalarizeInstruction(const Instruction *Instr,
3189 VPReplicateRecipe *RepRecipe,
3190 const VPLane &Lane, VPTransformState &State) {
3191 assert((!Instr->getType()->isAggregateType() ||
3192 canVectorizeTy(Instr->getType())) &&
3193 "Expected vectorizable or non-aggregate type.");
3194
3195 // Does this instruction return a value ?
3196 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3197
3198 Instruction *Cloned = Instr->clone();
3199 if (!IsVoidRetTy) {
3200 Cloned->setName(Instr->getName() + ".cloned");
3201 Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe);
3202 // The operands of the replicate recipe may have been narrowed, resulting in
3203 // a narrower result type. Update the type of the cloned instruction to the
3204 // correct type.
3205 if (ResultTy != Cloned->getType())
3206 Cloned->mutateType(Ty: ResultTy);
3207 }
3208
3209 RepRecipe->applyFlags(I&: *Cloned);
3210 RepRecipe->applyMetadata(I&: *Cloned);
3211
3212 if (RepRecipe->hasPredicate())
3213 cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate());
3214
3215 if (auto DL = RepRecipe->getDebugLoc())
3216 State.setDebugLocFrom(DL);
3217
3218 // Replace the operands of the cloned instructions with their scalar
3219 // equivalents in the new loop.
3220 for (const auto &I : enumerate(First: RepRecipe->operands())) {
3221 auto InputLane = Lane;
3222 VPValue *Operand = I.value();
3223 if (vputils::isSingleScalar(VPV: Operand))
3224 InputLane = VPLane::getFirstLane();
3225 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
3226 }
3227
3228 // Place the cloned scalar in the new loop.
3229 State.Builder.Insert(I: Cloned);
3230
3231 State.set(Def: RepRecipe, V: Cloned, Lane);
3232
3233 // If we just cloned a new assumption, add it the assumption cache.
3234 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3235 State.AC->registerAssumption(CI: II);
3236
3237 assert(
3238 (RepRecipe->getRegion() ||
3239 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3240 all_of(RepRecipe->operands(),
3241 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3242 "Expected a recipe is either within a region or all of its operands "
3243 "are defined outside the vectorized region.");
3244}
3245
3246void VPReplicateRecipe::execute(VPTransformState &State) {
3247 Instruction *UI = getUnderlyingInstr();
3248
3249 if (!State.Lane) {
3250 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3251 "must have already been unrolled");
3252 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State);
3253 return;
3254 }
3255
3256 assert((State.VF.isScalar() || !isSingleScalar()) &&
3257 "uniform recipe shouldn't be predicated");
3258 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3259 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
3260 // Insert scalar instance packing it into a vector.
3261 if (State.VF.isVector() && shouldPack()) {
3262 Value *WideValue =
3263 State.Lane->isFirstLane()
3264 ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF))
3265 : State.get(Def: this);
3266 State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
3267 Lane: *State.Lane));
3268 }
3269}
3270
3271bool VPReplicateRecipe::shouldPack() const {
3272 // Find if the recipe is used by a widened recipe via an intervening
3273 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3274 return any_of(Range: users(), P: [](const VPUser *U) {
3275 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
3276 return !vputils::onlyScalarValuesUsed(Def: PredR);
3277 return false;
3278 });
3279}
3280
3281/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3282/// which the legacy cost model computes a SCEV expression when computing the
3283/// address cost. Computing SCEVs for VPValues is incomplete and returns
3284/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3285/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3286static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3287 PredicatedScalarEvolution &PSE,
3288 const Loop *L) {
3289 const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3290 if (isa<SCEVCouldNotCompute>(Val: Addr))
3291 return Addr;
3292
3293 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr;
3294}
3295
3296/// Returns true if \p V is used as part of the address of another load or
3297/// store.
3298static bool isUsedByLoadStoreAddress(const VPUser *V) {
3299 SmallPtrSet<const VPUser *, 4> Seen;
3300 SmallVector<const VPUser *> WorkList = {V};
3301
3302 while (!WorkList.empty()) {
3303 auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val());
3304 if (!Cur || !Seen.insert(Ptr: Cur).second)
3305 continue;
3306
3307 auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur);
3308 // Skip blends that use V only through a compare by checking if any incoming
3309 // value was already visited.
3310 if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()),
3311 P: [&](unsigned I) {
3312 return Seen.contains(
3313 Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe());
3314 }))
3315 continue;
3316
3317 for (VPUser *U : Cur->users()) {
3318 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U))
3319 if (InterleaveR->getAddr() == Cur)
3320 return true;
3321 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) {
3322 if (RepR->getOpcode() == Instruction::Load &&
3323 RepR->getOperand(N: 0) == Cur)
3324 return true;
3325 if (RepR->getOpcode() == Instruction::Store &&
3326 RepR->getOperand(N: 1) == Cur)
3327 return true;
3328 }
3329 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) {
3330 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3331 return true;
3332 }
3333 }
3334
3335 // The legacy cost model only supports scalarization loads/stores with phi
3336 // addresses, if the phi is directly used as load/store address. Don't
3337 // traverse further for Blends.
3338 if (Blend)
3339 continue;
3340
3341 append_range(C&: WorkList, R: Cur->users());
3342 }
3343 return false;
3344}
3345
3346InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3347 VPCostContext &Ctx) const {
3348 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3349 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3350 // transform, avoid computing their cost multiple times for now.
3351 Ctx.SkipCostComputation.insert(Ptr: UI);
3352
3353 if (VF.isScalable() && !isSingleScalar())
3354 return InstructionCost::getInvalid();
3355
3356 switch (UI->getOpcode()) {
3357 case Instruction::Alloca:
3358 if (VF.isScalable())
3359 return InstructionCost::getInvalid();
3360 return Ctx.TTI.getArithmeticInstrCost(
3361 Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind);
3362 case Instruction::GetElementPtr:
3363 // We mark this instruction as zero-cost because the cost of GEPs in
3364 // vectorized code depends on whether the corresponding memory instruction
3365 // is scalarized or not. Therefore, we handle GEPs with the memory
3366 // instruction cost.
3367 return 0;
3368 case Instruction::Call: {
3369 auto *CalledFn =
3370 cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue());
3371
3372 SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3373 SmallVector<Type *, 4> Tys;
3374 for (const VPValue *ArgOp : ArgOps)
3375 Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp));
3376
3377 if (CalledFn->isIntrinsic())
3378 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3379 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3380 switch (CalledFn->getIntrinsicID()) {
3381 case Intrinsic::assume:
3382 case Intrinsic::lifetime_end:
3383 case Intrinsic::lifetime_start:
3384 case Intrinsic::sideeffect:
3385 case Intrinsic::pseudoprobe:
3386 case Intrinsic::experimental_noalias_scope_decl: {
3387 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3388 ElementCount::getFixed(1), Ctx) == 0 &&
3389 "scalarizing intrinsic should be free");
3390 return InstructionCost(0);
3391 }
3392 default:
3393 break;
3394 }
3395
3396 Type *ResultTy = Ctx.Types.inferScalarType(V: this);
3397 InstructionCost ScalarCallCost =
3398 Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3399 if (isSingleScalar()) {
3400 if (CalledFn->isIntrinsic())
3401 ScalarCallCost = std::min(
3402 a: ScalarCallCost,
3403 b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this,
3404 VF: ElementCount::getFixed(MinVal: 1), Ctx));
3405 return ScalarCallCost;
3406 }
3407
3408 return ScalarCallCost * VF.getFixedValue() +
3409 Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
3410 }
3411 case Instruction::Add:
3412 case Instruction::Sub:
3413 case Instruction::FAdd:
3414 case Instruction::FSub:
3415 case Instruction::Mul:
3416 case Instruction::FMul:
3417 case Instruction::FDiv:
3418 case Instruction::FRem:
3419 case Instruction::Shl:
3420 case Instruction::LShr:
3421 case Instruction::AShr:
3422 case Instruction::And:
3423 case Instruction::Or:
3424 case Instruction::Xor:
3425 case Instruction::ICmp:
3426 case Instruction::FCmp:
3427 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3428 Ctx) *
3429 (isSingleScalar() ? 1 : VF.getFixedValue());
3430 case Instruction::SDiv:
3431 case Instruction::UDiv:
3432 case Instruction::SRem:
3433 case Instruction::URem: {
3434 InstructionCost ScalarCost =
3435 getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx);
3436 if (isSingleScalar())
3437 return ScalarCost;
3438
3439 // If any of the operands is from a different replicate region and has its
3440 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3441 // model to avoid cost mis-match.
3442 if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3443 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3444 if (!PredR)
3445 return false;
3446 return Ctx.skipCostComputation(
3447 UI: dyn_cast_or_null<Instruction>(
3448 Val: PredR->getOperand(N: 0)->getUnderlyingValue()),
3449 IsVector: VF.isVector());
3450 }))
3451 break;
3452
3453 ScalarCost = ScalarCost * VF.getFixedValue() +
3454 Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this),
3455 Operands: to_vector(Range: operands()), VF);
3456 // If the recipe is not predicated (i.e. not in a replicate region), return
3457 // the scalar cost. Otherwise handle predicated cost.
3458 if (!getRegion()->isReplicator())
3459 return ScalarCost;
3460
3461 // Account for the phi nodes that we will create.
3462 ScalarCost += VF.getFixedValue() *
3463 Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3464 // Scale the cost by the probability of executing the predicated blocks.
3465 // This assumes the predicated block for each vector lane is equally
3466 // likely.
3467 ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3468 return ScalarCost;
3469 }
3470 case Instruction::Load:
3471 case Instruction::Store: {
3472 // TODO: See getMemInstScalarizationCost for how to handle replicating and
3473 // predicated cases.
3474 const VPRegionBlock *ParentRegion = getRegion();
3475 if (ParentRegion && ParentRegion->isReplicator())
3476 break;
3477
3478 bool IsLoad = UI->getOpcode() == Instruction::Load;
3479 const VPValue *PtrOp = getOperand(N: !IsLoad);
3480 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3481 if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3482 break;
3483
3484 Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0));
3485 Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp);
3486 const Align Alignment = getLoadStoreAlignment(I: UI);
3487 unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3488 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0));
3489 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3490 Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo);
3491
3492 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3493 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3494 bool UsedByLoadStoreAddress =
3495 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this);
3496 InstructionCost ScalarCost =
3497 ScalarMemOpCost +
3498 Ctx.TTI.getAddressComputationCost(
3499 PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3500 CostKind: Ctx.CostKind);
3501 if (isSingleScalar())
3502 return ScalarCost;
3503
3504 SmallVector<const VPValue *> OpsToScalarize;
3505 Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3506 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3507 // don't assign scalarization overhead in general, if the target prefers
3508 // vectorized addressing or the loaded value is used as part of an address
3509 // of another load or store.
3510 if (!UsedByLoadStoreAddress) {
3511 bool EfficientVectorLoadStore =
3512 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3513 if (!(IsLoad && !PreferVectorizedAddressing) &&
3514 !(!IsLoad && EfficientVectorLoadStore))
3515 append_range(C&: OpsToScalarize, R: operands());
3516
3517 if (!EfficientVectorLoadStore)
3518 ResultTy = Ctx.Types.inferScalarType(V: this);
3519 }
3520
3521 TTI::VectorInstrContext VIC =
3522 IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3523 return (ScalarCost * VF.getFixedValue()) +
3524 Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC,
3525 AlwaysIncludeReplicatingR: true);
3526 }
3527 case Instruction::SExt:
3528 case Instruction::ZExt:
3529 case Instruction::FPToUI:
3530 case Instruction::FPToSI:
3531 case Instruction::FPExt:
3532 case Instruction::PtrToInt:
3533 case Instruction::PtrToAddr:
3534 case Instruction::IntToPtr:
3535 case Instruction::SIToFP:
3536 case Instruction::UIToFP:
3537 case Instruction::Trunc:
3538 case Instruction::FPTrunc:
3539 case Instruction::AddrSpaceCast: {
3540 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3541 Ctx) *
3542 (isSingleScalar() ? 1 : VF.getFixedValue());
3543 }
3544 case Instruction::ExtractValue:
3545 case Instruction::InsertValue:
3546 return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3547 }
3548
3549 return Ctx.getLegacyCost(UI, VF);
3550}
3551
3552#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3553void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3554 VPSlotTracker &SlotTracker) const {
3555 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3556
3557 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3558 printAsOperand(O, SlotTracker);
3559 O << " = ";
3560 }
3561 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3562 O << "call";
3563 printFlags(O);
3564 O << "@" << CB->getCalledFunction()->getName() << "(";
3565 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
3566 O, [&O, &SlotTracker](VPValue *Op) {
3567 Op->printAsOperand(O, SlotTracker);
3568 });
3569 O << ")";
3570 } else {
3571 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
3572 printFlags(O);
3573 printOperands(O, SlotTracker);
3574 }
3575
3576 if (shouldPack())
3577 O << " (S->V)";
3578}
3579#endif
3580
3581void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
3582 assert(State.Lane && "Branch on Mask works only on single instance.");
3583
3584 VPValue *BlockInMask = getOperand(N: 0);
3585 Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane);
3586
3587 // Replace the temporary unreachable terminator with a new conditional branch,
3588 // whose two destinations will be set later when they are created.
3589 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3590 assert(isa<UnreachableInst>(CurrentTerminator) &&
3591 "Expected to replace unreachable terminator with conditional branch.");
3592 auto CondBr =
3593 State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
3594 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
3595 CurrentTerminator->eraseFromParent();
3596}
3597
3598InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3599 VPCostContext &Ctx) const {
3600 // The legacy cost model doesn't assign costs to branches for individual
3601 // replicate regions. Match the current behavior in the VPlan cost model for
3602 // now.
3603 return 0;
3604}
3605
3606void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3607 assert(State.Lane && "Predicated instruction PHI works per instance.");
3608 Instruction *ScalarPredInst =
3609 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane));
3610 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3611 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3612 assert(PredicatingBB && "Predicated block has no single predecessor.");
3613 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3614 "operand must be VPReplicateRecipe");
3615
3616 // By current pack/unpack logic we need to generate only a single phi node: if
3617 // a vector value for the predicated instruction exists at this point it means
3618 // the instruction has vector users only, and a phi for the vector value is
3619 // needed. In this case the recipe of the predicated instruction is marked to
3620 // also do that packing, thereby "hoisting" the insert-element sequence.
3621 // Otherwise, a phi node for the scalar value is needed.
3622 if (State.hasVectorValue(Def: getOperand(N: 0))) {
3623 auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0)));
3624 assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
3625 "Packed operands must generate an insertelement or insertvalue");
3626
3627 // If VectorI is a struct, it will be a sequence like:
3628 // %1 = insertvalue %unmodified, %x, 0
3629 // %2 = insertvalue %1, %y, 1
3630 // %VectorI = insertvalue %2, %z, 2
3631 // To get the unmodified vector we need to look through the chain.
3632 if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType()))
3633 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3634 VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0));
3635
3636 PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2);
3637 VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector.
3638 VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element.
3639 if (State.hasVectorValue(Def: this))
3640 State.reset(Def: this, V: VPhi);
3641 else
3642 State.set(Def: this, V: VPhi);
3643 // NOTE: Currently we need to update the value of the operand, so the next
3644 // predicated iteration inserts its generated value in the correct vector.
3645 State.reset(Def: getOperand(N: 0), V: VPhi);
3646 } else {
3647 if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane())
3648 return;
3649
3650 Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
3651 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
3652 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3653 BB: PredicatingBB);
3654 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3655 if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3656 State.reset(Def: this, V: Phi, Lane: *State.Lane);
3657 else
3658 State.set(Def: this, V: Phi, Lane: *State.Lane);
3659 // NOTE: Currently we need to update the value of the operand, so the next
3660 // predicated iteration inserts its generated value in the correct vector.
3661 State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane);
3662 }
3663}
3664
3665#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3666void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3667 VPSlotTracker &SlotTracker) const {
3668 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3669 printAsOperand(O, SlotTracker);
3670 O << " = ";
3671 printOperands(O, SlotTracker);
3672}
3673#endif
3674
3675InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3676 VPCostContext &Ctx) const {
3677 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3678 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3679 ->getAddressSpace();
3680 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3681 ? Instruction::Load
3682 : Instruction::Store;
3683
3684 if (!Consecutive) {
3685 // TODO: Using the original IR may not be accurate.
3686 // Currently, ARM will use the underlying IR to calculate gather/scatter
3687 // instruction cost.
3688 assert(!Reverse &&
3689 "Inconsecutive memory access should not have the order.");
3690
3691 const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3692 Type *PtrTy = Ptr->getType();
3693
3694 // If the address value is uniform across all lanes, then the address can be
3695 // calculated with scalar type and broadcast.
3696 if (!vputils::isSingleScalar(VPV: getAddr()))
3697 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
3698
3699 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather
3700 : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter
3701 : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather
3702 : Intrinsic::vp_scatter;
3703 return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
3704 CostKind: Ctx.CostKind) +
3705 Ctx.TTI.getMemIntrinsicInstrCost(
3706 MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
3707 &Ingredient),
3708 CostKind: Ctx.CostKind);
3709 }
3710
3711 InstructionCost Cost = 0;
3712 if (IsMasked) {
3713 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load
3714 : Intrinsic::masked_store;
3715 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3716 MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
3717 } else {
3718 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3719 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0)
3720 : getOperand(N: 1));
3721 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3722 OpdInfo: OpInfo, I: &Ingredient);
3723 }
3724 return Cost;
3725}
3726
3727void VPWidenLoadRecipe::execute(VPTransformState &State) {
3728 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3729 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3730 bool CreateGather = !isConsecutive();
3731
3732 auto &Builder = State.Builder;
3733 Value *Mask = nullptr;
3734 if (auto *VPMask = getMask()) {
3735 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3736 // of a null all-one mask is a null mask.
3737 Mask = State.get(Def: VPMask);
3738 if (isReverse())
3739 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3740 }
3741
3742 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
3743 Value *NewLI;
3744 if (CreateGather) {
3745 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3746 Name: "wide.masked.gather");
3747 } else if (Mask) {
3748 NewLI =
3749 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3750 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3751 } else {
3752 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3753 }
3754 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3755 State.set(Def: this, V: NewLI);
3756}
3757
3758#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3759void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3760 VPSlotTracker &SlotTracker) const {
3761 O << Indent << "WIDEN ";
3762 printAsOperand(O, SlotTracker);
3763 O << " = load ";
3764 printOperands(O, SlotTracker);
3765}
3766#endif
3767
3768/// Use all-true mask for reverse rather than actual mask, as it avoids a
3769/// dependence w/o affecting the result.
3770static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3771 Value *EVL, const Twine &Name) {
3772 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3773 Value *AllTrueMask =
3774 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3775 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3776 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3777}
3778
3779void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3780 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3781 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3782 bool CreateGather = !isConsecutive();
3783
3784 auto &Builder = State.Builder;
3785 CallInst *NewLI;
3786 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3787 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3788 Value *Mask = nullptr;
3789 if (VPValue *VPMask = getMask()) {
3790 Mask = State.get(Def: VPMask);
3791 if (isReverse())
3792 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3793 } else {
3794 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3795 }
3796
3797 if (CreateGather) {
3798 NewLI =
3799 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3800 FMFSource: nullptr, Name: "wide.masked.gather");
3801 } else {
3802 NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3803 Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3804 }
3805 NewLI->addParamAttr(
3806 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3807 applyMetadata(I&: *NewLI);
3808 Instruction *Res = NewLI;
3809 State.set(Def: this, V: Res);
3810}
3811
3812InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3813 VPCostContext &Ctx) const {
3814 if (!Consecutive || IsMasked)
3815 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3816
3817 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3818 // here because the EVL recipes using EVL to replace the tail mask. But in the
3819 // legacy model, it will always calculate the cost of mask.
3820 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3821 // don't need to compare to the legacy cost model.
3822 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3823 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3824 ->getAddressSpace();
3825 return Ctx.TTI.getMemIntrinsicInstrCost(
3826 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3827 CostKind: Ctx.CostKind);
3828}
3829
3830#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3831void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3832 VPSlotTracker &SlotTracker) const {
3833 O << Indent << "WIDEN ";
3834 printAsOperand(O, SlotTracker);
3835 O << " = vp.load ";
3836 printOperands(O, SlotTracker);
3837}
3838#endif
3839
3840void VPWidenStoreRecipe::execute(VPTransformState &State) {
3841 VPValue *StoredVPValue = getStoredValue();
3842 bool CreateScatter = !isConsecutive();
3843
3844 auto &Builder = State.Builder;
3845
3846 Value *Mask = nullptr;
3847 if (auto *VPMask = getMask()) {
3848 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3849 // of a null all-one mask is a null mask.
3850 Mask = State.get(Def: VPMask);
3851 if (isReverse())
3852 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3853 }
3854
3855 Value *StoredVal = State.get(Def: StoredVPValue);
3856 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
3857 Instruction *NewSI = nullptr;
3858 if (CreateScatter)
3859 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3860 else if (Mask)
3861 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3862 else
3863 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3864 applyMetadata(I&: *NewSI);
3865}
3866
3867#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3868void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3869 VPSlotTracker &SlotTracker) const {
3870 O << Indent << "WIDEN store ";
3871 printOperands(O, SlotTracker);
3872}
3873#endif
3874
3875void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3876 VPValue *StoredValue = getStoredValue();
3877 bool CreateScatter = !isConsecutive();
3878
3879 auto &Builder = State.Builder;
3880
3881 CallInst *NewSI = nullptr;
3882 Value *StoredVal = State.get(Def: StoredValue);
3883 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3884 Value *Mask = nullptr;
3885 if (VPValue *VPMask = getMask()) {
3886 Mask = State.get(Def: VPMask);
3887 if (isReverse())
3888 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3889 } else {
3890 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3891 }
3892 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3893 if (CreateScatter) {
3894 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3895 ID: Intrinsic::vp_scatter,
3896 Args: {StoredVal, Addr, Mask, EVL});
3897 } else {
3898 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3899 ID: Intrinsic::vp_store,
3900 Args: {StoredVal, Addr, Mask, EVL});
3901 }
3902 NewSI->addParamAttr(
3903 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3904 applyMetadata(I&: *NewSI);
3905}
3906
3907InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3908 VPCostContext &Ctx) const {
3909 if (!Consecutive || IsMasked)
3910 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3911
3912 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3913 // here because the EVL recipes using EVL to replace the tail mask. But in the
3914 // legacy model, it will always calculate the cost of mask.
3915 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3916 // don't need to compare to the legacy cost model.
3917 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3918 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3919 ->getAddressSpace();
3920 return Ctx.TTI.getMemIntrinsicInstrCost(
3921 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
3922 CostKind: Ctx.CostKind);
3923}
3924
3925#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3926void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3927 VPSlotTracker &SlotTracker) const {
3928 O << Indent << "WIDEN vp.store ";
3929 printOperands(O, SlotTracker);
3930}
3931#endif
3932
3933static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3934 VectorType *DstVTy, const DataLayout &DL) {
3935 // Verify that V is a vector type with same number of elements as DstVTy.
3936 auto VF = DstVTy->getElementCount();
3937 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
3938 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3939 Type *SrcElemTy = SrcVecTy->getElementType();
3940 Type *DstElemTy = DstVTy->getElementType();
3941 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3942 "Vector elements must have same size");
3943
3944 // Do a direct cast if element types are castable.
3945 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
3946 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
3947 }
3948 // V cannot be directly casted to desired vector type.
3949 // May happen when V is a floating point vector but DstVTy is a vector of
3950 // pointers or vice-versa. Handle this using a two-step bitcast using an
3951 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3952 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3953 "Only one type should be a pointer type");
3954 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3955 "Only one type should be a floating point type");
3956 Type *IntTy =
3957 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
3958 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
3959 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
3960 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
3961}
3962
3963/// Return a vector containing interleaved elements from multiple
3964/// smaller input vectors.
3965static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
3966 const Twine &Name) {
3967 unsigned Factor = Vals.size();
3968 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
3969
3970 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
3971#ifndef NDEBUG
3972 for (Value *Val : Vals)
3973 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
3974#endif
3975
3976 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
3977 // must use intrinsics to interleave.
3978 if (VecTy->isScalableTy()) {
3979 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
3980 return Builder.CreateVectorInterleave(Ops: Vals, Name);
3981 }
3982
3983 // Fixed length. Start by concatenating all vectors into a wide vector.
3984 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
3985
3986 // Interleave the elements into the wide vector.
3987 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
3988 return Builder.CreateShuffleVector(
3989 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
3990}
3991
3992// Try to vectorize the interleave group that \p Instr belongs to.
3993//
3994// E.g. Translate following interleaved load group (factor = 3):
3995// for (i = 0; i < N; i+=3) {
3996// R = Pic[i]; // Member of index 0
3997// G = Pic[i+1]; // Member of index 1
3998// B = Pic[i+2]; // Member of index 2
3999// ... // do something to R, G, B
4000// }
4001// To:
4002// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4003// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4004// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4005// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4006//
4007// Or translate following interleaved store group (factor = 3):
4008// for (i = 0; i < N; i+=3) {
4009// ... do something to R, G, B
4010// Pic[i] = R; // Member of index 0
4011// Pic[i+1] = G; // Member of index 1
4012// Pic[i+2] = B; // Member of index 2
4013// }
4014// To:
4015// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4016// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4017// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4018// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4019// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4020void VPInterleaveRecipe::execute(VPTransformState &State) {
4021 assert(!State.Lane && "Interleave group being replicated.");
4022 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4023 "Masking gaps for scalable vectors is not yet supported.");
4024 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4025 Instruction *Instr = Group->getInsertPos();
4026
4027 // Prepare for the vector type of the interleaved load/store.
4028 Type *ScalarTy = getLoadStoreType(I: Instr);
4029 unsigned InterleaveFactor = Group->getFactor();
4030 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
4031
4032 VPValue *BlockInMask = getMask();
4033 VPValue *Addr = getAddr();
4034 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4035
4036 auto CreateGroupMask = [&BlockInMask, &State,
4037 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4038 if (State.VF.isScalable()) {
4039 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4040 assert(InterleaveFactor <= 8 &&
4041 "Unsupported deinterleave factor for scalable vectors");
4042 auto *ResBlockInMask = State.get(Def: BlockInMask);
4043 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4044 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4045 }
4046
4047 if (!BlockInMask)
4048 return MaskForGaps;
4049
4050 Value *ResBlockInMask = State.get(Def: BlockInMask);
4051 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4052 V: ResBlockInMask,
4053 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4054 Name: "interleaved.mask");
4055 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4056 LHS: ShuffledMask, RHS: MaskForGaps)
4057 : ShuffledMask;
4058 };
4059
4060 const DataLayout &DL = Instr->getDataLayout();
4061 // Vectorize the interleaved load group.
4062 if (isa<LoadInst>(Val: Instr)) {
4063 Value *MaskForGaps = nullptr;
4064 if (needsMaskForGaps()) {
4065 MaskForGaps =
4066 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4067 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4068 }
4069
4070 Instruction *NewLoad;
4071 if (BlockInMask || MaskForGaps) {
4072 Value *GroupMask = CreateGroupMask(MaskForGaps);
4073 Value *PoisonVec = PoisonValue::get(T: VecTy);
4074 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4075 Alignment: Group->getAlign(), Mask: GroupMask,
4076 PassThru: PoisonVec, Name: "wide.masked.vec");
4077 } else
4078 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4079 Align: Group->getAlign(), Name: "wide.vec");
4080 applyMetadata(I&: *NewLoad);
4081 // TODO: Also manage existing metadata using VPIRMetadata.
4082 Group->addMetadata(NewInst: NewLoad);
4083
4084 ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4085 if (VecTy->isScalableTy()) {
4086 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4087 // so must use intrinsics to deinterleave.
4088 assert(InterleaveFactor <= 8 &&
4089 "Unsupported deinterleave factor for scalable vectors");
4090 NewLoad = State.Builder.CreateIntrinsic(
4091 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4092 Types: NewLoad->getType(), Args: NewLoad,
4093 /*FMFSource=*/nullptr, Name: "strided.vec");
4094 }
4095
4096 auto CreateStridedVector = [&InterleaveFactor, &State,
4097 &NewLoad](unsigned Index) -> Value * {
4098 assert(Index < InterleaveFactor && "Illegal group index");
4099 if (State.VF.isScalable())
4100 return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4101
4102 // For fixed length VF, use shuffle to extract the sub-vectors from the
4103 // wide load.
4104 auto StrideMask =
4105 createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4106 return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4107 Name: "strided.vec");
4108 };
4109
4110 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4111 Instruction *Member = Group->getMember(Index: I);
4112
4113 // Skip the gaps in the group.
4114 if (!Member)
4115 continue;
4116
4117 Value *StridedVec = CreateStridedVector(I);
4118
4119 // If this member has different type, cast the result type.
4120 if (Member->getType() != ScalarTy) {
4121 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4122 StridedVec =
4123 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4124 }
4125
4126 if (Group->isReverse())
4127 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4128
4129 State.set(Def: VPDefs[J], V: StridedVec);
4130 ++J;
4131 }
4132 return;
4133 }
4134
4135 // The sub vector type for current instruction.
4136 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4137
4138 // Vectorize the interleaved store group.
4139 Value *MaskForGaps =
4140 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4141 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4142 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4143 ArrayRef<VPValue *> StoredValues = getStoredValues();
4144 // Collect the stored vector from each member.
4145 SmallVector<Value *, 4> StoredVecs;
4146 unsigned StoredIdx = 0;
4147 for (unsigned i = 0; i < InterleaveFactor; i++) {
4148 assert((Group->getMember(i) || MaskForGaps) &&
4149 "Fail to get a member from an interleaved store group");
4150 Instruction *Member = Group->getMember(Index: i);
4151
4152 // Skip the gaps in the group.
4153 if (!Member) {
4154 Value *Undef = PoisonValue::get(T: SubVT);
4155 StoredVecs.push_back(Elt: Undef);
4156 continue;
4157 }
4158
4159 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4160 ++StoredIdx;
4161
4162 if (Group->isReverse())
4163 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4164
4165 // If this member has different type, cast it to a unified type.
4166
4167 if (StoredVec->getType() != SubVT)
4168 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4169
4170 StoredVecs.push_back(Elt: StoredVec);
4171 }
4172
4173 // Interleave all the smaller vectors into one wider vector.
4174 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4175 Instruction *NewStoreInstr;
4176 if (BlockInMask || MaskForGaps) {
4177 Value *GroupMask = CreateGroupMask(MaskForGaps);
4178 NewStoreInstr = State.Builder.CreateMaskedStore(
4179 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4180 } else
4181 NewStoreInstr =
4182 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4183
4184 applyMetadata(I&: *NewStoreInstr);
4185 // TODO: Also manage existing metadata using VPIRMetadata.
4186 Group->addMetadata(NewInst: NewStoreInstr);
4187}
4188
4189#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4190void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4191 VPSlotTracker &SlotTracker) const {
4192 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4193 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4194 IG->getInsertPos()->printAsOperand(O, false);
4195 O << ", ";
4196 getAddr()->printAsOperand(O, SlotTracker);
4197 VPValue *Mask = getMask();
4198 if (Mask) {
4199 O << ", ";
4200 Mask->printAsOperand(O, SlotTracker);
4201 }
4202
4203 unsigned OpIdx = 0;
4204 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4205 if (!IG->getMember(i))
4206 continue;
4207 if (getNumStoreOperands() > 0) {
4208 O << "\n" << Indent << " store ";
4209 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
4210 O << " to index " << i;
4211 } else {
4212 O << "\n" << Indent << " ";
4213 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4214 O << " = load from index " << i;
4215 }
4216 ++OpIdx;
4217 }
4218}
4219#endif
4220
4221void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4222 assert(!State.Lane && "Interleave group being replicated.");
4223 assert(State.VF.isScalable() &&
4224 "Only support scalable VF for EVL tail-folding.");
4225 assert(!needsMaskForGaps() &&
4226 "Masking gaps for scalable vectors is not yet supported.");
4227 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4228 Instruction *Instr = Group->getInsertPos();
4229
4230 // Prepare for the vector type of the interleaved load/store.
4231 Type *ScalarTy = getLoadStoreType(I: Instr);
4232 unsigned InterleaveFactor = Group->getFactor();
4233 assert(InterleaveFactor <= 8 &&
4234 "Unsupported deinterleave/interleave factor for scalable vectors");
4235 ElementCount WideVF = State.VF * InterleaveFactor;
4236 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4237
4238 VPValue *Addr = getAddr();
4239 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4240 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4241 Value *InterleaveEVL = State.Builder.CreateMul(
4242 LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4243 /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true);
4244 LLVMContext &Ctx = State.Builder.getContext();
4245
4246 Value *GroupMask = nullptr;
4247 if (VPValue *BlockInMask = getMask()) {
4248 SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4249 GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4250 } else {
4251 GroupMask =
4252 State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4253 }
4254
4255 // Vectorize the interleaved load group.
4256 if (isa<LoadInst>(Val: Instr)) {
4257 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4258 RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4259 Name: "wide.vp.load");
4260 NewLoad->addParamAttr(ArgNo: 0,
4261 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4262
4263 applyMetadata(I&: *NewLoad);
4264 // TODO: Also manage existing metadata using VPIRMetadata.
4265 Group->addMetadata(NewInst: NewLoad);
4266
4267 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4268 // so must use intrinsics to deinterleave.
4269 NewLoad = State.Builder.CreateIntrinsic(
4270 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4271 Types: NewLoad->getType(), Args: NewLoad,
4272 /*FMFSource=*/nullptr, Name: "strided.vec");
4273
4274 const DataLayout &DL = Instr->getDataLayout();
4275 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4276 Instruction *Member = Group->getMember(Index: I);
4277 // Skip the gaps in the group.
4278 if (!Member)
4279 continue;
4280
4281 Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4282 // If this member has different type, cast the result type.
4283 if (Member->getType() != ScalarTy) {
4284 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4285 StridedVec =
4286 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4287 }
4288
4289 State.set(Def: getVPValue(I: J), V: StridedVec);
4290 ++J;
4291 }
4292 return;
4293 } // End for interleaved load.
4294
4295 // The sub vector type for current instruction.
4296 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4297 // Vectorize the interleaved store group.
4298 ArrayRef<VPValue *> StoredValues = getStoredValues();
4299 // Collect the stored vector from each member.
4300 SmallVector<Value *, 4> StoredVecs;
4301 const DataLayout &DL = Instr->getDataLayout();
4302 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4303 Instruction *Member = Group->getMember(Index: I);
4304 // Skip the gaps in the group.
4305 if (!Member) {
4306 StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4307 continue;
4308 }
4309
4310 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4311 // If this member has different type, cast it to a unified type.
4312 if (StoredVec->getType() != SubVT)
4313 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4314
4315 StoredVecs.push_back(Elt: StoredVec);
4316 ++StoredIdx;
4317 }
4318
4319 // Interleave all the smaller vectors into one wider vector.
4320 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4321 CallInst *NewStore =
4322 State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4323 Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4324 NewStore->addParamAttr(ArgNo: 1,
4325 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4326
4327 applyMetadata(I&: *NewStore);
4328 // TODO: Also manage existing metadata using VPIRMetadata.
4329 Group->addMetadata(NewInst: NewStore);
4330}
4331
4332#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4333void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4334 VPSlotTracker &SlotTracker) const {
4335 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4336 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4337 IG->getInsertPos()->printAsOperand(O, false);
4338 O << ", ";
4339 getAddr()->printAsOperand(O, SlotTracker);
4340 O << ", ";
4341 getEVL()->printAsOperand(O, SlotTracker);
4342 if (VPValue *Mask = getMask()) {
4343 O << ", ";
4344 Mask->printAsOperand(O, SlotTracker);
4345 }
4346
4347 unsigned OpIdx = 0;
4348 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4349 if (!IG->getMember(i))
4350 continue;
4351 if (getNumStoreOperands() > 0) {
4352 O << "\n" << Indent << " vp.store ";
4353 getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
4354 O << " to index " << i;
4355 } else {
4356 O << "\n" << Indent << " ";
4357 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4358 O << " = vp.load from index " << i;
4359 }
4360 ++OpIdx;
4361 }
4362}
4363#endif
4364
4365InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4366 VPCostContext &Ctx) const {
4367 Instruction *InsertPos = getInsertPos();
4368 // Find the VPValue index of the interleave group. We need to skip gaps.
4369 unsigned InsertPosIdx = 0;
4370 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4371 if (auto *Member = IG->getMember(Index: Idx)) {
4372 if (Member == InsertPos)
4373 break;
4374 InsertPosIdx++;
4375 }
4376 Type *ValTy = Ctx.Types.inferScalarType(
4377 V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx)
4378 : getStoredValues()[InsertPosIdx]);
4379 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4380 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4381 ->getAddressSpace();
4382
4383 unsigned InterleaveFactor = IG->getFactor();
4384 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4385
4386 // Holds the indices of existing members in the interleaved group.
4387 SmallVector<unsigned, 4> Indices;
4388 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4389 if (IG->getMember(Index: IF))
4390 Indices.push_back(Elt: IF);
4391
4392 // Calculate the cost of the whole interleaved group.
4393 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4394 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4395 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4396
4397 if (!IG->isReverse())
4398 return Cost;
4399
4400 return Cost + IG->getNumMembers() *
4401 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4402 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4403 Index: 0);
4404}
4405
4406#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4407void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4408 VPSlotTracker &SlotTracker) const {
4409 O << Indent << "EMIT ";
4410 printAsOperand(O, SlotTracker);
4411 O << " = CANONICAL-INDUCTION ";
4412 printOperands(O, SlotTracker);
4413}
4414#endif
4415
4416bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4417 return vputils::onlyScalarValuesUsed(Def: this) &&
4418 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
4419}
4420
4421#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4422void VPWidenPointerInductionRecipe::printRecipe(
4423 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4424 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4425 "unexpected number of operands");
4426 O << Indent << "EMIT ";
4427 printAsOperand(O, SlotTracker);
4428 O << " = WIDEN-POINTER-INDUCTION ";
4429 getStartValue()->printAsOperand(O, SlotTracker);
4430 O << ", ";
4431 getStepValue()->printAsOperand(O, SlotTracker);
4432 O << ", ";
4433 getOperand(2)->printAsOperand(O, SlotTracker);
4434 if (getNumOperands() == 5) {
4435 O << ", ";
4436 getOperand(3)->printAsOperand(O, SlotTracker);
4437 O << ", ";
4438 getOperand(4)->printAsOperand(O, SlotTracker);
4439 }
4440}
4441
4442void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4443 VPSlotTracker &SlotTracker) const {
4444 O << Indent << "EMIT ";
4445 printAsOperand(O, SlotTracker);
4446 O << " = EXPAND SCEV " << *Expr;
4447}
4448#endif
4449
4450void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
4451 Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
4452 Type *STy = CanonicalIV->getType();
4453 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4454 ElementCount VF = State.VF;
4455 Value *VStart = VF.isScalar()
4456 ? CanonicalIV
4457 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
4458 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: *this));
4459 if (VF.isVector()) {
4460 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
4461 VStep =
4462 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
4463 }
4464 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
4465 State.set(Def: this, V: CanonicalVectorIV);
4466}
4467
4468#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4469void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4470 VPSlotTracker &SlotTracker) const {
4471 O << Indent << "EMIT ";
4472 printAsOperand(O, SlotTracker);
4473 O << " = WIDEN-CANONICAL-INDUCTION ";
4474 printOperands(O, SlotTracker);
4475}
4476#endif
4477
4478void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4479 auto &Builder = State.Builder;
4480 // Create a vector from the initial value.
4481 auto *VectorInit = getStartValue()->getLiveInIRValue();
4482
4483 Type *VecTy = State.VF.isScalar()
4484 ? VectorInit->getType()
4485 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4486
4487 BasicBlock *VectorPH =
4488 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4489 if (State.VF.isVector()) {
4490 auto *IdxTy = Builder.getInt32Ty();
4491 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
4492 IRBuilder<>::InsertPointGuard Guard(Builder);
4493 Builder.SetInsertPoint(VectorPH->getTerminator());
4494 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4495 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4496 VectorInit = Builder.CreateInsertElement(
4497 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4498 }
4499
4500 // Create a phi node for the new recurrence.
4501 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
4502 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4503 Phi->addIncoming(V: VectorInit, BB: VectorPH);
4504 State.set(Def: this, V: Phi);
4505}
4506
4507InstructionCost
4508VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4509 VPCostContext &Ctx) const {
4510 if (VF.isScalar())
4511 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4512
4513 return 0;
4514}
4515
4516#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4517void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4518 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4519 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4520 printAsOperand(O, SlotTracker);
4521 O << " = phi ";
4522 printOperands(O, SlotTracker);
4523}
4524#endif
4525
4526void VPReductionPHIRecipe::execute(VPTransformState &State) {
4527 // Reductions do not have to start at zero. They can start with
4528 // any loop invariant values.
4529 VPValue *StartVPV = getStartValue();
4530
4531 // In order to support recurrences we need to be able to vectorize Phi nodes.
4532 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4533 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4534 // this value when we vectorize all of the instructions that use the PHI.
4535 BasicBlock *VectorPH =
4536 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4537 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4538 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4539 Type *VecTy = StartV->getType();
4540
4541 BasicBlock *HeaderBB = State.CFG.PrevBB;
4542 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4543 "recipe must be in the vector loop header");
4544 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
4545 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4546 State.set(Def: this, V: Phi, IsScalar: isInLoop());
4547
4548 Phi->addIncoming(V: StartV, BB: VectorPH);
4549}
4550
4551#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4552void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4553 VPSlotTracker &SlotTracker) const {
4554 O << Indent << "WIDEN-REDUCTION-PHI ";
4555
4556 printAsOperand(O, SlotTracker);
4557 O << " = phi ";
4558 printOperands(O, SlotTracker);
4559 if (getVFScaleFactor() > 1)
4560 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4561}
4562#endif
4563
4564void VPWidenPHIRecipe::execute(VPTransformState &State) {
4565 Value *Op0 = State.get(Def: getOperand(N: 0));
4566 Type *VecTy = Op0->getType();
4567 Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name);
4568 State.set(Def: this, V: VecPhi);
4569}
4570
4571InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4572 VPCostContext &Ctx) const {
4573 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4574}
4575
4576#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4577void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4578 VPSlotTracker &SlotTracker) const {
4579 O << Indent << "WIDEN-PHI ";
4580
4581 printAsOperand(O, SlotTracker);
4582 O << " = phi ";
4583 printPhiOperands(O, SlotTracker);
4584}
4585#endif
4586
4587void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4588 BasicBlock *VectorPH =
4589 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4590 Value *StartMask = State.get(Def: getOperand(N: 0));
4591 PHINode *Phi =
4592 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
4593 Phi->addIncoming(V: StartMask, BB: VectorPH);
4594 State.set(Def: this, V: Phi);
4595}
4596
4597#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4598void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4599 VPSlotTracker &SlotTracker) const {
4600 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4601
4602 printAsOperand(O, SlotTracker);
4603 O << " = phi ";
4604 printOperands(O, SlotTracker);
4605}
4606#endif
4607
4608#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4609void VPEVLBasedIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4610 VPSlotTracker &SlotTracker) const {
4611 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
4612
4613 printAsOperand(O, SlotTracker);
4614 O << " = phi ";
4615 printOperands(O, SlotTracker);
4616}
4617#endif
4618