1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include "llvm/Transforms/Utils/LoopUtils.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(Val: this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(Val: this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCanonicalIVPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
111bool VPRecipeBase::mayReadFromMemory() const {
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(Val: this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenIntrinsicSC:
128 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
129 case VPBranchOnMaskSC:
130 case VPDerivedIVSC:
131 case VPFirstOrderRecurrencePHISC:
132 case VPReductionPHISC:
133 case VPPredInstPHISC:
134 case VPScalarIVStepsSC:
135 case VPWidenStoreEVLSC:
136 case VPWidenStoreSC:
137 return false;
138 case VPBlendSC:
139 case VPReductionEVLSC:
140 case VPReductionSC:
141 case VPVectorPointerSC:
142 case VPWidenCanonicalIVSC:
143 case VPWidenCastSC:
144 case VPWidenGEPSC:
145 case VPWidenIntOrFpInductionSC:
146 case VPWidenPHISC:
147 case VPWidenPointerInductionSC:
148 case VPWidenSC: {
149 const Instruction *I =
150 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
151 (void)I;
152 assert((!I || !I->mayReadFromMemory()) &&
153 "underlying instruction may read from memory");
154 return false;
155 }
156 default:
157 // FIXME: Return false if the recipe represents an interleaved store.
158 return true;
159 }
160}
161
162bool VPRecipeBase::mayHaveSideEffects() const {
163 switch (getVPRecipeID()) {
164 case VPExpressionSC:
165 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
166 case VPActiveLaneMaskPHISC:
167 case VPDerivedIVSC:
168 case VPFirstOrderRecurrencePHISC:
169 case VPReductionPHISC:
170 case VPPredInstPHISC:
171 case VPVectorEndPointerSC:
172 return false;
173 case VPInstructionSC: {
174 auto *VPI = cast<VPInstruction>(Val: this);
175 return mayWriteToMemory() ||
176 VPI->getOpcode() == VPInstruction::BranchOnCount ||
177 VPI->getOpcode() == VPInstruction::BranchOnCond ||
178 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
179 }
180 case VPWidenCallSC: {
181 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
182 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
183 }
184 case VPWidenIntrinsicSC:
185 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
186 case VPBlendSC:
187 case VPReductionEVLSC:
188 case VPReductionSC:
189 case VPScalarIVStepsSC:
190 case VPVectorPointerSC:
191 case VPWidenCanonicalIVSC:
192 case VPWidenCastSC:
193 case VPWidenGEPSC:
194 case VPWidenIntOrFpInductionSC:
195 case VPWidenPHISC:
196 case VPWidenPointerInductionSC:
197 case VPWidenSC: {
198 const Instruction *I =
199 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
200 (void)I;
201 assert((!I || !I->mayHaveSideEffects()) &&
202 "underlying instruction has side-effects");
203 return false;
204 }
205 case VPInterleaveEVLSC:
206 case VPInterleaveSC:
207 return mayWriteToMemory();
208 case VPWidenLoadEVLSC:
209 case VPWidenLoadSC:
210 case VPWidenStoreEVLSC:
211 case VPWidenStoreSC:
212 assert(
213 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
214 mayWriteToMemory() &&
215 "mayHaveSideffects result for ingredient differs from this "
216 "implementation");
217 return mayWriteToMemory();
218 case VPReplicateSC: {
219 auto *R = cast<VPReplicateRecipe>(Val: this);
220 return R->getUnderlyingInstr()->mayHaveSideEffects();
221 }
222 default:
223 return true;
224 }
225}
226
227void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
228 assert(!Parent && "Recipe already in some VPBasicBlock");
229 assert(InsertPos->getParent() &&
230 "Insertion position not in any VPBasicBlock");
231 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
232}
233
234void VPRecipeBase::insertBefore(VPBasicBlock &BB,
235 iplist<VPRecipeBase>::iterator I) {
236 assert(!Parent && "Recipe already in some VPBasicBlock");
237 assert(I == BB.end() || I->getParent() == &BB);
238 BB.insert(Recipe: this, InsertPt: I);
239}
240
241void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
242 assert(!Parent && "Recipe already in some VPBasicBlock");
243 assert(InsertPos->getParent() &&
244 "Insertion position not in any VPBasicBlock");
245 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
246}
247
248void VPRecipeBase::removeFromParent() {
249 assert(getParent() && "Recipe not in any VPBasicBlock");
250 getParent()->getRecipeList().remove(IT: getIterator());
251 Parent = nullptr;
252}
253
254iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
255 assert(getParent() && "Recipe not in any VPBasicBlock");
256 return getParent()->getRecipeList().erase(where: getIterator());
257}
258
259void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
260 removeFromParent();
261 insertAfter(InsertPos);
262}
263
264void VPRecipeBase::moveBefore(VPBasicBlock &BB,
265 iplist<VPRecipeBase>::iterator I) {
266 removeFromParent();
267 insertBefore(BB, I);
268}
269
270InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
271 // Get the underlying instruction for the recipe, if there is one. It is used
272 // to
273 // * decide if cost computation should be skipped for this recipe,
274 // * apply forced target instruction cost.
275 Instruction *UI = nullptr;
276 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
277 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
278 else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this))
279 UI = IG->getInsertPos();
280 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
281 UI = &WidenMem->getIngredient();
282
283 InstructionCost RecipeCost;
284 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
285 RecipeCost = 0;
286 } else {
287 RecipeCost = computeCost(VF, Ctx);
288 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
289 RecipeCost.isValid()) {
290 if (UI)
291 RecipeCost = InstructionCost(ForceTargetInstructionCost);
292 else
293 RecipeCost = InstructionCost(0);
294 }
295 }
296
297 LLVM_DEBUG({
298 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
299 dump();
300 });
301 return RecipeCost;
302}
303
304InstructionCost VPRecipeBase::computeCost(ElementCount VF,
305 VPCostContext &Ctx) const {
306 llvm_unreachable("subclasses should implement computeCost");
307}
308
309bool VPRecipeBase::isPhi() const {
310 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
311 isa<VPPhi, VPIRPhi>(Val: this);
312}
313
314bool VPRecipeBase::isScalarCast() const {
315 auto *VPI = dyn_cast<VPInstruction>(Val: this);
316 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
317}
318
319void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
320 assert(OpType == Other.OpType && "OpType must match");
321 switch (OpType) {
322 case OperationType::OverflowingBinOp:
323 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
324 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
325 break;
326 case OperationType::Trunc:
327 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
328 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
329 break;
330 case OperationType::DisjointOp:
331 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
332 break;
333 case OperationType::PossiblyExactOp:
334 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
335 break;
336 case OperationType::GEPOp:
337 GEPFlags &= Other.GEPFlags;
338 break;
339 case OperationType::FPMathOp:
340 case OperationType::FCmp:
341 assert((OpType != OperationType::FCmp ||
342 FCmpFlags.Pred == Other.FCmpFlags.Pred) &&
343 "Cannot drop CmpPredicate");
344 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
345 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
346 break;
347 case OperationType::NonNegOp:
348 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
349 break;
350 case OperationType::Cmp:
351 assert(CmpPredicate == Other.CmpPredicate && "Cannot drop CmpPredicate");
352 break;
353 case OperationType::ReductionOp:
354 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
355 "Cannot change RecurKind");
356 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
357 "Cannot change IsOrdered");
358 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
359 "Cannot change IsInLoop");
360 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
361 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
362 break;
363 case OperationType::Other:
364 assert(AllFlags == Other.AllFlags && "Cannot drop other flags");
365 break;
366 }
367}
368
369FastMathFlags VPIRFlags::getFastMathFlags() const {
370 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
371 OpType == OperationType::ReductionOp ||
372 OpType == OperationType::Other) &&
373 "recipe doesn't have fast math flags");
374 if (OpType == OperationType::Other)
375 return FastMathFlags();
376 const FastMathFlagsTy &F = getFMFsRef();
377 FastMathFlags Res;
378 Res.setAllowReassoc(F.AllowReassoc);
379 Res.setNoNaNs(F.NoNaNs);
380 Res.setNoInfs(F.NoInfs);
381 Res.setNoSignedZeros(F.NoSignedZeros);
382 Res.setAllowReciprocal(F.AllowReciprocal);
383 Res.setAllowContract(F.AllowContract);
384 Res.setApproxFunc(F.ApproxFunc);
385 return Res;
386}
387
388#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
389void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
390
391void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
392 VPSlotTracker &SlotTracker) const {
393 printRecipe(O, Indent, SlotTracker);
394 if (auto DL = getDebugLoc()) {
395 O << ", !dbg ";
396 DL.print(O);
397 }
398
399 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
400 Metadata->print(O, SlotTracker);
401}
402#endif
403
404template <unsigned PartOpIdx>
405VPValue *
406VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
407 if (U.getNumOperands() == PartOpIdx + 1)
408 return U.getOperand(N: PartOpIdx);
409 return nullptr;
410}
411
412template <unsigned PartOpIdx>
413unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
414 if (auto *UnrollPartOp = getUnrollPartOperand(U))
415 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
416 return 0;
417}
418
419namespace llvm {
420template class VPUnrollPartAccessor<1>;
421template class VPUnrollPartAccessor<2>;
422template class VPUnrollPartAccessor<3>;
423}
424
425VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
426 const VPIRFlags &Flags, const VPIRMetadata &MD,
427 DebugLoc DL, const Twine &Name)
428 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
429 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
430 assert(flagsValidForOpcode(getOpcode()) &&
431 "Set flags not supported for the provided opcode");
432 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
433 "Opcode requires specific flags to be set");
434 assert((getNumOperandsForOpcode() == -1u ||
435 getNumOperandsForOpcode() == getNumOperands()) &&
436 "number of operands does not match opcode");
437}
438
439unsigned VPInstruction::getNumOperandsForOpcode() const {
440 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
441 return 1;
442
443 if (Instruction::isBinaryOp(Opcode))
444 return 2;
445
446 switch (Opcode) {
447 case VPInstruction::StepVector:
448 case VPInstruction::VScale:
449 return 0;
450 case Instruction::Alloca:
451 case Instruction::ExtractValue:
452 case Instruction::Freeze:
453 case Instruction::Load:
454 case VPInstruction::BranchOnCond:
455 case VPInstruction::Broadcast:
456 case VPInstruction::ExplicitVectorLength:
457 case VPInstruction::ExtractLastLane:
458 case VPInstruction::ExtractLastPart:
459 case VPInstruction::ExtractPenultimateElement:
460 case VPInstruction::Not:
461 case VPInstruction::ResumeForEpilogue:
462 case VPInstruction::Reverse:
463 case VPInstruction::Unpack:
464 return 1;
465 case Instruction::ICmp:
466 case Instruction::FCmp:
467 case Instruction::ExtractElement:
468 case Instruction::Store:
469 case VPInstruction::BranchOnCount:
470 case VPInstruction::BranchOnTwoConds:
471 case VPInstruction::ExitingIVValue:
472 case VPInstruction::FirstOrderRecurrenceSplice:
473 case VPInstruction::LogicalAnd:
474 case VPInstruction::LogicalOr:
475 case VPInstruction::PtrAdd:
476 case VPInstruction::WidePtrAdd:
477 case VPInstruction::WideIVStep:
478 case VPInstruction::CalculateTripCountMinusVF:
479 return 2;
480 case Instruction::Select:
481 case VPInstruction::ActiveLaneMask:
482 case VPInstruction::ReductionStartVector:
483 case VPInstruction::ExtractLastActive:
484 return 3;
485 case Instruction::Call: {
486 // For unmasked calls, the last argument will the called function. Use that
487 // to compute the number of operands without the mask.
488 VPValue *LastOp = getOperand(N: getNumOperands() - 1);
489 if (isa<VPIRValue>(Val: LastOp) && isa<Function>(Val: LastOp->getLiveInIRValue()))
490 return getNumOperands();
491 return getNumOperands() - 1;
492 }
493 case Instruction::GetElementPtr:
494 case Instruction::PHI:
495 case Instruction::Switch:
496 case VPInstruction::AnyOf:
497 case VPInstruction::BuildStructVector:
498 case VPInstruction::BuildVector:
499 case VPInstruction::CanonicalIVIncrementForPart:
500 case VPInstruction::ComputeAnyOfResult:
501 case VPInstruction::ComputeReductionResult:
502 case VPInstruction::FirstActiveLane:
503 case VPInstruction::LastActiveLane:
504 case VPInstruction::SLPLoad:
505 case VPInstruction::SLPStore:
506 case VPInstruction::ExtractLane:
507 // Cannot determine the number of operands from the opcode.
508 return -1u;
509 }
510 llvm_unreachable("all cases should be handled above");
511}
512
513bool VPInstruction::doesGeneratePerAllLanes() const {
514 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
515}
516
517bool VPInstruction::canGenerateScalarForFirstLane() const {
518 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
519 return true;
520 if (isSingleScalar() || isVectorToScalar())
521 return true;
522 switch (Opcode) {
523 case Instruction::Freeze:
524 case Instruction::ICmp:
525 case Instruction::PHI:
526 case Instruction::Select:
527 case VPInstruction::BranchOnCond:
528 case VPInstruction::BranchOnTwoConds:
529 case VPInstruction::BranchOnCount:
530 case VPInstruction::CalculateTripCountMinusVF:
531 case VPInstruction::CanonicalIVIncrementForPart:
532 case VPInstruction::PtrAdd:
533 case VPInstruction::ExplicitVectorLength:
534 case VPInstruction::AnyOf:
535 case VPInstruction::Not:
536 return true;
537 default:
538 return false;
539 }
540}
541
542Value *VPInstruction::generate(VPTransformState &State) {
543 IRBuilderBase &Builder = State.Builder;
544
545 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
546 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
547 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
548 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
549 auto *Res =
550 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
551 if (auto *I = dyn_cast<Instruction>(Val: Res))
552 applyFlags(I&: *I);
553 return Res;
554 }
555
556 switch (getOpcode()) {
557 case VPInstruction::Not: {
558 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
559 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
560 return Builder.CreateNot(V: A, Name);
561 }
562 case Instruction::ExtractElement: {
563 assert(State.VF.isVector() && "Only extract elements from vectors");
564 if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1)))
565 return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue()));
566 Value *Vec = State.get(Def: getOperand(N: 0));
567 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
568 return Builder.CreateExtractElement(Vec, Idx, Name);
569 }
570 case Instruction::Freeze: {
571 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
572 return Builder.CreateFreeze(V: Op, Name);
573 }
574 case Instruction::FCmp:
575 case Instruction::ICmp: {
576 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
577 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
578 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
579 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
580 }
581 case Instruction::PHI: {
582 llvm_unreachable("should be handled by VPPhi::execute");
583 }
584 case Instruction::Select: {
585 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
586 Value *Cond =
587 State.get(Def: getOperand(N: 0),
588 IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0)));
589 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
590 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
591 return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlags(), Name);
592 }
593 case VPInstruction::ActiveLaneMask: {
594 // Get first lane of vector induction variable.
595 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
596 // Get the original loop tripcount.
597 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
598
599 // If this part of the active lane mask is scalar, generate the CMP directly
600 // to avoid unnecessary extracts.
601 if (State.VF.isScalar())
602 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
603 Name);
604
605 ElementCount EC = State.VF.multiplyCoefficientBy(
606 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
607 auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
608 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
609 Types: {PredTy, ScalarTC->getType()},
610 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
611 }
612 case VPInstruction::FirstOrderRecurrenceSplice: {
613 // Generate code to combine the previous and current values in vector v3.
614 //
615 // vector.ph:
616 // v_init = vector(..., ..., ..., a[-1])
617 // br vector.body
618 //
619 // vector.body
620 // i = phi [0, vector.ph], [i+4, vector.body]
621 // v1 = phi [v_init, vector.ph], [v2, vector.body]
622 // v2 = a[i, i+1, i+2, i+3];
623 // v3 = vector(v1(3), v2(0, 1, 2))
624
625 auto *V1 = State.get(Def: getOperand(N: 0));
626 if (!V1->getType()->isVectorTy())
627 return V1;
628 Value *V2 = State.get(Def: getOperand(N: 1));
629 return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name);
630 }
631 case VPInstruction::CalculateTripCountMinusVF: {
632 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
633 Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
634 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF);
635 Value *Cmp =
636 Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF);
637 Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
638 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
639 }
640 case VPInstruction::ExplicitVectorLength: {
641 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
642 // be outside of the main loop.
643 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
644 // Compute EVL
645 assert(AVL->getType()->isIntegerTy() &&
646 "Requested vector length should be an integer.");
647
648 assert(State.VF.isScalable() && "Expected scalable vector factor.");
649 Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
650
651 Value *EVL = Builder.CreateIntrinsic(
652 RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
653 Args: {AVL, VFArg, Builder.getTrue()});
654 return EVL;
655 }
656 case VPInstruction::CanonicalIVIncrementForPart: {
657 auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
658 auto *VFxPart = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
659 // The canonical IV is incremented by the vectorization factor (num of
660 // SIMD elements) times the unroll part.
661 return Builder.CreateAdd(LHS: IV, RHS: VFxPart, Name, HasNUW: hasNoUnsignedWrap(),
662 HasNSW: hasNoSignedWrap());
663 }
664 case VPInstruction::BranchOnCond: {
665 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
666 // Replace the temporary unreachable terminator with a new conditional
667 // branch, hooking it up to backward destination for latch blocks now, and
668 // to forward destination(s) later when they are created.
669 // Second successor may be backwards - iff it is already in VPBB2IRBB.
670 VPBasicBlock *SecondVPSucc =
671 cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]);
672 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
673 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
674 auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
675 // First successor is always forward, reset it to nullptr.
676 Br->setSuccessor(idx: 0, NewSucc: nullptr);
677 IRBB->getTerminator()->eraseFromParent();
678 applyMetadata(I&: *Br);
679 return Br;
680 }
681 case VPInstruction::Broadcast: {
682 return Builder.CreateVectorSplat(
683 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
684 }
685 case VPInstruction::BuildStructVector: {
686 // For struct types, we need to build a new 'wide' struct type, where each
687 // element is widened, i.e., we create a struct of vectors.
688 auto *StructTy =
689 cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)));
690 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
691 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
692 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
693 FieldIndex++) {
694 Value *ScalarValue =
695 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
696 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
697 VectorValue =
698 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
699 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
700 }
701 }
702 return Res;
703 }
704 case VPInstruction::BuildVector: {
705 auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
706 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
707 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
708 for (const auto &[Idx, Op] : enumerate(First: operands()))
709 Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
710 Idx: Builder.getInt32(C: Idx));
711 return Res;
712 }
713 case VPInstruction::ReductionStartVector: {
714 if (State.VF.isScalar())
715 return State.get(Def: getOperand(N: 0), IsScalar: true);
716 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
717 Builder.setFastMathFlags(getFastMathFlags());
718 // If this start vector is scaled then it should produce a vector with fewer
719 // elements than the VF.
720 ElementCount VF = State.VF.divideCoefficientBy(
721 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
722 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
723 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
724 Idx: Builder.getInt32(C: 0));
725 }
726 case VPInstruction::ComputeAnyOfResult: {
727 Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
728 Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
729 Value *ReducedResult = State.get(Def: getOperand(N: 2));
730 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
731 ReducedResult =
732 Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)),
733 RHS: ReducedResult, Name: "bin.rdx");
734 // If any predicate is true it means that we want to select the new value.
735 if (ReducedResult->getType()->isVectorTy())
736 ReducedResult = Builder.CreateOrReduce(Src: ReducedResult);
737 // The compares in the loop may yield poison, which propagates through the
738 // bitwise ORs. Freeze it here before the condition is used.
739 ReducedResult = Builder.CreateFreeze(V: ReducedResult);
740 return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select");
741 }
742 case VPInstruction::ComputeReductionResult: {
743 RecurKind RK = getRecurKind();
744 bool IsOrdered = isReductionOrdered();
745 bool IsInLoop = isReductionInLoop();
746 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
747 "FindIV should use min/max reduction kinds");
748
749 // The recipe may have multiple operands to be reduced together.
750 unsigned NumOperandsToReduce = getNumOperands();
751 VectorParts RdxParts(NumOperandsToReduce);
752 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
753 RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
754
755 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
756 Builder.setFastMathFlags(getFastMathFlags());
757
758 // Reduce multiple operands into one.
759 Value *ReducedPartRdx = RdxParts[0];
760 if (IsOrdered) {
761 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
762 } else {
763 // Floating-point operations should have some FMF to enable the reduction.
764 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
765 Value *RdxPart = RdxParts[Part];
766 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
767 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
768 else {
769 // For sub-recurrences, each part's reduction variable is already
770 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
771 Instruction::BinaryOps Opcode =
772 RK == RecurKind::Sub
773 ? Instruction::Add
774 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
775 ReducedPartRdx =
776 Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
777 }
778 }
779 }
780
781 // Create the reduction after the loop. Note that inloop reductions create
782 // the target reduction in the loop using a Reduction recipe.
783 if (State.VF.isVector() && !IsInLoop) {
784 // TODO: Support in-order reductions based on the recurrence descriptor.
785 // All ops in the reduction inherit fast-math-flags from the recurrence
786 // descriptor.
787 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
788 }
789
790 return ReducedPartRdx;
791 }
792 case VPInstruction::ExtractLastLane:
793 case VPInstruction::ExtractPenultimateElement: {
794 unsigned Offset =
795 getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
796 Value *Res;
797 if (State.VF.isVector()) {
798 assert(Offset <= State.VF.getKnownMinValue() &&
799 "invalid offset to extract from");
800 // Extract lane VF - Offset from the operand.
801 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
802 } else {
803 // TODO: Remove ExtractLastLane for scalar VFs.
804 assert(Offset <= 1 && "invalid offset to extract from");
805 Res = State.get(Def: getOperand(N: 0));
806 }
807 if (isa<ExtractElementInst>(Val: Res))
808 Res->setName(Name);
809 return Res;
810 }
811 case VPInstruction::LogicalAnd: {
812 Value *A = State.get(Def: getOperand(N: 0));
813 Value *B = State.get(Def: getOperand(N: 1));
814 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
815 }
816 case VPInstruction::LogicalOr: {
817 Value *A = State.get(Def: getOperand(N: 0));
818 Value *B = State.get(Def: getOperand(N: 1));
819 return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name);
820 }
821 case VPInstruction::PtrAdd: {
822 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
823 "can only generate first lane for PtrAdd");
824 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
825 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
826 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
827 }
828 case VPInstruction::WidePtrAdd: {
829 Value *Ptr =
830 State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0)));
831 Value *Addend = State.get(Def: getOperand(N: 1));
832 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
833 }
834 case VPInstruction::AnyOf: {
835 Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0)));
836 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
837 Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
838 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
839 }
840 case VPInstruction::ExtractLane: {
841 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
842 "simplified to ExtractElement.");
843 Value *LaneToExtract = State.get(Def: getOperand(N: 0), IsScalar: true);
844 Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
845 Value *Res = nullptr;
846 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
847
848 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
849 Value *VectorStart =
850 Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1));
851 Value *VectorIdx = Idx == 1
852 ? LaneToExtract
853 : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
854 Value *Ext = State.VF.isScalar()
855 ? State.get(Def: getOperand(N: Idx))
856 : Builder.CreateExtractElement(
857 Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
858 if (Res) {
859 Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
860 Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
861 } else {
862 Res = Ext;
863 }
864 }
865 return Res;
866 }
867 case VPInstruction::FirstActiveLane: {
868 if (getNumOperands() == 1) {
869 Value *Mask = State.get(Def: getOperand(N: 0));
870 return Builder.CreateCountTrailingZeroElems(ResTy: Builder.getInt64Ty(), Mask,
871 /*ZeroIsPoison=*/false, Name);
872 }
873 // If there are multiple operands, create a chain of selects to pick the
874 // first operand with an active lane and add the number of lanes of the
875 // preceding operands.
876 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt64Ty(), VF: State.VF);
877 unsigned LastOpIdx = getNumOperands() - 1;
878 Value *Res = nullptr;
879 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
880 Value *TrailingZeros =
881 State.VF.isScalar()
882 ? Builder.CreateZExt(
883 V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
884 RHS: Builder.getFalse()),
885 DestTy: Builder.getInt64Ty())
886 : Builder.CreateCountTrailingZeroElems(
887 ResTy: Builder.getInt64Ty(), Mask: State.get(Def: getOperand(N: Idx)),
888 /*ZeroIsPoison=*/false, Name);
889 Value *Current = Builder.CreateAdd(
890 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: Builder.getInt64(C: Idx)), RHS: TrailingZeros);
891 if (Res) {
892 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
893 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
894 } else {
895 Res = Current;
896 }
897 }
898
899 return Res;
900 }
901 case VPInstruction::ResumeForEpilogue:
902 return State.get(Def: getOperand(N: 0), IsScalar: true);
903 case VPInstruction::Reverse:
904 return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse");
905 case VPInstruction::ExtractLastActive: {
906 Value *Data = State.get(Def: getOperand(N: 0));
907 Value *Mask = State.get(Def: getOperand(N: 1));
908 Value *Default = State.get(Def: getOperand(N: 2), /*IsScalar=*/true);
909 Type *VTy = Data->getType();
910 return Builder.CreateIntrinsic(
911 ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy},
912 Args: {Data, Mask, Default});
913 }
914 default:
915 llvm_unreachable("Unsupported opcode for instruction");
916 }
917}
918
919InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
920 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
921 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
922 Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
923 switch (Opcode) {
924 case Instruction::FNeg:
925 return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
926 case Instruction::UDiv:
927 case Instruction::SDiv:
928 case Instruction::SRem:
929 case Instruction::URem:
930 case Instruction::Add:
931 case Instruction::FAdd:
932 case Instruction::Sub:
933 case Instruction::FSub:
934 case Instruction::Mul:
935 case Instruction::FMul:
936 case Instruction::FDiv:
937 case Instruction::FRem:
938 case Instruction::Shl:
939 case Instruction::LShr:
940 case Instruction::AShr:
941 case Instruction::And:
942 case Instruction::Or:
943 case Instruction::Xor: {
944 TargetTransformInfo::OperandValueInfo RHSInfo = {
945 .Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None};
946
947 if (VF.isVector()) {
948 // Certain instructions can be cheaper to vectorize if they have a
949 // constant second vector operand. One example of this are shifts on x86.
950 VPValue *RHS = getOperand(N: 1);
951 RHSInfo = Ctx.getOperandInfo(V: RHS);
952
953 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
954 getOperand(N: 1)->isDefinedOutsideLoopRegions())
955 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
956 }
957
958 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
959 SmallVector<const Value *, 4> Operands;
960 if (CtxI)
961 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
962 return Ctx.TTI.getArithmeticInstrCost(
963 Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
964 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
965 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
966 }
967 case Instruction::Freeze:
968 // This opcode is unknown. Assume that it is the same as 'mul'.
969 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy,
970 CostKind: Ctx.CostKind);
971 case Instruction::ExtractValue:
972 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
973 CostKind: Ctx.CostKind);
974 case Instruction::ICmp:
975 case Instruction::FCmp: {
976 Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
977 Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
978 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
979 return Ctx.TTI.getCmpSelInstrCost(
980 Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
981 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
982 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
983 }
984 case Instruction::BitCast: {
985 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
986 if (ScalarTy->isPointerTy())
987 return 0;
988 [[fallthrough]];
989 }
990 case Instruction::SExt:
991 case Instruction::ZExt:
992 case Instruction::FPToUI:
993 case Instruction::FPToSI:
994 case Instruction::FPExt:
995 case Instruction::PtrToInt:
996 case Instruction::PtrToAddr:
997 case Instruction::IntToPtr:
998 case Instruction::SIToFP:
999 case Instruction::UIToFP:
1000 case Instruction::Trunc:
1001 case Instruction::FPTrunc:
1002 case Instruction::AddrSpaceCast: {
1003 // Computes the CastContextHint from a recipe that may access memory.
1004 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1005 if (isa<VPInterleaveBase>(Val: R))
1006 return TTI::CastContextHint::Interleave;
1007 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
1008 // Only compute CCH for memory operations, matching the legacy model
1009 // which only considers loads/stores for cast context hints.
1010 auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
1011 if (!isa<LoadInst, StoreInst>(Val: UI))
1012 return TTI::CastContextHint::None;
1013 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1014 : TTI::CastContextHint::Normal;
1015 }
1016 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1017 if (WidenMemoryRecipe == nullptr)
1018 return TTI::CastContextHint::None;
1019 if (VF.isScalar())
1020 return TTI::CastContextHint::Normal;
1021 if (!WidenMemoryRecipe->isConsecutive())
1022 return TTI::CastContextHint::GatherScatter;
1023 if (WidenMemoryRecipe->isReverse())
1024 return TTI::CastContextHint::Reversed;
1025 if (WidenMemoryRecipe->isMasked())
1026 return TTI::CastContextHint::Masked;
1027 return TTI::CastContextHint::Normal;
1028 };
1029
1030 VPValue *Operand = getOperand(N: 0);
1031 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1032 // For Trunc/FPTrunc, get the context from the only user.
1033 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1034 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1035 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1036 return nullptr;
1037 return dyn_cast<VPRecipeBase>(Val: *R->user_begin());
1038 };
1039 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1040 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue())))
1041 Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe));
1042 if (Recipe)
1043 CCH = ComputeCCH(Recipe);
1044 }
1045 }
1046 // For Z/Sext, get the context from the operand.
1047 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1048 Opcode == Instruction::FPExt) {
1049 if (auto *Recipe = Operand->getDefiningRecipe()) {
1050 VPValue *ReverseOp;
1051 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp))))
1052 Recipe = ReverseOp->getDefiningRecipe();
1053 if (Recipe)
1054 CCH = ComputeCCH(Recipe);
1055 }
1056 }
1057
1058 auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand);
1059 Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1060 // Arm TTI will use the underlying instruction to determine the cost.
1061 return Ctx.TTI.getCastInstrCost(
1062 Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1063 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1064 }
1065 case Instruction::Select: {
1066 SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1067 bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1068 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1069
1070 VPValue *Op0, *Op1;
1071 bool IsLogicalAnd =
1072 match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1073 bool IsLogicalOr =
1074 match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1075 // Also match the inverted forms:
1076 // select x, false, y --> !x & y (still AND)
1077 // select x, y, true --> !x | y (still OR)
1078 IsLogicalAnd |=
1079 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1080 IsLogicalOr |=
1081 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1082
1083 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1084 (IsLogicalAnd || IsLogicalOr)) {
1085 // select x, y, false --> x & y
1086 // select x, true, y --> x | y
1087 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1088 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1089
1090 SmallVector<const Value *, 2> Operands;
1091 if (SI && all_of(Range: operands(),
1092 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1093 append_range(C&: Operands, R: SI->operands());
1094 return Ctx.TTI.getArithmeticInstrCost(
1095 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1096 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1097 }
1098
1099 Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1100 if (!IsScalarCond)
1101 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1102
1103 llvm::CmpPredicate Pred;
1104 if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1105 if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0)))
1106 if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1107 Pred = Cmp->getPredicate();
1108 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1109 return Ctx.TTI.getCmpSelInstrCost(
1110 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1111 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1112 }
1113 }
1114 llvm_unreachable("called for unsupported opcode");
1115}
1116
1117InstructionCost VPInstruction::computeCost(ElementCount VF,
1118 VPCostContext &Ctx) const {
1119 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1120 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1121 // TODO: Compute cost for VPInstructions without underlying values once
1122 // the legacy cost model has been retired.
1123 return 0;
1124 }
1125
1126 assert(!doesGeneratePerAllLanes() &&
1127 "Should only generate a vector value or single scalar, not scalars "
1128 "for all lanes.");
1129 return getCostForRecipeWithOpcode(
1130 Opcode: getOpcode(),
1131 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1132 }
1133
1134 switch (getOpcode()) {
1135 case Instruction::Select: {
1136 llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1137 match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1138 auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1139 auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1));
1140 if (!vputils::onlyFirstLaneUsed(Def: this)) {
1141 CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1142 VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1143 }
1144 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1145 CostKind: Ctx.CostKind);
1146 }
1147 case Instruction::ExtractElement:
1148 case VPInstruction::ExtractLane: {
1149 if (VF.isScalar()) {
1150 // ExtractLane with VF=1 takes care of handling extracting across multiple
1151 // parts.
1152 return 0;
1153 }
1154
1155 // Add on the cost of extracting the element.
1156 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1157 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1158 CostKind: Ctx.CostKind);
1159 }
1160 case VPInstruction::AnyOf: {
1161 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1162 return Ctx.TTI.getArithmeticReductionCost(
1163 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1164 }
1165 case VPInstruction::FirstActiveLane: {
1166 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1167 if (VF.isScalar())
1168 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1169 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1170 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1171 // Calculate the cost of determining the lane index.
1172 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1173 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1174 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1175 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1176 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1177 }
1178 case VPInstruction::LastActiveLane: {
1179 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1180 if (VF.isScalar())
1181 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1182 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1183 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1184 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1185 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1186 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
1187 Type::getInt64Ty(C&: Ctx.LLVMCtx),
1188 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1189 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1190 // Add cost of NOT operation on the predicate.
1191 Cost += Ctx.TTI.getArithmeticInstrCost(
1192 Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1193 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1194 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1195 .Properties: TargetTransformInfo::OP_None});
1196 // Add cost of SUB operation on the index.
1197 Cost += Ctx.TTI.getArithmeticInstrCost(
1198 Opcode: Instruction::Sub, Ty: Type::getInt64Ty(C&: Ctx.LLVMCtx), CostKind: Ctx.CostKind);
1199 return Cost;
1200 }
1201 case VPInstruction::ExtractLastActive: {
1202 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1203 Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1204 Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1205 IntrinsicCostAttributes ICA(
1206 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1207 {VecTy, MaskTy, ScalarTy});
1208 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1209 }
1210 case VPInstruction::FirstOrderRecurrenceSplice: {
1211 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1212 SmallVector<int> Mask(VF.getKnownMinValue());
1213 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
1214 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1215
1216 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
1217 DstTy: cast<VectorType>(Val: VectorTy),
1218 SrcTy: cast<VectorType>(Val: VectorTy), Mask,
1219 CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1);
1220 }
1221 case VPInstruction::ActiveLaneMask: {
1222 Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1223 unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue();
1224 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier);
1225 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1226 {ArgTy, ArgTy});
1227 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1228 }
1229 case VPInstruction::ExplicitVectorLength: {
1230 Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1231 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1232 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1233 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1234 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1235 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1236 }
1237 case VPInstruction::Reverse: {
1238 assert(VF.isVector() && "Reverse operation must be vector type");
1239 auto *VectorTy = cast<VectorType>(
1240 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
1241 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1242 SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind,
1243 /*Index=*/0);
1244 }
1245 case VPInstruction::ExtractLastLane: {
1246 // Add on the cost of extracting the element.
1247 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1248 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1249 Val: VecTy, CostKind: Ctx.CostKind, Index: 0);
1250 }
1251 case VPInstruction::ExtractPenultimateElement:
1252 if (VF == ElementCount::getScalable(MinVal: 1))
1253 return InstructionCost::getInvalid();
1254 [[fallthrough]];
1255 default:
1256 // TODO: Compute cost other VPInstructions once the legacy cost model has
1257 // been retired.
1258 assert(!getUnderlyingValue() &&
1259 "unexpected VPInstruction witht underlying value");
1260 return 0;
1261 }
1262}
1263
1264bool VPInstruction::isVectorToScalar() const {
1265 return getOpcode() == VPInstruction::ExtractLastLane ||
1266 getOpcode() == VPInstruction::ExtractPenultimateElement ||
1267 getOpcode() == Instruction::ExtractElement ||
1268 getOpcode() == VPInstruction::ExtractLane ||
1269 getOpcode() == VPInstruction::FirstActiveLane ||
1270 getOpcode() == VPInstruction::LastActiveLane ||
1271 getOpcode() == VPInstruction::ComputeAnyOfResult ||
1272 getOpcode() == VPInstruction::ExtractLastActive ||
1273 getOpcode() == VPInstruction::ComputeReductionResult ||
1274 getOpcode() == VPInstruction::AnyOf;
1275}
1276
1277bool VPInstruction::isSingleScalar() const {
1278 switch (getOpcode()) {
1279 case Instruction::PHI:
1280 case VPInstruction::ExplicitVectorLength:
1281 case VPInstruction::ResumeForEpilogue:
1282 case VPInstruction::VScale:
1283 return true;
1284 default:
1285 return isScalarCast();
1286 }
1287}
1288
1289void VPInstruction::execute(VPTransformState &State) {
1290 assert(!isMasked() && "cannot execute masked VPInstruction");
1291 assert(!State.Lane && "VPInstruction executing an Lane");
1292 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1293 assert(flagsValidForOpcode(getOpcode()) &&
1294 "Set flags not supported for the provided opcode");
1295 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1296 "Opcode requires specific flags to be set");
1297 if (hasFastMathFlags())
1298 State.Builder.setFastMathFlags(getFastMathFlags());
1299 Value *GeneratedValue = generate(State);
1300 if (!hasResult())
1301 return;
1302 assert(GeneratedValue && "generate must produce a value");
1303 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1304 (vputils::onlyFirstLaneUsed(Def: this) ||
1305 isVectorToScalar() || isSingleScalar());
1306 assert((((GeneratedValue->getType()->isVectorTy() ||
1307 GeneratedValue->getType()->isStructTy()) ==
1308 !GeneratesPerFirstLaneOnly) ||
1309 State.VF.isScalar()) &&
1310 "scalar value but not only first lane defined");
1311 State.set(Def: this, V: GeneratedValue,
1312 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1313}
1314
1315bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1316 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1317 return false;
1318 switch (getOpcode()) {
1319 case Instruction::GetElementPtr:
1320 case Instruction::ExtractElement:
1321 case Instruction::Freeze:
1322 case Instruction::FCmp:
1323 case Instruction::ICmp:
1324 case Instruction::Select:
1325 case Instruction::PHI:
1326 case VPInstruction::AnyOf:
1327 case VPInstruction::BranchOnCond:
1328 case VPInstruction::BranchOnTwoConds:
1329 case VPInstruction::BranchOnCount:
1330 case VPInstruction::Broadcast:
1331 case VPInstruction::BuildStructVector:
1332 case VPInstruction::BuildVector:
1333 case VPInstruction::CalculateTripCountMinusVF:
1334 case VPInstruction::CanonicalIVIncrementForPart:
1335 case VPInstruction::ExtractLane:
1336 case VPInstruction::ExtractLastLane:
1337 case VPInstruction::ExtractLastPart:
1338 case VPInstruction::ExtractPenultimateElement:
1339 case VPInstruction::ActiveLaneMask:
1340 case VPInstruction::ExitingIVValue:
1341 case VPInstruction::ExplicitVectorLength:
1342 case VPInstruction::FirstActiveLane:
1343 case VPInstruction::LastActiveLane:
1344 case VPInstruction::ExtractLastActive:
1345 case VPInstruction::FirstOrderRecurrenceSplice:
1346 case VPInstruction::LogicalAnd:
1347 case VPInstruction::LogicalOr:
1348 case VPInstruction::Not:
1349 case VPInstruction::PtrAdd:
1350 case VPInstruction::WideIVStep:
1351 case VPInstruction::WidePtrAdd:
1352 case VPInstruction::StepVector:
1353 case VPInstruction::ReductionStartVector:
1354 case VPInstruction::Reverse:
1355 case VPInstruction::VScale:
1356 case VPInstruction::Unpack:
1357 return false;
1358 default:
1359 return true;
1360 }
1361}
1362
1363bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
1364 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1365 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1366 return vputils::onlyFirstLaneUsed(Def: this);
1367
1368 switch (getOpcode()) {
1369 default:
1370 return false;
1371 case Instruction::ExtractElement:
1372 return Op == getOperand(N: 1);
1373 case Instruction::PHI:
1374 return true;
1375 case Instruction::FCmp:
1376 case Instruction::ICmp:
1377 case Instruction::Select:
1378 case Instruction::Or:
1379 case Instruction::Freeze:
1380 case VPInstruction::Not:
1381 // TODO: Cover additional opcodes.
1382 return vputils::onlyFirstLaneUsed(Def: this);
1383 case VPInstruction::ActiveLaneMask:
1384 case VPInstruction::ExplicitVectorLength:
1385 case VPInstruction::CalculateTripCountMinusVF:
1386 case VPInstruction::CanonicalIVIncrementForPart:
1387 case VPInstruction::BranchOnCount:
1388 case VPInstruction::BranchOnCond:
1389 case VPInstruction::Broadcast:
1390 case VPInstruction::ReductionStartVector:
1391 return true;
1392 case VPInstruction::BuildStructVector:
1393 case VPInstruction::BuildVector:
1394 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1395 // operand, after replicating its operands only the first lane is used.
1396 // Before replicating, it will have only a single operand.
1397 return getNumOperands() > 1;
1398 case VPInstruction::PtrAdd:
1399 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1400 case VPInstruction::WidePtrAdd:
1401 // WidePtrAdd supports scalar and vector base addresses.
1402 return false;
1403 case VPInstruction::ComputeAnyOfResult:
1404 return Op == getOperand(N: 0) || Op == getOperand(N: 1);
1405 case VPInstruction::ExitingIVValue:
1406 case VPInstruction::ExtractLane:
1407 return Op == getOperand(N: 0);
1408 };
1409 llvm_unreachable("switch should return");
1410}
1411
1412bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
1413 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1414 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1415 return vputils::onlyFirstPartUsed(Def: this);
1416
1417 switch (getOpcode()) {
1418 default:
1419 return false;
1420 case Instruction::FCmp:
1421 case Instruction::ICmp:
1422 case Instruction::Select:
1423 return vputils::onlyFirstPartUsed(Def: this);
1424 case VPInstruction::BranchOnCount:
1425 case VPInstruction::BranchOnCond:
1426 case VPInstruction::BranchOnTwoConds:
1427 case VPInstruction::CanonicalIVIncrementForPart:
1428 return true;
1429 };
1430 llvm_unreachable("switch should return");
1431}
1432
1433#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1434void VPInstruction::dump() const {
1435 VPSlotTracker SlotTracker(getParent()->getPlan());
1436 printRecipe(dbgs(), "", SlotTracker);
1437}
1438
1439void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1440 VPSlotTracker &SlotTracker) const {
1441 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1442
1443 if (hasResult()) {
1444 printAsOperand(O, SlotTracker);
1445 O << " = ";
1446 }
1447
1448 switch (getOpcode()) {
1449 case VPInstruction::Not:
1450 O << "not";
1451 break;
1452 case VPInstruction::SLPLoad:
1453 O << "combined load";
1454 break;
1455 case VPInstruction::SLPStore:
1456 O << "combined store";
1457 break;
1458 case VPInstruction::ActiveLaneMask:
1459 O << "active lane mask";
1460 break;
1461 case VPInstruction::ExplicitVectorLength:
1462 O << "EXPLICIT-VECTOR-LENGTH";
1463 break;
1464 case VPInstruction::FirstOrderRecurrenceSplice:
1465 O << "first-order splice";
1466 break;
1467 case VPInstruction::BranchOnCond:
1468 O << "branch-on-cond";
1469 break;
1470 case VPInstruction::BranchOnTwoConds:
1471 O << "branch-on-two-conds";
1472 break;
1473 case VPInstruction::CalculateTripCountMinusVF:
1474 O << "TC > VF ? TC - VF : 0";
1475 break;
1476 case VPInstruction::CanonicalIVIncrementForPart:
1477 O << "VF * Part +";
1478 break;
1479 case VPInstruction::BranchOnCount:
1480 O << "branch-on-count";
1481 break;
1482 case VPInstruction::Broadcast:
1483 O << "broadcast";
1484 break;
1485 case VPInstruction::BuildStructVector:
1486 O << "buildstructvector";
1487 break;
1488 case VPInstruction::BuildVector:
1489 O << "buildvector";
1490 break;
1491 case VPInstruction::ExitingIVValue:
1492 O << "exiting-iv-value";
1493 break;
1494 case VPInstruction::ExtractLane:
1495 O << "extract-lane";
1496 break;
1497 case VPInstruction::ExtractLastLane:
1498 O << "extract-last-lane";
1499 break;
1500 case VPInstruction::ExtractLastPart:
1501 O << "extract-last-part";
1502 break;
1503 case VPInstruction::ExtractPenultimateElement:
1504 O << "extract-penultimate-element";
1505 break;
1506 case VPInstruction::ComputeAnyOfResult:
1507 O << "compute-anyof-result";
1508 break;
1509 case VPInstruction::ComputeReductionResult:
1510 O << "compute-reduction-result";
1511 break;
1512 case VPInstruction::LogicalAnd:
1513 O << "logical-and";
1514 break;
1515 case VPInstruction::LogicalOr:
1516 O << "logical-or";
1517 break;
1518 case VPInstruction::PtrAdd:
1519 O << "ptradd";
1520 break;
1521 case VPInstruction::WidePtrAdd:
1522 O << "wide-ptradd";
1523 break;
1524 case VPInstruction::AnyOf:
1525 O << "any-of";
1526 break;
1527 case VPInstruction::FirstActiveLane:
1528 O << "first-active-lane";
1529 break;
1530 case VPInstruction::LastActiveLane:
1531 O << "last-active-lane";
1532 break;
1533 case VPInstruction::ReductionStartVector:
1534 O << "reduction-start-vector";
1535 break;
1536 case VPInstruction::ResumeForEpilogue:
1537 O << "resume-for-epilogue";
1538 break;
1539 case VPInstruction::Reverse:
1540 O << "reverse";
1541 break;
1542 case VPInstruction::Unpack:
1543 O << "unpack";
1544 break;
1545 case VPInstruction::ExtractLastActive:
1546 O << "extract-last-active";
1547 break;
1548 default:
1549 O << Instruction::getOpcodeName(getOpcode());
1550 }
1551
1552 printFlags(O);
1553 printOperands(O, SlotTracker);
1554}
1555#endif
1556
1557void VPInstructionWithType::execute(VPTransformState &State) {
1558 State.setDebugLocFrom(getDebugLoc());
1559 if (isScalarCast()) {
1560 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1561 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1562 V: Op, DestTy: ResultTy);
1563 State.set(Def: this, V: Cast, Lane: VPLane(0));
1564 return;
1565 }
1566 switch (getOpcode()) {
1567 case VPInstruction::StepVector: {
1568 Value *StepVector =
1569 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1570 State.set(Def: this, V: StepVector);
1571 break;
1572 }
1573 case VPInstruction::VScale: {
1574 Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1575 State.set(Def: this, V: VScale, IsScalar: true);
1576 break;
1577 }
1578
1579 default:
1580 llvm_unreachable("opcode not implemented yet");
1581 }
1582}
1583
1584#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1585void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1586 VPSlotTracker &SlotTracker) const {
1587 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1588 printAsOperand(O, SlotTracker);
1589 O << " = ";
1590
1591 switch (getOpcode()) {
1592 case VPInstruction::WideIVStep:
1593 O << "wide-iv-step ";
1594 printOperands(O, SlotTracker);
1595 break;
1596 case VPInstruction::StepVector:
1597 O << "step-vector " << *ResultTy;
1598 break;
1599 case VPInstruction::VScale:
1600 O << "vscale " << *ResultTy;
1601 break;
1602 default:
1603 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1604 O << Instruction::getOpcodeName(getOpcode()) << " ";
1605 printOperands(O, SlotTracker);
1606 O << " to " << *ResultTy;
1607 }
1608}
1609#endif
1610
1611void VPPhi::execute(VPTransformState &State) {
1612 State.setDebugLocFrom(getDebugLoc());
1613 PHINode *NewPhi = State.Builder.CreatePHI(
1614 Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName());
1615 unsigned NumIncoming = getNumIncoming();
1616 if (getParent() != getParent()->getPlan()->getScalarPreheader()) {
1617 // TODO: Fixup all incoming values of header phis once recipes defining them
1618 // are introduced.
1619 NumIncoming = 1;
1620 }
1621 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1622 Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0));
1623 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1624 NewPhi->addIncoming(V: IncV, BB: PredBB);
1625 }
1626 State.set(Def: this, V: NewPhi, Lane: VPLane(0));
1627}
1628
1629#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1630void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1631 VPSlotTracker &SlotTracker) const {
1632 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1633 printAsOperand(O, SlotTracker);
1634 O << " = phi";
1635 printFlags(O);
1636 printPhiOperands(O, SlotTracker);
1637}
1638#endif
1639
1640VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1641 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1642 return new VPIRPhi(*Phi);
1643 return new VPIRInstruction(I);
1644}
1645
1646void VPIRInstruction::execute(VPTransformState &State) {
1647 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1648 "PHINodes must be handled by VPIRPhi");
1649 // Advance the insert point after the wrapped IR instruction. This allows
1650 // interleaving VPIRInstructions and other recipes.
1651 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1652}
1653
1654InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1655 VPCostContext &Ctx) const {
1656 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1657 // hence it does not contribute to the cost-modeling for the VPlan.
1658 return 0;
1659}
1660
1661#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1662void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1663 VPSlotTracker &SlotTracker) const {
1664 O << Indent << "IR " << I;
1665}
1666#endif
1667
1668void VPIRPhi::execute(VPTransformState &State) {
1669 PHINode *Phi = &getIRPhi();
1670 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1671 VPValue *ExitValue = Op;
1672 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1673 ? VPLane::getFirstLane()
1674 : VPLane::getLastLaneForVF(VF: State.VF);
1675 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1676 auto *PredVPBB = Pred->getExitingBasicBlock();
1677 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1678 // Set insertion point in PredBB in case an extract needs to be generated.
1679 // TODO: Model extracts explicitly.
1680 State.Builder.SetInsertPoint(PredBB->getTerminator());
1681 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
1682 // If there is no existing block for PredBB in the phi, add a new incoming
1683 // value. Otherwise update the existing incoming value for PredBB.
1684 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
1685 Phi->addIncoming(V, BB: PredBB);
1686 else
1687 Phi->setIncomingValueForBlock(BB: PredBB, V);
1688 }
1689
1690 // Advance the insert point after the wrapped IR instruction. This allows
1691 // interleaving VPIRInstructions and other recipes.
1692 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1693}
1694
1695void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1696 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1697 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1698 "Number of phi operands must match number of predecessors");
1699 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1700 R->removeOperand(Idx: Position);
1701}
1702
1703VPValue *
1704VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
1705 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1706 return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB));
1707}
1708
1709void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
1710 VPValue *V) const {
1711 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1712 R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V);
1713}
1714
1715#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1716void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1717 VPSlotTracker &SlotTracker) const {
1718 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1719 [this, &O, &SlotTracker](auto Op) {
1720 O << "[ ";
1721 Op.value()->printAsOperand(O, SlotTracker);
1722 O << ", ";
1723 getIncomingBlock(Op.index())->printAsOperand(O);
1724 O << " ]";
1725 });
1726}
1727#endif
1728
1729#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1730void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1731 VPSlotTracker &SlotTracker) const {
1732 VPIRInstruction::printRecipe(O, Indent, SlotTracker);
1733
1734 if (getNumOperands() != 0) {
1735 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1736 interleaveComma(incoming_values_and_blocks(), O,
1737 [&O, &SlotTracker](auto Op) {
1738 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1739 O << " from ";
1740 std::get<1>(Op)->printAsOperand(O);
1741 });
1742 O << ")";
1743 }
1744}
1745#endif
1746
1747void VPIRMetadata::applyMetadata(Instruction &I) const {
1748 for (const auto &[Kind, Node] : Metadata)
1749 I.setMetadata(KindID: Kind, Node);
1750}
1751
1752void VPIRMetadata::intersect(const VPIRMetadata &Other) {
1753 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1754 for (const auto &[KindA, MDA] : Metadata) {
1755 for (const auto &[KindB, MDB] : Other.Metadata) {
1756 if (KindA == KindB && MDA == MDB) {
1757 MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
1758 break;
1759 }
1760 }
1761 }
1762 Metadata = std::move(MetadataIntersection);
1763}
1764
1765#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1766void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1767 const Module *M = SlotTracker.getModule();
1768 if (Metadata.empty() || !M)
1769 return;
1770
1771 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1772 O << " (";
1773 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1774 auto [Kind, Node] = KindNodePair;
1775 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1776 "Unexpected unnamed metadata kind");
1777 O << "!" << MDNames[Kind] << " ";
1778 Node->printAsOperand(O, M);
1779 });
1780 O << ")";
1781}
1782#endif
1783
1784void VPWidenCallRecipe::execute(VPTransformState &State) {
1785 assert(State.VF.isVector() && "not widening");
1786 assert(Variant != nullptr && "Can't create vector function.");
1787
1788 FunctionType *VFTy = Variant->getFunctionType();
1789 // Add return type if intrinsic is overloaded on it.
1790 SmallVector<Value *, 4> Args;
1791 for (const auto &I : enumerate(First: args())) {
1792 Value *Arg;
1793 // Some vectorized function variants may also take a scalar argument,
1794 // e.g. linear parameters for pointers. This needs to be the scalar value
1795 // from the start of the respective part when interleaving.
1796 if (!VFTy->getParamType(i: I.index())->isVectorTy())
1797 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1798 else
1799 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1800 Args.push_back(Elt: Arg);
1801 }
1802
1803 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1804 SmallVector<OperandBundleDef, 1> OpBundles;
1805 if (CI)
1806 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1807
1808 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1809 applyFlags(I&: *V);
1810 applyMetadata(I&: *V);
1811 V->setCallingConv(Variant->getCallingConv());
1812
1813 if (!V->getType()->isVoidTy())
1814 State.set(Def: this, V);
1815}
1816
1817InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1818 VPCostContext &Ctx) const {
1819 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1820 Tys: Variant->getFunctionType()->params(),
1821 CostKind: Ctx.CostKind);
1822}
1823
1824#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1825void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1826 VPSlotTracker &SlotTracker) const {
1827 O << Indent << "WIDEN-CALL ";
1828
1829 Function *CalledFn = getCalledScalarFunction();
1830 if (CalledFn->getReturnType()->isVoidTy())
1831 O << "void ";
1832 else {
1833 printAsOperand(O, SlotTracker);
1834 O << " = ";
1835 }
1836
1837 O << "call";
1838 printFlags(O);
1839 O << " @" << CalledFn->getName() << "(";
1840 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1841 Op->printAsOperand(O, SlotTracker);
1842 });
1843 O << ")";
1844
1845 O << " (using library function";
1846 if (Variant->hasName())
1847 O << ": " << Variant->getName();
1848 O << ")";
1849}
1850#endif
1851
1852void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1853 assert(State.VF.isVector() && "not widening");
1854
1855 SmallVector<Type *, 2> TysForDecl;
1856 // Add return type if intrinsic is overloaded on it.
1857 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1,
1858 TTI: State.TTI)) {
1859 Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF);
1860 ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
1861 for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
1862 if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
1863 RetIdx: Idx, TTI: State.TTI))
1864 TysForDecl.push_back(Elt: Ty);
1865 }
1866 }
1867 SmallVector<Value *, 4> Args;
1868 for (const auto &I : enumerate(First: operands())) {
1869 // Some intrinsics have a scalar argument - don't replace it with a
1870 // vector.
1871 Value *Arg;
1872 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1873 TTI: State.TTI))
1874 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1875 else
1876 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1877 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1878 TTI: State.TTI))
1879 TysForDecl.push_back(Elt: Arg->getType());
1880 Args.push_back(Elt: Arg);
1881 }
1882
1883 // Use vector version of the intrinsic.
1884 Module *M = State.Builder.GetInsertBlock()->getModule();
1885 Function *VectorF =
1886 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
1887 assert(VectorF &&
1888 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1889
1890 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1891 SmallVector<OperandBundleDef, 1> OpBundles;
1892 if (CI)
1893 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1894
1895 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1896
1897 applyFlags(I&: *V);
1898 applyMetadata(I&: *V);
1899
1900 if (!V->getType()->isVoidTy())
1901 State.set(Def: this, V);
1902}
1903
1904/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1905static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
1906 ArrayRef<const VPValue *> Operands,
1907 const VPRecipeWithIRFlags &R,
1908 ElementCount VF,
1909 VPCostContext &Ctx) {
1910 // Some backends analyze intrinsic arguments to determine cost. Use the
1911 // underlying value for the operand if it has one. Otherwise try to use the
1912 // operand of the underlying call instruction, if there is one. Otherwise
1913 // clear Arguments.
1914 // TODO: Rework TTI interface to be independent of concrete IR values.
1915 SmallVector<const Value *> Arguments;
1916 for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
1917 auto *V = Op->getUnderlyingValue();
1918 if (!V) {
1919 if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
1920 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1921 continue;
1922 }
1923 Arguments.clear();
1924 break;
1925 }
1926 Arguments.push_back(Elt: V);
1927 }
1928
1929 Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R);
1930 Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
1931 SmallVector<Type *> ParamTys;
1932 for (const VPValue *Op : Operands) {
1933 ParamTys.push_back(Elt: VF.isVector()
1934 ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF)
1935 : Ctx.Types.inferScalarType(V: Op));
1936 }
1937
1938 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1939 IntrinsicCostAttributes CostAttrs(
1940 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1941 dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
1942 InstructionCost::getInvalid(), &Ctx.TLI);
1943 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1944}
1945
1946InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1947 VPCostContext &Ctx) const {
1948 SmallVector<const VPValue *> ArgOps(operands());
1949 return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
1950}
1951
1952StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1953 return Intrinsic::getBaseName(id: VectorIntrinsicID);
1954}
1955
1956bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
1957 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1958 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1959 auto [Idx, V] = X;
1960 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1961 Idx, nullptr);
1962 });
1963}
1964
1965#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1966void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1967 VPSlotTracker &SlotTracker) const {
1968 O << Indent << "WIDEN-INTRINSIC ";
1969 if (ResultTy->isVoidTy()) {
1970 O << "void ";
1971 } else {
1972 printAsOperand(O, SlotTracker);
1973 O << " = ";
1974 }
1975
1976 O << "call";
1977 printFlags(O);
1978 O << getIntrinsicName() << "(";
1979
1980 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
1981 Op->printAsOperand(O, SlotTracker);
1982 });
1983 O << ")";
1984}
1985#endif
1986
1987void VPHistogramRecipe::execute(VPTransformState &State) {
1988 IRBuilderBase &Builder = State.Builder;
1989
1990 Value *Address = State.get(Def: getOperand(N: 0));
1991 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
1992 VectorType *VTy = cast<VectorType>(Val: Address->getType());
1993
1994 // The histogram intrinsic requires a mask even if the recipe doesn't;
1995 // if the mask operand was omitted then all lanes should be executed and
1996 // we just need to synthesize an all-true mask.
1997 Value *Mask = nullptr;
1998 if (VPValue *VPMask = getMask())
1999 Mask = State.get(Def: VPMask);
2000 else
2001 Mask =
2002 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
2003
2004 // If this is a subtract, we want to invert the increment amount. We may
2005 // add a separate intrinsic in future, but for now we'll try this.
2006 if (Opcode == Instruction::Sub)
2007 IncAmt = Builder.CreateNeg(V: IncAmt);
2008 else
2009 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2010
2011 State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
2012 Types: {VTy, IncAmt->getType()},
2013 Args: {Address, IncAmt, Mask});
2014}
2015
2016InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
2017 VPCostContext &Ctx) const {
2018 // FIXME: Take the gather and scatter into account as well. For now we're
2019 // generating the same cost as the fallback path, but we'll likely
2020 // need to create a new TTI method for determining the cost, including
2021 // whether we can use base + vec-of-smaller-indices or just
2022 // vec-of-pointers.
2023 assert(VF.isVector() && "Invalid VF for histogram cost");
2024 Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
2025 VPValue *IncAmt = getOperand(N: 1);
2026 Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
2027 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2028
2029 // Assume that a non-constant update value (or a constant != 1) requires
2030 // a multiply, and add that into the cost.
2031 InstructionCost MulCost =
2032 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2033 if (auto *CI = dyn_cast<VPConstantInt>(Val: IncAmt))
2034 if (CI->isOne())
2035 MulCost = TTI::TCC_Free;
2036
2037 // Find the cost of the histogram operation itself.
2038 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2039 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2040 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2041 Type::getVoidTy(C&: Ctx.LLVMCtx),
2042 {PtrTy, IncTy, MaskTy});
2043
2044 // Add the costs together with the add/sub operation.
2045 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2046 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2047}
2048
2049#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2050void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2051 VPSlotTracker &SlotTracker) const {
2052 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2053 getOperand(0)->printAsOperand(O, SlotTracker);
2054
2055 if (Opcode == Instruction::Sub)
2056 O << ", dec: ";
2057 else {
2058 assert(Opcode == Instruction::Add);
2059 O << ", inc: ";
2060 }
2061 getOperand(1)->printAsOperand(O, SlotTracker);
2062
2063 if (VPValue *Mask = getMask()) {
2064 O << ", mask: ";
2065 Mask->printAsOperand(O, SlotTracker);
2066 }
2067}
2068#endif
2069
2070VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2071 AllowReassoc = FMF.allowReassoc();
2072 NoNaNs = FMF.noNaNs();
2073 NoInfs = FMF.noInfs();
2074 NoSignedZeros = FMF.noSignedZeros();
2075 AllowReciprocal = FMF.allowReciprocal();
2076 AllowContract = FMF.allowContract();
2077 ApproxFunc = FMF.approxFunc();
2078}
2079
2080VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2081 switch (Opcode) {
2082 case Instruction::Add:
2083 case Instruction::Sub:
2084 case Instruction::Mul:
2085 case Instruction::Shl:
2086 case VPInstruction::CanonicalIVIncrementForPart:
2087 return WrapFlagsTy(false, false);
2088 case Instruction::Trunc:
2089 return TruncFlagsTy(false, false);
2090 case Instruction::Or:
2091 return DisjointFlagsTy(false);
2092 case Instruction::AShr:
2093 case Instruction::LShr:
2094 case Instruction::UDiv:
2095 case Instruction::SDiv:
2096 return ExactFlagsTy(false);
2097 case Instruction::GetElementPtr:
2098 case VPInstruction::PtrAdd:
2099 case VPInstruction::WidePtrAdd:
2100 return GEPNoWrapFlags::none();
2101 case Instruction::ZExt:
2102 case Instruction::UIToFP:
2103 return NonNegFlagsTy(false);
2104 case Instruction::FAdd:
2105 case Instruction::FSub:
2106 case Instruction::FMul:
2107 case Instruction::FDiv:
2108 case Instruction::FRem:
2109 case Instruction::FNeg:
2110 case Instruction::FPExt:
2111 case Instruction::FPTrunc:
2112 return FastMathFlags();
2113 case Instruction::ICmp:
2114 case Instruction::FCmp:
2115 case VPInstruction::ComputeReductionResult:
2116 llvm_unreachable("opcode requires explicit flags");
2117 default:
2118 return VPIRFlags();
2119 }
2120}
2121
2122#if !defined(NDEBUG)
2123bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2124 switch (OpType) {
2125 case OperationType::OverflowingBinOp:
2126 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2127 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2128 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2129 case OperationType::Trunc:
2130 return Opcode == Instruction::Trunc;
2131 case OperationType::DisjointOp:
2132 return Opcode == Instruction::Or;
2133 case OperationType::PossiblyExactOp:
2134 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2135 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2136 case OperationType::GEPOp:
2137 return Opcode == Instruction::GetElementPtr ||
2138 Opcode == VPInstruction::PtrAdd ||
2139 Opcode == VPInstruction::WidePtrAdd;
2140 case OperationType::FPMathOp:
2141 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2142 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2143 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2144 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2145 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2146 Opcode == Instruction::Select ||
2147 Opcode == VPInstruction::WideIVStep ||
2148 Opcode == VPInstruction::ReductionStartVector;
2149 case OperationType::FCmp:
2150 return Opcode == Instruction::FCmp;
2151 case OperationType::NonNegOp:
2152 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2153 case OperationType::Cmp:
2154 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2155 case OperationType::ReductionOp:
2156 return Opcode == VPInstruction::ComputeReductionResult;
2157 case OperationType::Other:
2158 return true;
2159 }
2160 llvm_unreachable("Unknown OperationType enum");
2161}
2162
2163bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2164 // Handle opcodes without default flags.
2165 if (Opcode == Instruction::ICmp)
2166 return OpType == OperationType::Cmp;
2167 if (Opcode == Instruction::FCmp)
2168 return OpType == OperationType::FCmp;
2169 if (Opcode == VPInstruction::ComputeReductionResult)
2170 return OpType == OperationType::ReductionOp;
2171
2172 OperationType Required = getDefaultFlags(Opcode).OpType;
2173 return Required == OperationType::Other || Required == OpType;
2174}
2175#endif
2176
2177#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2178void VPIRFlags::printFlags(raw_ostream &O) const {
2179 switch (OpType) {
2180 case OperationType::Cmp:
2181 O << " " << CmpInst::getPredicateName(getPredicate());
2182 break;
2183 case OperationType::FCmp:
2184 O << " " << CmpInst::getPredicateName(getPredicate());
2185 getFastMathFlags().print(O);
2186 break;
2187 case OperationType::DisjointOp:
2188 if (DisjointFlags.IsDisjoint)
2189 O << " disjoint";
2190 break;
2191 case OperationType::PossiblyExactOp:
2192 if (ExactFlags.IsExact)
2193 O << " exact";
2194 break;
2195 case OperationType::OverflowingBinOp:
2196 if (WrapFlags.HasNUW)
2197 O << " nuw";
2198 if (WrapFlags.HasNSW)
2199 O << " nsw";
2200 break;
2201 case OperationType::Trunc:
2202 if (TruncFlags.HasNUW)
2203 O << " nuw";
2204 if (TruncFlags.HasNSW)
2205 O << " nsw";
2206 break;
2207 case OperationType::FPMathOp:
2208 getFastMathFlags().print(O);
2209 break;
2210 case OperationType::GEPOp:
2211 if (GEPFlags.isInBounds())
2212 O << " inbounds";
2213 else if (GEPFlags.hasNoUnsignedSignedWrap())
2214 O << " nusw";
2215 if (GEPFlags.hasNoUnsignedWrap())
2216 O << " nuw";
2217 break;
2218 case OperationType::NonNegOp:
2219 if (NonNegFlags.NonNeg)
2220 O << " nneg";
2221 break;
2222 case OperationType::ReductionOp: {
2223 RecurKind RK = getRecurKind();
2224 O << " (";
2225 switch (RK) {
2226 case RecurKind::AnyOf:
2227 O << "any-of";
2228 break;
2229 case RecurKind::SMax:
2230 O << "smax";
2231 break;
2232 case RecurKind::SMin:
2233 O << "smin";
2234 break;
2235 case RecurKind::UMax:
2236 O << "umax";
2237 break;
2238 case RecurKind::UMin:
2239 O << "umin";
2240 break;
2241 case RecurKind::FMinNum:
2242 O << "fminnum";
2243 break;
2244 case RecurKind::FMaxNum:
2245 O << "fmaxnum";
2246 break;
2247 case RecurKind::FMinimum:
2248 O << "fminimum";
2249 break;
2250 case RecurKind::FMaximum:
2251 O << "fmaximum";
2252 break;
2253 case RecurKind::FMinimumNum:
2254 O << "fminimumnum";
2255 break;
2256 case RecurKind::FMaximumNum:
2257 O << "fmaximumnum";
2258 break;
2259 default:
2260 O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
2261 break;
2262 }
2263 if (isReductionInLoop())
2264 O << ", in-loop";
2265 if (isReductionOrdered())
2266 O << ", ordered";
2267 O << ")";
2268 getFastMathFlags().print(O);
2269 break;
2270 }
2271 case OperationType::Other:
2272 break;
2273 }
2274 O << " ";
2275}
2276#endif
2277
2278void VPWidenRecipe::execute(VPTransformState &State) {
2279 auto &Builder = State.Builder;
2280 switch (Opcode) {
2281 case Instruction::Call:
2282 case Instruction::Br:
2283 case Instruction::PHI:
2284 case Instruction::GetElementPtr:
2285 llvm_unreachable("This instruction is handled by a different recipe.");
2286 case Instruction::UDiv:
2287 case Instruction::SDiv:
2288 case Instruction::SRem:
2289 case Instruction::URem:
2290 case Instruction::Add:
2291 case Instruction::FAdd:
2292 case Instruction::Sub:
2293 case Instruction::FSub:
2294 case Instruction::FNeg:
2295 case Instruction::Mul:
2296 case Instruction::FMul:
2297 case Instruction::FDiv:
2298 case Instruction::FRem:
2299 case Instruction::Shl:
2300 case Instruction::LShr:
2301 case Instruction::AShr:
2302 case Instruction::And:
2303 case Instruction::Or:
2304 case Instruction::Xor: {
2305 // Just widen unops and binops.
2306 SmallVector<Value *, 2> Ops;
2307 for (VPValue *VPOp : operands())
2308 Ops.push_back(Elt: State.get(Def: VPOp));
2309
2310 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2311
2312 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2313 applyFlags(I&: *VecOp);
2314 applyMetadata(I&: *VecOp);
2315 }
2316
2317 // Use this vector value for all users of the original instruction.
2318 State.set(Def: this, V);
2319 break;
2320 }
2321 case Instruction::ExtractValue: {
2322 assert(getNumOperands() == 2 && "expected single level extractvalue");
2323 Value *Op = State.get(Def: getOperand(N: 0));
2324 Value *Extract = Builder.CreateExtractValue(
2325 Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue());
2326 State.set(Def: this, V: Extract);
2327 break;
2328 }
2329 case Instruction::Freeze: {
2330 Value *Op = State.get(Def: getOperand(N: 0));
2331 Value *Freeze = Builder.CreateFreeze(V: Op);
2332 State.set(Def: this, V: Freeze);
2333 break;
2334 }
2335 case Instruction::ICmp:
2336 case Instruction::FCmp: {
2337 // Widen compares. Generate vector compares.
2338 bool FCmp = Opcode == Instruction::FCmp;
2339 Value *A = State.get(Def: getOperand(N: 0));
2340 Value *B = State.get(Def: getOperand(N: 1));
2341 Value *C = nullptr;
2342 if (FCmp) {
2343 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2344 } else {
2345 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2346 }
2347 if (auto *I = dyn_cast<Instruction>(Val: C)) {
2348 applyFlags(I&: *I);
2349 applyMetadata(I&: *I);
2350 }
2351 State.set(Def: this, V: C);
2352 break;
2353 }
2354 case Instruction::Select: {
2355 VPValue *CondOp = getOperand(N: 0);
2356 Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2357 Value *Op0 = State.get(Def: getOperand(N: 1));
2358 Value *Op1 = State.get(Def: getOperand(N: 2));
2359 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2360 State.set(Def: this, V: Sel);
2361 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2362 if (isa<FPMathOperator>(Val: I))
2363 applyFlags(I&: *I);
2364 applyMetadata(I&: *I);
2365 }
2366 break;
2367 }
2368 default:
2369 // This instruction is not vectorized by simple widening.
2370 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2371 << Instruction::getOpcodeName(Opcode));
2372 llvm_unreachable("Unhandled instruction!");
2373 } // end of switch.
2374
2375#if !defined(NDEBUG)
2376 // Verify that VPlan type inference results agree with the type of the
2377 // generated values.
2378 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2379 State.get(this)->getType() &&
2380 "inferred type and type from generated instructions do not match");
2381#endif
2382}
2383
2384InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2385 VPCostContext &Ctx) const {
2386 switch (Opcode) {
2387 case Instruction::UDiv:
2388 case Instruction::SDiv:
2389 case Instruction::SRem:
2390 case Instruction::URem:
2391 // If the div/rem operation isn't safe to speculate and requires
2392 // predication, then the only way we can even create a vplan is to insert
2393 // a select on the second input operand to ensure we use the value of 1
2394 // for the inactive lanes. The select will be costed separately.
2395 case Instruction::FNeg:
2396 case Instruction::Add:
2397 case Instruction::FAdd:
2398 case Instruction::Sub:
2399 case Instruction::FSub:
2400 case Instruction::Mul:
2401 case Instruction::FMul:
2402 case Instruction::FDiv:
2403 case Instruction::FRem:
2404 case Instruction::Shl:
2405 case Instruction::LShr:
2406 case Instruction::AShr:
2407 case Instruction::And:
2408 case Instruction::Or:
2409 case Instruction::Xor:
2410 case Instruction::Freeze:
2411 case Instruction::ExtractValue:
2412 case Instruction::ICmp:
2413 case Instruction::FCmp:
2414 case Instruction::Select:
2415 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2416 default:
2417 llvm_unreachable("Unsupported opcode for instruction");
2418 }
2419}
2420
2421#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2422void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2423 VPSlotTracker &SlotTracker) const {
2424 O << Indent << "WIDEN ";
2425 printAsOperand(O, SlotTracker);
2426 O << " = " << Instruction::getOpcodeName(Opcode);
2427 printFlags(O);
2428 printOperands(O, SlotTracker);
2429}
2430#endif
2431
2432void VPWidenCastRecipe::execute(VPTransformState &State) {
2433 auto &Builder = State.Builder;
2434 /// Vectorize casts.
2435 assert(State.VF.isVector() && "Not vectorizing?");
2436 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2437 VPValue *Op = getOperand(N: 0);
2438 Value *A = State.get(Def: Op);
2439 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2440 State.set(Def: this, V: Cast);
2441 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2442 applyFlags(I&: *CastOp);
2443 applyMetadata(I&: *CastOp);
2444 }
2445}
2446
2447InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2448 VPCostContext &Ctx) const {
2449 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2450 // the legacy cost model, including truncates/extends when evaluating a
2451 // reduction in a smaller type.
2452 if (!getUnderlyingValue())
2453 return 0;
2454 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2455}
2456
2457#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2458void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2459 VPSlotTracker &SlotTracker) const {
2460 O << Indent << "WIDEN-CAST ";
2461 printAsOperand(O, SlotTracker);
2462 O << " = " << Instruction::getOpcodeName(Opcode);
2463 printFlags(O);
2464 printOperands(O, SlotTracker);
2465 O << " to " << *getResultType();
2466}
2467#endif
2468
2469InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2470 VPCostContext &Ctx) const {
2471 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2472}
2473
2474#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2475void VPWidenIntOrFpInductionRecipe::printRecipe(
2476 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2477 O << Indent;
2478 printAsOperand(O, SlotTracker);
2479 O << " = WIDEN-INDUCTION";
2480 printFlags(O);
2481 printOperands(O, SlotTracker);
2482
2483 if (auto *TI = getTruncInst())
2484 O << " (truncated to " << *TI->getType() << ")";
2485}
2486#endif
2487
2488bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2489 // The step may be defined by a recipe in the preheader (e.g. if it requires
2490 // SCEV expansion), but for the canonical induction the step is required to be
2491 // 1, which is represented as live-in.
2492 auto *StepC = dyn_cast<VPConstantInt>(Val: getStepValue());
2493 auto *StartC = dyn_cast<VPConstantInt>(Val: getStartValue());
2494 return StartC && StartC->isZero() && StepC && StepC->isOne() &&
2495 getScalarType() == getRegion()->getCanonicalIVType();
2496}
2497
2498void VPDerivedIVRecipe::execute(VPTransformState &State) {
2499 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2500
2501 // Fast-math-flags propagate from the original induction instruction.
2502 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2503 if (FPBinOp)
2504 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2505
2506 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2507 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
2508 Value *DerivedIV = emitTransformedIndex(
2509 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
2510 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
2511 DerivedIV->setName(Name);
2512 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
2513}
2514
2515#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2516void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2517 VPSlotTracker &SlotTracker) const {
2518 O << Indent;
2519 printAsOperand(O, SlotTracker);
2520 O << " = DERIVED-IV ";
2521 getStartValue()->printAsOperand(O, SlotTracker);
2522 O << " + ";
2523 getOperand(1)->printAsOperand(O, SlotTracker);
2524 O << " * ";
2525 getStepValue()->printAsOperand(O, SlotTracker);
2526}
2527#endif
2528
2529void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2530 // Fast-math-flags propagate from the original induction instruction.
2531 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2532 State.Builder.setFastMathFlags(getFastMathFlags());
2533
2534 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2535 /// variable on which to base the steps, \p Step is the size of the step.
2536
2537 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2538 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2539 IRBuilderBase &Builder = State.Builder;
2540
2541 // Ensure step has the same type as that of scalar IV.
2542 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2543 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2544
2545 // We build scalar steps for both integer and floating-point induction
2546 // variables. Here, we determine the kind of arithmetic we will perform.
2547 Instruction::BinaryOps AddOp;
2548 Instruction::BinaryOps MulOp;
2549 if (BaseIVTy->isIntegerTy()) {
2550 AddOp = Instruction::Add;
2551 MulOp = Instruction::Mul;
2552 } else {
2553 AddOp = InductionOpcode;
2554 MulOp = Instruction::FMul;
2555 }
2556
2557 // Determine the number of scalars we need to generate.
2558 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2559 // Compute the scalar steps and save the results in State.
2560
2561 unsigned StartLane = 0;
2562 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2563 if (State.Lane) {
2564 StartLane = State.Lane->getKnownLane();
2565 EndLane = StartLane + 1;
2566 }
2567 Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true)
2568 : Constant::getNullValue(Ty: BaseIVTy);
2569
2570 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2571 // It is okay if the induction variable type cannot hold the lane number,
2572 // we expect truncation in this case.
2573 Constant *LaneValue =
2574 BaseIVTy->isIntegerTy()
2575 ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false,
2576 /*ImplicitTrunc=*/true)
2577 : ConstantFP::get(Ty: BaseIVTy, V: Lane);
2578 Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
2579 // The step returned by `createStepForVF` is a runtime-evaluated value
2580 // when VF is scalable. Otherwise, it should be folded into a Constant.
2581 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2582 "Expected StartIdx to be folded to a constant when VF is not "
2583 "scalable");
2584 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2585 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2586 State.set(Def: this, V: Add, Lane: VPLane(Lane));
2587 }
2588}
2589
2590#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2591void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2592 VPSlotTracker &SlotTracker) const {
2593 O << Indent;
2594 printAsOperand(O, SlotTracker);
2595 O << " = SCALAR-STEPS ";
2596 printOperands(O, SlotTracker);
2597}
2598#endif
2599
2600bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2601 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2602 return vputils::isSingleScalar(VPV: Op);
2603}
2604
2605void VPWidenGEPRecipe::execute(VPTransformState &State) {
2606 assert(State.VF.isVector() && "not widening");
2607 // Construct a vector GEP by widening the operands of the scalar GEP as
2608 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2609 // results in a vector of pointers when at least one operand of the GEP
2610 // is vector-typed. Thus, to keep the representation compact, we only use
2611 // vector-typed operands for loop-varying values.
2612
2613 bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) {
2614 return Op->isDefinedOutsideLoopRegions();
2615 });
2616 if (AllOperandsAreInvariant) {
2617 // If we are vectorizing, but the GEP has only loop-invariant operands,
2618 // the GEP we build (by only using vector-typed operands for
2619 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2620 // produce a vector of pointers, we need to either arbitrarily pick an
2621 // operand to broadcast, or broadcast a clone of the original GEP.
2622 // Here, we broadcast a clone of the original.
2623
2624 SmallVector<Value *> Ops;
2625 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2626 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0)));
2627
2628 auto *NewGEP =
2629 State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
2630 Name: "", NW: getGEPNoWrapFlags());
2631 Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2632 State.set(Def: this, V: Splat);
2633 return;
2634 }
2635
2636 // If the GEP has at least one loop-varying operand, we are sure to
2637 // produce a vector of pointers unless VF is scalar.
2638 // The pointer operand of the new GEP. If it's loop-invariant, we
2639 // won't broadcast it.
2640 auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant());
2641
2642 // Collect all the indices for the new GEP. If any index is
2643 // loop-invariant, we won't broadcast it.
2644 SmallVector<Value *, 4> Indices;
2645 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2646 VPValue *Operand = getOperand(N: I);
2647 Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1)));
2648 }
2649
2650 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2651 // but it should be a vector, otherwise.
2652 auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices,
2653 Name: "", NW: getGEPNoWrapFlags());
2654 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2655 "NewGEP is not a pointer vector");
2656 State.set(Def: this, V: NewGEP);
2657}
2658
2659#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2660void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2661 VPSlotTracker &SlotTracker) const {
2662 O << Indent << "WIDEN-GEP ";
2663 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2664 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2665 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2666
2667 O << " ";
2668 printAsOperand(O, SlotTracker);
2669 O << " = getelementptr";
2670 printFlags(O);
2671 printOperands(O, SlotTracker);
2672}
2673#endif
2674
2675void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
2676 assert(!getOffset() && "Unexpected offset operand");
2677 VPBuilder Builder(this);
2678 VPlan &Plan = *getParent()->getPlan();
2679 VPValue *VFVal = getVFValue();
2680 VPTypeAnalysis TypeInfo(Plan);
2681 const DataLayout &DL =
2682 Plan.getScalarHeader()->getIRBasicBlock()->getDataLayout();
2683 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: getPointer()));
2684 VPValue *Stride =
2685 Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true);
2686 Type *VFTy = TypeInfo.inferScalarType(V: VFVal);
2687 VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy,
2688 DL: DebugLoc::getUnknown());
2689
2690 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2691 VPInstruction *VFMinusOne =
2692 Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u),
2693 DL: DebugLoc::getUnknown(), Name: "", WrapFlags: {true, true});
2694 VPInstruction *Offset0 =
2695 Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride});
2696
2697 // Offset for PartN = Offset0 + Part * Stride * VF.
2698 VPValue *PartxStride =
2699 Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true);
2700 VPValue *Offset = Builder.createAdd(
2701 LHS: Offset0,
2702 RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF}));
2703 addOperand(Operand: Offset);
2704}
2705
2706void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2707 auto &Builder = State.Builder;
2708 assert(getOffset() && "Expected prior materialization of offset");
2709 Value *Ptr = State.get(Def: getPointer(), IsScalar: true);
2710 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2711 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2712 NW: getGEPNoWrapFlags());
2713 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2714}
2715
2716#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2717void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2718 VPSlotTracker &SlotTracker) const {
2719 O << Indent;
2720 printAsOperand(O, SlotTracker);
2721 O << " = vector-end-pointer";
2722 printFlags(O);
2723 printOperands(O, SlotTracker);
2724}
2725#endif
2726
2727void VPVectorPointerRecipe::execute(VPTransformState &State) {
2728 auto &Builder = State.Builder;
2729 assert(getOffset() &&
2730 "Expected prior simplification of recipe without offset");
2731 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2732 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2733 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2734 NW: getGEPNoWrapFlags());
2735 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2736}
2737
2738#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2739void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2740 VPSlotTracker &SlotTracker) const {
2741 O << Indent;
2742 printAsOperand(O, SlotTracker);
2743 O << " = vector-pointer";
2744 printFlags(O);
2745 printOperands(O, SlotTracker);
2746}
2747#endif
2748
2749InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2750 VPCostContext &Ctx) const {
2751 // A blend will be expanded to a select VPInstruction, which will generate a
2752 // scalar select if only the first lane is used.
2753 if (vputils::onlyFirstLaneUsed(Def: this))
2754 VF = ElementCount::getFixed(MinVal: 1);
2755
2756 Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
2757 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2758 return (getNumIncomingValues() - 1) *
2759 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2760 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2761}
2762
2763#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2764void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2765 VPSlotTracker &SlotTracker) const {
2766 O << Indent << "BLEND ";
2767 printAsOperand(O, SlotTracker);
2768 O << " =";
2769 printFlags(O);
2770 if (getNumIncomingValues() == 1) {
2771 // Not a User of any mask: not really blending, this is a
2772 // single-predecessor phi.
2773 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2774 } else {
2775 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2776 if (I != 0)
2777 O << " ";
2778 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2779 if (I == 0 && isNormalized())
2780 continue;
2781 O << "/";
2782 getMask(I)->printAsOperand(O, SlotTracker);
2783 }
2784 }
2785}
2786#endif
2787
2788void VPReductionRecipe::execute(VPTransformState &State) {
2789 assert(!State.Lane && "Reduction being replicated.");
2790 RecurKind Kind = getRecurrenceKind();
2791 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2792 "In-loop AnyOf reductions aren't currently supported");
2793 // Propagate the fast-math flags carried by the underlying instruction.
2794 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2795 State.Builder.setFastMathFlags(getFastMathFlags());
2796 Value *NewVecOp = State.get(Def: getVecOp());
2797 if (VPValue *Cond = getCondOp()) {
2798 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2799 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2800 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2801
2802 Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2803 if (State.VF.isVector())
2804 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2805
2806 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2807 NewVecOp = Select;
2808 }
2809 Value *NewRed;
2810 Value *NextInChain;
2811 if (isOrdered()) {
2812 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2813 if (State.VF.isVector())
2814 NewRed =
2815 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2816 else
2817 NewRed = State.Builder.CreateBinOp(
2818 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2819 LHS: PrevInChain, RHS: NewVecOp);
2820 PrevInChain = NewRed;
2821 NextInChain = NewRed;
2822 } else if (isPartialReduction()) {
2823 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2824 "Unexpected partial reduction kind");
2825 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false);
2826 NewRed = State.Builder.CreateIntrinsic(
2827 RetTy: PrevInChain->getType(),
2828 ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2829 : Intrinsic::vector_partial_reduce_fadd,
2830 Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
2831 Name: "partial.reduce");
2832 PrevInChain = NewRed;
2833 NextInChain = NewRed;
2834 } else {
2835 assert(isInLoop() &&
2836 "The reduction must either be ordered, partial or in-loop");
2837 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2838 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2839 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2840 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2841 else
2842 NextInChain = State.Builder.CreateBinOp(
2843 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2844 LHS: PrevInChain, RHS: NewRed);
2845 }
2846 State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction());
2847}
2848
2849void VPReductionEVLRecipe::execute(VPTransformState &State) {
2850 assert(!State.Lane && "Reduction being replicated.");
2851
2852 auto &Builder = State.Builder;
2853 // Propagate the fast-math flags carried by the underlying instruction.
2854 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2855 Builder.setFastMathFlags(getFastMathFlags());
2856
2857 RecurKind Kind = getRecurrenceKind();
2858 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
2859 Value *VecOp = State.get(Def: getVecOp());
2860 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
2861
2862 Value *Mask;
2863 if (VPValue *CondOp = getCondOp())
2864 Mask = State.get(Def: CondOp);
2865 else
2866 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2867
2868 Value *NewRed;
2869 if (isOrdered()) {
2870 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2871 } else {
2872 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2873 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2874 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2875 else
2876 NewRed = Builder.CreateBinOp(
2877 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2878 RHS: Prev);
2879 }
2880 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
2881}
2882
2883InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2884 VPCostContext &Ctx) const {
2885 RecurKind RdxKind = getRecurrenceKind();
2886 Type *ElementTy = Ctx.Types.inferScalarType(V: this);
2887 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2888 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2889 FastMathFlags FMFs = getFastMathFlags();
2890 std::optional<FastMathFlags> OptionalFMF =
2891 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2892
2893 if (isPartialReduction()) {
2894 InstructionCost CondCost = 0;
2895 if (isConditional()) {
2896 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
2897 auto *CondTy = cast<VectorType>(
2898 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF));
2899 CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
2900 CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
2901 }
2902 return CondCost + Ctx.TTI.getPartialReductionCost(
2903 Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
2904 OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
2905 FMF: OptionalFMF);
2906 }
2907
2908 // TODO: Support any-of reductions.
2909 assert(
2910 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2911 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2912 "Any-of reduction not implemented in VPlan-based cost model currently.");
2913
2914 // Note that TTI should model the cost of moving result to the scalar register
2915 // and the BinOp cost in the getMinMaxReductionCost().
2916 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2917 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2918 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2919 }
2920
2921 // Note that TTI should model the cost of moving result to the scalar register
2922 // and the BinOp cost in the getArithmeticReductionCost().
2923 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2924 CostKind: Ctx.CostKind);
2925}
2926
2927VPExpressionRecipe::VPExpressionRecipe(
2928 ExpressionTypes ExpressionType,
2929 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2930 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2931 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2932 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2933 assert(
2934 none_of(ExpressionRecipes,
2935 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2936 "expression cannot contain recipes with side-effects");
2937
2938 // Maintain a copy of the expression recipes as a set of users.
2939 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2940 for (auto *R : ExpressionRecipes)
2941 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2942
2943 // Recipes in the expression, except the last one, must only be used by
2944 // (other) recipes inside the expression. If there are other users, external
2945 // to the expression, use a clone of the recipe for external users.
2946 for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
2947 if (R != ExpressionRecipes.back() &&
2948 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2949 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2950 })) {
2951 // There are users outside of the expression. Clone the recipe and use the
2952 // clone those external users.
2953 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2954 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2955 VPUser &U, unsigned) {
2956 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2957 });
2958 CopyForExtUsers->insertBefore(InsertPos: R);
2959 }
2960 if (R->getParent())
2961 R->removeFromParent();
2962 }
2963
2964 // Internalize all external operands to the expression recipes. To do so,
2965 // create new temporary VPValues for all operands defined by a recipe outside
2966 // the expression. The original operands are added as operands of the
2967 // VPExpressionRecipe itself.
2968 for (auto *R : ExpressionRecipes) {
2969 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
2970 auto *Def = Op->getDefiningRecipe();
2971 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
2972 continue;
2973 addOperand(Operand: Op);
2974 LiveInPlaceholders.push_back(Elt: new VPSymbolicValue());
2975 }
2976 }
2977
2978 // Replace each external operand with the first one created for it in
2979 // LiveInPlaceholders.
2980 for (auto *R : ExpressionRecipes)
2981 for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
2982 R->replaceUsesOfWith(From: LiveIn, To: Tmp);
2983}
2984
2985void VPExpressionRecipe::decompose() {
2986 for (auto *R : ExpressionRecipes)
2987 // Since the list could contain duplicates, make sure the recipe hasn't
2988 // already been inserted.
2989 if (!R->getParent())
2990 R->insertBefore(InsertPos: this);
2991
2992 for (const auto &[Idx, Op] : enumerate(First: operands()))
2993 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
2994
2995 replaceAllUsesWith(New: ExpressionRecipes.back());
2996 ExpressionRecipes.clear();
2997}
2998
2999InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
3000 VPCostContext &Ctx) const {
3001 Type *RedTy = Ctx.Types.inferScalarType(V: this);
3002 auto *SrcVecTy = cast<VectorType>(
3003 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
3004 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3005 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
3006 switch (ExpressionType) {
3007 case ExpressionTypes::ExtendedReduction: {
3008 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3009 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind());
3010 auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3011 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3012
3013 if (RedR->isPartialReduction())
3014 return Ctx.TTI.getPartialReductionCost(
3015 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF,
3016 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
3017 OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
3018 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3019 : std::nullopt);
3020 else if (!RedTy->isFloatingPointTy())
3021 // TTI::getExtendedReductionCost only supports integer types.
3022 return Ctx.TTI.getExtendedReductionCost(
3023 Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
3024 FMF: std::nullopt, CostKind: Ctx.CostKind);
3025 else
3026 return InstructionCost::getInvalid();
3027 }
3028 case ExpressionTypes::MulAccReduction:
3029 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
3030 CostKind: Ctx.CostKind);
3031
3032 case ExpressionTypes::ExtNegatedMulAccReduction:
3033 assert(Opcode == Instruction::Add && "Unexpected opcode");
3034 Opcode = Instruction::Sub;
3035 [[fallthrough]];
3036 case ExpressionTypes::ExtMulAccReduction: {
3037 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3038 if (RedR->isPartialReduction()) {
3039 auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3040 auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]);
3041 auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]);
3042 return Ctx.TTI.getPartialReductionCost(
3043 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)),
3044 InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF,
3045 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3046 CastOpc: Ext0R->getOpcode()),
3047 OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3048 CastOpc: Ext1R->getOpcode()),
3049 BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3050 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3051 : std::nullopt);
3052 }
3053 return Ctx.TTI.getMulAccReductionCost(
3054 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3055 Instruction::ZExt,
3056 RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3057 }
3058 }
3059 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3060}
3061
3062bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3063 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3064 return R->mayReadFromMemory() || R->mayWriteToMemory();
3065 });
3066}
3067
3068bool VPExpressionRecipe::mayHaveSideEffects() const {
3069 assert(
3070 none_of(ExpressionRecipes,
3071 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3072 "expression cannot contain recipes with side-effects");
3073 return false;
3074}
3075
3076bool VPExpressionRecipe::isSingleScalar() const {
3077 // Cannot use vputils::isSingleScalar(), because all external operands
3078 // of the expression will be live-ins while bundled.
3079 auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3080 return RR && !RR->isPartialReduction();
3081}
3082
3083#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3084
3085void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3086 VPSlotTracker &SlotTracker) const {
3087 O << Indent << "EXPRESSION ";
3088 printAsOperand(O, SlotTracker);
3089 O << " = ";
3090 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3091 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3092
3093 switch (ExpressionType) {
3094 case ExpressionTypes::ExtendedReduction: {
3095 getOperand(1)->printAsOperand(O, SlotTracker);
3096 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3097 O << Instruction::getOpcodeName(Opcode) << " (";
3098 getOperand(0)->printAsOperand(O, SlotTracker);
3099 Red->printFlags(O);
3100
3101 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3102 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3103 << *Ext0->getResultType();
3104 if (Red->isConditional()) {
3105 O << ", ";
3106 Red->getCondOp()->printAsOperand(O, SlotTracker);
3107 }
3108 O << ")";
3109 break;
3110 }
3111 case ExpressionTypes::ExtNegatedMulAccReduction: {
3112 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3113 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3114 O << Instruction::getOpcodeName(
3115 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3116 << " (sub (0, mul";
3117 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3118 Mul->printFlags(O);
3119 O << "(";
3120 getOperand(0)->printAsOperand(O, SlotTracker);
3121 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3122 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3123 << *Ext0->getResultType() << "), (";
3124 getOperand(1)->printAsOperand(O, SlotTracker);
3125 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3126 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3127 << *Ext1->getResultType() << ")";
3128 if (Red->isConditional()) {
3129 O << ", ";
3130 Red->getCondOp()->printAsOperand(O, SlotTracker);
3131 }
3132 O << "))";
3133 break;
3134 }
3135 case ExpressionTypes::MulAccReduction:
3136 case ExpressionTypes::ExtMulAccReduction: {
3137 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3138 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3139 O << Instruction::getOpcodeName(
3140 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3141 << " (";
3142 O << "mul";
3143 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3144 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3145 : ExpressionRecipes[0]);
3146 Mul->printFlags(O);
3147 if (IsExtended)
3148 O << "(";
3149 getOperand(0)->printAsOperand(O, SlotTracker);
3150 if (IsExtended) {
3151 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3152 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3153 << *Ext0->getResultType() << "), (";
3154 } else {
3155 O << ", ";
3156 }
3157 getOperand(1)->printAsOperand(O, SlotTracker);
3158 if (IsExtended) {
3159 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3160 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3161 << *Ext1->getResultType() << ")";
3162 }
3163 if (Red->isConditional()) {
3164 O << ", ";
3165 Red->getCondOp()->printAsOperand(O, SlotTracker);
3166 }
3167 O << ")";
3168 break;
3169 }
3170 }
3171}
3172
3173void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3174 VPSlotTracker &SlotTracker) const {
3175 if (isPartialReduction())
3176 O << Indent << "PARTIAL-REDUCE ";
3177 else
3178 O << Indent << "REDUCE ";
3179 printAsOperand(O, SlotTracker);
3180 O << " = ";
3181 getChainOp()->printAsOperand(O, SlotTracker);
3182 O << " +";
3183 printFlags(O);
3184 O << " reduce."
3185 << Instruction::getOpcodeName(
3186 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3187 << " (";
3188 getVecOp()->printAsOperand(O, SlotTracker);
3189 if (isConditional()) {
3190 O << ", ";
3191 getCondOp()->printAsOperand(O, SlotTracker);
3192 }
3193 O << ")";
3194}
3195
3196void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3197 VPSlotTracker &SlotTracker) const {
3198 O << Indent << "REDUCE ";
3199 printAsOperand(O, SlotTracker);
3200 O << " = ";
3201 getChainOp()->printAsOperand(O, SlotTracker);
3202 O << " +";
3203 printFlags(O);
3204 O << " vp.reduce."
3205 << Instruction::getOpcodeName(
3206 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3207 << " (";
3208 getVecOp()->printAsOperand(O, SlotTracker);
3209 O << ", ";
3210 getEVL()->printAsOperand(O, SlotTracker);
3211 if (isConditional()) {
3212 O << ", ";
3213 getCondOp()->printAsOperand(O, SlotTracker);
3214 }
3215 O << ")";
3216}
3217
3218#endif
3219
3220/// A helper function to scalarize a single Instruction in the innermost loop.
3221/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3222/// operands from \p RepRecipe instead of \p Instr's operands.
3223static void scalarizeInstruction(const Instruction *Instr,
3224 VPReplicateRecipe *RepRecipe,
3225 const VPLane &Lane, VPTransformState &State) {
3226 assert((!Instr->getType()->isAggregateType() ||
3227 canVectorizeTy(Instr->getType())) &&
3228 "Expected vectorizable or non-aggregate type.");
3229
3230 // Does this instruction return a value ?
3231 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3232
3233 Instruction *Cloned = Instr->clone();
3234 if (!IsVoidRetTy) {
3235 Cloned->setName(Instr->getName() + ".cloned");
3236 Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe);
3237 // The operands of the replicate recipe may have been narrowed, resulting in
3238 // a narrower result type. Update the type of the cloned instruction to the
3239 // correct type.
3240 if (ResultTy != Cloned->getType())
3241 Cloned->mutateType(Ty: ResultTy);
3242 }
3243
3244 RepRecipe->applyFlags(I&: *Cloned);
3245 RepRecipe->applyMetadata(I&: *Cloned);
3246
3247 if (RepRecipe->hasPredicate())
3248 cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate());
3249
3250 if (auto DL = RepRecipe->getDebugLoc())
3251 State.setDebugLocFrom(DL);
3252
3253 // Replace the operands of the cloned instructions with their scalar
3254 // equivalents in the new loop.
3255 for (const auto &I : enumerate(First: RepRecipe->operands())) {
3256 auto InputLane = Lane;
3257 VPValue *Operand = I.value();
3258 if (vputils::isSingleScalar(VPV: Operand))
3259 InputLane = VPLane::getFirstLane();
3260 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
3261 }
3262
3263 // Place the cloned scalar in the new loop.
3264 State.Builder.Insert(I: Cloned);
3265
3266 State.set(Def: RepRecipe, V: Cloned, Lane);
3267
3268 // If we just cloned a new assumption, add it the assumption cache.
3269 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3270 State.AC->registerAssumption(CI: II);
3271
3272 assert(
3273 (RepRecipe->getRegion() ||
3274 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3275 all_of(RepRecipe->operands(),
3276 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3277 "Expected a recipe is either within a region or all of its operands "
3278 "are defined outside the vectorized region.");
3279}
3280
3281void VPReplicateRecipe::execute(VPTransformState &State) {
3282 Instruction *UI = getUnderlyingInstr();
3283
3284 if (!State.Lane) {
3285 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3286 "must have already been unrolled");
3287 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State);
3288 return;
3289 }
3290
3291 assert((State.VF.isScalar() || !isSingleScalar()) &&
3292 "uniform recipe shouldn't be predicated");
3293 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3294 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
3295 // Insert scalar instance packing it into a vector.
3296 if (State.VF.isVector() && shouldPack()) {
3297 Value *WideValue =
3298 State.Lane->isFirstLane()
3299 ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF))
3300 : State.get(Def: this);
3301 State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
3302 Lane: *State.Lane));
3303 }
3304}
3305
3306bool VPReplicateRecipe::shouldPack() const {
3307 // Find if the recipe is used by a widened recipe via an intervening
3308 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3309 return any_of(Range: users(), P: [](const VPUser *U) {
3310 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
3311 return !vputils::onlyScalarValuesUsed(Def: PredR);
3312 return false;
3313 });
3314}
3315
3316/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3317/// which the legacy cost model computes a SCEV expression when computing the
3318/// address cost. Computing SCEVs for VPValues is incomplete and returns
3319/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3320/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3321static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3322 PredicatedScalarEvolution &PSE,
3323 const Loop *L) {
3324 const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3325 if (isa<SCEVCouldNotCompute>(Val: Addr))
3326 return Addr;
3327
3328 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr;
3329}
3330
3331/// Returns true if \p V is used as part of the address of another load or
3332/// store.
3333static bool isUsedByLoadStoreAddress(const VPUser *V) {
3334 SmallPtrSet<const VPUser *, 4> Seen;
3335 SmallVector<const VPUser *> WorkList = {V};
3336
3337 while (!WorkList.empty()) {
3338 auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val());
3339 if (!Cur || !Seen.insert(Ptr: Cur).second)
3340 continue;
3341
3342 auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur);
3343 // Skip blends that use V only through a compare by checking if any incoming
3344 // value was already visited.
3345 if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()),
3346 P: [&](unsigned I) {
3347 return Seen.contains(
3348 Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe());
3349 }))
3350 continue;
3351
3352 for (VPUser *U : Cur->users()) {
3353 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U))
3354 if (InterleaveR->getAddr() == Cur)
3355 return true;
3356 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) {
3357 if (RepR->getOpcode() == Instruction::Load &&
3358 RepR->getOperand(N: 0) == Cur)
3359 return true;
3360 if (RepR->getOpcode() == Instruction::Store &&
3361 RepR->getOperand(N: 1) == Cur)
3362 return true;
3363 }
3364 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) {
3365 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3366 return true;
3367 }
3368 }
3369
3370 // The legacy cost model only supports scalarization loads/stores with phi
3371 // addresses, if the phi is directly used as load/store address. Don't
3372 // traverse further for Blends.
3373 if (Blend)
3374 continue;
3375
3376 append_range(C&: WorkList, R: Cur->users());
3377 }
3378 return false;
3379}
3380
3381InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3382 VPCostContext &Ctx) const {
3383 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3384 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3385 // transform, avoid computing their cost multiple times for now.
3386 Ctx.SkipCostComputation.insert(Ptr: UI);
3387
3388 if (VF.isScalable() && !isSingleScalar())
3389 return InstructionCost::getInvalid();
3390
3391 switch (UI->getOpcode()) {
3392 case Instruction::Alloca:
3393 if (VF.isScalable())
3394 return InstructionCost::getInvalid();
3395 return Ctx.TTI.getArithmeticInstrCost(
3396 Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind);
3397 case Instruction::GetElementPtr:
3398 // We mark this instruction as zero-cost because the cost of GEPs in
3399 // vectorized code depends on whether the corresponding memory instruction
3400 // is scalarized or not. Therefore, we handle GEPs with the memory
3401 // instruction cost.
3402 return 0;
3403 case Instruction::Call: {
3404 auto *CalledFn =
3405 cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue());
3406
3407 SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3408 SmallVector<Type *, 4> Tys;
3409 for (const VPValue *ArgOp : ArgOps)
3410 Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp));
3411
3412 if (CalledFn->isIntrinsic())
3413 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3414 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3415 switch (CalledFn->getIntrinsicID()) {
3416 case Intrinsic::assume:
3417 case Intrinsic::lifetime_end:
3418 case Intrinsic::lifetime_start:
3419 case Intrinsic::sideeffect:
3420 case Intrinsic::pseudoprobe:
3421 case Intrinsic::experimental_noalias_scope_decl: {
3422 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3423 ElementCount::getFixed(1), Ctx) == 0 &&
3424 "scalarizing intrinsic should be free");
3425 return InstructionCost(0);
3426 }
3427 default:
3428 break;
3429 }
3430
3431 Type *ResultTy = Ctx.Types.inferScalarType(V: this);
3432 InstructionCost ScalarCallCost =
3433 Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3434 if (isSingleScalar()) {
3435 if (CalledFn->isIntrinsic())
3436 ScalarCallCost = std::min(
3437 a: ScalarCallCost,
3438 b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this,
3439 VF: ElementCount::getFixed(MinVal: 1), Ctx));
3440 return ScalarCallCost;
3441 }
3442
3443 return ScalarCallCost * VF.getFixedValue() +
3444 Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
3445 }
3446 case Instruction::Add:
3447 case Instruction::Sub:
3448 case Instruction::FAdd:
3449 case Instruction::FSub:
3450 case Instruction::Mul:
3451 case Instruction::FMul:
3452 case Instruction::FDiv:
3453 case Instruction::FRem:
3454 case Instruction::Shl:
3455 case Instruction::LShr:
3456 case Instruction::AShr:
3457 case Instruction::And:
3458 case Instruction::Or:
3459 case Instruction::Xor:
3460 case Instruction::ICmp:
3461 case Instruction::FCmp:
3462 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3463 Ctx) *
3464 (isSingleScalar() ? 1 : VF.getFixedValue());
3465 case Instruction::SDiv:
3466 case Instruction::UDiv:
3467 case Instruction::SRem:
3468 case Instruction::URem: {
3469 InstructionCost ScalarCost =
3470 getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx);
3471 if (isSingleScalar())
3472 return ScalarCost;
3473
3474 // If any of the operands is from a different replicate region and has its
3475 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3476 // model to avoid cost mis-match.
3477 if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3478 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3479 if (!PredR)
3480 return false;
3481 return Ctx.skipCostComputation(
3482 UI: dyn_cast_or_null<Instruction>(
3483 Val: PredR->getOperand(N: 0)->getUnderlyingValue()),
3484 IsVector: VF.isVector());
3485 }))
3486 break;
3487
3488 ScalarCost = ScalarCost * VF.getFixedValue() +
3489 Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this),
3490 Operands: to_vector(Range: operands()), VF);
3491 // If the recipe is not predicated (i.e. not in a replicate region), return
3492 // the scalar cost. Otherwise handle predicated cost.
3493 if (!getRegion()->isReplicator())
3494 return ScalarCost;
3495
3496 // Account for the phi nodes that we will create.
3497 ScalarCost += VF.getFixedValue() *
3498 Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3499 // Scale the cost by the probability of executing the predicated blocks.
3500 // This assumes the predicated block for each vector lane is equally
3501 // likely.
3502 ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3503 return ScalarCost;
3504 }
3505 case Instruction::Load:
3506 case Instruction::Store: {
3507 bool IsLoad = UI->getOpcode() == Instruction::Load;
3508 const VPValue *PtrOp = getOperand(N: !IsLoad);
3509 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3510 if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3511 break;
3512
3513 Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0));
3514 Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp);
3515 const Align Alignment = getLoadStoreAlignment(I: UI);
3516 unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3517 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0));
3518 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3519 bool UsedByLoadStoreAddress =
3520 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this);
3521 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3522 Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo,
3523 I: UsedByLoadStoreAddress ? UI : nullptr);
3524
3525 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3526 InstructionCost ScalarCost =
3527 ScalarMemOpCost +
3528 Ctx.TTI.getAddressComputationCost(
3529 PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3530 CostKind: Ctx.CostKind);
3531 if (isSingleScalar())
3532 return ScalarCost;
3533
3534 SmallVector<const VPValue *> OpsToScalarize;
3535 Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3536 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3537 // don't assign scalarization overhead in general, if the target prefers
3538 // vectorized addressing or the loaded value is used as part of an address
3539 // of another load or store.
3540 if (!UsedByLoadStoreAddress) {
3541 bool EfficientVectorLoadStore =
3542 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3543 if (!(IsLoad && !PreferVectorizedAddressing) &&
3544 !(!IsLoad && EfficientVectorLoadStore))
3545 append_range(C&: OpsToScalarize, R: operands());
3546
3547 if (!EfficientVectorLoadStore)
3548 ResultTy = Ctx.Types.inferScalarType(V: this);
3549 }
3550
3551 TTI::VectorInstrContext VIC =
3552 IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3553 InstructionCost Cost =
3554 (ScalarCost * VF.getFixedValue()) +
3555 Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true);
3556
3557 const VPRegionBlock *ParentRegion = getRegion();
3558 if (ParentRegion && ParentRegion->isReplicator()) {
3559 // TODO: Handle loop-invariant pointers in predicated blocks. For now,
3560 // fall back to the legacy cost model.
3561 if (!PtrSCEV || Ctx.PSE.getSE()->isLoopInvariant(S: PtrSCEV, L: Ctx.L))
3562 break;
3563 Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3564 Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind: Ctx.CostKind);
3565
3566 auto *VecI1Ty = VectorType::get(
3567 ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF);
3568 Cost += Ctx.TTI.getScalarizationOverhead(
3569 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
3570 /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind);
3571
3572 if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) {
3573 // Artificially setting to a high enough value to practically disable
3574 // vectorization with such operations.
3575 return 3000000;
3576 }
3577 }
3578 return Cost;
3579 }
3580 case Instruction::SExt:
3581 case Instruction::ZExt:
3582 case Instruction::FPToUI:
3583 case Instruction::FPToSI:
3584 case Instruction::FPExt:
3585 case Instruction::PtrToInt:
3586 case Instruction::PtrToAddr:
3587 case Instruction::IntToPtr:
3588 case Instruction::SIToFP:
3589 case Instruction::UIToFP:
3590 case Instruction::Trunc:
3591 case Instruction::FPTrunc:
3592 case Instruction::AddrSpaceCast: {
3593 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3594 Ctx) *
3595 (isSingleScalar() ? 1 : VF.getFixedValue());
3596 }
3597 case Instruction::ExtractValue:
3598 case Instruction::InsertValue:
3599 return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3600 }
3601
3602 return Ctx.getLegacyCost(UI, VF);
3603}
3604
3605#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3606void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3607 VPSlotTracker &SlotTracker) const {
3608 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3609
3610 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3611 printAsOperand(O, SlotTracker);
3612 O << " = ";
3613 }
3614 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3615 O << "call";
3616 printFlags(O);
3617 O << "@" << CB->getCalledFunction()->getName() << "(";
3618 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
3619 O, [&O, &SlotTracker](VPValue *Op) {
3620 Op->printAsOperand(O, SlotTracker);
3621 });
3622 O << ")";
3623 } else {
3624 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
3625 printFlags(O);
3626 printOperands(O, SlotTracker);
3627 }
3628
3629 if (shouldPack())
3630 O << " (S->V)";
3631}
3632#endif
3633
3634void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
3635 assert(State.Lane && "Branch on Mask works only on single instance.");
3636
3637 VPValue *BlockInMask = getOperand(N: 0);
3638 Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane);
3639
3640 // Replace the temporary unreachable terminator with a new conditional branch,
3641 // whose two destinations will be set later when they are created.
3642 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3643 assert(isa<UnreachableInst>(CurrentTerminator) &&
3644 "Expected to replace unreachable terminator with conditional branch.");
3645 auto CondBr =
3646 State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
3647 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
3648 CurrentTerminator->eraseFromParent();
3649}
3650
3651InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3652 VPCostContext &Ctx) const {
3653 // The legacy cost model doesn't assign costs to branches for individual
3654 // replicate regions. Match the current behavior in the VPlan cost model for
3655 // now.
3656 return 0;
3657}
3658
3659void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3660 assert(State.Lane && "Predicated instruction PHI works per instance.");
3661 Instruction *ScalarPredInst =
3662 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane));
3663 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3664 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3665 assert(PredicatingBB && "Predicated block has no single predecessor.");
3666 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3667 "operand must be VPReplicateRecipe");
3668
3669 // By current pack/unpack logic we need to generate only a single phi node: if
3670 // a vector value for the predicated instruction exists at this point it means
3671 // the instruction has vector users only, and a phi for the vector value is
3672 // needed. In this case the recipe of the predicated instruction is marked to
3673 // also do that packing, thereby "hoisting" the insert-element sequence.
3674 // Otherwise, a phi node for the scalar value is needed.
3675 if (State.hasVectorValue(Def: getOperand(N: 0))) {
3676 auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0)));
3677 assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
3678 "Packed operands must generate an insertelement or insertvalue");
3679
3680 // If VectorI is a struct, it will be a sequence like:
3681 // %1 = insertvalue %unmodified, %x, 0
3682 // %2 = insertvalue %1, %y, 1
3683 // %VectorI = insertvalue %2, %z, 2
3684 // To get the unmodified vector we need to look through the chain.
3685 if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType()))
3686 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3687 VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0));
3688
3689 PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2);
3690 VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector.
3691 VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element.
3692 if (State.hasVectorValue(Def: this))
3693 State.reset(Def: this, V: VPhi);
3694 else
3695 State.set(Def: this, V: VPhi);
3696 // NOTE: Currently we need to update the value of the operand, so the next
3697 // predicated iteration inserts its generated value in the correct vector.
3698 State.reset(Def: getOperand(N: 0), V: VPhi);
3699 } else {
3700 if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane())
3701 return;
3702
3703 Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
3704 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
3705 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3706 BB: PredicatingBB);
3707 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3708 if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3709 State.reset(Def: this, V: Phi, Lane: *State.Lane);
3710 else
3711 State.set(Def: this, V: Phi, Lane: *State.Lane);
3712 // NOTE: Currently we need to update the value of the operand, so the next
3713 // predicated iteration inserts its generated value in the correct vector.
3714 State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane);
3715 }
3716}
3717
3718#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3719void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3720 VPSlotTracker &SlotTracker) const {
3721 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3722 printAsOperand(O, SlotTracker);
3723 O << " = ";
3724 printOperands(O, SlotTracker);
3725}
3726#endif
3727
3728InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3729 VPCostContext &Ctx) const {
3730 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3731 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3732 ->getAddressSpace();
3733 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3734 ? Instruction::Load
3735 : Instruction::Store;
3736
3737 if (!Consecutive) {
3738 // TODO: Using the original IR may not be accurate.
3739 // Currently, ARM will use the underlying IR to calculate gather/scatter
3740 // instruction cost.
3741 assert(!Reverse &&
3742 "Inconsecutive memory access should not have the order.");
3743
3744 const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3745 Type *PtrTy = Ptr->getType();
3746
3747 // If the address value is uniform across all lanes, then the address can be
3748 // calculated with scalar type and broadcast.
3749 if (!vputils::isSingleScalar(VPV: getAddr()))
3750 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
3751
3752 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather
3753 : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter
3754 : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather
3755 : Intrinsic::vp_scatter;
3756 return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
3757 CostKind: Ctx.CostKind) +
3758 Ctx.TTI.getMemIntrinsicInstrCost(
3759 MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
3760 &Ingredient),
3761 CostKind: Ctx.CostKind);
3762 }
3763
3764 InstructionCost Cost = 0;
3765 if (IsMasked) {
3766 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load
3767 : Intrinsic::masked_store;
3768 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3769 MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
3770 } else {
3771 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3772 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0)
3773 : getOperand(N: 1));
3774 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3775 OpdInfo: OpInfo, I: &Ingredient);
3776 }
3777 return Cost;
3778}
3779
3780void VPWidenLoadRecipe::execute(VPTransformState &State) {
3781 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3782 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3783 bool CreateGather = !isConsecutive();
3784
3785 auto &Builder = State.Builder;
3786 Value *Mask = nullptr;
3787 if (auto *VPMask = getMask()) {
3788 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3789 // of a null all-one mask is a null mask.
3790 Mask = State.get(Def: VPMask);
3791 if (isReverse())
3792 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3793 }
3794
3795 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
3796 Value *NewLI;
3797 if (CreateGather) {
3798 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3799 Name: "wide.masked.gather");
3800 } else if (Mask) {
3801 NewLI =
3802 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3803 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3804 } else {
3805 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3806 }
3807 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3808 State.set(Def: this, V: NewLI);
3809}
3810
3811#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3812void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3813 VPSlotTracker &SlotTracker) const {
3814 O << Indent << "WIDEN ";
3815 printAsOperand(O, SlotTracker);
3816 O << " = load ";
3817 printOperands(O, SlotTracker);
3818}
3819#endif
3820
3821/// Use all-true mask for reverse rather than actual mask, as it avoids a
3822/// dependence w/o affecting the result.
3823static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3824 Value *EVL, const Twine &Name) {
3825 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3826 Value *AllTrueMask =
3827 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3828 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3829 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3830}
3831
3832void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3833 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3834 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3835 bool CreateGather = !isConsecutive();
3836
3837 auto &Builder = State.Builder;
3838 CallInst *NewLI;
3839 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3840 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3841 Value *Mask = nullptr;
3842 if (VPValue *VPMask = getMask()) {
3843 Mask = State.get(Def: VPMask);
3844 if (isReverse())
3845 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3846 } else {
3847 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3848 }
3849
3850 if (CreateGather) {
3851 NewLI =
3852 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3853 FMFSource: nullptr, Name: "wide.masked.gather");
3854 } else {
3855 NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3856 Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3857 }
3858 NewLI->addParamAttr(
3859 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3860 applyMetadata(I&: *NewLI);
3861 Instruction *Res = NewLI;
3862 State.set(Def: this, V: Res);
3863}
3864
3865InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3866 VPCostContext &Ctx) const {
3867 if (!Consecutive || IsMasked)
3868 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3869
3870 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3871 // here because the EVL recipes using EVL to replace the tail mask. But in the
3872 // legacy model, it will always calculate the cost of mask.
3873 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3874 // don't need to compare to the legacy cost model.
3875 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3876 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3877 ->getAddressSpace();
3878 return Ctx.TTI.getMemIntrinsicInstrCost(
3879 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3880 CostKind: Ctx.CostKind);
3881}
3882
3883#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3884void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3885 VPSlotTracker &SlotTracker) const {
3886 O << Indent << "WIDEN ";
3887 printAsOperand(O, SlotTracker);
3888 O << " = vp.load ";
3889 printOperands(O, SlotTracker);
3890}
3891#endif
3892
3893void VPWidenStoreRecipe::execute(VPTransformState &State) {
3894 VPValue *StoredVPValue = getStoredValue();
3895 bool CreateScatter = !isConsecutive();
3896
3897 auto &Builder = State.Builder;
3898
3899 Value *Mask = nullptr;
3900 if (auto *VPMask = getMask()) {
3901 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3902 // of a null all-one mask is a null mask.
3903 Mask = State.get(Def: VPMask);
3904 if (isReverse())
3905 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3906 }
3907
3908 Value *StoredVal = State.get(Def: StoredVPValue);
3909 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
3910 Instruction *NewSI = nullptr;
3911 if (CreateScatter)
3912 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3913 else if (Mask)
3914 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3915 else
3916 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3917 applyMetadata(I&: *NewSI);
3918}
3919
3920#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3921void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3922 VPSlotTracker &SlotTracker) const {
3923 O << Indent << "WIDEN store ";
3924 printOperands(O, SlotTracker);
3925}
3926#endif
3927
3928void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3929 VPValue *StoredValue = getStoredValue();
3930 bool CreateScatter = !isConsecutive();
3931
3932 auto &Builder = State.Builder;
3933
3934 CallInst *NewSI = nullptr;
3935 Value *StoredVal = State.get(Def: StoredValue);
3936 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3937 Value *Mask = nullptr;
3938 if (VPValue *VPMask = getMask()) {
3939 Mask = State.get(Def: VPMask);
3940 if (isReverse())
3941 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3942 } else {
3943 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3944 }
3945 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
3946 if (CreateScatter) {
3947 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3948 ID: Intrinsic::vp_scatter,
3949 Args: {StoredVal, Addr, Mask, EVL});
3950 } else {
3951 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
3952 ID: Intrinsic::vp_store,
3953 Args: {StoredVal, Addr, Mask, EVL});
3954 }
3955 NewSI->addParamAttr(
3956 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
3957 applyMetadata(I&: *NewSI);
3958}
3959
3960InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
3961 VPCostContext &Ctx) const {
3962 if (!Consecutive || IsMasked)
3963 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3964
3965 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3966 // here because the EVL recipes using EVL to replace the tail mask. But in the
3967 // legacy model, it will always calculate the cost of mask.
3968 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3969 // don't need to compare to the legacy cost model.
3970 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3971 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3972 ->getAddressSpace();
3973 return Ctx.TTI.getMemIntrinsicInstrCost(
3974 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
3975 CostKind: Ctx.CostKind);
3976}
3977
3978#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3979void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3980 VPSlotTracker &SlotTracker) const {
3981 O << Indent << "WIDEN vp.store ";
3982 printOperands(O, SlotTracker);
3983}
3984#endif
3985
3986static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
3987 VectorType *DstVTy, const DataLayout &DL) {
3988 // Verify that V is a vector type with same number of elements as DstVTy.
3989 auto VF = DstVTy->getElementCount();
3990 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
3991 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
3992 Type *SrcElemTy = SrcVecTy->getElementType();
3993 Type *DstElemTy = DstVTy->getElementType();
3994 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3995 "Vector elements must have same size");
3996
3997 // Do a direct cast if element types are castable.
3998 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
3999 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
4000 }
4001 // V cannot be directly casted to desired vector type.
4002 // May happen when V is a floating point vector but DstVTy is a vector of
4003 // pointers or vice-versa. Handle this using a two-step bitcast using an
4004 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4005 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4006 "Only one type should be a pointer type");
4007 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4008 "Only one type should be a floating point type");
4009 Type *IntTy =
4010 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
4011 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
4012 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
4013 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
4014}
4015
4016/// Return a vector containing interleaved elements from multiple
4017/// smaller input vectors.
4018static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
4019 const Twine &Name) {
4020 unsigned Factor = Vals.size();
4021 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4022
4023 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
4024#ifndef NDEBUG
4025 for (Value *Val : Vals)
4026 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4027#endif
4028
4029 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4030 // must use intrinsics to interleave.
4031 if (VecTy->isScalableTy()) {
4032 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4033 return Builder.CreateVectorInterleave(Ops: Vals, Name);
4034 }
4035
4036 // Fixed length. Start by concatenating all vectors into a wide vector.
4037 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
4038
4039 // Interleave the elements into the wide vector.
4040 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4041 return Builder.CreateShuffleVector(
4042 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
4043}
4044
4045// Try to vectorize the interleave group that \p Instr belongs to.
4046//
4047// E.g. Translate following interleaved load group (factor = 3):
4048// for (i = 0; i < N; i+=3) {
4049// R = Pic[i]; // Member of index 0
4050// G = Pic[i+1]; // Member of index 1
4051// B = Pic[i+2]; // Member of index 2
4052// ... // do something to R, G, B
4053// }
4054// To:
4055// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4056// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4057// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4058// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4059//
4060// Or translate following interleaved store group (factor = 3):
4061// for (i = 0; i < N; i+=3) {
4062// ... do something to R, G, B
4063// Pic[i] = R; // Member of index 0
4064// Pic[i+1] = G; // Member of index 1
4065// Pic[i+2] = B; // Member of index 2
4066// }
4067// To:
4068// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4069// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4070// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4071// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4072// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4073void VPInterleaveRecipe::execute(VPTransformState &State) {
4074 assert(!State.Lane && "Interleave group being replicated.");
4075 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4076 "Masking gaps for scalable vectors is not yet supported.");
4077 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4078 Instruction *Instr = Group->getInsertPos();
4079
4080 // Prepare for the vector type of the interleaved load/store.
4081 Type *ScalarTy = getLoadStoreType(I: Instr);
4082 unsigned InterleaveFactor = Group->getFactor();
4083 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
4084
4085 VPValue *BlockInMask = getMask();
4086 VPValue *Addr = getAddr();
4087 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4088
4089 auto CreateGroupMask = [&BlockInMask, &State,
4090 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4091 if (State.VF.isScalable()) {
4092 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4093 assert(InterleaveFactor <= 8 &&
4094 "Unsupported deinterleave factor for scalable vectors");
4095 auto *ResBlockInMask = State.get(Def: BlockInMask);
4096 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4097 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4098 }
4099
4100 if (!BlockInMask)
4101 return MaskForGaps;
4102
4103 Value *ResBlockInMask = State.get(Def: BlockInMask);
4104 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4105 V: ResBlockInMask,
4106 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4107 Name: "interleaved.mask");
4108 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4109 LHS: ShuffledMask, RHS: MaskForGaps)
4110 : ShuffledMask;
4111 };
4112
4113 const DataLayout &DL = Instr->getDataLayout();
4114 // Vectorize the interleaved load group.
4115 if (isa<LoadInst>(Val: Instr)) {
4116 Value *MaskForGaps = nullptr;
4117 if (needsMaskForGaps()) {
4118 MaskForGaps =
4119 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4120 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4121 }
4122
4123 Instruction *NewLoad;
4124 if (BlockInMask || MaskForGaps) {
4125 Value *GroupMask = CreateGroupMask(MaskForGaps);
4126 Value *PoisonVec = PoisonValue::get(T: VecTy);
4127 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4128 Alignment: Group->getAlign(), Mask: GroupMask,
4129 PassThru: PoisonVec, Name: "wide.masked.vec");
4130 } else
4131 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4132 Align: Group->getAlign(), Name: "wide.vec");
4133 applyMetadata(I&: *NewLoad);
4134 // TODO: Also manage existing metadata using VPIRMetadata.
4135 Group->addMetadata(NewInst: NewLoad);
4136
4137 ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4138 if (VecTy->isScalableTy()) {
4139 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4140 // so must use intrinsics to deinterleave.
4141 assert(InterleaveFactor <= 8 &&
4142 "Unsupported deinterleave factor for scalable vectors");
4143 NewLoad = State.Builder.CreateIntrinsic(
4144 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4145 Types: NewLoad->getType(), Args: NewLoad,
4146 /*FMFSource=*/nullptr, Name: "strided.vec");
4147 }
4148
4149 auto CreateStridedVector = [&InterleaveFactor, &State,
4150 &NewLoad](unsigned Index) -> Value * {
4151 assert(Index < InterleaveFactor && "Illegal group index");
4152 if (State.VF.isScalable())
4153 return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4154
4155 // For fixed length VF, use shuffle to extract the sub-vectors from the
4156 // wide load.
4157 auto StrideMask =
4158 createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4159 return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4160 Name: "strided.vec");
4161 };
4162
4163 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4164 Instruction *Member = Group->getMember(Index: I);
4165
4166 // Skip the gaps in the group.
4167 if (!Member)
4168 continue;
4169
4170 Value *StridedVec = CreateStridedVector(I);
4171
4172 // If this member has different type, cast the result type.
4173 if (Member->getType() != ScalarTy) {
4174 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4175 StridedVec =
4176 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4177 }
4178
4179 if (Group->isReverse())
4180 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4181
4182 State.set(Def: VPDefs[J], V: StridedVec);
4183 ++J;
4184 }
4185 return;
4186 }
4187
4188 // The sub vector type for current instruction.
4189 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4190
4191 // Vectorize the interleaved store group.
4192 Value *MaskForGaps =
4193 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4194 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4195 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4196 ArrayRef<VPValue *> StoredValues = getStoredValues();
4197 // Collect the stored vector from each member.
4198 SmallVector<Value *, 4> StoredVecs;
4199 unsigned StoredIdx = 0;
4200 for (unsigned i = 0; i < InterleaveFactor; i++) {
4201 assert((Group->getMember(i) || MaskForGaps) &&
4202 "Fail to get a member from an interleaved store group");
4203 Instruction *Member = Group->getMember(Index: i);
4204
4205 // Skip the gaps in the group.
4206 if (!Member) {
4207 Value *Undef = PoisonValue::get(T: SubVT);
4208 StoredVecs.push_back(Elt: Undef);
4209 continue;
4210 }
4211
4212 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4213 ++StoredIdx;
4214
4215 if (Group->isReverse())
4216 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4217
4218 // If this member has different type, cast it to a unified type.
4219
4220 if (StoredVec->getType() != SubVT)
4221 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4222
4223 StoredVecs.push_back(Elt: StoredVec);
4224 }
4225
4226 // Interleave all the smaller vectors into one wider vector.
4227 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4228 Instruction *NewStoreInstr;
4229 if (BlockInMask || MaskForGaps) {
4230 Value *GroupMask = CreateGroupMask(MaskForGaps);
4231 NewStoreInstr = State.Builder.CreateMaskedStore(
4232 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4233 } else
4234 NewStoreInstr =
4235 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4236
4237 applyMetadata(I&: *NewStoreInstr);
4238 // TODO: Also manage existing metadata using VPIRMetadata.
4239 Group->addMetadata(NewInst: NewStoreInstr);
4240}
4241
4242#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4243void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4244 VPSlotTracker &SlotTracker) const {
4245 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4246 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4247 IG->getInsertPos()->printAsOperand(O, false);
4248 O << ", ";
4249 getAddr()->printAsOperand(O, SlotTracker);
4250 VPValue *Mask = getMask();
4251 if (Mask) {
4252 O << ", ";
4253 Mask->printAsOperand(O, SlotTracker);
4254 }
4255
4256 unsigned OpIdx = 0;
4257 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4258 if (!IG->getMember(i))
4259 continue;
4260 if (getNumStoreOperands() > 0) {
4261 O << "\n" << Indent << " store ";
4262 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
4263 O << " to index " << i;
4264 } else {
4265 O << "\n" << Indent << " ";
4266 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4267 O << " = load from index " << i;
4268 }
4269 ++OpIdx;
4270 }
4271}
4272#endif
4273
4274void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4275 assert(!State.Lane && "Interleave group being replicated.");
4276 assert(State.VF.isScalable() &&
4277 "Only support scalable VF for EVL tail-folding.");
4278 assert(!needsMaskForGaps() &&
4279 "Masking gaps for scalable vectors is not yet supported.");
4280 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4281 Instruction *Instr = Group->getInsertPos();
4282
4283 // Prepare for the vector type of the interleaved load/store.
4284 Type *ScalarTy = getLoadStoreType(I: Instr);
4285 unsigned InterleaveFactor = Group->getFactor();
4286 assert(InterleaveFactor <= 8 &&
4287 "Unsupported deinterleave/interleave factor for scalable vectors");
4288 ElementCount WideVF = State.VF * InterleaveFactor;
4289 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4290
4291 VPValue *Addr = getAddr();
4292 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4293 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4294 Value *InterleaveEVL = State.Builder.CreateMul(
4295 LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4296 /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true);
4297 LLVMContext &Ctx = State.Builder.getContext();
4298
4299 Value *GroupMask = nullptr;
4300 if (VPValue *BlockInMask = getMask()) {
4301 SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4302 GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4303 } else {
4304 GroupMask =
4305 State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4306 }
4307
4308 // Vectorize the interleaved load group.
4309 if (isa<LoadInst>(Val: Instr)) {
4310 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4311 RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4312 Name: "wide.vp.load");
4313 NewLoad->addParamAttr(ArgNo: 0,
4314 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4315
4316 applyMetadata(I&: *NewLoad);
4317 // TODO: Also manage existing metadata using VPIRMetadata.
4318 Group->addMetadata(NewInst: NewLoad);
4319
4320 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4321 // so must use intrinsics to deinterleave.
4322 NewLoad = State.Builder.CreateIntrinsic(
4323 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4324 Types: NewLoad->getType(), Args: NewLoad,
4325 /*FMFSource=*/nullptr, Name: "strided.vec");
4326
4327 const DataLayout &DL = Instr->getDataLayout();
4328 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4329 Instruction *Member = Group->getMember(Index: I);
4330 // Skip the gaps in the group.
4331 if (!Member)
4332 continue;
4333
4334 Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4335 // If this member has different type, cast the result type.
4336 if (Member->getType() != ScalarTy) {
4337 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4338 StridedVec =
4339 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4340 }
4341
4342 State.set(Def: getVPValue(I: J), V: StridedVec);
4343 ++J;
4344 }
4345 return;
4346 } // End for interleaved load.
4347
4348 // The sub vector type for current instruction.
4349 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4350 // Vectorize the interleaved store group.
4351 ArrayRef<VPValue *> StoredValues = getStoredValues();
4352 // Collect the stored vector from each member.
4353 SmallVector<Value *, 4> StoredVecs;
4354 const DataLayout &DL = Instr->getDataLayout();
4355 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4356 Instruction *Member = Group->getMember(Index: I);
4357 // Skip the gaps in the group.
4358 if (!Member) {
4359 StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4360 continue;
4361 }
4362
4363 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4364 // If this member has different type, cast it to a unified type.
4365 if (StoredVec->getType() != SubVT)
4366 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4367
4368 StoredVecs.push_back(Elt: StoredVec);
4369 ++StoredIdx;
4370 }
4371
4372 // Interleave all the smaller vectors into one wider vector.
4373 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4374 CallInst *NewStore =
4375 State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4376 Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4377 NewStore->addParamAttr(ArgNo: 1,
4378 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4379
4380 applyMetadata(I&: *NewStore);
4381 // TODO: Also manage existing metadata using VPIRMetadata.
4382 Group->addMetadata(NewInst: NewStore);
4383}
4384
4385#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4386void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4387 VPSlotTracker &SlotTracker) const {
4388 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4389 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4390 IG->getInsertPos()->printAsOperand(O, false);
4391 O << ", ";
4392 getAddr()->printAsOperand(O, SlotTracker);
4393 O << ", ";
4394 getEVL()->printAsOperand(O, SlotTracker);
4395 if (VPValue *Mask = getMask()) {
4396 O << ", ";
4397 Mask->printAsOperand(O, SlotTracker);
4398 }
4399
4400 unsigned OpIdx = 0;
4401 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4402 if (!IG->getMember(i))
4403 continue;
4404 if (getNumStoreOperands() > 0) {
4405 O << "\n" << Indent << " vp.store ";
4406 getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
4407 O << " to index " << i;
4408 } else {
4409 O << "\n" << Indent << " ";
4410 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4411 O << " = vp.load from index " << i;
4412 }
4413 ++OpIdx;
4414 }
4415}
4416#endif
4417
4418InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4419 VPCostContext &Ctx) const {
4420 Instruction *InsertPos = getInsertPos();
4421 // Find the VPValue index of the interleave group. We need to skip gaps.
4422 unsigned InsertPosIdx = 0;
4423 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4424 if (auto *Member = IG->getMember(Index: Idx)) {
4425 if (Member == InsertPos)
4426 break;
4427 InsertPosIdx++;
4428 }
4429 Type *ValTy = Ctx.Types.inferScalarType(
4430 V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx)
4431 : getStoredValues()[InsertPosIdx]);
4432 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4433 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4434 ->getAddressSpace();
4435
4436 unsigned InterleaveFactor = IG->getFactor();
4437 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4438
4439 // Holds the indices of existing members in the interleaved group.
4440 SmallVector<unsigned, 4> Indices;
4441 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4442 if (IG->getMember(Index: IF))
4443 Indices.push_back(Elt: IF);
4444
4445 // Calculate the cost of the whole interleaved group.
4446 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4447 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4448 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4449
4450 if (!IG->isReverse())
4451 return Cost;
4452
4453 return Cost + IG->getNumMembers() *
4454 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4455 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4456 Index: 0);
4457}
4458
4459#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4460void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4461 VPSlotTracker &SlotTracker) const {
4462 O << Indent << "EMIT ";
4463 printAsOperand(O, SlotTracker);
4464 O << " = CANONICAL-INDUCTION ";
4465 printOperands(O, SlotTracker);
4466}
4467#endif
4468
4469bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4470 return vputils::onlyScalarValuesUsed(Def: this) &&
4471 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
4472}
4473
4474#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4475void VPWidenPointerInductionRecipe::printRecipe(
4476 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4477 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4478 "unexpected number of operands");
4479 O << Indent << "EMIT ";
4480 printAsOperand(O, SlotTracker);
4481 O << " = WIDEN-POINTER-INDUCTION ";
4482 getStartValue()->printAsOperand(O, SlotTracker);
4483 O << ", ";
4484 getStepValue()->printAsOperand(O, SlotTracker);
4485 O << ", ";
4486 getOperand(2)->printAsOperand(O, SlotTracker);
4487 if (getNumOperands() == 5) {
4488 O << ", ";
4489 getOperand(3)->printAsOperand(O, SlotTracker);
4490 O << ", ";
4491 getOperand(4)->printAsOperand(O, SlotTracker);
4492 }
4493}
4494
4495void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4496 VPSlotTracker &SlotTracker) const {
4497 O << Indent << "EMIT ";
4498 printAsOperand(O, SlotTracker);
4499 O << " = EXPAND SCEV " << *Expr;
4500}
4501#endif
4502
4503void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
4504 Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
4505 Type *STy = CanonicalIV->getType();
4506 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4507 ElementCount VF = State.VF;
4508 Value *VStart = VF.isScalar()
4509 ? CanonicalIV
4510 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
4511 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: getUnrollPart(U: *this));
4512 if (VF.isVector()) {
4513 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
4514 VStep =
4515 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
4516 }
4517 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
4518 State.set(Def: this, V: CanonicalVectorIV);
4519}
4520
4521#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4522void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4523 VPSlotTracker &SlotTracker) const {
4524 O << Indent << "EMIT ";
4525 printAsOperand(O, SlotTracker);
4526 O << " = WIDEN-CANONICAL-INDUCTION ";
4527 printOperands(O, SlotTracker);
4528}
4529#endif
4530
4531void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4532 auto &Builder = State.Builder;
4533 // Create a vector from the initial value.
4534 auto *VectorInit = getStartValue()->getLiveInIRValue();
4535
4536 Type *VecTy = State.VF.isScalar()
4537 ? VectorInit->getType()
4538 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4539
4540 BasicBlock *VectorPH =
4541 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4542 if (State.VF.isVector()) {
4543 auto *IdxTy = Builder.getInt32Ty();
4544 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
4545 IRBuilder<>::InsertPointGuard Guard(Builder);
4546 Builder.SetInsertPoint(VectorPH->getTerminator());
4547 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4548 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4549 VectorInit = Builder.CreateInsertElement(
4550 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4551 }
4552
4553 // Create a phi node for the new recurrence.
4554 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
4555 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4556 Phi->addIncoming(V: VectorInit, BB: VectorPH);
4557 State.set(Def: this, V: Phi);
4558}
4559
4560InstructionCost
4561VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4562 VPCostContext &Ctx) const {
4563 if (VF.isScalar())
4564 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4565
4566 return 0;
4567}
4568
4569#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4570void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4571 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4572 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4573 printAsOperand(O, SlotTracker);
4574 O << " = phi ";
4575 printOperands(O, SlotTracker);
4576}
4577#endif
4578
4579void VPReductionPHIRecipe::execute(VPTransformState &State) {
4580 // Reductions do not have to start at zero. They can start with
4581 // any loop invariant values.
4582 VPValue *StartVPV = getStartValue();
4583
4584 // In order to support recurrences we need to be able to vectorize Phi nodes.
4585 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4586 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4587 // this value when we vectorize all of the instructions that use the PHI.
4588 BasicBlock *VectorPH =
4589 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4590 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4591 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4592 Type *VecTy = StartV->getType();
4593
4594 BasicBlock *HeaderBB = State.CFG.PrevBB;
4595 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4596 "recipe must be in the vector loop header");
4597 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
4598 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4599 State.set(Def: this, V: Phi, IsScalar: isInLoop());
4600
4601 Phi->addIncoming(V: StartV, BB: VectorPH);
4602}
4603
4604#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4605void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4606 VPSlotTracker &SlotTracker) const {
4607 O << Indent << "WIDEN-REDUCTION-PHI ";
4608
4609 printAsOperand(O, SlotTracker);
4610 O << " = phi";
4611 printFlags(O);
4612 printOperands(O, SlotTracker);
4613 if (getVFScaleFactor() > 1)
4614 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4615}
4616#endif
4617
4618void VPWidenPHIRecipe::execute(VPTransformState &State) {
4619 Value *Op0 = State.get(Def: getOperand(N: 0));
4620 Type *VecTy = Op0->getType();
4621 Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name);
4622 State.set(Def: this, V: VecPhi);
4623}
4624
4625InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4626 VPCostContext &Ctx) const {
4627 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4628}
4629
4630#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4631void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4632 VPSlotTracker &SlotTracker) const {
4633 O << Indent << "WIDEN-PHI ";
4634
4635 printAsOperand(O, SlotTracker);
4636 O << " = phi ";
4637 printPhiOperands(O, SlotTracker);
4638}
4639#endif
4640
4641void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4642 BasicBlock *VectorPH =
4643 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4644 Value *StartMask = State.get(Def: getOperand(N: 0));
4645 PHINode *Phi =
4646 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
4647 Phi->addIncoming(V: StartMask, BB: VectorPH);
4648 State.set(Def: this, V: Phi);
4649}
4650
4651#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4652void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4653 VPSlotTracker &SlotTracker) const {
4654 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4655
4656 printAsOperand(O, SlotTracker);
4657 O << " = phi ";
4658 printOperands(O, SlotTracker);
4659}
4660#endif
4661
4662#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4663void VPCurrentIterationPHIRecipe::printRecipe(
4664 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4665 O << Indent << "CURRENT-ITERATION-PHI ";
4666
4667 printAsOperand(O, SlotTracker);
4668 O << " = phi ";
4669 printOperands(O, SlotTracker);
4670}
4671#endif
4672