1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "LoopVectorizationPlanner.h"
15#include "VPlan.h"
16#include "VPlanAnalysis.h"
17#include "VPlanHelpers.h"
18#include "VPlanPatternMatch.h"
19#include "VPlanUtils.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/Twine.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/IVDescriptors.h"
25#include "llvm/Analysis/LoopInfo.h"
26#include "llvm/Analysis/ScalarEvolutionExpressions.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/Instruction.h"
30#include "llvm/IR/Instructions.h"
31#include "llvm/IR/Intrinsics.h"
32#include "llvm/IR/Type.h"
33#include "llvm/IR/Value.h"
34#include "llvm/Support/Casting.h"
35#include "llvm/Support/CommandLine.h"
36#include "llvm/Support/Debug.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Transforms/Utils/BasicBlockUtils.h"
39#include "llvm/Transforms/Utils/LoopUtils.h"
40#include <cassert>
41
42using namespace llvm;
43using namespace llvm::VPlanPatternMatch;
44
45using VectorParts = SmallVector<Value *, 2>;
46
47#define LV_NAME "loop-vectorize"
48#define DEBUG_TYPE LV_NAME
49
50bool VPRecipeBase::mayWriteToMemory() const {
51 switch (getVPRecipeID()) {
52 case VPExpressionSC:
53 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
54 case VPInstructionSC: {
55 auto *VPI = cast<VPInstruction>(Val: this);
56 // Loads read from memory but don't write to memory.
57 if (VPI->getOpcode() == Instruction::Load)
58 return false;
59 return VPI->opcodeMayReadOrWriteFromMemory();
60 }
61 case VPInterleaveEVLSC:
62 case VPInterleaveSC:
63 return cast<VPInterleaveBase>(Val: this)->getNumStoreOperands() > 0;
64 case VPWidenStoreEVLSC:
65 case VPWidenStoreSC:
66 return true;
67 case VPReplicateSC:
68 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
69 ->mayWriteToMemory();
70 case VPWidenCallSC:
71 return !cast<VPWidenCallRecipe>(Val: this)
72 ->getCalledScalarFunction()
73 ->onlyReadsMemory();
74 case VPWidenIntrinsicSC:
75 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayWriteToMemory();
76 case VPActiveLaneMaskPHISC:
77 case VPCanonicalIVPHISC:
78 case VPCurrentIterationPHISC:
79 case VPBranchOnMaskSC:
80 case VPDerivedIVSC:
81 case VPFirstOrderRecurrencePHISC:
82 case VPReductionPHISC:
83 case VPScalarIVStepsSC:
84 case VPPredInstPHISC:
85 return false;
86 case VPBlendSC:
87 case VPReductionEVLSC:
88 case VPReductionSC:
89 case VPVectorPointerSC:
90 case VPWidenCanonicalIVSC:
91 case VPWidenCastSC:
92 case VPWidenGEPSC:
93 case VPWidenIntOrFpInductionSC:
94 case VPWidenLoadEVLSC:
95 case VPWidenLoadSC:
96 case VPWidenPHISC:
97 case VPWidenPointerInductionSC:
98 case VPWidenSC: {
99 const Instruction *I =
100 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
101 (void)I;
102 assert((!I || !I->mayWriteToMemory()) &&
103 "underlying instruction may write to memory");
104 return false;
105 }
106 default:
107 return true;
108 }
109}
110
111bool VPRecipeBase::mayReadFromMemory() const {
112 switch (getVPRecipeID()) {
113 case VPExpressionSC:
114 return cast<VPExpressionRecipe>(Val: this)->mayReadOrWriteMemory();
115 case VPInstructionSC:
116 return cast<VPInstruction>(Val: this)->opcodeMayReadOrWriteFromMemory();
117 case VPWidenLoadEVLSC:
118 case VPWidenLoadSC:
119 return true;
120 case VPReplicateSC:
121 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
122 ->mayReadFromMemory();
123 case VPWidenCallSC:
124 return !cast<VPWidenCallRecipe>(Val: this)
125 ->getCalledScalarFunction()
126 ->onlyWritesMemory();
127 case VPWidenIntrinsicSC:
128 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayReadFromMemory();
129 case VPCanonicalIVPHISC:
130 case VPBranchOnMaskSC:
131 case VPDerivedIVSC:
132 case VPCurrentIterationPHISC:
133 case VPFirstOrderRecurrencePHISC:
134 case VPReductionPHISC:
135 case VPPredInstPHISC:
136 case VPScalarIVStepsSC:
137 case VPWidenStoreEVLSC:
138 case VPWidenStoreSC:
139 return false;
140 case VPBlendSC:
141 case VPReductionEVLSC:
142 case VPReductionSC:
143 case VPVectorPointerSC:
144 case VPWidenCanonicalIVSC:
145 case VPWidenCastSC:
146 case VPWidenGEPSC:
147 case VPWidenIntOrFpInductionSC:
148 case VPWidenPHISC:
149 case VPWidenPointerInductionSC:
150 case VPWidenSC: {
151 const Instruction *I =
152 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
153 (void)I;
154 assert((!I || !I->mayReadFromMemory()) &&
155 "underlying instruction may read from memory");
156 return false;
157 }
158 default:
159 // FIXME: Return false if the recipe represents an interleaved store.
160 return true;
161 }
162}
163
164bool VPRecipeBase::mayHaveSideEffects() const {
165 switch (getVPRecipeID()) {
166 case VPExpressionSC:
167 return cast<VPExpressionRecipe>(Val: this)->mayHaveSideEffects();
168 case VPActiveLaneMaskPHISC:
169 case VPDerivedIVSC:
170 case VPCurrentIterationPHISC:
171 case VPFirstOrderRecurrencePHISC:
172 case VPReductionPHISC:
173 case VPPredInstPHISC:
174 case VPVectorEndPointerSC:
175 return false;
176 case VPInstructionSC: {
177 auto *VPI = cast<VPInstruction>(Val: this);
178 return mayWriteToMemory() ||
179 VPI->getOpcode() == VPInstruction::BranchOnCount ||
180 VPI->getOpcode() == VPInstruction::BranchOnCond ||
181 VPI->getOpcode() == VPInstruction::BranchOnTwoConds;
182 }
183 case VPWidenCallSC: {
184 Function *Fn = cast<VPWidenCallRecipe>(Val: this)->getCalledScalarFunction();
185 return mayWriteToMemory() || !Fn->doesNotThrow() || !Fn->willReturn();
186 }
187 case VPWidenIntrinsicSC:
188 return cast<VPWidenIntrinsicRecipe>(Val: this)->mayHaveSideEffects();
189 case VPBlendSC:
190 case VPReductionEVLSC:
191 case VPReductionSC:
192 case VPScalarIVStepsSC:
193 case VPVectorPointerSC:
194 case VPWidenCanonicalIVSC:
195 case VPWidenCastSC:
196 case VPWidenGEPSC:
197 case VPWidenIntOrFpInductionSC:
198 case VPWidenPHISC:
199 case VPWidenPointerInductionSC:
200 case VPWidenSC: {
201 const Instruction *I =
202 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
203 (void)I;
204 assert((!I || !I->mayHaveSideEffects()) &&
205 "underlying instruction has side-effects");
206 return false;
207 }
208 case VPInterleaveEVLSC:
209 case VPInterleaveSC:
210 return mayWriteToMemory();
211 case VPWidenLoadEVLSC:
212 case VPWidenLoadSC:
213 case VPWidenStoreEVLSC:
214 case VPWidenStoreSC:
215 assert(
216 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
217 mayWriteToMemory() &&
218 "mayHaveSideffects result for ingredient differs from this "
219 "implementation");
220 return mayWriteToMemory();
221 case VPReplicateSC: {
222 auto *R = cast<VPReplicateRecipe>(Val: this);
223 return R->getUnderlyingInstr()->mayHaveSideEffects();
224 }
225 default:
226 return true;
227 }
228}
229
230void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
231 assert(!Parent && "Recipe already in some VPBasicBlock");
232 assert(InsertPos->getParent() &&
233 "Insertion position not in any VPBasicBlock");
234 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
235}
236
237void VPRecipeBase::insertBefore(VPBasicBlock &BB,
238 iplist<VPRecipeBase>::iterator I) {
239 assert(!Parent && "Recipe already in some VPBasicBlock");
240 assert(I == BB.end() || I->getParent() == &BB);
241 BB.insert(Recipe: this, InsertPt: I);
242}
243
244void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
245 assert(!Parent && "Recipe already in some VPBasicBlock");
246 assert(InsertPos->getParent() &&
247 "Insertion position not in any VPBasicBlock");
248 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
249}
250
251void VPRecipeBase::removeFromParent() {
252 assert(getParent() && "Recipe not in any VPBasicBlock");
253 getParent()->getRecipeList().remove(IT: getIterator());
254 Parent = nullptr;
255}
256
257iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
258 assert(getParent() && "Recipe not in any VPBasicBlock");
259 return getParent()->getRecipeList().erase(where: getIterator());
260}
261
262void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
263 removeFromParent();
264 insertAfter(InsertPos);
265}
266
267void VPRecipeBase::moveBefore(VPBasicBlock &BB,
268 iplist<VPRecipeBase>::iterator I) {
269 removeFromParent();
270 insertBefore(BB, I);
271}
272
273InstructionCost VPRecipeBase::cost(ElementCount VF, VPCostContext &Ctx) {
274 // Get the underlying instruction for the recipe, if there is one. It is used
275 // to
276 // * decide if cost computation should be skipped for this recipe,
277 // * apply forced target instruction cost.
278 Instruction *UI = nullptr;
279 if (auto *S = dyn_cast<VPSingleDefRecipe>(Val: this))
280 UI = dyn_cast_or_null<Instruction>(Val: S->getUnderlyingValue());
281 else if (auto *IG = dyn_cast<VPInterleaveBase>(Val: this))
282 UI = IG->getInsertPos();
283 else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(Val: this))
284 UI = &WidenMem->getIngredient();
285
286 InstructionCost RecipeCost;
287 if (UI && Ctx.skipCostComputation(UI, IsVector: VF.isVector())) {
288 RecipeCost = 0;
289 } else {
290 RecipeCost = computeCost(VF, Ctx);
291 if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
292 RecipeCost.isValid()) {
293 if (UI)
294 RecipeCost = InstructionCost(ForceTargetInstructionCost);
295 else
296 RecipeCost = InstructionCost(0);
297 }
298 }
299
300 LLVM_DEBUG({
301 dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
302 dump();
303 });
304 return RecipeCost;
305}
306
307InstructionCost VPRecipeBase::computeCost(ElementCount VF,
308 VPCostContext &Ctx) const {
309 llvm_unreachable("subclasses should implement computeCost");
310}
311
312bool VPRecipeBase::isPhi() const {
313 return (getVPRecipeID() >= VPFirstPHISC && getVPRecipeID() <= VPLastPHISC) ||
314 isa<VPPhi, VPIRPhi>(Val: this);
315}
316
317bool VPRecipeBase::isScalarCast() const {
318 auto *VPI = dyn_cast<VPInstruction>(Val: this);
319 return VPI && Instruction::isCast(Opcode: VPI->getOpcode());
320}
321
322void VPIRFlags::intersectFlags(const VPIRFlags &Other) {
323 assert(OpType == Other.OpType && "OpType must match");
324 switch (OpType) {
325 case OperationType::OverflowingBinOp:
326 WrapFlags.HasNUW &= Other.WrapFlags.HasNUW;
327 WrapFlags.HasNSW &= Other.WrapFlags.HasNSW;
328 break;
329 case OperationType::Trunc:
330 TruncFlags.HasNUW &= Other.TruncFlags.HasNUW;
331 TruncFlags.HasNSW &= Other.TruncFlags.HasNSW;
332 break;
333 case OperationType::DisjointOp:
334 DisjointFlags.IsDisjoint &= Other.DisjointFlags.IsDisjoint;
335 break;
336 case OperationType::PossiblyExactOp:
337 ExactFlags.IsExact &= Other.ExactFlags.IsExact;
338 break;
339 case OperationType::GEPOp:
340 GEPFlagsStorage &= Other.GEPFlagsStorage;
341 break;
342 case OperationType::FPMathOp:
343 case OperationType::FCmp:
344 assert((OpType != OperationType::FCmp ||
345 FCmpFlags.CmpPredStorage == Other.FCmpFlags.CmpPredStorage) &&
346 "Cannot drop CmpPredicate");
347 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
348 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
349 break;
350 case OperationType::NonNegOp:
351 NonNegFlags.NonNeg &= Other.NonNegFlags.NonNeg;
352 break;
353 case OperationType::Cmp:
354 assert(CmpPredStorage == Other.CmpPredStorage &&
355 "Cannot drop CmpPredicate");
356 break;
357 case OperationType::ReductionOp:
358 assert(ReductionFlags.Kind == Other.ReductionFlags.Kind &&
359 "Cannot change RecurKind");
360 assert(ReductionFlags.IsOrdered == Other.ReductionFlags.IsOrdered &&
361 "Cannot change IsOrdered");
362 assert(ReductionFlags.IsInLoop == Other.ReductionFlags.IsInLoop &&
363 "Cannot change IsInLoop");
364 getFMFsRef().NoNaNs &= Other.getFMFsRef().NoNaNs;
365 getFMFsRef().NoInfs &= Other.getFMFsRef().NoInfs;
366 break;
367 case OperationType::Other:
368 break;
369 }
370}
371
372FastMathFlags VPIRFlags::getFastMathFlags() const {
373 assert((OpType == OperationType::FPMathOp || OpType == OperationType::FCmp ||
374 OpType == OperationType::ReductionOp ||
375 OpType == OperationType::Other) &&
376 "recipe doesn't have fast math flags");
377 if (OpType == OperationType::Other)
378 return FastMathFlags();
379 const FastMathFlagsTy &F = getFMFsRef();
380 FastMathFlags Res;
381 Res.setAllowReassoc(F.AllowReassoc);
382 Res.setNoNaNs(F.NoNaNs);
383 Res.setNoInfs(F.NoInfs);
384 Res.setNoSignedZeros(F.NoSignedZeros);
385 Res.setAllowReciprocal(F.AllowReciprocal);
386 Res.setAllowContract(F.AllowContract);
387 Res.setApproxFunc(F.ApproxFunc);
388 return Res;
389}
390
391#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
392void VPSingleDefRecipe::dump() const { VPRecipeBase::dump(); }
393
394void VPRecipeBase::print(raw_ostream &O, const Twine &Indent,
395 VPSlotTracker &SlotTracker) const {
396 printRecipe(O, Indent, SlotTracker);
397 if (auto DL = getDebugLoc()) {
398 O << ", !dbg ";
399 DL.print(O);
400 }
401
402 if (auto *Metadata = dyn_cast<VPIRMetadata>(this))
403 Metadata->print(O, SlotTracker);
404}
405#endif
406
407template <unsigned PartOpIdx>
408VPValue *
409VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(const VPUser &U) const {
410 if (U.getNumOperands() == PartOpIdx + 1)
411 return U.getOperand(N: PartOpIdx);
412 return nullptr;
413}
414
415template <unsigned PartOpIdx>
416unsigned VPUnrollPartAccessor<PartOpIdx>::getUnrollPart(const VPUser &U) const {
417 if (auto *UnrollPartOp = getUnrollPartOperand(U))
418 return cast<VPConstantInt>(UnrollPartOp)->getZExtValue();
419 return 0;
420}
421
422namespace llvm {
423template class VPUnrollPartAccessor<1>;
424template class VPUnrollPartAccessor<2>;
425template class VPUnrollPartAccessor<3>;
426}
427
428VPInstruction::VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
429 const VPIRFlags &Flags, const VPIRMetadata &MD,
430 DebugLoc DL, const Twine &Name)
431 : VPRecipeWithIRFlags(VPRecipeBase::VPInstructionSC, Operands, Flags, DL),
432 VPIRMetadata(MD), Opcode(Opcode), Name(Name.str()) {
433 assert(flagsValidForOpcode(getOpcode()) &&
434 "Set flags not supported for the provided opcode");
435 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
436 "Opcode requires specific flags to be set");
437 assert((getNumOperandsForOpcode() == -1u ||
438 getNumOperandsForOpcode() == getNumOperands() ||
439 (isMasked() && getNumOperandsForOpcode() + 1 == getNumOperands())) &&
440 "number of operands does not match opcode");
441}
442
443unsigned VPInstruction::getNumOperandsForOpcode() const {
444 if (Instruction::isUnaryOp(Opcode) || Instruction::isCast(Opcode))
445 return 1;
446
447 if (Instruction::isBinaryOp(Opcode))
448 return 2;
449
450 switch (Opcode) {
451 case VPInstruction::StepVector:
452 case VPInstruction::VScale:
453 return 0;
454 case Instruction::Alloca:
455 case Instruction::ExtractValue:
456 case Instruction::Freeze:
457 case Instruction::Load:
458 case VPInstruction::BranchOnCond:
459 case VPInstruction::Broadcast:
460 case VPInstruction::ExitingIVValue:
461 case VPInstruction::ExplicitVectorLength:
462 case VPInstruction::ExtractLastLane:
463 case VPInstruction::ExtractLastPart:
464 case VPInstruction::ExtractPenultimateElement:
465 case VPInstruction::MaskedCond:
466 case VPInstruction::Not:
467 case VPInstruction::ResumeForEpilogue:
468 case VPInstruction::Reverse:
469 case VPInstruction::Unpack:
470 return 1;
471 case Instruction::ICmp:
472 case Instruction::FCmp:
473 case Instruction::ExtractElement:
474 case Instruction::Store:
475 case VPInstruction::BranchOnCount:
476 case VPInstruction::BranchOnTwoConds:
477 case VPInstruction::FirstOrderRecurrenceSplice:
478 case VPInstruction::LogicalAnd:
479 case VPInstruction::LogicalOr:
480 case VPInstruction::PtrAdd:
481 case VPInstruction::WidePtrAdd:
482 case VPInstruction::WideIVStep:
483 case VPInstruction::CalculateTripCountMinusVF:
484 return 2;
485 case Instruction::Select:
486 case VPInstruction::ActiveLaneMask:
487 case VPInstruction::ReductionStartVector:
488 return 3;
489 case Instruction::Call: {
490 // For unmasked calls, the last argument will the called function. Use that
491 // to compute the number of operands without the mask.
492 VPValue *LastOp = getOperand(N: getNumOperands() - 1);
493 if (isa<VPIRValue>(Val: LastOp) && isa<Function>(Val: LastOp->getLiveInIRValue()))
494 return getNumOperands();
495 return getNumOperands() - 1;
496 }
497 case Instruction::GetElementPtr:
498 case Instruction::PHI:
499 case Instruction::Switch:
500 case VPInstruction::AnyOf:
501 case VPInstruction::BuildStructVector:
502 case VPInstruction::BuildVector:
503 case VPInstruction::CanonicalIVIncrementForPart:
504 case VPInstruction::ComputeAnyOfResult:
505 case VPInstruction::ComputeReductionResult:
506 case VPInstruction::FirstActiveLane:
507 case VPInstruction::LastActiveLane:
508 case VPInstruction::SLPLoad:
509 case VPInstruction::SLPStore:
510 case VPInstruction::ExtractLane:
511 case VPInstruction::ExtractLastActive:
512 // Cannot determine the number of operands from the opcode.
513 return -1u;
514 }
515 llvm_unreachable("all cases should be handled above");
516}
517
518bool VPInstruction::doesGeneratePerAllLanes() const {
519 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
520}
521
522bool VPInstruction::canGenerateScalarForFirstLane() const {
523 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
524 return true;
525 if (isSingleScalar() || isVectorToScalar())
526 return true;
527 switch (Opcode) {
528 case Instruction::Freeze:
529 case Instruction::ICmp:
530 case Instruction::PHI:
531 case Instruction::Select:
532 case VPInstruction::BranchOnCond:
533 case VPInstruction::BranchOnTwoConds:
534 case VPInstruction::BranchOnCount:
535 case VPInstruction::CalculateTripCountMinusVF:
536 case VPInstruction::CanonicalIVIncrementForPart:
537 case VPInstruction::PtrAdd:
538 case VPInstruction::ExplicitVectorLength:
539 case VPInstruction::AnyOf:
540 case VPInstruction::Not:
541 return true;
542 default:
543 return false;
544 }
545}
546
547Value *VPInstruction::generate(VPTransformState &State) {
548 IRBuilderBase &Builder = State.Builder;
549
550 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
551 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
552 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
553 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
554 auto *Res =
555 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
556 if (auto *I = dyn_cast<Instruction>(Val: Res))
557 applyFlags(I&: *I);
558 return Res;
559 }
560
561 switch (getOpcode()) {
562 case VPInstruction::Not: {
563 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
564 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
565 return Builder.CreateNot(V: A, Name);
566 }
567 case Instruction::ExtractElement: {
568 assert(State.VF.isVector() && "Only extract elements from vectors");
569 if (auto *Idx = dyn_cast<VPConstantInt>(Val: getOperand(N: 1)))
570 return State.get(Def: getOperand(N: 0), Lane: VPLane(Idx->getZExtValue()));
571 Value *Vec = State.get(Def: getOperand(N: 0));
572 Value *Idx = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
573 return Builder.CreateExtractElement(Vec, Idx, Name);
574 }
575 case Instruction::Freeze: {
576 Value *Op = State.get(Def: getOperand(N: 0), IsScalar: vputils::onlyFirstLaneUsed(Def: this));
577 return Builder.CreateFreeze(V: Op, Name);
578 }
579 case Instruction::FCmp:
580 case Instruction::ICmp: {
581 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
582 Value *A = State.get(Def: getOperand(N: 0), IsScalar: OnlyFirstLaneUsed);
583 Value *B = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
584 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
585 }
586 case Instruction::PHI: {
587 llvm_unreachable("should be handled by VPPhi::execute");
588 }
589 case Instruction::Select: {
590 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
591 Value *Cond =
592 State.get(Def: getOperand(N: 0),
593 IsScalar: OnlyFirstLaneUsed || vputils::isSingleScalar(VPV: getOperand(N: 0)));
594 Value *Op1 = State.get(Def: getOperand(N: 1), IsScalar: OnlyFirstLaneUsed);
595 Value *Op2 = State.get(Def: getOperand(N: 2), IsScalar: OnlyFirstLaneUsed);
596 return Builder.CreateSelectFMF(C: Cond, True: Op1, False: Op2, FMFSource: getFastMathFlags(), Name);
597 }
598 case VPInstruction::ActiveLaneMask: {
599 // Get first lane of vector induction variable.
600 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
601 // Get the original loop tripcount.
602 Value *ScalarTC = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
603
604 // If this part of the active lane mask is scalar, generate the CMP directly
605 // to avoid unnecessary extracts.
606 if (State.VF.isScalar())
607 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
608 Name);
609
610 ElementCount EC = State.VF.multiplyCoefficientBy(
611 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
612 auto *PredTy = VectorType::get(ElementType: Builder.getInt1Ty(), EC);
613 return Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
614 Types: {PredTy, ScalarTC->getType()},
615 Args: {VIVElem0, ScalarTC}, FMFSource: nullptr, Name);
616 }
617 case VPInstruction::FirstOrderRecurrenceSplice: {
618 // Generate code to combine the previous and current values in vector v3.
619 //
620 // vector.ph:
621 // v_init = vector(..., ..., ..., a[-1])
622 // br vector.body
623 //
624 // vector.body
625 // i = phi [0, vector.ph], [i+4, vector.body]
626 // v1 = phi [v_init, vector.ph], [v2, vector.body]
627 // v2 = a[i, i+1, i+2, i+3];
628 // v3 = vector(v1(3), v2(0, 1, 2))
629
630 auto *V1 = State.get(Def: getOperand(N: 0));
631 if (!V1->getType()->isVectorTy())
632 return V1;
633 Value *V2 = State.get(Def: getOperand(N: 1));
634 return Builder.CreateVectorSpliceRight(V1, V2, Offset: 1, Name);
635 }
636 case VPInstruction::CalculateTripCountMinusVF: {
637 Value *ScalarTC = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
638 Value *VFxUF = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
639 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: VFxUF);
640 Value *Cmp =
641 Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: VFxUF);
642 Value *Zero = ConstantInt::getNullValue(Ty: ScalarTC->getType());
643 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
644 }
645 case VPInstruction::ExplicitVectorLength: {
646 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
647 // be outside of the main loop.
648 Value *AVL = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
649 // Compute EVL
650 assert(AVL->getType()->isIntegerTy() &&
651 "Requested vector length should be an integer.");
652
653 assert(State.VF.isScalable() && "Expected scalable vector factor.");
654 Value *VFArg = Builder.getInt32(C: State.VF.getKnownMinValue());
655
656 Value *EVL = Builder.CreateIntrinsic(
657 RetTy: Builder.getInt32Ty(), ID: Intrinsic::experimental_get_vector_length,
658 Args: {AVL, VFArg, Builder.getTrue()});
659 return EVL;
660 }
661 case VPInstruction::CanonicalIVIncrementForPart: {
662 auto *IV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
663 auto *VFxPart = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
664 // The canonical IV is incremented by the vectorization factor (num of
665 // SIMD elements) times the unroll part.
666 return Builder.CreateAdd(LHS: IV, RHS: VFxPart, Name, HasNUW: hasNoUnsignedWrap(),
667 HasNSW: hasNoSignedWrap());
668 }
669 case VPInstruction::BranchOnCond: {
670 Value *Cond = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
671 // Replace the temporary unreachable terminator with a new conditional
672 // branch, hooking it up to backward destination for latch blocks now, and
673 // to forward destination(s) later when they are created.
674 // Second successor may be backwards - iff it is already in VPBB2IRBB.
675 VPBasicBlock *SecondVPSucc =
676 cast<VPBasicBlock>(Val: getParent()->getSuccessors()[1]);
677 BasicBlock *SecondIRSucc = State.CFG.VPBB2IRBB.lookup(Val: SecondVPSucc);
678 BasicBlock *IRBB = State.CFG.VPBB2IRBB[getParent()];
679 auto *Br = Builder.CreateCondBr(Cond, True: IRBB, False: SecondIRSucc);
680 // First successor is always forward, reset it to nullptr.
681 Br->setSuccessor(idx: 0, NewSucc: nullptr);
682 IRBB->getTerminator()->eraseFromParent();
683 applyMetadata(I&: *Br);
684 return Br;
685 }
686 case VPInstruction::Broadcast: {
687 return Builder.CreateVectorSplat(
688 EC: State.VF, V: State.get(Def: getOperand(N: 0), /*IsScalar*/ true), Name: "broadcast");
689 }
690 case VPInstruction::BuildStructVector: {
691 // For struct types, we need to build a new 'wide' struct type, where each
692 // element is widened, i.e., we create a struct of vectors.
693 auto *StructTy =
694 cast<StructType>(Val: State.TypeAnalysis.inferScalarType(V: getOperand(N: 0)));
695 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: StructTy, EC: State.VF));
696 for (const auto &[LaneIndex, Op] : enumerate(First: operands())) {
697 for (unsigned FieldIndex = 0; FieldIndex != StructTy->getNumElements();
698 FieldIndex++) {
699 Value *ScalarValue =
700 Builder.CreateExtractValue(Agg: State.get(Def: Op, IsScalar: true), Idxs: FieldIndex);
701 Value *VectorValue = Builder.CreateExtractValue(Agg: Res, Idxs: FieldIndex);
702 VectorValue =
703 Builder.CreateInsertElement(Vec: VectorValue, NewElt: ScalarValue, Idx: LaneIndex);
704 Res = Builder.CreateInsertValue(Agg: Res, Val: VectorValue, Idxs: FieldIndex);
705 }
706 }
707 return Res;
708 }
709 case VPInstruction::BuildVector: {
710 auto *ScalarTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
711 auto NumOfElements = ElementCount::getFixed(MinVal: getNumOperands());
712 Value *Res = PoisonValue::get(T: toVectorizedTy(Ty: ScalarTy, EC: NumOfElements));
713 for (const auto &[Idx, Op] : enumerate(First: operands()))
714 Res = Builder.CreateInsertElement(Vec: Res, NewElt: State.get(Def: Op, IsScalar: true),
715 Idx: Builder.getInt32(C: Idx));
716 return Res;
717 }
718 case VPInstruction::ReductionStartVector: {
719 if (State.VF.isScalar())
720 return State.get(Def: getOperand(N: 0), IsScalar: true);
721 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
722 Builder.setFastMathFlags(getFastMathFlags());
723 // If this start vector is scaled then it should produce a vector with fewer
724 // elements than the VF.
725 ElementCount VF = State.VF.divideCoefficientBy(
726 RHS: cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue());
727 auto *Iden = Builder.CreateVectorSplat(EC: VF, V: State.get(Def: getOperand(N: 1), IsScalar: true));
728 return Builder.CreateInsertElement(Vec: Iden, NewElt: State.get(Def: getOperand(N: 0), IsScalar: true),
729 Idx: Builder.getInt32(C: 0));
730 }
731 case VPInstruction::ComputeAnyOfResult: {
732 Value *Start = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
733 Value *NewVal = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
734 Value *ReducedResult = State.get(Def: getOperand(N: 2));
735 for (unsigned Idx = 3; Idx < getNumOperands(); ++Idx)
736 ReducedResult =
737 Builder.CreateBinOp(Opc: Instruction::Or, LHS: State.get(Def: getOperand(N: Idx)),
738 RHS: ReducedResult, Name: "bin.rdx");
739 // If any predicate is true it means that we want to select the new value.
740 if (ReducedResult->getType()->isVectorTy())
741 ReducedResult = Builder.CreateOrReduce(Src: ReducedResult);
742 // The compares in the loop may yield poison, which propagates through the
743 // bitwise ORs. Freeze it here before the condition is used.
744 ReducedResult = Builder.CreateFreeze(V: ReducedResult);
745 return Builder.CreateSelect(C: ReducedResult, True: NewVal, False: Start, Name: "rdx.select");
746 }
747 case VPInstruction::ComputeReductionResult: {
748 RecurKind RK = getRecurKind();
749 bool IsOrdered = isReductionOrdered();
750 bool IsInLoop = isReductionInLoop();
751 assert(!RecurrenceDescriptor::isFindIVRecurrenceKind(RK) &&
752 "FindIV should use min/max reduction kinds");
753
754 // The recipe may have multiple operands to be reduced together.
755 unsigned NumOperandsToReduce = getNumOperands();
756 VectorParts RdxParts(NumOperandsToReduce);
757 for (unsigned Part = 0; Part < NumOperandsToReduce; ++Part)
758 RdxParts[Part] = State.get(Def: getOperand(N: Part), IsScalar: IsInLoop);
759
760 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
761 Builder.setFastMathFlags(getFastMathFlags());
762
763 // Reduce multiple operands into one.
764 Value *ReducedPartRdx = RdxParts[0];
765 if (IsOrdered) {
766 ReducedPartRdx = RdxParts[NumOperandsToReduce - 1];
767 } else {
768 // Floating-point operations should have some FMF to enable the reduction.
769 for (unsigned Part = 1; Part < NumOperandsToReduce; ++Part) {
770 Value *RdxPart = RdxParts[Part];
771 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
772 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
773 else {
774 // For sub-recurrences, each part's reduction variable is already
775 // negative, we need to do: reduce.add(-acc_uf0 + -acc_uf1)
776 Instruction::BinaryOps Opcode =
777 RK == RecurKind::Sub
778 ? Instruction::Add
779 : (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK);
780 ReducedPartRdx =
781 Builder.CreateBinOp(Opc: Opcode, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
782 }
783 }
784 }
785
786 // Create the reduction after the loop. Note that inloop reductions create
787 // the target reduction in the loop using a Reduction recipe.
788 if (State.VF.isVector() && !IsInLoop) {
789 // TODO: Support in-order reductions based on the recurrence descriptor.
790 // All ops in the reduction inherit fast-math-flags from the recurrence
791 // descriptor.
792 ReducedPartRdx = createSimpleReduction(B&: Builder, Src: ReducedPartRdx, RdxKind: RK);
793 }
794
795 return ReducedPartRdx;
796 }
797 case VPInstruction::ExtractLastLane:
798 case VPInstruction::ExtractPenultimateElement: {
799 unsigned Offset =
800 getOpcode() == VPInstruction::ExtractPenultimateElement ? 2 : 1;
801 Value *Res;
802 if (State.VF.isVector()) {
803 assert(Offset <= State.VF.getKnownMinValue() &&
804 "invalid offset to extract from");
805 // Extract lane VF - Offset from the operand.
806 Res = State.get(Def: getOperand(N: 0), Lane: VPLane::getLaneFromEnd(VF: State.VF, Offset));
807 } else {
808 // TODO: Remove ExtractLastLane for scalar VFs.
809 assert(Offset <= 1 && "invalid offset to extract from");
810 Res = State.get(Def: getOperand(N: 0));
811 }
812 if (isa<ExtractElementInst>(Val: Res))
813 Res->setName(Name);
814 return Res;
815 }
816 case VPInstruction::LogicalAnd: {
817 Value *A = State.get(Def: getOperand(N: 0));
818 Value *B = State.get(Def: getOperand(N: 1));
819 return Builder.CreateLogicalAnd(Cond1: A, Cond2: B, Name);
820 }
821 case VPInstruction::LogicalOr: {
822 Value *A = State.get(Def: getOperand(N: 0));
823 Value *B = State.get(Def: getOperand(N: 1));
824 return Builder.CreateLogicalOr(Cond1: A, Cond2: B, Name);
825 }
826 case VPInstruction::PtrAdd: {
827 assert((State.VF.isScalar() || vputils::onlyFirstLaneUsed(this)) &&
828 "can only generate first lane for PtrAdd");
829 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
830 Value *Addend = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
831 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
832 }
833 case VPInstruction::WidePtrAdd: {
834 Value *Ptr =
835 State.get(Def: getOperand(N: 0), IsScalar: vputils::isSingleScalar(VPV: getOperand(N: 0)));
836 Value *Addend = State.get(Def: getOperand(N: 1));
837 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name, NW: getGEPNoWrapFlags());
838 }
839 case VPInstruction::AnyOf: {
840 Value *Res = Builder.CreateFreeze(V: State.get(Def: getOperand(N: 0)));
841 for (VPValue *Op : drop_begin(RangeOrContainer: operands()))
842 Res = Builder.CreateOr(LHS: Res, RHS: Builder.CreateFreeze(V: State.get(Def: Op)));
843 return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Src: Res);
844 }
845 case VPInstruction::ExtractLane: {
846 assert(getNumOperands() != 2 && "ExtractLane from single source should be "
847 "simplified to ExtractElement.");
848 Value *LaneToExtract = State.get(Def: getOperand(N: 0), IsScalar: true);
849 Type *IdxTy = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
850 Value *Res = nullptr;
851 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
852
853 for (unsigned Idx = 1; Idx != getNumOperands(); ++Idx) {
854 Value *VectorStart =
855 Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: Idx - 1));
856 Value *VectorIdx = Idx == 1
857 ? LaneToExtract
858 : Builder.CreateSub(LHS: LaneToExtract, RHS: VectorStart);
859 Value *Ext = State.VF.isScalar()
860 ? State.get(Def: getOperand(N: Idx))
861 : Builder.CreateExtractElement(
862 Vec: State.get(Def: getOperand(N: Idx)), Idx: VectorIdx);
863 if (Res) {
864 Value *Cmp = Builder.CreateICmpUGE(LHS: LaneToExtract, RHS: VectorStart);
865 Res = Builder.CreateSelect(C: Cmp, True: Ext, False: Res);
866 } else {
867 Res = Ext;
868 }
869 }
870 return Res;
871 }
872 case VPInstruction::FirstActiveLane: {
873 Type *Ty = State.TypeAnalysis.inferScalarType(V: this);
874 if (getNumOperands() == 1) {
875 Value *Mask = State.get(Def: getOperand(N: 0));
876 return Builder.CreateCountTrailingZeroElems(ResTy: Ty, Mask,
877 /*ZeroIsPoison=*/false, Name);
878 }
879 // If there are multiple operands, create a chain of selects to pick the
880 // first operand with an active lane and add the number of lanes of the
881 // preceding operands.
882 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty, VF: State.VF);
883 unsigned LastOpIdx = getNumOperands() - 1;
884 Value *Res = nullptr;
885 for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
886 Value *TrailingZeros =
887 State.VF.isScalar()
888 ? Builder.CreateZExt(
889 V: Builder.CreateICmpEQ(LHS: State.get(Def: getOperand(N: Idx)),
890 RHS: Builder.getFalse()),
891 DestTy: Ty)
892 : Builder.CreateCountTrailingZeroElems(
893 ResTy: Ty, Mask: State.get(Def: getOperand(N: Idx)),
894 /*ZeroIsPoison=*/false, Name);
895 Value *Current = Builder.CreateAdd(
896 LHS: Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty, V: Idx)),
897 RHS: TrailingZeros);
898 if (Res) {
899 Value *Cmp = Builder.CreateICmpNE(LHS: TrailingZeros, RHS: RuntimeVF);
900 Res = Builder.CreateSelect(C: Cmp, True: Current, False: Res);
901 } else {
902 Res = Current;
903 }
904 }
905
906 return Res;
907 }
908 case VPInstruction::ResumeForEpilogue:
909 return State.get(Def: getOperand(N: 0), IsScalar: true);
910 case VPInstruction::Reverse:
911 return Builder.CreateVectorReverse(V: State.get(Def: getOperand(N: 0)), Name: "reverse");
912 case VPInstruction::ExtractLastActive: {
913 Value *Result = State.get(Def: getOperand(N: 0), /*IsScalar=*/true);
914 for (unsigned Idx = 1; Idx < getNumOperands(); Idx += 2) {
915 Value *Data = State.get(Def: getOperand(N: Idx));
916 Value *Mask = State.get(Def: getOperand(N: Idx + 1));
917 Type *VTy = Data->getType();
918
919 if (State.VF.isScalar())
920 Result = Builder.CreateSelect(C: Mask, True: Data, False: Result);
921 else
922 Result = Builder.CreateIntrinsic(
923 ID: Intrinsic::experimental_vector_extract_last_active, Types: {VTy},
924 Args: {Data, Mask, Result});
925 }
926
927 return Result;
928 }
929 default:
930 llvm_unreachable("Unsupported opcode for instruction");
931 }
932}
933
934InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
935 unsigned Opcode, ElementCount VF, VPCostContext &Ctx) const {
936 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
937 Type *ResultTy = VF.isVector() ? toVectorTy(Scalar: ScalarTy, EC: VF) : ScalarTy;
938 switch (Opcode) {
939 case Instruction::FNeg:
940 return Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: ResultTy, CostKind: Ctx.CostKind);
941 case Instruction::UDiv:
942 case Instruction::SDiv:
943 case Instruction::SRem:
944 case Instruction::URem:
945 case Instruction::Add:
946 case Instruction::FAdd:
947 case Instruction::Sub:
948 case Instruction::FSub:
949 case Instruction::Mul:
950 case Instruction::FMul:
951 case Instruction::FDiv:
952 case Instruction::FRem:
953 case Instruction::Shl:
954 case Instruction::LShr:
955 case Instruction::AShr:
956 case Instruction::And:
957 case Instruction::Or:
958 case Instruction::Xor: {
959 // Certain instructions can be cheaper if they have a constant second
960 // operand. One example of this are shifts on x86.
961 VPValue *RHS = getOperand(N: 1);
962 TargetTransformInfo::OperandValueInfo RHSInfo = Ctx.getOperandInfo(V: RHS);
963
964 if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
965 getOperand(N: 1)->isDefinedOutsideLoopRegions())
966 RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
967
968 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
969 SmallVector<const Value *, 4> Operands;
970 if (CtxI)
971 Operands.append(in_start: CtxI->value_op_begin(), in_end: CtxI->value_op_end());
972 return Ctx.TTI.getArithmeticInstrCost(
973 Opcode, Ty: ResultTy, CostKind: Ctx.CostKind,
974 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
975 Opd2Info: RHSInfo, Args: Operands, CxtI: CtxI, TLibInfo: &Ctx.TLI);
976 }
977 case Instruction::Freeze:
978 // This opcode is unknown. Assume that it is the same as 'mul'.
979 return Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: ResultTy,
980 CostKind: Ctx.CostKind);
981 case Instruction::ExtractValue:
982 return Ctx.TTI.getInsertExtractValueCost(Opcode: Instruction::ExtractValue,
983 CostKind: Ctx.CostKind);
984 case Instruction::ICmp:
985 case Instruction::FCmp: {
986 Type *ScalarOpTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
987 Type *OpTy = VF.isVector() ? toVectorTy(Scalar: ScalarOpTy, EC: VF) : ScalarOpTy;
988 Instruction *CtxI = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue());
989 return Ctx.TTI.getCmpSelInstrCost(
990 Opcode, ValTy: OpTy, CondTy: CmpInst::makeCmpResultType(opnd_type: OpTy), VecPred: getPredicate(),
991 CostKind: Ctx.CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
992 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CtxI);
993 }
994 case Instruction::BitCast: {
995 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
996 if (ScalarTy->isPointerTy())
997 return 0;
998 [[fallthrough]];
999 }
1000 case Instruction::SExt:
1001 case Instruction::ZExt:
1002 case Instruction::FPToUI:
1003 case Instruction::FPToSI:
1004 case Instruction::FPExt:
1005 case Instruction::PtrToInt:
1006 case Instruction::PtrToAddr:
1007 case Instruction::IntToPtr:
1008 case Instruction::SIToFP:
1009 case Instruction::UIToFP:
1010 case Instruction::Trunc:
1011 case Instruction::FPTrunc:
1012 case Instruction::AddrSpaceCast: {
1013 // Computes the CastContextHint from a recipe that may access memory.
1014 auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
1015 if (isa<VPInterleaveBase>(Val: R))
1016 return TTI::CastContextHint::Interleave;
1017 if (const auto *ReplicateRecipe = dyn_cast<VPReplicateRecipe>(Val: R)) {
1018 // Only compute CCH for memory operations, matching the legacy model
1019 // which only considers loads/stores for cast context hints.
1020 auto *UI = cast<Instruction>(Val: ReplicateRecipe->getUnderlyingValue());
1021 if (!isa<LoadInst, StoreInst>(Val: UI))
1022 return TTI::CastContextHint::None;
1023 return ReplicateRecipe->isPredicated() ? TTI::CastContextHint::Masked
1024 : TTI::CastContextHint::Normal;
1025 }
1026 const auto *WidenMemoryRecipe = dyn_cast<VPWidenMemoryRecipe>(Val: R);
1027 if (WidenMemoryRecipe == nullptr)
1028 return TTI::CastContextHint::None;
1029 if (VF.isScalar())
1030 return TTI::CastContextHint::Normal;
1031 if (!WidenMemoryRecipe->isConsecutive())
1032 return TTI::CastContextHint::GatherScatter;
1033 if (WidenMemoryRecipe->isReverse())
1034 return TTI::CastContextHint::Reversed;
1035 if (WidenMemoryRecipe->isMasked())
1036 return TTI::CastContextHint::Masked;
1037 return TTI::CastContextHint::Normal;
1038 };
1039
1040 VPValue *Operand = getOperand(N: 0);
1041 TTI::CastContextHint CCH = TTI::CastContextHint::None;
1042 // For Trunc/FPTrunc, get the context from the only user.
1043 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
1044 auto GetOnlyUser = [](const VPSingleDefRecipe *R) -> VPRecipeBase * {
1045 if (R->getNumUsers() == 0 || R->hasMoreThanOneUniqueUser())
1046 return nullptr;
1047 return dyn_cast<VPRecipeBase>(Val: *R->user_begin());
1048 };
1049 if (VPRecipeBase *Recipe = GetOnlyUser(this)) {
1050 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue())))
1051 Recipe = GetOnlyUser(cast<VPInstruction>(Val: Recipe));
1052 if (Recipe)
1053 CCH = ComputeCCH(Recipe);
1054 }
1055 }
1056 // For Z/Sext, get the context from the operand.
1057 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
1058 Opcode == Instruction::FPExt) {
1059 if (auto *Recipe = Operand->getDefiningRecipe()) {
1060 VPValue *ReverseOp;
1061 if (match(V: Recipe, P: m_Reverse(Op0: m_VPValue(V&: ReverseOp))))
1062 Recipe = ReverseOp->getDefiningRecipe();
1063 if (Recipe)
1064 CCH = ComputeCCH(Recipe);
1065 }
1066 }
1067
1068 auto *ScalarSrcTy = Ctx.Types.inferScalarType(V: Operand);
1069 Type *SrcTy = VF.isVector() ? toVectorTy(Scalar: ScalarSrcTy, EC: VF) : ScalarSrcTy;
1070 // Arm TTI will use the underlying instruction to determine the cost.
1071 return Ctx.TTI.getCastInstrCost(
1072 Opcode, Dst: ResultTy, Src: SrcTy, CCH, CostKind: Ctx.CostKind,
1073 I: dyn_cast_if_present<Instruction>(Val: getUnderlyingValue()));
1074 }
1075 case Instruction::Select: {
1076 SelectInst *SI = cast_or_null<SelectInst>(Val: getUnderlyingValue());
1077 bool IsScalarCond = getOperand(N: 0)->isDefinedOutsideLoopRegions();
1078 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1079
1080 VPValue *Op0, *Op1;
1081 bool IsLogicalAnd =
1082 match(V: this, P: m_c_LogicalAnd(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1083 bool IsLogicalOr =
1084 match(V: this, P: m_c_LogicalOr(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1)));
1085 // Also match the inverted forms:
1086 // select x, false, y --> !x & y (still AND)
1087 // select x, y, true --> !x | y (still OR)
1088 IsLogicalAnd |=
1089 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_False(), Op2: m_VPValue(V&: Op1)));
1090 IsLogicalOr |=
1091 match(V: this, P: m_Select(Op0: m_VPValue(V&: Op0), Op1: m_VPValue(V&: Op1), Op2: m_True()));
1092
1093 if (!IsScalarCond && ScalarTy->getScalarSizeInBits() == 1 &&
1094 (IsLogicalAnd || IsLogicalOr)) {
1095 // select x, y, false --> x & y
1096 // select x, true, y --> x | y
1097 const auto [Op1VK, Op1VP] = Ctx.getOperandInfo(V: Op0);
1098 const auto [Op2VK, Op2VP] = Ctx.getOperandInfo(V: Op1);
1099
1100 SmallVector<const Value *, 2> Operands;
1101 if (SI && all_of(Range: operands(),
1102 P: [](VPValue *Op) { return Op->getUnderlyingValue(); }))
1103 append_range(C&: Operands, R: SI->operands());
1104 return Ctx.TTI.getArithmeticInstrCost(
1105 Opcode: IsLogicalOr ? Instruction::Or : Instruction::And, Ty: ResultTy,
1106 CostKind: Ctx.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: SI);
1107 }
1108
1109 Type *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1110 if (!IsScalarCond && VF.isVector())
1111 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
1112
1113 llvm::CmpPredicate Pred;
1114 if (!match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue())))
1115 if (auto *CondIRV = dyn_cast<VPIRValue>(Val: getOperand(N: 0)))
1116 if (auto *Cmp = dyn_cast<CmpInst>(Val: CondIRV->getValue()))
1117 Pred = Cmp->getPredicate();
1118 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1119 return Ctx.TTI.getCmpSelInstrCost(
1120 Opcode: Instruction::Select, ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Ctx.CostKind,
1121 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: SI);
1122 }
1123 }
1124 llvm_unreachable("called for unsupported opcode");
1125}
1126
1127InstructionCost VPInstruction::computeCost(ElementCount VF,
1128 VPCostContext &Ctx) const {
1129 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
1130 if (!getUnderlyingValue() && getOpcode() != Instruction::FMul) {
1131 // TODO: Compute cost for VPInstructions without underlying values once
1132 // the legacy cost model has been retired.
1133 return 0;
1134 }
1135
1136 assert(!doesGeneratePerAllLanes() &&
1137 "Should only generate a vector value or single scalar, not scalars "
1138 "for all lanes.");
1139 return getCostForRecipeWithOpcode(
1140 Opcode: getOpcode(),
1141 VF: vputils::onlyFirstLaneUsed(Def: this) ? ElementCount::getFixed(MinVal: 1) : VF, Ctx);
1142 }
1143
1144 switch (getOpcode()) {
1145 case Instruction::Select: {
1146 llvm::CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
1147 match(V: getOperand(N: 0), P: m_Cmp(Pred, Op0: m_VPValue(), Op1: m_VPValue()));
1148 auto *CondTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1149 auto *VecTy = Ctx.Types.inferScalarType(V: getOperand(N: 1));
1150 if (!vputils::onlyFirstLaneUsed(Def: this)) {
1151 CondTy = toVectorTy(Scalar: CondTy, EC: VF);
1152 VecTy = toVectorTy(Scalar: VecTy, EC: VF);
1153 }
1154 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy, VecPred: Pred,
1155 CostKind: Ctx.CostKind);
1156 }
1157 case Instruction::ExtractElement:
1158 case VPInstruction::ExtractLane: {
1159 if (VF.isScalar()) {
1160 // ExtractLane with VF=1 takes care of handling extracting across multiple
1161 // parts.
1162 return 0;
1163 }
1164
1165 // Add on the cost of extracting the element.
1166 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1167 return Ctx.TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
1168 CostKind: Ctx.CostKind);
1169 }
1170 case VPInstruction::AnyOf: {
1171 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1172 return Ctx.TTI.getArithmeticReductionCost(
1173 Opcode: Instruction::Or, Ty: cast<VectorType>(Val: VecTy), FMF: std::nullopt, CostKind: Ctx.CostKind);
1174 }
1175 case VPInstruction::FirstActiveLane: {
1176 Type *Ty = Ctx.Types.inferScalarType(V: this);
1177 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1178 if (VF.isScalar())
1179 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1180 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1181 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1182 // Calculate the cost of determining the lane index.
1183 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1184 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1185 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1186 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1187 }
1188 case VPInstruction::LastActiveLane: {
1189 Type *Ty = Ctx.Types.inferScalarType(V: this);
1190 Type *ScalarTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1191 if (VF.isScalar())
1192 return Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ScalarTy,
1193 CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy),
1194 VecPred: CmpInst::ICMP_EQ, CostKind: Ctx.CostKind);
1195 // Calculate the cost of determining the lane index: NOT + cttz_elts + SUB.
1196 auto *PredTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1197 IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts, Ty,
1198 {PredTy, Type::getInt1Ty(C&: Ctx.LLVMCtx)});
1199 InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1200 // Add cost of NOT operation on the predicate.
1201 Cost += Ctx.TTI.getArithmeticInstrCost(
1202 Opcode: Instruction::Xor, Ty: PredTy, CostKind: Ctx.CostKind,
1203 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
1204 Opd2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue,
1205 .Properties: TargetTransformInfo::OP_None});
1206 // Add cost of SUB operation on the index.
1207 Cost += Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind: Ctx.CostKind);
1208 return Cost;
1209 }
1210 case VPInstruction::ExtractLastActive: {
1211 Type *ScalarTy = Ctx.Types.inferScalarType(V: this);
1212 Type *VecTy = toVectorTy(Scalar: ScalarTy, EC: VF);
1213 Type *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
1214 IntrinsicCostAttributes ICA(
1215 Intrinsic::experimental_vector_extract_last_active, ScalarTy,
1216 {VecTy, MaskTy, ScalarTy});
1217 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind);
1218 }
1219 case VPInstruction::FirstOrderRecurrenceSplice: {
1220 assert(VF.isVector() && "Scalar FirstOrderRecurrenceSplice?");
1221 SmallVector<int> Mask(VF.getKnownMinValue());
1222 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
1223 Type *VectorTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
1224
1225 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
1226 DstTy: cast<VectorType>(Val: VectorTy),
1227 SrcTy: cast<VectorType>(Val: VectorTy), Mask,
1228 CostKind: Ctx.CostKind, Index: VF.getKnownMinValue() - 1);
1229 }
1230 case VPInstruction::ActiveLaneMask: {
1231 Type *ArgTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1232 unsigned Multiplier = cast<VPConstantInt>(Val: getOperand(N: 2))->getZExtValue();
1233 Type *RetTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF * Multiplier);
1234 IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy,
1235 {ArgTy, ArgTy});
1236 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1237 }
1238 case VPInstruction::ExplicitVectorLength: {
1239 Type *Arg0Ty = Ctx.Types.inferScalarType(V: getOperand(N: 0));
1240 Type *I32Ty = Type::getInt32Ty(C&: Ctx.LLVMCtx);
1241 Type *I1Ty = Type::getInt1Ty(C&: Ctx.LLVMCtx);
1242 IntrinsicCostAttributes Attrs(Intrinsic::experimental_get_vector_length,
1243 I32Ty, {Arg0Ty, I32Ty, I1Ty});
1244 return Ctx.TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: Ctx.CostKind);
1245 }
1246 case VPInstruction::Reverse: {
1247 assert(VF.isVector() && "Reverse operation must be vector type");
1248 auto *VectorTy = cast<VectorType>(
1249 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
1250 return Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
1251 SrcTy: VectorTy, /*Mask=*/{}, CostKind: Ctx.CostKind,
1252 /*Index=*/0);
1253 }
1254 case VPInstruction::ExtractLastLane: {
1255 // Add on the cost of extracting the element.
1256 auto *VecTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF);
1257 return Ctx.TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
1258 Val: VecTy, CostKind: Ctx.CostKind, Index: 0);
1259 }
1260 case VPInstruction::ExtractPenultimateElement:
1261 if (VF == ElementCount::getScalable(MinVal: 1))
1262 return InstructionCost::getInvalid();
1263 [[fallthrough]];
1264 default:
1265 // TODO: Compute cost other VPInstructions once the legacy cost model has
1266 // been retired.
1267 assert(!getUnderlyingValue() &&
1268 "unexpected VPInstruction witht underlying value");
1269 return 0;
1270 }
1271}
1272
1273bool VPInstruction::isVectorToScalar() const {
1274 return getOpcode() == VPInstruction::ExtractLastLane ||
1275 getOpcode() == VPInstruction::ExtractPenultimateElement ||
1276 getOpcode() == Instruction::ExtractElement ||
1277 getOpcode() == VPInstruction::ExtractLane ||
1278 getOpcode() == VPInstruction::FirstActiveLane ||
1279 getOpcode() == VPInstruction::LastActiveLane ||
1280 getOpcode() == VPInstruction::ComputeAnyOfResult ||
1281 getOpcode() == VPInstruction::ExtractLastActive ||
1282 getOpcode() == VPInstruction::ComputeReductionResult ||
1283 getOpcode() == VPInstruction::AnyOf;
1284}
1285
1286bool VPInstruction::isSingleScalar() const {
1287 switch (getOpcode()) {
1288 case Instruction::Load:
1289 case Instruction::PHI:
1290 case VPInstruction::ExplicitVectorLength:
1291 case VPInstruction::ResumeForEpilogue:
1292 case VPInstruction::VScale:
1293 return true;
1294 default:
1295 return isScalarCast();
1296 }
1297}
1298
1299void VPInstruction::execute(VPTransformState &State) {
1300 assert(!isMasked() && "cannot execute masked VPInstruction");
1301 assert(!State.Lane && "VPInstruction executing an Lane");
1302 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1303 assert(flagsValidForOpcode(getOpcode()) &&
1304 "Set flags not supported for the provided opcode");
1305 assert(hasRequiredFlagsForOpcode(getOpcode()) &&
1306 "Opcode requires specific flags to be set");
1307 if (hasFastMathFlags())
1308 State.Builder.setFastMathFlags(getFastMathFlags());
1309 Value *GeneratedValue = generate(State);
1310 if (!hasResult())
1311 return;
1312 assert(GeneratedValue && "generate must produce a value");
1313 bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() &&
1314 (vputils::onlyFirstLaneUsed(Def: this) ||
1315 isVectorToScalar() || isSingleScalar());
1316 assert((((GeneratedValue->getType()->isVectorTy() ||
1317 GeneratedValue->getType()->isStructTy()) ==
1318 !GeneratesPerFirstLaneOnly) ||
1319 State.VF.isScalar()) &&
1320 "scalar value but not only first lane defined");
1321 State.set(Def: this, V: GeneratedValue,
1322 /*IsScalar*/ GeneratesPerFirstLaneOnly);
1323 if (getOpcode() == VPInstruction::ResumeForEpilogue) {
1324 // FIXME: This is a workaround to enable reliable updates of the scalar loop
1325 // resume phis, when vectorizing the epilogue. Must be removed once epilogue
1326 // vectorization explicitly connects VPlans.
1327 setUnderlyingValue(GeneratedValue);
1328 }
1329}
1330
1331bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
1332 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1333 return false;
1334 switch (getOpcode()) {
1335 case Instruction::GetElementPtr:
1336 case Instruction::ExtractElement:
1337 case Instruction::Freeze:
1338 case Instruction::FCmp:
1339 case Instruction::ICmp:
1340 case Instruction::Select:
1341 case Instruction::PHI:
1342 case VPInstruction::AnyOf:
1343 case VPInstruction::BranchOnCond:
1344 case VPInstruction::BranchOnTwoConds:
1345 case VPInstruction::BranchOnCount:
1346 case VPInstruction::Broadcast:
1347 case VPInstruction::BuildStructVector:
1348 case VPInstruction::BuildVector:
1349 case VPInstruction::CalculateTripCountMinusVF:
1350 case VPInstruction::CanonicalIVIncrementForPart:
1351 case VPInstruction::ExtractLane:
1352 case VPInstruction::ExtractLastLane:
1353 case VPInstruction::ExtractLastPart:
1354 case VPInstruction::ExtractPenultimateElement:
1355 case VPInstruction::ActiveLaneMask:
1356 case VPInstruction::ExitingIVValue:
1357 case VPInstruction::ExplicitVectorLength:
1358 case VPInstruction::FirstActiveLane:
1359 case VPInstruction::LastActiveLane:
1360 case VPInstruction::ExtractLastActive:
1361 case VPInstruction::FirstOrderRecurrenceSplice:
1362 case VPInstruction::LogicalAnd:
1363 case VPInstruction::LogicalOr:
1364 case VPInstruction::MaskedCond:
1365 case VPInstruction::Not:
1366 case VPInstruction::PtrAdd:
1367 case VPInstruction::WideIVStep:
1368 case VPInstruction::WidePtrAdd:
1369 case VPInstruction::StepVector:
1370 case VPInstruction::ReductionStartVector:
1371 case VPInstruction::Reverse:
1372 case VPInstruction::VScale:
1373 case VPInstruction::Unpack:
1374 return false;
1375 default:
1376 return true;
1377 }
1378}
1379
1380bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
1381 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1382 if (Instruction::isBinaryOp(Opcode: getOpcode()) || Instruction::isCast(Opcode: getOpcode()))
1383 return vputils::onlyFirstLaneUsed(Def: this);
1384
1385 switch (getOpcode()) {
1386 default:
1387 return false;
1388 case Instruction::ExtractElement:
1389 return Op == getOperand(N: 1);
1390 case Instruction::PHI:
1391 return true;
1392 case Instruction::FCmp:
1393 case Instruction::ICmp:
1394 case Instruction::Select:
1395 case Instruction::Or:
1396 case Instruction::Freeze:
1397 case VPInstruction::Not:
1398 // TODO: Cover additional opcodes.
1399 return vputils::onlyFirstLaneUsed(Def: this);
1400 case Instruction::Load:
1401 case VPInstruction::ActiveLaneMask:
1402 case VPInstruction::ExplicitVectorLength:
1403 case VPInstruction::CalculateTripCountMinusVF:
1404 case VPInstruction::CanonicalIVIncrementForPart:
1405 case VPInstruction::BranchOnCount:
1406 case VPInstruction::BranchOnCond:
1407 case VPInstruction::Broadcast:
1408 case VPInstruction::ReductionStartVector:
1409 return true;
1410 case VPInstruction::BuildStructVector:
1411 case VPInstruction::BuildVector:
1412 // Before replicating by VF, Build(Struct)Vector uses all lanes of the
1413 // operand, after replicating its operands only the first lane is used.
1414 // Before replicating, it will have only a single operand.
1415 return getNumOperands() > 1;
1416 case VPInstruction::PtrAdd:
1417 return Op == getOperand(N: 0) || vputils::onlyFirstLaneUsed(Def: this);
1418 case VPInstruction::WidePtrAdd:
1419 // WidePtrAdd supports scalar and vector base addresses.
1420 return false;
1421 case VPInstruction::ComputeAnyOfResult:
1422 return Op == getOperand(N: 0) || Op == getOperand(N: 1);
1423 case VPInstruction::ExitingIVValue:
1424 case VPInstruction::ExtractLane:
1425 return Op == getOperand(N: 0);
1426 };
1427 llvm_unreachable("switch should return");
1428}
1429
1430bool VPInstruction::usesFirstPartOnly(const VPValue *Op) const {
1431 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1432 if (Instruction::isBinaryOp(Opcode: getOpcode()))
1433 return vputils::onlyFirstPartUsed(Def: this);
1434
1435 switch (getOpcode()) {
1436 default:
1437 return false;
1438 case Instruction::FCmp:
1439 case Instruction::ICmp:
1440 case Instruction::Select:
1441 return vputils::onlyFirstPartUsed(Def: this);
1442 case VPInstruction::BranchOnCount:
1443 case VPInstruction::BranchOnCond:
1444 case VPInstruction::BranchOnTwoConds:
1445 case VPInstruction::CanonicalIVIncrementForPart:
1446 return true;
1447 };
1448 llvm_unreachable("switch should return");
1449}
1450
1451#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1452void VPInstruction::dump() const {
1453 VPSlotTracker SlotTracker(getParent()->getPlan());
1454 printRecipe(dbgs(), "", SlotTracker);
1455}
1456
1457void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1458 VPSlotTracker &SlotTracker) const {
1459 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1460
1461 if (hasResult()) {
1462 printAsOperand(O, SlotTracker);
1463 O << " = ";
1464 }
1465
1466 switch (getOpcode()) {
1467 case VPInstruction::Not:
1468 O << "not";
1469 break;
1470 case VPInstruction::SLPLoad:
1471 O << "combined load";
1472 break;
1473 case VPInstruction::SLPStore:
1474 O << "combined store";
1475 break;
1476 case VPInstruction::ActiveLaneMask:
1477 O << "active lane mask";
1478 break;
1479 case VPInstruction::ExplicitVectorLength:
1480 O << "EXPLICIT-VECTOR-LENGTH";
1481 break;
1482 case VPInstruction::FirstOrderRecurrenceSplice:
1483 O << "first-order splice";
1484 break;
1485 case VPInstruction::BranchOnCond:
1486 O << "branch-on-cond";
1487 break;
1488 case VPInstruction::BranchOnTwoConds:
1489 O << "branch-on-two-conds";
1490 break;
1491 case VPInstruction::CalculateTripCountMinusVF:
1492 O << "TC > VF ? TC - VF : 0";
1493 break;
1494 case VPInstruction::CanonicalIVIncrementForPart:
1495 O << "VF * Part +";
1496 break;
1497 case VPInstruction::BranchOnCount:
1498 O << "branch-on-count";
1499 break;
1500 case VPInstruction::Broadcast:
1501 O << "broadcast";
1502 break;
1503 case VPInstruction::BuildStructVector:
1504 O << "buildstructvector";
1505 break;
1506 case VPInstruction::BuildVector:
1507 O << "buildvector";
1508 break;
1509 case VPInstruction::ExitingIVValue:
1510 O << "exiting-iv-value";
1511 break;
1512 case VPInstruction::MaskedCond:
1513 O << "masked-cond";
1514 break;
1515 case VPInstruction::ExtractLane:
1516 O << "extract-lane";
1517 break;
1518 case VPInstruction::ExtractLastLane:
1519 O << "extract-last-lane";
1520 break;
1521 case VPInstruction::ExtractLastPart:
1522 O << "extract-last-part";
1523 break;
1524 case VPInstruction::ExtractPenultimateElement:
1525 O << "extract-penultimate-element";
1526 break;
1527 case VPInstruction::ComputeAnyOfResult:
1528 O << "compute-anyof-result";
1529 break;
1530 case VPInstruction::ComputeReductionResult:
1531 O << "compute-reduction-result";
1532 break;
1533 case VPInstruction::LogicalAnd:
1534 O << "logical-and";
1535 break;
1536 case VPInstruction::LogicalOr:
1537 O << "logical-or";
1538 break;
1539 case VPInstruction::PtrAdd:
1540 O << "ptradd";
1541 break;
1542 case VPInstruction::WidePtrAdd:
1543 O << "wide-ptradd";
1544 break;
1545 case VPInstruction::AnyOf:
1546 O << "any-of";
1547 break;
1548 case VPInstruction::FirstActiveLane:
1549 O << "first-active-lane";
1550 break;
1551 case VPInstruction::LastActiveLane:
1552 O << "last-active-lane";
1553 break;
1554 case VPInstruction::ReductionStartVector:
1555 O << "reduction-start-vector";
1556 break;
1557 case VPInstruction::ResumeForEpilogue:
1558 O << "resume-for-epilogue";
1559 break;
1560 case VPInstruction::Reverse:
1561 O << "reverse";
1562 break;
1563 case VPInstruction::Unpack:
1564 O << "unpack";
1565 break;
1566 case VPInstruction::ExtractLastActive:
1567 O << "extract-last-active";
1568 break;
1569 default:
1570 O << Instruction::getOpcodeName(getOpcode());
1571 }
1572
1573 printFlags(O);
1574 printOperands(O, SlotTracker);
1575}
1576#endif
1577
1578void VPInstructionWithType::execute(VPTransformState &State) {
1579 State.setDebugLocFrom(getDebugLoc());
1580 if (isScalarCast()) {
1581 Value *Op = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
1582 Value *Cast = State.Builder.CreateCast(Op: Instruction::CastOps(getOpcode()),
1583 V: Op, DestTy: ResultTy);
1584 State.set(Def: this, V: Cast, Lane: VPLane(0));
1585 return;
1586 }
1587 switch (getOpcode()) {
1588 case VPInstruction::StepVector: {
1589 Value *StepVector =
1590 State.Builder.CreateStepVector(DstType: VectorType::get(ElementType: ResultTy, EC: State.VF));
1591 State.set(Def: this, V: StepVector);
1592 break;
1593 }
1594 case VPInstruction::VScale: {
1595 Value *VScale = State.Builder.CreateVScale(Ty: ResultTy);
1596 State.set(Def: this, V: VScale, IsScalar: true);
1597 break;
1598 }
1599
1600 default:
1601 llvm_unreachable("opcode not implemented yet");
1602 }
1603}
1604
1605#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1606void VPInstructionWithType::printRecipe(raw_ostream &O, const Twine &Indent,
1607 VPSlotTracker &SlotTracker) const {
1608 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1609 printAsOperand(O, SlotTracker);
1610 O << " = ";
1611
1612 switch (getOpcode()) {
1613 case VPInstruction::WideIVStep:
1614 O << "wide-iv-step ";
1615 printOperands(O, SlotTracker);
1616 break;
1617 case VPInstruction::StepVector:
1618 O << "step-vector " << *ResultTy;
1619 break;
1620 case VPInstruction::VScale:
1621 O << "vscale " << *ResultTy;
1622 break;
1623 case Instruction::Load:
1624 O << "load ";
1625 printOperands(O, SlotTracker);
1626 break;
1627 default:
1628 assert(Instruction::isCast(getOpcode()) && "unhandled opcode");
1629 O << Instruction::getOpcodeName(getOpcode()) << " ";
1630 printOperands(O, SlotTracker);
1631 O << " to " << *ResultTy;
1632 }
1633}
1634#endif
1635
1636void VPPhi::execute(VPTransformState &State) {
1637 State.setDebugLocFrom(getDebugLoc());
1638 PHINode *NewPhi = State.Builder.CreatePHI(
1639 Ty: State.TypeAnalysis.inferScalarType(V: this), NumReservedValues: 2, Name: getName());
1640 unsigned NumIncoming = getNumIncoming();
1641 // Detect header phis: the parent block dominates its second incoming block
1642 // (the latch). Those IR incoming values have not been generated yet and need
1643 // to be added after they have been executed.
1644 if (NumIncoming == 2 &&
1645 State.VPDT.dominates(A: getParent(), B: getIncomingBlock(Idx: 1))) {
1646 NumIncoming = 1;
1647 }
1648 for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
1649 Value *IncV = State.get(Def: getIncomingValue(Idx), Lane: VPLane(0));
1650 BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(Val: getIncomingBlock(Idx));
1651 NewPhi->addIncoming(V: IncV, BB: PredBB);
1652 }
1653 State.set(Def: this, V: NewPhi, Lane: VPLane(0));
1654}
1655
1656#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1657void VPPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1658 VPSlotTracker &SlotTracker) const {
1659 O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
1660 printAsOperand(O, SlotTracker);
1661 O << " = phi";
1662 printFlags(O);
1663 printPhiOperands(O, SlotTracker);
1664}
1665#endif
1666
1667VPIRInstruction *VPIRInstruction ::create(Instruction &I) {
1668 if (auto *Phi = dyn_cast<PHINode>(Val: &I))
1669 return new VPIRPhi(*Phi);
1670 return new VPIRInstruction(I);
1671}
1672
1673void VPIRInstruction::execute(VPTransformState &State) {
1674 assert(!isa<VPIRPhi>(this) && getNumOperands() == 0 &&
1675 "PHINodes must be handled by VPIRPhi");
1676 // Advance the insert point after the wrapped IR instruction. This allows
1677 // interleaving VPIRInstructions and other recipes.
1678 State.Builder.SetInsertPoint(TheBB: I.getParent(), IP: std::next(x: I.getIterator()));
1679}
1680
1681InstructionCost VPIRInstruction::computeCost(ElementCount VF,
1682 VPCostContext &Ctx) const {
1683 // The recipe wraps an existing IR instruction on the border of VPlan's scope,
1684 // hence it does not contribute to the cost-modeling for the VPlan.
1685 return 0;
1686}
1687
1688#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1689void VPIRInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
1690 VPSlotTracker &SlotTracker) const {
1691 O << Indent << "IR " << I;
1692}
1693#endif
1694
1695void VPIRPhi::execute(VPTransformState &State) {
1696 PHINode *Phi = &getIRPhi();
1697 for (const auto &[Idx, Op] : enumerate(First: operands())) {
1698 VPValue *ExitValue = Op;
1699 auto Lane = vputils::isSingleScalar(VPV: ExitValue)
1700 ? VPLane::getFirstLane()
1701 : VPLane::getLastLaneForVF(VF: State.VF);
1702 VPBlockBase *Pred = getParent()->getPredecessors()[Idx];
1703 auto *PredVPBB = Pred->getExitingBasicBlock();
1704 BasicBlock *PredBB = State.CFG.VPBB2IRBB[PredVPBB];
1705 // Set insertion point in PredBB in case an extract needs to be generated.
1706 // TODO: Model extracts explicitly.
1707 State.Builder.SetInsertPoint(PredBB->getTerminator());
1708 Value *V = State.get(Def: ExitValue, Lane: VPLane(Lane));
1709 // If there is no existing block for PredBB in the phi, add a new incoming
1710 // value. Otherwise update the existing incoming value for PredBB.
1711 if (Phi->getBasicBlockIndex(BB: PredBB) == -1)
1712 Phi->addIncoming(V, BB: PredBB);
1713 else
1714 Phi->setIncomingValueForBlock(BB: PredBB, V);
1715 }
1716
1717 // Advance the insert point after the wrapped IR instruction. This allows
1718 // interleaving VPIRInstructions and other recipes.
1719 State.Builder.SetInsertPoint(TheBB: Phi->getParent(), IP: std::next(x: Phi->getIterator()));
1720}
1721
1722void VPPhiAccessors::removeIncomingValueFor(VPBlockBase *IncomingBlock) const {
1723 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1724 assert(R->getNumOperands() == R->getParent()->getNumPredecessors() &&
1725 "Number of phi operands must match number of predecessors");
1726 unsigned Position = R->getParent()->getIndexForPredecessor(Pred: IncomingBlock);
1727 R->removeOperand(Idx: Position);
1728}
1729
1730VPValue *
1731VPPhiAccessors::getIncomingValueForBlock(const VPBasicBlock *VPBB) const {
1732 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1733 return getIncomingValue(Idx: R->getParent()->getIndexForPredecessor(Pred: VPBB));
1734}
1735
1736void VPPhiAccessors::setIncomingValueForBlock(const VPBasicBlock *VPBB,
1737 VPValue *V) const {
1738 VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
1739 R->setOperand(I: R->getParent()->getIndexForPredecessor(Pred: VPBB), New: V);
1740}
1741
1742#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1743void VPPhiAccessors::printPhiOperands(raw_ostream &O,
1744 VPSlotTracker &SlotTracker) const {
1745 interleaveComma(enumerate(getAsRecipe()->operands()), O,
1746 [this, &O, &SlotTracker](auto Op) {
1747 O << "[ ";
1748 Op.value()->printAsOperand(O, SlotTracker);
1749 O << ", ";
1750 getIncomingBlock(Op.index())->printAsOperand(O);
1751 O << " ]";
1752 });
1753}
1754#endif
1755
1756#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1757void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
1758 VPSlotTracker &SlotTracker) const {
1759 VPIRInstruction::printRecipe(O, Indent, SlotTracker);
1760
1761 if (getNumOperands() != 0) {
1762 O << " (extra operand" << (getNumOperands() > 1 ? "s" : "") << ": ";
1763 interleaveComma(incoming_values_and_blocks(), O,
1764 [&O, &SlotTracker](auto Op) {
1765 std::get<0>(Op)->printAsOperand(O, SlotTracker);
1766 O << " from ";
1767 std::get<1>(Op)->printAsOperand(O);
1768 });
1769 O << ")";
1770 }
1771}
1772#endif
1773
1774void VPIRMetadata::applyMetadata(Instruction &I) const {
1775 for (const auto &[Kind, Node] : Metadata)
1776 I.setMetadata(KindID: Kind, Node);
1777}
1778
1779void VPIRMetadata::intersect(const VPIRMetadata &Other) {
1780 SmallVector<std::pair<unsigned, MDNode *>> MetadataIntersection;
1781 for (const auto &[KindA, MDA] : Metadata) {
1782 for (const auto &[KindB, MDB] : Other.Metadata) {
1783 if (KindA == KindB && MDA == MDB) {
1784 MetadataIntersection.emplace_back(Args: KindA, Args: MDA);
1785 break;
1786 }
1787 }
1788 }
1789 Metadata = std::move(MetadataIntersection);
1790}
1791
1792#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1793void VPIRMetadata::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
1794 const Module *M = SlotTracker.getModule();
1795 if (Metadata.empty() || !M)
1796 return;
1797
1798 ArrayRef<StringRef> MDNames = SlotTracker.getMDNames();
1799 O << " (";
1800 interleaveComma(Metadata, O, [&](const auto &KindNodePair) {
1801 auto [Kind, Node] = KindNodePair;
1802 assert(Kind < MDNames.size() && !MDNames[Kind].empty() &&
1803 "Unexpected unnamed metadata kind");
1804 O << "!" << MDNames[Kind] << " ";
1805 Node->printAsOperand(O, M);
1806 });
1807 O << ")";
1808}
1809#endif
1810
1811void VPWidenCallRecipe::execute(VPTransformState &State) {
1812 assert(State.VF.isVector() && "not widening");
1813 assert(Variant != nullptr && "Can't create vector function.");
1814
1815 FunctionType *VFTy = Variant->getFunctionType();
1816 // Add return type if intrinsic is overloaded on it.
1817 SmallVector<Value *, 4> Args;
1818 for (const auto &I : enumerate(First: args())) {
1819 Value *Arg;
1820 // Some vectorized function variants may also take a scalar argument,
1821 // e.g. linear parameters for pointers. This needs to be the scalar value
1822 // from the start of the respective part when interleaving.
1823 if (!VFTy->getParamType(i: I.index())->isVectorTy())
1824 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1825 else
1826 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1827 Args.push_back(Elt: Arg);
1828 }
1829
1830 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1831 SmallVector<OperandBundleDef, 1> OpBundles;
1832 if (CI)
1833 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1834
1835 CallInst *V = State.Builder.CreateCall(Callee: Variant, Args, OpBundles);
1836 applyFlags(I&: *V);
1837 applyMetadata(I&: *V);
1838 V->setCallingConv(Variant->getCallingConv());
1839
1840 if (!V->getType()->isVoidTy())
1841 State.set(Def: this, V);
1842}
1843
1844InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
1845 VPCostContext &Ctx) const {
1846 return Ctx.TTI.getCallInstrCost(F: nullptr, RetTy: Variant->getReturnType(),
1847 Tys: Variant->getFunctionType()->params(),
1848 CostKind: Ctx.CostKind);
1849}
1850
1851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1852void VPWidenCallRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1853 VPSlotTracker &SlotTracker) const {
1854 O << Indent << "WIDEN-CALL ";
1855
1856 Function *CalledFn = getCalledScalarFunction();
1857 if (CalledFn->getReturnType()->isVoidTy())
1858 O << "void ";
1859 else {
1860 printAsOperand(O, SlotTracker);
1861 O << " = ";
1862 }
1863
1864 O << "call";
1865 printFlags(O);
1866 O << " @" << CalledFn->getName() << "(";
1867 interleaveComma(args(), O, [&O, &SlotTracker](VPValue *Op) {
1868 Op->printAsOperand(O, SlotTracker);
1869 });
1870 O << ")";
1871
1872 O << " (using library function";
1873 if (Variant->hasName())
1874 O << ": " << Variant->getName();
1875 O << ")";
1876}
1877#endif
1878
1879void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
1880 assert(State.VF.isVector() && "not widening");
1881
1882 SmallVector<Type *, 2> TysForDecl;
1883 // Add return type if intrinsic is overloaded on it.
1884 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1,
1885 TTI: State.TTI)) {
1886 Type *RetTy = toVectorizedTy(Ty: getResultType(), EC: State.VF);
1887 ArrayRef<Type *> ContainedTys = getContainedTypes(Ty: RetTy);
1888 for (auto [Idx, Ty] : enumerate(First&: ContainedTys)) {
1889 if (isVectorIntrinsicWithStructReturnOverloadAtField(ID: VectorIntrinsicID,
1890 RetIdx: Idx, TTI: State.TTI))
1891 TysForDecl.push_back(Elt: Ty);
1892 }
1893 }
1894 SmallVector<Value *, 4> Args;
1895 for (const auto &I : enumerate(First: operands())) {
1896 // Some intrinsics have a scalar argument - don't replace it with a
1897 // vector.
1898 Value *Arg;
1899 if (isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index(),
1900 TTI: State.TTI))
1901 Arg = State.get(Def: I.value(), Lane: VPLane(0));
1902 else
1903 Arg = State.get(Def: I.value(), IsScalar: usesFirstLaneOnly(Op: I.value()));
1904 if (isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index(),
1905 TTI: State.TTI))
1906 TysForDecl.push_back(Elt: Arg->getType());
1907 Args.push_back(Elt: Arg);
1908 }
1909
1910 // Use vector version of the intrinsic.
1911 Module *M = State.Builder.GetInsertBlock()->getModule();
1912 Function *VectorF =
1913 Intrinsic::getOrInsertDeclaration(M, id: VectorIntrinsicID, OverloadTys: TysForDecl);
1914 assert(VectorF &&
1915 "Can't retrieve vector intrinsic or vector-predication intrinsics.");
1916
1917 auto *CI = cast_or_null<CallInst>(Val: getUnderlyingValue());
1918 SmallVector<OperandBundleDef, 1> OpBundles;
1919 if (CI)
1920 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
1921
1922 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
1923
1924 applyFlags(I&: *V);
1925 applyMetadata(I&: *V);
1926
1927 if (!V->getType()->isVoidTy())
1928 State.set(Def: this, V);
1929}
1930
1931/// Compute the cost for the intrinsic \p ID with \p Operands, produced by \p R.
1932static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
1933 ArrayRef<const VPValue *> Operands,
1934 const VPRecipeWithIRFlags &R,
1935 ElementCount VF,
1936 VPCostContext &Ctx) {
1937 // Some backends analyze intrinsic arguments to determine cost. Use the
1938 // underlying value for the operand if it has one. Otherwise try to use the
1939 // operand of the underlying call instruction, if there is one. Otherwise
1940 // clear Arguments.
1941 // TODO: Rework TTI interface to be independent of concrete IR values.
1942 SmallVector<const Value *> Arguments;
1943 for (const auto &[Idx, Op] : enumerate(First&: Operands)) {
1944 auto *V = Op->getUnderlyingValue();
1945 if (!V) {
1946 if (auto *UI = dyn_cast_or_null<CallBase>(Val: R.getUnderlyingValue())) {
1947 Arguments.push_back(Elt: UI->getArgOperand(i: Idx));
1948 continue;
1949 }
1950 Arguments.clear();
1951 break;
1952 }
1953 Arguments.push_back(Elt: V);
1954 }
1955
1956 Type *ScalarRetTy = Ctx.Types.inferScalarType(V: &R);
1957 Type *RetTy = VF.isVector() ? toVectorizedTy(Ty: ScalarRetTy, EC: VF) : ScalarRetTy;
1958 SmallVector<Type *> ParamTys;
1959 for (const VPValue *Op : Operands) {
1960 ParamTys.push_back(Elt: VF.isVector()
1961 ? toVectorTy(Scalar: Ctx.Types.inferScalarType(V: Op), EC: VF)
1962 : Ctx.Types.inferScalarType(V: Op));
1963 }
1964
1965 // TODO: Rework TTI interface to avoid reliance on underlying IntrinsicInst.
1966 IntrinsicCostAttributes CostAttrs(
1967 ID, RetTy, Arguments, ParamTys, R.getFastMathFlags(),
1968 dyn_cast_or_null<IntrinsicInst>(Val: R.getUnderlyingValue()),
1969 InstructionCost::getInvalid());
1970 return Ctx.TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Ctx.CostKind);
1971}
1972
1973InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
1974 VPCostContext &Ctx) const {
1975 SmallVector<const VPValue *> ArgOps(operands());
1976 return getCostForIntrinsics(ID: VectorIntrinsicID, Operands: ArgOps, R: *this, VF, Ctx);
1977}
1978
1979StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
1980 return Intrinsic::getBaseName(id: VectorIntrinsicID);
1981}
1982
1983bool VPWidenIntrinsicRecipe::usesFirstLaneOnly(const VPValue *Op) const {
1984 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
1985 return all_of(Range: enumerate(First: operands()), P: [this, &Op](const auto &X) {
1986 auto [Idx, V] = X;
1987 return V != Op || isVectorIntrinsicWithScalarOpAtArg(getVectorIntrinsicID(),
1988 Idx, nullptr);
1989 });
1990}
1991
1992#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1993void VPWidenIntrinsicRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
1994 VPSlotTracker &SlotTracker) const {
1995 O << Indent << "WIDEN-INTRINSIC ";
1996 if (ResultTy->isVoidTy()) {
1997 O << "void ";
1998 } else {
1999 printAsOperand(O, SlotTracker);
2000 O << " = ";
2001 }
2002
2003 O << "call";
2004 printFlags(O);
2005 O << getIntrinsicName() << "(";
2006
2007 interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
2008 Op->printAsOperand(O, SlotTracker);
2009 });
2010 O << ")";
2011}
2012#endif
2013
2014void VPHistogramRecipe::execute(VPTransformState &State) {
2015 IRBuilderBase &Builder = State.Builder;
2016
2017 Value *Address = State.get(Def: getOperand(N: 0));
2018 Value *IncAmt = State.get(Def: getOperand(N: 1), /*IsScalar=*/true);
2019 VectorType *VTy = cast<VectorType>(Val: Address->getType());
2020
2021 // The histogram intrinsic requires a mask even if the recipe doesn't;
2022 // if the mask operand was omitted then all lanes should be executed and
2023 // we just need to synthesize an all-true mask.
2024 Value *Mask = nullptr;
2025 if (VPValue *VPMask = getMask())
2026 Mask = State.get(Def: VPMask);
2027 else
2028 Mask =
2029 Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Builder.getInt1(V: 1));
2030
2031 // If this is a subtract, we want to invert the increment amount. We may
2032 // add a separate intrinsic in future, but for now we'll try this.
2033 if (Opcode == Instruction::Sub)
2034 IncAmt = Builder.CreateNeg(V: IncAmt);
2035 else
2036 assert(Opcode == Instruction::Add && "only add or sub supported for now");
2037
2038 State.Builder.CreateIntrinsic(ID: Intrinsic::experimental_vector_histogram_add,
2039 Types: {VTy, IncAmt->getType()},
2040 Args: {Address, IncAmt, Mask});
2041}
2042
2043InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
2044 VPCostContext &Ctx) const {
2045 // FIXME: Take the gather and scatter into account as well. For now we're
2046 // generating the same cost as the fallback path, but we'll likely
2047 // need to create a new TTI method for determining the cost, including
2048 // whether we can use base + vec-of-smaller-indices or just
2049 // vec-of-pointers.
2050 assert(VF.isVector() && "Invalid VF for histogram cost");
2051 Type *AddressTy = Ctx.Types.inferScalarType(V: getOperand(N: 0));
2052 VPValue *IncAmt = getOperand(N: 1);
2053 Type *IncTy = Ctx.Types.inferScalarType(V: IncAmt);
2054 VectorType *VTy = VectorType::get(ElementType: IncTy, EC: VF);
2055
2056 // Assume that a non-constant update value (or a constant != 1) requires
2057 // a multiply, and add that into the cost.
2058 InstructionCost MulCost =
2059 Ctx.TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VTy, CostKind: Ctx.CostKind);
2060 if (match(V: IncAmt, P: m_One()))
2061 MulCost = TTI::TCC_Free;
2062
2063 // Find the cost of the histogram operation itself.
2064 Type *PtrTy = VectorType::get(ElementType: AddressTy, EC: VF);
2065 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: Ctx.LLVMCtx), EC: VF);
2066 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
2067 Type::getVoidTy(C&: Ctx.LLVMCtx),
2068 {PtrTy, IncTy, MaskTy});
2069
2070 // Add the costs together with the add/sub operation.
2071 return Ctx.TTI.getIntrinsicInstrCost(ICA, CostKind: Ctx.CostKind) + MulCost +
2072 Ctx.TTI.getArithmeticInstrCost(Opcode, Ty: VTy, CostKind: Ctx.CostKind);
2073}
2074
2075#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2076void VPHistogramRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2077 VPSlotTracker &SlotTracker) const {
2078 O << Indent << "WIDEN-HISTOGRAM buckets: ";
2079 getOperand(0)->printAsOperand(O, SlotTracker);
2080
2081 if (Opcode == Instruction::Sub)
2082 O << ", dec: ";
2083 else {
2084 assert(Opcode == Instruction::Add);
2085 O << ", inc: ";
2086 }
2087 getOperand(1)->printAsOperand(O, SlotTracker);
2088
2089 if (VPValue *Mask = getMask()) {
2090 O << ", mask: ";
2091 Mask->printAsOperand(O, SlotTracker);
2092 }
2093}
2094#endif
2095
2096VPIRFlags::FastMathFlagsTy::FastMathFlagsTy(const FastMathFlags &FMF) {
2097 AllowReassoc = FMF.allowReassoc();
2098 NoNaNs = FMF.noNaNs();
2099 NoInfs = FMF.noInfs();
2100 NoSignedZeros = FMF.noSignedZeros();
2101 AllowReciprocal = FMF.allowReciprocal();
2102 AllowContract = FMF.allowContract();
2103 ApproxFunc = FMF.approxFunc();
2104}
2105
2106VPIRFlags VPIRFlags::getDefaultFlags(unsigned Opcode) {
2107 switch (Opcode) {
2108 case Instruction::Add:
2109 case Instruction::Sub:
2110 case Instruction::Mul:
2111 case Instruction::Shl:
2112 case VPInstruction::CanonicalIVIncrementForPart:
2113 return WrapFlagsTy(false, false);
2114 case Instruction::Trunc:
2115 return TruncFlagsTy(false, false);
2116 case Instruction::Or:
2117 return DisjointFlagsTy(false);
2118 case Instruction::AShr:
2119 case Instruction::LShr:
2120 case Instruction::UDiv:
2121 case Instruction::SDiv:
2122 return ExactFlagsTy(false);
2123 case Instruction::GetElementPtr:
2124 case VPInstruction::PtrAdd:
2125 case VPInstruction::WidePtrAdd:
2126 return GEPNoWrapFlags::none();
2127 case Instruction::ZExt:
2128 case Instruction::UIToFP:
2129 return NonNegFlagsTy(false);
2130 case Instruction::FAdd:
2131 case Instruction::FSub:
2132 case Instruction::FMul:
2133 case Instruction::FDiv:
2134 case Instruction::FRem:
2135 case Instruction::FNeg:
2136 case Instruction::FPExt:
2137 case Instruction::FPTrunc:
2138 return FastMathFlags();
2139 case Instruction::ICmp:
2140 case Instruction::FCmp:
2141 case VPInstruction::ComputeReductionResult:
2142 llvm_unreachable("opcode requires explicit flags");
2143 default:
2144 return VPIRFlags();
2145 }
2146}
2147
2148#if !defined(NDEBUG)
2149bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
2150 switch (OpType) {
2151 case OperationType::OverflowingBinOp:
2152 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
2153 Opcode == Instruction::Mul || Opcode == Instruction::Shl ||
2154 Opcode == VPInstruction::VPInstruction::CanonicalIVIncrementForPart;
2155 case OperationType::Trunc:
2156 return Opcode == Instruction::Trunc;
2157 case OperationType::DisjointOp:
2158 return Opcode == Instruction::Or;
2159 case OperationType::PossiblyExactOp:
2160 return Opcode == Instruction::AShr || Opcode == Instruction::LShr ||
2161 Opcode == Instruction::UDiv || Opcode == Instruction::SDiv;
2162 case OperationType::GEPOp:
2163 return Opcode == Instruction::GetElementPtr ||
2164 Opcode == VPInstruction::PtrAdd ||
2165 Opcode == VPInstruction::WidePtrAdd;
2166 case OperationType::FPMathOp:
2167 return Opcode == Instruction::Call || Opcode == Instruction::FAdd ||
2168 Opcode == Instruction::FMul || Opcode == Instruction::FSub ||
2169 Opcode == Instruction::FNeg || Opcode == Instruction::FDiv ||
2170 Opcode == Instruction::FRem || Opcode == Instruction::FPExt ||
2171 Opcode == Instruction::FPTrunc || Opcode == Instruction::PHI ||
2172 Opcode == Instruction::Select ||
2173 Opcode == VPInstruction::WideIVStep ||
2174 Opcode == VPInstruction::ReductionStartVector;
2175 case OperationType::FCmp:
2176 return Opcode == Instruction::FCmp;
2177 case OperationType::NonNegOp:
2178 return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP;
2179 case OperationType::Cmp:
2180 return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
2181 case OperationType::ReductionOp:
2182 return Opcode == VPInstruction::ComputeReductionResult;
2183 case OperationType::Other:
2184 return true;
2185 }
2186 llvm_unreachable("Unknown OperationType enum");
2187}
2188
2189bool VPIRFlags::hasRequiredFlagsForOpcode(unsigned Opcode) const {
2190 // Handle opcodes without default flags.
2191 if (Opcode == Instruction::ICmp)
2192 return OpType == OperationType::Cmp;
2193 if (Opcode == Instruction::FCmp)
2194 return OpType == OperationType::FCmp;
2195 if (Opcode == VPInstruction::ComputeReductionResult)
2196 return OpType == OperationType::ReductionOp;
2197
2198 OperationType Required = getDefaultFlags(Opcode).OpType;
2199 return Required == OperationType::Other || Required == OpType;
2200}
2201#endif
2202
2203#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2204void VPIRFlags::printFlags(raw_ostream &O) const {
2205 switch (OpType) {
2206 case OperationType::Cmp:
2207 O << " " << CmpInst::getPredicateName(getPredicate());
2208 break;
2209 case OperationType::FCmp:
2210 O << " " << CmpInst::getPredicateName(getPredicate());
2211 getFastMathFlags().print(O);
2212 break;
2213 case OperationType::DisjointOp:
2214 if (DisjointFlags.IsDisjoint)
2215 O << " disjoint";
2216 break;
2217 case OperationType::PossiblyExactOp:
2218 if (ExactFlags.IsExact)
2219 O << " exact";
2220 break;
2221 case OperationType::OverflowingBinOp:
2222 if (WrapFlags.HasNUW)
2223 O << " nuw";
2224 if (WrapFlags.HasNSW)
2225 O << " nsw";
2226 break;
2227 case OperationType::Trunc:
2228 if (TruncFlags.HasNUW)
2229 O << " nuw";
2230 if (TruncFlags.HasNSW)
2231 O << " nsw";
2232 break;
2233 case OperationType::FPMathOp:
2234 getFastMathFlags().print(O);
2235 break;
2236 case OperationType::GEPOp: {
2237 GEPNoWrapFlags Flags = getGEPNoWrapFlags();
2238 if (Flags.isInBounds())
2239 O << " inbounds";
2240 else if (Flags.hasNoUnsignedSignedWrap())
2241 O << " nusw";
2242 if (Flags.hasNoUnsignedWrap())
2243 O << " nuw";
2244 break;
2245 }
2246 case OperationType::NonNegOp:
2247 if (NonNegFlags.NonNeg)
2248 O << " nneg";
2249 break;
2250 case OperationType::ReductionOp: {
2251 RecurKind RK = getRecurKind();
2252 O << " (";
2253 switch (RK) {
2254 case RecurKind::AnyOf:
2255 O << "any-of";
2256 break;
2257 case RecurKind::FindLast:
2258 O << "find-last";
2259 break;
2260 case RecurKind::SMax:
2261 O << "smax";
2262 break;
2263 case RecurKind::SMin:
2264 O << "smin";
2265 break;
2266 case RecurKind::UMax:
2267 O << "umax";
2268 break;
2269 case RecurKind::UMin:
2270 O << "umin";
2271 break;
2272 case RecurKind::FMinNum:
2273 O << "fminnum";
2274 break;
2275 case RecurKind::FMaxNum:
2276 O << "fmaxnum";
2277 break;
2278 case RecurKind::FMinimum:
2279 O << "fminimum";
2280 break;
2281 case RecurKind::FMaximum:
2282 O << "fmaximum";
2283 break;
2284 case RecurKind::FMinimumNum:
2285 O << "fminimumnum";
2286 break;
2287 case RecurKind::FMaximumNum:
2288 O << "fmaximumnum";
2289 break;
2290 default:
2291 O << Instruction::getOpcodeName(RecurrenceDescriptor::getOpcode(RK));
2292 break;
2293 }
2294 if (isReductionInLoop())
2295 O << ", in-loop";
2296 if (isReductionOrdered())
2297 O << ", ordered";
2298 O << ")";
2299 getFastMathFlags().print(O);
2300 break;
2301 }
2302 case OperationType::Other:
2303 break;
2304 }
2305 O << " ";
2306}
2307#endif
2308
2309void VPWidenRecipe::execute(VPTransformState &State) {
2310 auto &Builder = State.Builder;
2311 switch (Opcode) {
2312 case Instruction::Call:
2313 case Instruction::UncondBr:
2314 case Instruction::CondBr:
2315 case Instruction::PHI:
2316 case Instruction::GetElementPtr:
2317 llvm_unreachable("This instruction is handled by a different recipe.");
2318 case Instruction::UDiv:
2319 case Instruction::SDiv:
2320 case Instruction::SRem:
2321 case Instruction::URem:
2322 case Instruction::Add:
2323 case Instruction::FAdd:
2324 case Instruction::Sub:
2325 case Instruction::FSub:
2326 case Instruction::FNeg:
2327 case Instruction::Mul:
2328 case Instruction::FMul:
2329 case Instruction::FDiv:
2330 case Instruction::FRem:
2331 case Instruction::Shl:
2332 case Instruction::LShr:
2333 case Instruction::AShr:
2334 case Instruction::And:
2335 case Instruction::Or:
2336 case Instruction::Xor: {
2337 // Just widen unops and binops.
2338 SmallVector<Value *, 2> Ops;
2339 for (VPValue *VPOp : operands())
2340 Ops.push_back(Elt: State.get(Def: VPOp));
2341
2342 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
2343
2344 if (auto *VecOp = dyn_cast<Instruction>(Val: V)) {
2345 applyFlags(I&: *VecOp);
2346 applyMetadata(I&: *VecOp);
2347 }
2348
2349 // Use this vector value for all users of the original instruction.
2350 State.set(Def: this, V);
2351 break;
2352 }
2353 case Instruction::ExtractValue: {
2354 assert(getNumOperands() == 2 && "expected single level extractvalue");
2355 Value *Op = State.get(Def: getOperand(N: 0));
2356 Value *Extract = Builder.CreateExtractValue(
2357 Agg: Op, Idxs: cast<VPConstantInt>(Val: getOperand(N: 1))->getZExtValue());
2358 State.set(Def: this, V: Extract);
2359 break;
2360 }
2361 case Instruction::Freeze: {
2362 Value *Op = State.get(Def: getOperand(N: 0));
2363 Value *Freeze = Builder.CreateFreeze(V: Op);
2364 State.set(Def: this, V: Freeze);
2365 break;
2366 }
2367 case Instruction::ICmp:
2368 case Instruction::FCmp: {
2369 // Widen compares. Generate vector compares.
2370 bool FCmp = Opcode == Instruction::FCmp;
2371 Value *A = State.get(Def: getOperand(N: 0));
2372 Value *B = State.get(Def: getOperand(N: 1));
2373 Value *C = nullptr;
2374 if (FCmp) {
2375 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
2376 } else {
2377 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
2378 }
2379 if (auto *I = dyn_cast<Instruction>(Val: C)) {
2380 applyFlags(I&: *I);
2381 applyMetadata(I&: *I);
2382 }
2383 State.set(Def: this, V: C);
2384 break;
2385 }
2386 case Instruction::Select: {
2387 VPValue *CondOp = getOperand(N: 0);
2388 Value *Cond = State.get(Def: CondOp, IsScalar: vputils::isSingleScalar(VPV: CondOp));
2389 Value *Op0 = State.get(Def: getOperand(N: 1));
2390 Value *Op1 = State.get(Def: getOperand(N: 2));
2391 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
2392 State.set(Def: this, V: Sel);
2393 if (auto *I = dyn_cast<Instruction>(Val: Sel)) {
2394 if (isa<FPMathOperator>(Val: I))
2395 applyFlags(I&: *I);
2396 applyMetadata(I&: *I);
2397 }
2398 break;
2399 }
2400 default:
2401 // This instruction is not vectorized by simple widening.
2402 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
2403 << Instruction::getOpcodeName(Opcode));
2404 llvm_unreachable("Unhandled instruction!");
2405 } // end of switch.
2406
2407#if !defined(NDEBUG)
2408 // Verify that VPlan type inference results agree with the type of the
2409 // generated values.
2410 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), State.VF) ==
2411 State.get(this)->getType() &&
2412 "inferred type and type from generated instructions do not match");
2413#endif
2414}
2415
2416InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
2417 VPCostContext &Ctx) const {
2418 switch (Opcode) {
2419 case Instruction::UDiv:
2420 case Instruction::SDiv:
2421 case Instruction::SRem:
2422 case Instruction::URem:
2423 // If the div/rem operation isn't safe to speculate and requires
2424 // predication, then the only way we can even create a vplan is to insert
2425 // a select on the second input operand to ensure we use the value of 1
2426 // for the inactive lanes. The select will be costed separately.
2427 case Instruction::FNeg:
2428 case Instruction::Add:
2429 case Instruction::FAdd:
2430 case Instruction::Sub:
2431 case Instruction::FSub:
2432 case Instruction::Mul:
2433 case Instruction::FMul:
2434 case Instruction::FDiv:
2435 case Instruction::FRem:
2436 case Instruction::Shl:
2437 case Instruction::LShr:
2438 case Instruction::AShr:
2439 case Instruction::And:
2440 case Instruction::Or:
2441 case Instruction::Xor:
2442 case Instruction::Freeze:
2443 case Instruction::ExtractValue:
2444 case Instruction::ICmp:
2445 case Instruction::FCmp:
2446 case Instruction::Select:
2447 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2448 default:
2449 llvm_unreachable("Unsupported opcode for instruction");
2450 }
2451}
2452
2453#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2454void VPWidenRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2455 VPSlotTracker &SlotTracker) const {
2456 O << Indent << "WIDEN ";
2457 printAsOperand(O, SlotTracker);
2458 O << " = " << Instruction::getOpcodeName(Opcode);
2459 printFlags(O);
2460 printOperands(O, SlotTracker);
2461}
2462#endif
2463
2464void VPWidenCastRecipe::execute(VPTransformState &State) {
2465 auto &Builder = State.Builder;
2466 /// Vectorize casts.
2467 assert(State.VF.isVector() && "Not vectorizing?");
2468 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
2469 VPValue *Op = getOperand(N: 0);
2470 Value *A = State.get(Def: Op);
2471 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
2472 State.set(Def: this, V: Cast);
2473 if (auto *CastOp = dyn_cast<Instruction>(Val: Cast)) {
2474 applyFlags(I&: *CastOp);
2475 applyMetadata(I&: *CastOp);
2476 }
2477}
2478
2479InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
2480 VPCostContext &Ctx) const {
2481 // TODO: In some cases, VPWidenCastRecipes are created but not considered in
2482 // the legacy cost model, including truncates/extends when evaluating a
2483 // reduction in a smaller type.
2484 if (!getUnderlyingValue())
2485 return 0;
2486 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF, Ctx);
2487}
2488
2489#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2490void VPWidenCastRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2491 VPSlotTracker &SlotTracker) const {
2492 O << Indent << "WIDEN-CAST ";
2493 printAsOperand(O, SlotTracker);
2494 O << " = " << Instruction::getOpcodeName(Opcode);
2495 printFlags(O);
2496 printOperands(O, SlotTracker);
2497 O << " to " << *getResultType();
2498}
2499#endif
2500
2501InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
2502 VPCostContext &Ctx) const {
2503 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
2504}
2505
2506#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2507void VPWidenIntOrFpInductionRecipe::printRecipe(
2508 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
2509 O << Indent;
2510 printAsOperand(O, SlotTracker);
2511 O << " = WIDEN-INDUCTION";
2512 printFlags(O);
2513 printOperands(O, SlotTracker);
2514
2515 if (auto *TI = getTruncInst())
2516 O << " (truncated to " << *TI->getType() << ")";
2517}
2518#endif
2519
2520bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
2521 // The step may be defined by a recipe in the preheader (e.g. if it requires
2522 // SCEV expansion), but for the canonical induction the step is required to be
2523 // 1, which is represented as live-in.
2524 return match(V: getStartValue(), P: m_ZeroInt()) &&
2525 match(V: getStepValue(), P: m_One()) &&
2526 getScalarType() == getRegion()->getCanonicalIVType();
2527}
2528
2529void VPDerivedIVRecipe::execute(VPTransformState &State) {
2530 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
2531
2532 // Fast-math-flags propagate from the original induction instruction.
2533 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2534 if (FPBinOp)
2535 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
2536
2537 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2538 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
2539 Value *DerivedIV = emitTransformedIndex(
2540 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
2541 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
2542 DerivedIV->setName(Name);
2543 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
2544}
2545
2546#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2547void VPDerivedIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2548 VPSlotTracker &SlotTracker) const {
2549 O << Indent;
2550 printAsOperand(O, SlotTracker);
2551 O << " = DERIVED-IV ";
2552 getStartValue()->printAsOperand(O, SlotTracker);
2553 O << " + ";
2554 getOperand(1)->printAsOperand(O, SlotTracker);
2555 O << " * ";
2556 getStepValue()->printAsOperand(O, SlotTracker);
2557}
2558#endif
2559
2560void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
2561 // Fast-math-flags propagate from the original induction instruction.
2562 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
2563 State.Builder.setFastMathFlags(getFastMathFlags());
2564
2565 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2566 /// variable on which to base the steps, \p Step is the size of the step.
2567
2568 Value *BaseIV = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2569 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
2570 IRBuilderBase &Builder = State.Builder;
2571
2572 // Ensure step has the same type as that of scalar IV.
2573 Type *BaseIVTy = BaseIV->getType()->getScalarType();
2574 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
2575
2576 // We build scalar steps for both integer and floating-point induction
2577 // variables. Here, we determine the kind of arithmetic we will perform.
2578 Instruction::BinaryOps AddOp;
2579 Instruction::BinaryOps MulOp;
2580 if (BaseIVTy->isIntegerTy()) {
2581 AddOp = Instruction::Add;
2582 MulOp = Instruction::Mul;
2583 } else {
2584 AddOp = InductionOpcode;
2585 MulOp = Instruction::FMul;
2586 }
2587
2588 // Determine the number of scalars we need to generate.
2589 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
2590 // Compute the scalar steps and save the results in State.
2591
2592 unsigned StartLane = 0;
2593 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2594 if (State.Lane) {
2595 StartLane = State.Lane->getKnownLane();
2596 EndLane = StartLane + 1;
2597 }
2598 Value *StartIdx0 = getStartIndex() ? State.get(Def: getStartIndex(), IsScalar: true)
2599 : Constant::getNullValue(Ty: BaseIVTy);
2600
2601 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
2602 // It is okay if the induction variable type cannot hold the lane number,
2603 // we expect truncation in this case.
2604 Constant *LaneValue =
2605 BaseIVTy->isIntegerTy()
2606 ? ConstantInt::get(Ty: BaseIVTy, V: Lane, /*IsSigned=*/false,
2607 /*ImplicitTrunc=*/true)
2608 : ConstantFP::get(Ty: BaseIVTy, V: Lane);
2609 Value *StartIdx = Builder.CreateBinOp(Opc: AddOp, LHS: StartIdx0, RHS: LaneValue);
2610 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2611 "Expected StartIdx to be folded to a constant when VF is not "
2612 "scalable");
2613 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
2614 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
2615 State.set(Def: this, V: Add, Lane: VPLane(Lane));
2616 }
2617}
2618
2619#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2620void VPScalarIVStepsRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2621 VPSlotTracker &SlotTracker) const {
2622 O << Indent;
2623 printAsOperand(O, SlotTracker);
2624 O << " = SCALAR-STEPS ";
2625 printOperands(O, SlotTracker);
2626}
2627#endif
2628
2629bool VPWidenGEPRecipe::usesFirstLaneOnly(const VPValue *Op) const {
2630 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
2631 return vputils::isSingleScalar(VPV: Op);
2632}
2633
2634void VPWidenGEPRecipe::execute(VPTransformState &State) {
2635 assert(State.VF.isVector() && "not widening");
2636 // Construct a vector GEP by widening the operands of the scalar GEP as
2637 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
2638 // results in a vector of pointers when at least one operand of the GEP
2639 // is vector-typed. Thus, to keep the representation compact, we only use
2640 // vector-typed operands for loop-varying values.
2641
2642 bool AllOperandsAreInvariant = all_of(Range: operands(), P: [](VPValue *Op) {
2643 return Op->isDefinedOutsideLoopRegions();
2644 });
2645 if (AllOperandsAreInvariant) {
2646 // If we are vectorizing, but the GEP has only loop-invariant operands,
2647 // the GEP we build (by only using vector-typed operands for
2648 // loop-varying values) would be a scalar pointer. Thus, to ensure we
2649 // produce a vector of pointers, we need to either arbitrarily pick an
2650 // operand to broadcast, or broadcast a clone of the original GEP.
2651 // Here, we broadcast a clone of the original.
2652
2653 SmallVector<Value *> Ops;
2654 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
2655 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Lane: VPLane(0)));
2656
2657 auto *NewGEP =
2658 State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr: Ops[0], IdxList: drop_begin(RangeOrContainer&: Ops),
2659 Name: "", NW: getGEPNoWrapFlags());
2660 Value *Splat = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
2661 State.set(Def: this, V: Splat);
2662 return;
2663 }
2664
2665 // If the GEP has at least one loop-varying operand, we are sure to
2666 // produce a vector of pointers unless VF is scalar.
2667 // The pointer operand of the new GEP. If it's loop-invariant, we
2668 // won't broadcast it.
2669 auto *Ptr = State.get(Def: getOperand(N: 0), IsScalar: isPointerLoopInvariant());
2670
2671 // Collect all the indices for the new GEP. If any index is
2672 // loop-invariant, we won't broadcast it.
2673 SmallVector<Value *, 4> Indices;
2674 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
2675 VPValue *Operand = getOperand(N: I);
2676 Indices.push_back(Elt: State.get(Def: Operand, IsScalar: isIndexLoopInvariant(I: I - 1)));
2677 }
2678
2679 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
2680 // but it should be a vector, otherwise.
2681 auto *NewGEP = State.Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Indices,
2682 Name: "", NW: getGEPNoWrapFlags());
2683 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
2684 "NewGEP is not a pointer vector");
2685 State.set(Def: this, V: NewGEP);
2686}
2687
2688#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2689void VPWidenGEPRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2690 VPSlotTracker &SlotTracker) const {
2691 O << Indent << "WIDEN-GEP ";
2692 O << (isPointerLoopInvariant() ? "Inv" : "Var");
2693 for (size_t I = 0; I < getNumOperands() - 1; ++I)
2694 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
2695
2696 O << " ";
2697 printAsOperand(O, SlotTracker);
2698 O << " = getelementptr";
2699 printFlags(O);
2700 printOperands(O, SlotTracker);
2701}
2702#endif
2703
2704void VPVectorEndPointerRecipe::materializeOffset(unsigned Part) {
2705 assert(!getOffset() && "Unexpected offset operand");
2706 VPBuilder Builder(this);
2707 VPlan &Plan = *getParent()->getPlan();
2708 VPValue *VFVal = getVFValue();
2709 VPTypeAnalysis TypeInfo(Plan);
2710 const DataLayout &DL = Plan.getDataLayout();
2711 Type *IndexTy = DL.getIndexType(PtrTy: TypeInfo.inferScalarType(V: this));
2712 VPValue *Stride =
2713 Plan.getConstantInt(Ty: IndexTy, Val: getStride(), /*IsSigned=*/true);
2714 Type *VFTy = TypeInfo.inferScalarType(V: VFVal);
2715 VPValue *VF = Builder.createScalarZExtOrTrunc(Op: VFVal, ResultTy: IndexTy, SrcTy: VFTy,
2716 DL: DebugLoc::getUnknown());
2717
2718 // Offset for Part0 = Offset0 = Stride * (VF - 1).
2719 VPInstruction *VFMinusOne =
2720 Builder.createSub(LHS: VF, RHS: Plan.getConstantInt(Ty: IndexTy, Val: 1u),
2721 DL: DebugLoc::getUnknown(), Name: "", WrapFlags: {true, true});
2722 VPInstruction *Offset0 =
2723 Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {VFMinusOne, Stride});
2724
2725 // Offset for PartN = Offset0 + Part * Stride * VF.
2726 VPValue *PartxStride =
2727 Plan.getConstantInt(Ty: IndexTy, Val: Part * getStride(), /*IsSigned=*/true);
2728 VPValue *Offset = Builder.createAdd(
2729 LHS: Offset0,
2730 RHS: Builder.createOverflowingOp(Opcode: Instruction::Mul, Operands: {PartxStride, VF}));
2731 addOperand(Operand: Offset);
2732}
2733
2734void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
2735 auto &Builder = State.Builder;
2736 assert(getOffset() && "Expected prior materialization of offset");
2737 Value *Ptr = State.get(Def: getPointer(), IsScalar: true);
2738 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2739 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2740 NW: getGEPNoWrapFlags());
2741 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2742}
2743
2744#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2745void VPVectorEndPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2746 VPSlotTracker &SlotTracker) const {
2747 O << Indent;
2748 printAsOperand(O, SlotTracker);
2749 O << " = vector-end-pointer";
2750 printFlags(O);
2751 printOperands(O, SlotTracker);
2752}
2753#endif
2754
2755void VPVectorPointerRecipe::execute(VPTransformState &State) {
2756 auto &Builder = State.Builder;
2757 assert(getOffset() &&
2758 "Expected prior simplification of recipe without offset");
2759 Value *Ptr = State.get(Def: getOperand(N: 0), Lane: VPLane(0));
2760 Value *Offset = State.get(Def: getOffset(), IsScalar: true);
2761 Value *ResultPtr = Builder.CreateGEP(Ty: getSourceElementType(), Ptr, IdxList: Offset, Name: "",
2762 NW: getGEPNoWrapFlags());
2763 State.set(Def: this, V: ResultPtr, /*IsScalar*/ true);
2764}
2765
2766#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2767void VPVectorPointerRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2768 VPSlotTracker &SlotTracker) const {
2769 O << Indent;
2770 printAsOperand(O, SlotTracker);
2771 O << " = vector-pointer";
2772 printFlags(O);
2773 printOperands(O, SlotTracker);
2774}
2775#endif
2776
2777InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
2778 VPCostContext &Ctx) const {
2779 // A blend will be expanded to a select VPInstruction, which will generate a
2780 // scalar select if only the first lane is used.
2781 if (vputils::onlyFirstLaneUsed(Def: this))
2782 VF = ElementCount::getFixed(MinVal: 1);
2783
2784 Type *ResultTy = toVectorTy(Scalar: Ctx.Types.inferScalarType(V: this), EC: VF);
2785 Type *CmpTy = toVectorTy(Scalar: Type::getInt1Ty(C&: Ctx.Types.getContext()), EC: VF);
2786 return (getNumIncomingValues() - 1) *
2787 Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: ResultTy, CondTy: CmpTy,
2788 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Ctx.CostKind);
2789}
2790
2791#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2792void VPBlendRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
2793 VPSlotTracker &SlotTracker) const {
2794 O << Indent << "BLEND ";
2795 printAsOperand(O, SlotTracker);
2796 O << " =";
2797 printFlags(O);
2798 if (getNumIncomingValues() == 1) {
2799 // Not a User of any mask: not really blending, this is a
2800 // single-predecessor phi.
2801 getIncomingValue(0)->printAsOperand(O, SlotTracker);
2802 } else {
2803 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
2804 if (I != 0)
2805 O << " ";
2806 getIncomingValue(I)->printAsOperand(O, SlotTracker);
2807 if (I == 0 && isNormalized())
2808 continue;
2809 O << "/";
2810 getMask(I)->printAsOperand(O, SlotTracker);
2811 }
2812 }
2813}
2814#endif
2815
2816void VPReductionRecipe::execute(VPTransformState &State) {
2817 assert(!State.Lane && "Reduction being replicated.");
2818 RecurKind Kind = getRecurrenceKind();
2819 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
2820 "In-loop AnyOf reductions aren't currently supported");
2821 // Propagate the fast-math flags carried by the underlying instruction.
2822 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
2823 State.Builder.setFastMathFlags(getFastMathFlags());
2824 Value *NewVecOp = State.get(Def: getVecOp());
2825 if (VPValue *Cond = getCondOp()) {
2826 Value *NewCond = State.get(Def: Cond, IsScalar: State.VF.isScalar());
2827 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
2828 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
2829
2830 Value *Start = getRecurrenceIdentity(K: Kind, Tp: ElementTy, FMF: getFastMathFlags());
2831 if (State.VF.isVector())
2832 Start = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Start);
2833
2834 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Start);
2835 NewVecOp = Select;
2836 }
2837 Value *NewRed;
2838 Value *NextInChain;
2839 if (isOrdered()) {
2840 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2841 if (State.VF.isVector())
2842 NewRed =
2843 createOrderedReduction(B&: State.Builder, RdxKind: Kind, Src: NewVecOp, Start: PrevInChain);
2844 else
2845 NewRed = State.Builder.CreateBinOp(
2846 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2847 LHS: PrevInChain, RHS: NewVecOp);
2848 PrevInChain = NewRed;
2849 NextInChain = NewRed;
2850 } else if (isPartialReduction()) {
2851 assert((Kind == RecurKind::Add || Kind == RecurKind::FAdd) &&
2852 "Unexpected partial reduction kind");
2853 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ false);
2854 NewRed = State.Builder.CreateIntrinsic(
2855 RetTy: PrevInChain->getType(),
2856 ID: Kind == RecurKind::Add ? Intrinsic::vector_partial_reduce_add
2857 : Intrinsic::vector_partial_reduce_fadd,
2858 Args: {PrevInChain, NewVecOp}, FMFSource: State.Builder.getFastMathFlags(),
2859 Name: "partial.reduce");
2860 PrevInChain = NewRed;
2861 NextInChain = NewRed;
2862 } else {
2863 assert(isInLoop() &&
2864 "The reduction must either be ordered, partial or in-loop");
2865 Value *PrevInChain = State.get(Def: getChainOp(), /*IsScalar*/ true);
2866 NewRed = createSimpleReduction(B&: State.Builder, Src: NewVecOp, RdxKind: Kind);
2867 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2868 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: Kind, Left: NewRed, Right: PrevInChain);
2869 else
2870 NextInChain = State.Builder.CreateBinOp(
2871 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind),
2872 LHS: PrevInChain, RHS: NewRed);
2873 }
2874 State.set(Def: this, V: NextInChain, /*IsScalar*/ !isPartialReduction());
2875}
2876
2877void VPReductionEVLRecipe::execute(VPTransformState &State) {
2878 assert(!State.Lane && "Reduction being replicated.");
2879
2880 auto &Builder = State.Builder;
2881 // Propagate the fast-math flags carried by the underlying instruction.
2882 IRBuilderBase::FastMathFlagGuard FMFGuard(Builder);
2883 Builder.setFastMathFlags(getFastMathFlags());
2884
2885 RecurKind Kind = getRecurrenceKind();
2886 Value *Prev = State.get(Def: getChainOp(), /*IsScalar*/ true);
2887 Value *VecOp = State.get(Def: getVecOp());
2888 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
2889
2890 Value *Mask;
2891 if (VPValue *CondOp = getCondOp())
2892 Mask = State.get(Def: CondOp);
2893 else
2894 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
2895
2896 Value *NewRed;
2897 if (isOrdered()) {
2898 NewRed = createOrderedReduction(B&: Builder, RdxKind: Kind, Src: VecOp, Start: Prev, Mask, EVL);
2899 } else {
2900 NewRed = createSimpleReduction(B&: Builder, Src: VecOp, RdxKind: Kind, Mask, EVL);
2901 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
2902 NewRed = createMinMaxOp(Builder, RK: Kind, Left: NewRed, Right: Prev);
2903 else
2904 NewRed = Builder.CreateBinOp(
2905 Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind), LHS: NewRed,
2906 RHS: Prev);
2907 }
2908 State.set(Def: this, V: NewRed, /*IsScalar*/ true);
2909}
2910
2911InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
2912 VPCostContext &Ctx) const {
2913 RecurKind RdxKind = getRecurrenceKind();
2914 Type *ElementTy = Ctx.Types.inferScalarType(V: this);
2915 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ElementTy, EC: VF));
2916 unsigned Opcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
2917 FastMathFlags FMFs = getFastMathFlags();
2918 std::optional<FastMathFlags> OptionalFMF =
2919 ElementTy->isFloatingPointTy() ? std::make_optional(t&: FMFs) : std::nullopt;
2920
2921 if (isPartialReduction()) {
2922 InstructionCost CondCost = 0;
2923 if (isConditional()) {
2924 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
2925 auto *CondTy = cast<VectorType>(
2926 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getCondOp()), EC: VF));
2927 CondCost = Ctx.TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VectorTy,
2928 CondTy, VecPred: Pred, CostKind: Ctx.CostKind);
2929 }
2930 return CondCost + Ctx.TTI.getPartialReductionCost(
2931 Opcode, InputTypeA: ElementTy, InputTypeB: ElementTy, AccumType: ElementTy, VF,
2932 OpAExtend: TTI::PR_None, OpBExtend: TTI::PR_None, BinOp: {}, CostKind: Ctx.CostKind,
2933 FMF: OptionalFMF);
2934 }
2935
2936 // TODO: Support any-of reductions.
2937 assert(
2938 (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RdxKind) ||
2939 ForceTargetInstructionCost.getNumOccurrences() > 0) &&
2940 "Any-of reduction not implemented in VPlan-based cost model currently.");
2941
2942 // Note that TTI should model the cost of moving result to the scalar register
2943 // and the BinOp cost in the getMinMaxReductionCost().
2944 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RdxKind)) {
2945 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
2946 return Ctx.TTI.getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF: FMFs, CostKind: Ctx.CostKind);
2947 }
2948
2949 // Note that TTI should model the cost of moving result to the scalar register
2950 // and the BinOp cost in the getArithmeticReductionCost().
2951 return Ctx.TTI.getArithmeticReductionCost(Opcode, Ty: VectorTy, FMF: OptionalFMF,
2952 CostKind: Ctx.CostKind);
2953}
2954
2955VPExpressionRecipe::VPExpressionRecipe(
2956 ExpressionTypes ExpressionType,
2957 ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
2958 : VPSingleDefRecipe(VPRecipeBase::VPExpressionSC, {}, {}),
2959 ExpressionRecipes(ExpressionRecipes), ExpressionType(ExpressionType) {
2960 assert(!ExpressionRecipes.empty() && "Nothing to combine?");
2961 assert(
2962 none_of(ExpressionRecipes,
2963 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
2964 "expression cannot contain recipes with side-effects");
2965
2966 // Maintain a copy of the expression recipes as a set of users.
2967 SmallPtrSet<VPUser *, 4> ExpressionRecipesAsSetOfUsers;
2968 for (auto *R : ExpressionRecipes)
2969 ExpressionRecipesAsSetOfUsers.insert(Ptr: R);
2970
2971 // Recipes in the expression, except the last one, must only be used by
2972 // (other) recipes inside the expression. If there are other users, external
2973 // to the expression, use a clone of the recipe for external users.
2974 for (VPSingleDefRecipe *R : reverse(C&: ExpressionRecipes)) {
2975 if (R != ExpressionRecipes.back() &&
2976 any_of(Range: R->users(), P: [&ExpressionRecipesAsSetOfUsers](VPUser *U) {
2977 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: U);
2978 })) {
2979 // There are users outside of the expression. Clone the recipe and use the
2980 // clone those external users.
2981 VPSingleDefRecipe *CopyForExtUsers = R->clone();
2982 R->replaceUsesWithIf(New: CopyForExtUsers, ShouldReplace: [&ExpressionRecipesAsSetOfUsers](
2983 VPUser &U, unsigned) {
2984 return !ExpressionRecipesAsSetOfUsers.contains(Ptr: &U);
2985 });
2986 CopyForExtUsers->insertBefore(InsertPos: R);
2987 }
2988 if (R->getParent())
2989 R->removeFromParent();
2990 }
2991
2992 // Internalize all external operands to the expression recipes. To do so,
2993 // create new temporary VPValues for all operands defined by a recipe outside
2994 // the expression. The original operands are added as operands of the
2995 // VPExpressionRecipe itself.
2996 for (auto *R : ExpressionRecipes) {
2997 for (const auto &[Idx, Op] : enumerate(First: R->operands())) {
2998 auto *Def = Op->getDefiningRecipe();
2999 if (Def && ExpressionRecipesAsSetOfUsers.contains(Ptr: Def))
3000 continue;
3001 addOperand(Operand: Op);
3002 LiveInPlaceholders.push_back(Elt: new VPSymbolicValue());
3003 }
3004 }
3005
3006 // Replace each external operand with the first one created for it in
3007 // LiveInPlaceholders.
3008 for (auto *R : ExpressionRecipes)
3009 for (auto const &[LiveIn, Tmp] : zip(t: operands(), u&: LiveInPlaceholders))
3010 R->replaceUsesOfWith(From: LiveIn, To: Tmp);
3011}
3012
3013void VPExpressionRecipe::decompose() {
3014 for (auto *R : ExpressionRecipes)
3015 // Since the list could contain duplicates, make sure the recipe hasn't
3016 // already been inserted.
3017 if (!R->getParent())
3018 R->insertBefore(InsertPos: this);
3019
3020 for (const auto &[Idx, Op] : enumerate(First: operands()))
3021 LiveInPlaceholders[Idx]->replaceAllUsesWith(New: Op);
3022
3023 replaceAllUsesWith(New: ExpressionRecipes.back());
3024 ExpressionRecipes.clear();
3025}
3026
3027InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
3028 VPCostContext &Ctx) const {
3029 Type *RedTy = Ctx.Types.inferScalarType(V: this);
3030 auto *SrcVecTy = cast<VectorType>(
3031 Val: toVectorTy(Scalar: Ctx.Types.inferScalarType(V: getOperand(N: 0)), EC: VF));
3032 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3033 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes.back())->getRecurrenceKind());
3034 switch (ExpressionType) {
3035 case ExpressionTypes::ExtendedReduction: {
3036 unsigned Opcode = RecurrenceDescriptor::getOpcode(
3037 Kind: cast<VPReductionRecipe>(Val: ExpressionRecipes[1])->getRecurrenceKind());
3038 auto *ExtR = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3039 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3040
3041 if (RedR->isPartialReduction())
3042 return Ctx.TTI.getPartialReductionCost(
3043 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)), InputTypeB: nullptr, AccumType: RedTy, VF,
3044 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(CastOpc: ExtR->getOpcode()),
3045 OpBExtend: TargetTransformInfo::PR_None, BinOp: std::nullopt, CostKind: Ctx.CostKind,
3046 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3047 : std::nullopt);
3048 else if (!RedTy->isFloatingPointTy())
3049 // TTI::getExtendedReductionCost only supports integer types.
3050 return Ctx.TTI.getExtendedReductionCost(
3051 Opcode, IsUnsigned: ExtR->getOpcode() == Instruction::ZExt, ResTy: RedTy, Ty: SrcVecTy,
3052 FMF: std::nullopt, CostKind: Ctx.CostKind);
3053 else
3054 return InstructionCost::getInvalid();
3055 }
3056 case ExpressionTypes::MulAccReduction:
3057 return Ctx.TTI.getMulAccReductionCost(IsUnsigned: false, RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy,
3058 CostKind: Ctx.CostKind);
3059
3060 case ExpressionTypes::ExtNegatedMulAccReduction:
3061 assert(Opcode == Instruction::Add && "Unexpected opcode");
3062 Opcode = Instruction::Sub;
3063 [[fallthrough]];
3064 case ExpressionTypes::ExtMulAccReduction: {
3065 auto *RedR = cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3066 if (RedR->isPartialReduction()) {
3067 auto *Ext0R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[0]);
3068 auto *Ext1R = cast<VPWidenCastRecipe>(Val: ExpressionRecipes[1]);
3069 auto *Mul = cast<VPWidenRecipe>(Val: ExpressionRecipes[2]);
3070 return Ctx.TTI.getPartialReductionCost(
3071 Opcode, InputTypeA: Ctx.Types.inferScalarType(V: getOperand(N: 0)),
3072 InputTypeB: Ctx.Types.inferScalarType(V: getOperand(N: 1)), AccumType: RedTy, VF,
3073 OpAExtend: TargetTransformInfo::getPartialReductionExtendKind(
3074 CastOpc: Ext0R->getOpcode()),
3075 OpBExtend: TargetTransformInfo::getPartialReductionExtendKind(
3076 CastOpc: Ext1R->getOpcode()),
3077 BinOp: Mul->getOpcode(), CostKind: Ctx.CostKind,
3078 FMF: RedTy->isFloatingPointTy() ? std::optional{RedR->getFastMathFlags()}
3079 : std::nullopt);
3080 }
3081 return Ctx.TTI.getMulAccReductionCost(
3082 IsUnsigned: cast<VPWidenCastRecipe>(Val: ExpressionRecipes.front())->getOpcode() ==
3083 Instruction::ZExt,
3084 RedOpcode: Opcode, ResTy: RedTy, Ty: SrcVecTy, CostKind: Ctx.CostKind);
3085 }
3086 }
3087 llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
3088}
3089
3090bool VPExpressionRecipe::mayReadOrWriteMemory() const {
3091 return any_of(Range: ExpressionRecipes, P: [](VPSingleDefRecipe *R) {
3092 return R->mayReadFromMemory() || R->mayWriteToMemory();
3093 });
3094}
3095
3096bool VPExpressionRecipe::mayHaveSideEffects() const {
3097 assert(
3098 none_of(ExpressionRecipes,
3099 [](VPSingleDefRecipe *R) { return R->mayHaveSideEffects(); }) &&
3100 "expression cannot contain recipes with side-effects");
3101 return false;
3102}
3103
3104bool VPExpressionRecipe::isSingleScalar() const {
3105 // Cannot use vputils::isSingleScalar(), because all external operands
3106 // of the expression will be live-ins while bundled.
3107 auto *RR = dyn_cast<VPReductionRecipe>(Val: ExpressionRecipes.back());
3108 return RR && !RR->isPartialReduction();
3109}
3110
3111#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3112
3113void VPExpressionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3114 VPSlotTracker &SlotTracker) const {
3115 O << Indent << "EXPRESSION ";
3116 printAsOperand(O, SlotTracker);
3117 O << " = ";
3118 auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
3119 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
3120
3121 switch (ExpressionType) {
3122 case ExpressionTypes::ExtendedReduction: {
3123 getOperand(1)->printAsOperand(O, SlotTracker);
3124 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3125 O << Instruction::getOpcodeName(Opcode) << " (";
3126 getOperand(0)->printAsOperand(O, SlotTracker);
3127 Red->printFlags(O);
3128
3129 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3130 O << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3131 << *Ext0->getResultType();
3132 if (Red->isConditional()) {
3133 O << ", ";
3134 Red->getCondOp()->printAsOperand(O, SlotTracker);
3135 }
3136 O << ")";
3137 break;
3138 }
3139 case ExpressionTypes::ExtNegatedMulAccReduction: {
3140 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3141 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3142 O << Instruction::getOpcodeName(
3143 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3144 << " (sub (0, mul";
3145 auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
3146 Mul->printFlags(O);
3147 O << "(";
3148 getOperand(0)->printAsOperand(O, SlotTracker);
3149 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3150 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3151 << *Ext0->getResultType() << "), (";
3152 getOperand(1)->printAsOperand(O, SlotTracker);
3153 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3154 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3155 << *Ext1->getResultType() << ")";
3156 if (Red->isConditional()) {
3157 O << ", ";
3158 Red->getCondOp()->printAsOperand(O, SlotTracker);
3159 }
3160 O << "))";
3161 break;
3162 }
3163 case ExpressionTypes::MulAccReduction:
3164 case ExpressionTypes::ExtMulAccReduction: {
3165 getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
3166 O << " + " << (Red->isPartialReduction() ? "partial." : "") << "reduce.";
3167 O << Instruction::getOpcodeName(
3168 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
3169 << " (";
3170 O << "mul";
3171 bool IsExtended = ExpressionType == ExpressionTypes::ExtMulAccReduction;
3172 auto *Mul = cast<VPWidenRecipe>(IsExtended ? ExpressionRecipes[2]
3173 : ExpressionRecipes[0]);
3174 Mul->printFlags(O);
3175 if (IsExtended)
3176 O << "(";
3177 getOperand(0)->printAsOperand(O, SlotTracker);
3178 if (IsExtended) {
3179 auto *Ext0 = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
3180 O << " " << Instruction::getOpcodeName(Ext0->getOpcode()) << " to "
3181 << *Ext0->getResultType() << "), (";
3182 } else {
3183 O << ", ";
3184 }
3185 getOperand(1)->printAsOperand(O, SlotTracker);
3186 if (IsExtended) {
3187 auto *Ext1 = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
3188 O << " " << Instruction::getOpcodeName(Ext1->getOpcode()) << " to "
3189 << *Ext1->getResultType() << ")";
3190 }
3191 if (Red->isConditional()) {
3192 O << ", ";
3193 Red->getCondOp()->printAsOperand(O, SlotTracker);
3194 }
3195 O << ")";
3196 break;
3197 }
3198 }
3199}
3200
3201void VPReductionRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3202 VPSlotTracker &SlotTracker) const {
3203 if (isPartialReduction())
3204 O << Indent << "PARTIAL-REDUCE ";
3205 else
3206 O << Indent << "REDUCE ";
3207 printAsOperand(O, SlotTracker);
3208 O << " = ";
3209 getChainOp()->printAsOperand(O, SlotTracker);
3210 O << " +";
3211 printFlags(O);
3212 O << " reduce."
3213 << Instruction::getOpcodeName(
3214 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3215 << " (";
3216 getVecOp()->printAsOperand(O, SlotTracker);
3217 if (isConditional()) {
3218 O << ", ";
3219 getCondOp()->printAsOperand(O, SlotTracker);
3220 }
3221 O << ")";
3222}
3223
3224void VPReductionEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3225 VPSlotTracker &SlotTracker) const {
3226 O << Indent << "REDUCE ";
3227 printAsOperand(O, SlotTracker);
3228 O << " = ";
3229 getChainOp()->printAsOperand(O, SlotTracker);
3230 O << " +";
3231 printFlags(O);
3232 O << " vp.reduce."
3233 << Instruction::getOpcodeName(
3234 RecurrenceDescriptor::getOpcode(getRecurrenceKind()))
3235 << " (";
3236 getVecOp()->printAsOperand(O, SlotTracker);
3237 O << ", ";
3238 getEVL()->printAsOperand(O, SlotTracker);
3239 if (isConditional()) {
3240 O << ", ";
3241 getCondOp()->printAsOperand(O, SlotTracker);
3242 }
3243 O << ")";
3244}
3245
3246#endif
3247
3248/// A helper function to scalarize a single Instruction in the innermost loop.
3249/// Generates a sequence of scalar instances for lane \p Lane. Uses the VPValue
3250/// operands from \p RepRecipe instead of \p Instr's operands.
3251static void scalarizeInstruction(const Instruction *Instr,
3252 VPReplicateRecipe *RepRecipe,
3253 const VPLane &Lane, VPTransformState &State) {
3254 assert((!Instr->getType()->isAggregateType() ||
3255 canVectorizeTy(Instr->getType())) &&
3256 "Expected vectorizable or non-aggregate type.");
3257
3258 // Does this instruction return a value ?
3259 bool IsVoidRetTy = Instr->getType()->isVoidTy();
3260
3261 Instruction *Cloned = Instr->clone();
3262 if (!IsVoidRetTy) {
3263 Cloned->setName(Instr->getName() + ".cloned");
3264 Type *ResultTy = State.TypeAnalysis.inferScalarType(V: RepRecipe);
3265 // The operands of the replicate recipe may have been narrowed, resulting in
3266 // a narrower result type. Update the type of the cloned instruction to the
3267 // correct type.
3268 if (ResultTy != Cloned->getType())
3269 Cloned->mutateType(Ty: ResultTy);
3270 }
3271
3272 RepRecipe->applyFlags(I&: *Cloned);
3273 RepRecipe->applyMetadata(I&: *Cloned);
3274
3275 if (RepRecipe->hasPredicate())
3276 cast<CmpInst>(Val: Cloned)->setPredicate(RepRecipe->getPredicate());
3277
3278 if (auto DL = RepRecipe->getDebugLoc())
3279 State.setDebugLocFrom(DL);
3280
3281 // Replace the operands of the cloned instructions with their scalar
3282 // equivalents in the new loop.
3283 for (const auto &I : enumerate(First: RepRecipe->operands())) {
3284 auto InputLane = Lane;
3285 VPValue *Operand = I.value();
3286 if (vputils::isSingleScalar(VPV: Operand))
3287 InputLane = VPLane::getFirstLane();
3288 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Lane: InputLane));
3289 }
3290
3291 // Place the cloned scalar in the new loop.
3292 State.Builder.Insert(I: Cloned);
3293
3294 State.set(Def: RepRecipe, V: Cloned, Lane);
3295
3296 // If we just cloned a new assumption, add it the assumption cache.
3297 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
3298 State.AC->registerAssumption(CI: II);
3299
3300 assert(
3301 (RepRecipe->getRegion() ||
3302 !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||
3303 all_of(RepRecipe->operands(),
3304 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
3305 "Expected a recipe is either within a region or all of its operands "
3306 "are defined outside the vectorized region.");
3307}
3308
3309void VPReplicateRecipe::execute(VPTransformState &State) {
3310 Instruction *UI = getUnderlyingInstr();
3311
3312 if (!State.Lane) {
3313 assert(IsSingleScalar && "VPReplicateRecipes outside replicate regions "
3314 "must have already been unrolled");
3315 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: VPLane(0), State);
3316 return;
3317 }
3318
3319 assert((State.VF.isScalar() || !isSingleScalar()) &&
3320 "uniform recipe shouldn't be predicated");
3321 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
3322 scalarizeInstruction(Instr: UI, RepRecipe: this, Lane: *State.Lane, State);
3323 // Insert scalar instance packing it into a vector.
3324 if (State.VF.isVector() && shouldPack()) {
3325 Value *WideValue =
3326 State.Lane->isFirstLane()
3327 ? PoisonValue::get(T: toVectorizedTy(Ty: UI->getType(), EC: State.VF))
3328 : State.get(Def: this);
3329 State.set(Def: this, V: State.packScalarIntoVectorizedValue(Def: this, WideValue,
3330 Lane: *State.Lane));
3331 }
3332}
3333
3334bool VPReplicateRecipe::shouldPack() const {
3335 // Find if the recipe is used by a widened recipe via an intervening
3336 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
3337 return any_of(Range: users(), P: [](const VPUser *U) {
3338 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
3339 return !vputils::onlyScalarValuesUsed(Def: PredR);
3340 return false;
3341 });
3342}
3343
3344/// Returns a SCEV expression for \p Ptr if it is a pointer computation for
3345/// which the legacy cost model computes a SCEV expression when computing the
3346/// address cost. Computing SCEVs for VPValues is incomplete and returns
3347/// SCEVCouldNotCompute in cases the legacy cost model can compute SCEVs. In
3348/// those cases we fall back to the legacy cost model. Otherwise return nullptr.
3349static const SCEV *getAddressAccessSCEV(const VPValue *Ptr,
3350 PredicatedScalarEvolution &PSE,
3351 const Loop *L) {
3352 const SCEV *Addr = vputils::getSCEVExprForVPValue(V: Ptr, PSE, L);
3353 if (isa<SCEVCouldNotCompute>(Val: Addr))
3354 return Addr;
3355
3356 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L) ? Addr : nullptr;
3357}
3358
3359/// Returns true if \p V is used as part of the address of another load or
3360/// store.
3361static bool isUsedByLoadStoreAddress(const VPUser *V) {
3362 SmallPtrSet<const VPUser *, 4> Seen;
3363 SmallVector<const VPUser *> WorkList = {V};
3364
3365 while (!WorkList.empty()) {
3366 auto *Cur = dyn_cast<VPSingleDefRecipe>(Val: WorkList.pop_back_val());
3367 if (!Cur || !Seen.insert(Ptr: Cur).second)
3368 continue;
3369
3370 auto *Blend = dyn_cast<VPBlendRecipe>(Val: Cur);
3371 // Skip blends that use V only through a compare by checking if any incoming
3372 // value was already visited.
3373 if (Blend && none_of(Range: seq<unsigned>(Begin: 0, End: Blend->getNumIncomingValues()),
3374 P: [&](unsigned I) {
3375 return Seen.contains(
3376 Ptr: Blend->getIncomingValue(Idx: I)->getDefiningRecipe());
3377 }))
3378 continue;
3379
3380 for (VPUser *U : Cur->users()) {
3381 if (auto *InterleaveR = dyn_cast<VPInterleaveBase>(Val: U))
3382 if (InterleaveR->getAddr() == Cur)
3383 return true;
3384 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: U)) {
3385 if (RepR->getOpcode() == Instruction::Load &&
3386 RepR->getOperand(N: 0) == Cur)
3387 return true;
3388 if (RepR->getOpcode() == Instruction::Store &&
3389 RepR->getOperand(N: 1) == Cur)
3390 return true;
3391 }
3392 if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(Val: U)) {
3393 if (MemR->getAddr() == Cur && MemR->isConsecutive())
3394 return true;
3395 }
3396 }
3397
3398 // The legacy cost model only supports scalarization loads/stores with phi
3399 // addresses, if the phi is directly used as load/store address. Don't
3400 // traverse further for Blends.
3401 if (Blend)
3402 continue;
3403
3404 append_range(C&: WorkList, R: Cur->users());
3405 }
3406 return false;
3407}
3408
3409/// Return true if \p R is a predicated load/store with a loop-invariant address
3410/// only masked by the header mask.
3411static bool isPredicatedUniformMemOpAfterTailFolding(const VPReplicateRecipe &R,
3412 const SCEV *PtrSCEV,
3413 VPCostContext &Ctx) {
3414 const VPRegionBlock *ParentRegion = R.getRegion();
3415 if (!ParentRegion || !ParentRegion->isReplicator() || !PtrSCEV ||
3416 !Ctx.PSE.getSE()->isLoopInvariant(S: PtrSCEV, L: Ctx.L))
3417 return false;
3418 auto *BOM =
3419 cast<VPBranchOnMaskRecipe>(Val: &ParentRegion->getEntryBasicBlock()->front());
3420 return vputils::isHeaderMask(V: BOM->getOperand(N: 0), Plan: *ParentRegion->getPlan());
3421}
3422
3423InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
3424 VPCostContext &Ctx) const {
3425 Instruction *UI = cast<Instruction>(Val: getUnderlyingValue());
3426 // VPReplicateRecipe may be cloned as part of an existing VPlan-to-VPlan
3427 // transform, avoid computing their cost multiple times for now.
3428 Ctx.SkipCostComputation.insert(Ptr: UI);
3429
3430 if (VF.isScalable() && !isSingleScalar())
3431 return InstructionCost::getInvalid();
3432
3433 switch (UI->getOpcode()) {
3434 case Instruction::Alloca:
3435 if (VF.isScalable())
3436 return InstructionCost::getInvalid();
3437 return Ctx.TTI.getArithmeticInstrCost(
3438 Opcode: Instruction::Mul, Ty: Ctx.Types.inferScalarType(V: this), CostKind: Ctx.CostKind);
3439 case Instruction::GetElementPtr:
3440 // We mark this instruction as zero-cost because the cost of GEPs in
3441 // vectorized code depends on whether the corresponding memory instruction
3442 // is scalarized or not. Therefore, we handle GEPs with the memory
3443 // instruction cost.
3444 return 0;
3445 case Instruction::Call: {
3446 auto *CalledFn =
3447 cast<Function>(Val: getOperand(N: getNumOperands() - 1)->getLiveInIRValue());
3448
3449 SmallVector<const VPValue *> ArgOps(drop_end(RangeOrContainer: operands()));
3450 SmallVector<Type *, 4> Tys;
3451 for (const VPValue *ArgOp : ArgOps)
3452 Tys.push_back(Elt: Ctx.Types.inferScalarType(V: ArgOp));
3453
3454 if (CalledFn->isIntrinsic())
3455 // Various pseudo-intrinsics with costs of 0 are scalarized instead of
3456 // vectorized via VPWidenIntrinsicRecipe. Return 0 for them early.
3457 switch (CalledFn->getIntrinsicID()) {
3458 case Intrinsic::assume:
3459 case Intrinsic::lifetime_end:
3460 case Intrinsic::lifetime_start:
3461 case Intrinsic::sideeffect:
3462 case Intrinsic::pseudoprobe:
3463 case Intrinsic::experimental_noalias_scope_decl: {
3464 assert(getCostForIntrinsics(CalledFn->getIntrinsicID(), ArgOps, *this,
3465 ElementCount::getFixed(1), Ctx) == 0 &&
3466 "scalarizing intrinsic should be free");
3467 return InstructionCost(0);
3468 }
3469 default:
3470 break;
3471 }
3472
3473 Type *ResultTy = Ctx.Types.inferScalarType(V: this);
3474 InstructionCost ScalarCallCost =
3475 Ctx.TTI.getCallInstrCost(F: CalledFn, RetTy: ResultTy, Tys, CostKind: Ctx.CostKind);
3476 if (isSingleScalar()) {
3477 if (CalledFn->isIntrinsic())
3478 ScalarCallCost = std::min(
3479 a: ScalarCallCost,
3480 b: getCostForIntrinsics(ID: CalledFn->getIntrinsicID(), Operands: ArgOps, R: *this,
3481 VF: ElementCount::getFixed(MinVal: 1), Ctx));
3482 return ScalarCallCost;
3483 }
3484
3485 return ScalarCallCost * VF.getFixedValue() +
3486 Ctx.getScalarizationOverhead(ResultTy, Operands: ArgOps, VF);
3487 }
3488 case Instruction::Add:
3489 case Instruction::Sub:
3490 case Instruction::FAdd:
3491 case Instruction::FSub:
3492 case Instruction::Mul:
3493 case Instruction::FMul:
3494 case Instruction::FDiv:
3495 case Instruction::FRem:
3496 case Instruction::Shl:
3497 case Instruction::LShr:
3498 case Instruction::AShr:
3499 case Instruction::And:
3500 case Instruction::Or:
3501 case Instruction::Xor:
3502 case Instruction::ICmp:
3503 case Instruction::FCmp:
3504 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3505 Ctx) *
3506 (isSingleScalar() ? 1 : VF.getFixedValue());
3507 case Instruction::SDiv:
3508 case Instruction::UDiv:
3509 case Instruction::SRem:
3510 case Instruction::URem: {
3511 InstructionCost ScalarCost =
3512 getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1), Ctx);
3513 if (isSingleScalar())
3514 return ScalarCost;
3515
3516 // If any of the operands is from a different replicate region and has its
3517 // cost skipped, it may have been forced to scalar. Fall back to legacy cost
3518 // model to avoid cost mis-match.
3519 if (any_of(Range: operands(), P: [&Ctx, VF](VPValue *Op) {
3520 auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: Op);
3521 if (!PredR)
3522 return false;
3523 return Ctx.skipCostComputation(
3524 UI: dyn_cast_or_null<Instruction>(
3525 Val: PredR->getOperand(N: 0)->getUnderlyingValue()),
3526 IsVector: VF.isVector());
3527 }))
3528 break;
3529
3530 ScalarCost = ScalarCost * VF.getFixedValue() +
3531 Ctx.getScalarizationOverhead(ResultTy: Ctx.Types.inferScalarType(V: this),
3532 Operands: to_vector(Range: operands()), VF);
3533 // If the recipe is not predicated (i.e. not in a replicate region), return
3534 // the scalar cost. Otherwise handle predicated cost.
3535 if (!getRegion()->isReplicator())
3536 return ScalarCost;
3537
3538 // Account for the phi nodes that we will create.
3539 ScalarCost += VF.getFixedValue() *
3540 Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
3541 // Scale the cost by the probability of executing the predicated blocks.
3542 // This assumes the predicated block for each vector lane is equally
3543 // likely.
3544 ScalarCost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3545 return ScalarCost;
3546 }
3547 case Instruction::Load:
3548 case Instruction::Store: {
3549 bool IsLoad = UI->getOpcode() == Instruction::Load;
3550 const VPValue *PtrOp = getOperand(N: !IsLoad);
3551 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr: PtrOp, PSE&: Ctx.PSE, L: Ctx.L);
3552 if (isa_and_nonnull<SCEVCouldNotCompute>(Val: PtrSCEV))
3553 break;
3554
3555 Type *ValTy = Ctx.Types.inferScalarType(V: IsLoad ? this : getOperand(N: 0));
3556 Type *ScalarPtrTy = Ctx.Types.inferScalarType(V: PtrOp);
3557 const Align Alignment = getLoadStoreAlignment(I: UI);
3558 unsigned AS = cast<PointerType>(Val: ScalarPtrTy)->getAddressSpace();
3559 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: UI->getOperand(i: 0));
3560 bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing();
3561 bool UsedByLoadStoreAddress =
3562 !PreferVectorizedAddressing && isUsedByLoadStoreAddress(V: this);
3563 InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost(
3564 Opcode: UI->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind, OpdInfo: OpInfo,
3565 I: UsedByLoadStoreAddress ? UI : nullptr);
3566
3567 // Check if this is a predicated load/store with a loop-invariant address
3568 // only masked by the header mask. If so, return the uniform mem op cost.
3569 if (isPredicatedUniformMemOpAfterTailFolding(R: *this, PtrSCEV, Ctx)) {
3570 InstructionCost UniformCost =
3571 ScalarMemOpCost +
3572 Ctx.TTI.getAddressComputationCost(PtrTy: ScalarPtrTy, /*SE=*/nullptr,
3573 /*Ptr=*/nullptr, CostKind: Ctx.CostKind);
3574 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
3575 if (IsLoad) {
3576 return UniformCost +
3577 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast,
3578 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind);
3579 }
3580
3581 VPValue *StoredVal = getOperand(N: 0);
3582 if (!StoredVal->isDefinedOutsideLoopRegions())
3583 UniformCost += Ctx.TTI.getIndexedVectorInstrCostFromEnd(
3584 Opcode: Instruction::ExtractElement, Val: VectorTy, CostKind: Ctx.CostKind, Index: 0);
3585 return UniformCost;
3586 }
3587
3588 Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(Scalar: ScalarPtrTy, EC: VF);
3589 InstructionCost ScalarCost =
3590 ScalarMemOpCost +
3591 Ctx.TTI.getAddressComputationCost(
3592 PtrTy, SE: UsedByLoadStoreAddress ? nullptr : Ctx.PSE.getSE(), Ptr: PtrSCEV,
3593 CostKind: Ctx.CostKind);
3594 if (isSingleScalar())
3595 return ScalarCost;
3596
3597 SmallVector<const VPValue *> OpsToScalarize;
3598 Type *ResultTy = Type::getVoidTy(C&: PtrTy->getContext());
3599 // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we
3600 // don't assign scalarization overhead in general, if the target prefers
3601 // vectorized addressing or the loaded value is used as part of an address
3602 // of another load or store.
3603 if (!UsedByLoadStoreAddress) {
3604 bool EfficientVectorLoadStore =
3605 Ctx.TTI.supportsEfficientVectorElementLoadStore();
3606 if (!(IsLoad && !PreferVectorizedAddressing) &&
3607 !(!IsLoad && EfficientVectorLoadStore))
3608 append_range(C&: OpsToScalarize, R: operands());
3609
3610 if (!EfficientVectorLoadStore)
3611 ResultTy = Ctx.Types.inferScalarType(V: this);
3612 }
3613
3614 TTI::VectorInstrContext VIC =
3615 IsLoad ? TTI::VectorInstrContext::Load : TTI::VectorInstrContext::Store;
3616 InstructionCost Cost =
3617 (ScalarCost * VF.getFixedValue()) +
3618 Ctx.getScalarizationOverhead(ResultTy, Operands: OpsToScalarize, VF, VIC, AlwaysIncludeReplicatingR: true);
3619
3620 const VPRegionBlock *ParentRegion = getRegion();
3621 if (ParentRegion && ParentRegion->isReplicator()) {
3622 if (!PtrSCEV)
3623 break;
3624 Cost /= Ctx.getPredBlockCostDivisor(BB: UI->getParent());
3625 Cost += Ctx.TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Ctx.CostKind);
3626
3627 auto *VecI1Ty = VectorType::get(
3628 ElementType: IntegerType::getInt1Ty(C&: Ctx.L->getHeader()->getContext()), EC: VF);
3629 Cost += Ctx.TTI.getScalarizationOverhead(
3630 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
3631 /*Insert=*/false, /*Extract=*/true, CostKind: Ctx.CostKind);
3632
3633 if (Ctx.useEmulatedMaskMemRefHack(R: this, VF)) {
3634 // Artificially setting to a high enough value to practically disable
3635 // vectorization with such operations.
3636 return 3000000;
3637 }
3638 }
3639 return Cost;
3640 }
3641 case Instruction::SExt:
3642 case Instruction::ZExt:
3643 case Instruction::FPToUI:
3644 case Instruction::FPToSI:
3645 case Instruction::FPExt:
3646 case Instruction::PtrToInt:
3647 case Instruction::PtrToAddr:
3648 case Instruction::IntToPtr:
3649 case Instruction::SIToFP:
3650 case Instruction::UIToFP:
3651 case Instruction::Trunc:
3652 case Instruction::FPTrunc:
3653 case Instruction::Select:
3654 case Instruction::AddrSpaceCast: {
3655 return getCostForRecipeWithOpcode(Opcode: getOpcode(), VF: ElementCount::getFixed(MinVal: 1),
3656 Ctx) *
3657 (isSingleScalar() ? 1 : VF.getFixedValue());
3658 }
3659 case Instruction::ExtractValue:
3660 case Instruction::InsertValue:
3661 return Ctx.TTI.getInsertExtractValueCost(Opcode: getOpcode(), CostKind: Ctx.CostKind);
3662 }
3663
3664 return Ctx.getLegacyCost(UI, VF);
3665}
3666
3667#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3668void VPReplicateRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3669 VPSlotTracker &SlotTracker) const {
3670 O << Indent << (IsSingleScalar ? "CLONE " : "REPLICATE ");
3671
3672 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
3673 printAsOperand(O, SlotTracker);
3674 O << " = ";
3675 }
3676 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
3677 O << "call";
3678 printFlags(O);
3679 O << "@" << CB->getCalledFunction()->getName() << "(";
3680 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
3681 O, [&O, &SlotTracker](VPValue *Op) {
3682 Op->printAsOperand(O, SlotTracker);
3683 });
3684 O << ")";
3685 } else {
3686 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
3687 printFlags(O);
3688 printOperands(O, SlotTracker);
3689 }
3690
3691 if (shouldPack())
3692 O << " (S->V)";
3693}
3694#endif
3695
3696void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
3697 assert(State.Lane && "Branch on Mask works only on single instance.");
3698
3699 VPValue *BlockInMask = getOperand(N: 0);
3700 Value *ConditionBit = State.get(Def: BlockInMask, Lane: *State.Lane);
3701
3702 // Replace the temporary unreachable terminator with a new conditional branch,
3703 // whose two destinations will be set later when they are created.
3704 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
3705 assert(isa<UnreachableInst>(CurrentTerminator) &&
3706 "Expected to replace unreachable terminator with conditional branch.");
3707 auto CondBr =
3708 State.Builder.CreateCondBr(Cond: ConditionBit, True: State.CFG.PrevBB, False: nullptr);
3709 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
3710 CurrentTerminator->eraseFromParent();
3711}
3712
3713InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF,
3714 VPCostContext &Ctx) const {
3715 // The legacy cost model doesn't assign costs to branches for individual
3716 // replicate regions. Match the current behavior in the VPlan cost model for
3717 // now.
3718 return 0;
3719}
3720
3721void VPPredInstPHIRecipe::execute(VPTransformState &State) {
3722 assert(State.Lane && "Predicated instruction PHI works per instance.");
3723 Instruction *ScalarPredInst =
3724 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Lane: *State.Lane));
3725 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
3726 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
3727 assert(PredicatingBB && "Predicated block has no single predecessor.");
3728 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
3729 "operand must be VPReplicateRecipe");
3730
3731 // By current pack/unpack logic we need to generate only a single phi node: if
3732 // a vector value for the predicated instruction exists at this point it means
3733 // the instruction has vector users only, and a phi for the vector value is
3734 // needed. In this case the recipe of the predicated instruction is marked to
3735 // also do that packing, thereby "hoisting" the insert-element sequence.
3736 // Otherwise, a phi node for the scalar value is needed.
3737 if (State.hasVectorValue(Def: getOperand(N: 0))) {
3738 auto *VecI = cast<Instruction>(Val: State.get(Def: getOperand(N: 0)));
3739 assert((isa<InsertElementInst, InsertValueInst>(VecI)) &&
3740 "Packed operands must generate an insertelement or insertvalue");
3741
3742 // If VectorI is a struct, it will be a sequence like:
3743 // %1 = insertvalue %unmodified, %x, 0
3744 // %2 = insertvalue %1, %y, 1
3745 // %VectorI = insertvalue %2, %z, 2
3746 // To get the unmodified vector we need to look through the chain.
3747 if (auto *StructTy = dyn_cast<StructType>(Val: VecI->getType()))
3748 for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++)
3749 VecI = cast<InsertValueInst>(Val: VecI->getOperand(i: 0));
3750
3751 PHINode *VPhi = State.Builder.CreatePHI(Ty: VecI->getType(), NumReservedValues: 2);
3752 VPhi->addIncoming(V: VecI->getOperand(i: 0), BB: PredicatingBB); // Unmodified vector.
3753 VPhi->addIncoming(V: VecI, BB: PredicatedBB); // New vector with inserted element.
3754 if (State.hasVectorValue(Def: this))
3755 State.reset(Def: this, V: VPhi);
3756 else
3757 State.set(Def: this, V: VPhi);
3758 // NOTE: Currently we need to update the value of the operand, so the next
3759 // predicated iteration inserts its generated value in the correct vector.
3760 State.reset(Def: getOperand(N: 0), V: VPhi);
3761 } else {
3762 if (vputils::onlyFirstLaneUsed(Def: this) && !State.Lane->isFirstLane())
3763 return;
3764
3765 Type *PredInstType = State.TypeAnalysis.inferScalarType(V: getOperand(N: 0));
3766 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
3767 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
3768 BB: PredicatingBB);
3769 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
3770 if (State.hasScalarValue(Def: this, Lane: *State.Lane))
3771 State.reset(Def: this, V: Phi, Lane: *State.Lane);
3772 else
3773 State.set(Def: this, V: Phi, Lane: *State.Lane);
3774 // NOTE: Currently we need to update the value of the operand, so the next
3775 // predicated iteration inserts its generated value in the correct vector.
3776 State.reset(Def: getOperand(N: 0), V: Phi, Lane: *State.Lane);
3777 }
3778}
3779
3780#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3781void VPPredInstPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3782 VPSlotTracker &SlotTracker) const {
3783 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
3784 printAsOperand(O, SlotTracker);
3785 O << " = ";
3786 printOperands(O, SlotTracker);
3787}
3788#endif
3789
3790InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
3791 VPCostContext &Ctx) const {
3792 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3793 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3794 ->getAddressSpace();
3795 unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this)
3796 ? Instruction::Load
3797 : Instruction::Store;
3798
3799 if (!Consecutive) {
3800 // TODO: Using the original IR may not be accurate.
3801 // Currently, ARM will use the underlying IR to calculate gather/scatter
3802 // instruction cost.
3803 assert(!Reverse &&
3804 "Inconsecutive memory access should not have the order.");
3805
3806 const Value *Ptr = getLoadStorePointerOperand(V: &Ingredient);
3807 Type *PtrTy = Ptr->getType();
3808
3809 // If the address value is uniform across all lanes, then the address can be
3810 // calculated with scalar type and broadcast.
3811 if (!vputils::isSingleScalar(VPV: getAddr()))
3812 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
3813
3814 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_gather
3815 : isa<VPWidenStoreRecipe>(Val: this) ? Intrinsic::masked_scatter
3816 : isa<VPWidenLoadEVLRecipe>(Val: this) ? Intrinsic::vp_gather
3817 : Intrinsic::vp_scatter;
3818 return Ctx.TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
3819 CostKind: Ctx.CostKind) +
3820 Ctx.TTI.getMemIntrinsicInstrCost(
3821 MICA: MemIntrinsicCostAttributes(IID, Ty, Ptr, IsMasked, Alignment,
3822 &Ingredient),
3823 CostKind: Ctx.CostKind);
3824 }
3825
3826 InstructionCost Cost = 0;
3827 if (IsMasked) {
3828 unsigned IID = isa<VPWidenLoadRecipe>(Val: this) ? Intrinsic::masked_load
3829 : Intrinsic::masked_store;
3830 Cost += Ctx.TTI.getMemIntrinsicInstrCost(
3831 MICA: MemIntrinsicCostAttributes(IID, Ty, Alignment, AS), CostKind: Ctx.CostKind);
3832 } else {
3833 TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
3834 V: isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: this) ? getOperand(N: 0)
3835 : getOperand(N: 1));
3836 Cost += Ctx.TTI.getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace: AS, CostKind: Ctx.CostKind,
3837 OpdInfo: OpInfo, I: &Ingredient);
3838 }
3839 return Cost;
3840}
3841
3842void VPWidenLoadRecipe::execute(VPTransformState &State) {
3843 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3844 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3845 bool CreateGather = !isConsecutive();
3846
3847 auto &Builder = State.Builder;
3848 Value *Mask = nullptr;
3849 if (auto *VPMask = getMask()) {
3850 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3851 // of a null all-one mask is a null mask.
3852 Mask = State.get(Def: VPMask);
3853 if (isReverse())
3854 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3855 }
3856
3857 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateGather);
3858 Value *NewLI;
3859 if (CreateGather) {
3860 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
3861 Name: "wide.masked.gather");
3862 } else if (Mask) {
3863 NewLI =
3864 Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
3865 PassThru: PoisonValue::get(T: DataTy), Name: "wide.masked.load");
3866 } else {
3867 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
3868 }
3869 applyMetadata(I&: *cast<Instruction>(Val: NewLI));
3870 State.set(Def: this, V: NewLI);
3871}
3872
3873#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3874void VPWidenLoadRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3875 VPSlotTracker &SlotTracker) const {
3876 O << Indent << "WIDEN ";
3877 printAsOperand(O, SlotTracker);
3878 O << " = load ";
3879 printOperands(O, SlotTracker);
3880}
3881#endif
3882
3883/// Use all-true mask for reverse rather than actual mask, as it avoids a
3884/// dependence w/o affecting the result.
3885static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
3886 Value *EVL, const Twine &Name) {
3887 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
3888 Value *AllTrueMask =
3889 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
3890 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
3891 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
3892}
3893
3894void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
3895 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
3896 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
3897 bool CreateGather = !isConsecutive();
3898
3899 auto &Builder = State.Builder;
3900 CallInst *NewLI;
3901 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3902 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateGather);
3903 Value *Mask = nullptr;
3904 if (VPValue *VPMask = getMask()) {
3905 Mask = State.get(Def: VPMask);
3906 if (isReverse())
3907 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
3908 } else {
3909 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
3910 }
3911
3912 if (CreateGather) {
3913 NewLI =
3914 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
3915 FMFSource: nullptr, Name: "wide.masked.gather");
3916 } else {
3917 NewLI = Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_load,
3918 Args: {Addr, Mask, EVL}, FMFSource: nullptr, Name: "vp.op.load");
3919 }
3920 NewLI->addParamAttr(
3921 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
3922 applyMetadata(I&: *NewLI);
3923 Instruction *Res = NewLI;
3924 State.set(Def: this, V: Res);
3925}
3926
3927InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
3928 VPCostContext &Ctx) const {
3929 if (!Consecutive || IsMasked)
3930 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
3931
3932 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
3933 // here because the EVL recipes using EVL to replace the tail mask. But in the
3934 // legacy model, it will always calculate the cost of mask.
3935 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
3936 // don't need to compare to the legacy cost model.
3937 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
3938 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
3939 ->getAddressSpace();
3940 return Ctx.TTI.getMemIntrinsicInstrCost(
3941 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_load, Ty, Alignment, AS),
3942 CostKind: Ctx.CostKind);
3943}
3944
3945#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3946void VPWidenLoadEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3947 VPSlotTracker &SlotTracker) const {
3948 O << Indent << "WIDEN ";
3949 printAsOperand(O, SlotTracker);
3950 O << " = vp.load ";
3951 printOperands(O, SlotTracker);
3952}
3953#endif
3954
3955void VPWidenStoreRecipe::execute(VPTransformState &State) {
3956 VPValue *StoredVPValue = getStoredValue();
3957 bool CreateScatter = !isConsecutive();
3958
3959 auto &Builder = State.Builder;
3960
3961 Value *Mask = nullptr;
3962 if (auto *VPMask = getMask()) {
3963 // Mask reversal is only needed for non-all-one (null) masks, as reverse
3964 // of a null all-one mask is a null mask.
3965 Mask = State.get(Def: VPMask);
3966 if (isReverse())
3967 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
3968 }
3969
3970 Value *StoredVal = State.get(Def: StoredVPValue);
3971 Value *Addr = State.get(Def: getAddr(), /*IsScalar*/ !CreateScatter);
3972 Instruction *NewSI = nullptr;
3973 if (CreateScatter)
3974 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
3975 else if (Mask)
3976 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
3977 else
3978 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
3979 applyMetadata(I&: *NewSI);
3980}
3981
3982#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3983void VPWidenStoreRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
3984 VPSlotTracker &SlotTracker) const {
3985 O << Indent << "WIDEN store ";
3986 printOperands(O, SlotTracker);
3987}
3988#endif
3989
3990void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
3991 VPValue *StoredValue = getStoredValue();
3992 bool CreateScatter = !isConsecutive();
3993
3994 auto &Builder = State.Builder;
3995
3996 CallInst *NewSI = nullptr;
3997 Value *StoredVal = State.get(Def: StoredValue);
3998 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
3999 Value *Mask = nullptr;
4000 if (VPValue *VPMask = getMask()) {
4001 Mask = State.get(Def: VPMask);
4002 if (isReverse())
4003 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
4004 } else {
4005 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
4006 }
4007 Value *Addr = State.get(Def: getAddr(), IsScalar: !CreateScatter);
4008 if (CreateScatter) {
4009 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
4010 ID: Intrinsic::vp_scatter,
4011 Args: {StoredVal, Addr, Mask, EVL});
4012 } else {
4013 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
4014 ID: Intrinsic::vp_store,
4015 Args: {StoredVal, Addr, Mask, EVL});
4016 }
4017 NewSI->addParamAttr(
4018 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
4019 applyMetadata(I&: *NewSI);
4020}
4021
4022InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
4023 VPCostContext &Ctx) const {
4024 if (!Consecutive || IsMasked)
4025 return VPWidenMemoryRecipe::computeCost(VF, Ctx);
4026
4027 // We need to use the getMemIntrinsicInstrCost() instead of getMemoryOpCost()
4028 // here because the EVL recipes using EVL to replace the tail mask. But in the
4029 // legacy model, it will always calculate the cost of mask.
4030 // TODO: Using getMemoryOpCost() instead of getMemIntrinsicInstrCost when we
4031 // don't need to compare to the legacy cost model.
4032 Type *Ty = toVectorTy(Scalar: getLoadStoreType(I: &Ingredient), EC: VF);
4033 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4034 ->getAddressSpace();
4035 return Ctx.TTI.getMemIntrinsicInstrCost(
4036 MICA: MemIntrinsicCostAttributes(Intrinsic::vp_store, Ty, Alignment, AS),
4037 CostKind: Ctx.CostKind);
4038}
4039
4040#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4041void VPWidenStoreEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4042 VPSlotTracker &SlotTracker) const {
4043 O << Indent << "WIDEN vp.store ";
4044 printOperands(O, SlotTracker);
4045}
4046#endif
4047
4048static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V,
4049 VectorType *DstVTy, const DataLayout &DL) {
4050 // Verify that V is a vector type with same number of elements as DstVTy.
4051 auto VF = DstVTy->getElementCount();
4052 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
4053 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
4054 Type *SrcElemTy = SrcVecTy->getElementType();
4055 Type *DstElemTy = DstVTy->getElementType();
4056 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
4057 "Vector elements must have same size");
4058
4059 // Do a direct cast if element types are castable.
4060 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
4061 return Builder.CreateBitOrPointerCast(V, DestTy: DstVTy);
4062 }
4063 // V cannot be directly casted to desired vector type.
4064 // May happen when V is a floating point vector but DstVTy is a vector of
4065 // pointers or vice-versa. Handle this using a two-step bitcast using an
4066 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
4067 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
4068 "Only one type should be a pointer type");
4069 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
4070 "Only one type should be a floating point type");
4071 Type *IntTy =
4072 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
4073 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
4074 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
4075 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstVTy);
4076}
4077
4078/// Return a vector containing interleaved elements from multiple
4079/// smaller input vectors.
4080static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
4081 const Twine &Name) {
4082 unsigned Factor = Vals.size();
4083 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
4084
4085 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
4086#ifndef NDEBUG
4087 for (Value *Val : Vals)
4088 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
4089#endif
4090
4091 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
4092 // must use intrinsics to interleave.
4093 if (VecTy->isScalableTy()) {
4094 assert(Factor <= 8 && "Unsupported interleave factor for scalable vectors");
4095 return Builder.CreateVectorInterleave(Ops: Vals, Name);
4096 }
4097
4098 // Fixed length. Start by concatenating all vectors into a wide vector.
4099 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
4100
4101 // Interleave the elements into the wide vector.
4102 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
4103 return Builder.CreateShuffleVector(
4104 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
4105}
4106
4107// Try to vectorize the interleave group that \p Instr belongs to.
4108//
4109// E.g. Translate following interleaved load group (factor = 3):
4110// for (i = 0; i < N; i+=3) {
4111// R = Pic[i]; // Member of index 0
4112// G = Pic[i+1]; // Member of index 1
4113// B = Pic[i+2]; // Member of index 2
4114// ... // do something to R, G, B
4115// }
4116// To:
4117// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
4118// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
4119// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
4120// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
4121//
4122// Or translate following interleaved store group (factor = 3):
4123// for (i = 0; i < N; i+=3) {
4124// ... do something to R, G, B
4125// Pic[i] = R; // Member of index 0
4126// Pic[i+1] = G; // Member of index 1
4127// Pic[i+2] = B; // Member of index 2
4128// }
4129// To:
4130// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
4131// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
4132// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
4133// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
4134// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
4135void VPInterleaveRecipe::execute(VPTransformState &State) {
4136 assert(!State.Lane && "Interleave group being replicated.");
4137 assert((!needsMaskForGaps() || !State.VF.isScalable()) &&
4138 "Masking gaps for scalable vectors is not yet supported.");
4139 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4140 Instruction *Instr = Group->getInsertPos();
4141
4142 // Prepare for the vector type of the interleaved load/store.
4143 Type *ScalarTy = getLoadStoreType(I: Instr);
4144 unsigned InterleaveFactor = Group->getFactor();
4145 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: State.VF * InterleaveFactor);
4146
4147 VPValue *BlockInMask = getMask();
4148 VPValue *Addr = getAddr();
4149 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4150
4151 auto CreateGroupMask = [&BlockInMask, &State,
4152 &InterleaveFactor](Value *MaskForGaps) -> Value * {
4153 if (State.VF.isScalable()) {
4154 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
4155 assert(InterleaveFactor <= 8 &&
4156 "Unsupported deinterleave factor for scalable vectors");
4157 auto *ResBlockInMask = State.get(Def: BlockInMask);
4158 SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask);
4159 return interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4160 }
4161
4162 if (!BlockInMask)
4163 return MaskForGaps;
4164
4165 Value *ResBlockInMask = State.get(Def: BlockInMask);
4166 Value *ShuffledMask = State.Builder.CreateShuffleVector(
4167 V: ResBlockInMask,
4168 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: State.VF.getFixedValue()),
4169 Name: "interleaved.mask");
4170 return MaskForGaps ? State.Builder.CreateBinOp(Opc: Instruction::And,
4171 LHS: ShuffledMask, RHS: MaskForGaps)
4172 : ShuffledMask;
4173 };
4174
4175 const DataLayout &DL = Instr->getDataLayout();
4176 // Vectorize the interleaved load group.
4177 if (isa<LoadInst>(Val: Instr)) {
4178 Value *MaskForGaps = nullptr;
4179 if (needsMaskForGaps()) {
4180 MaskForGaps =
4181 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getFixedValue(), Group: *Group);
4182 assert(MaskForGaps && "Mask for Gaps is required but it is null");
4183 }
4184
4185 Instruction *NewLoad;
4186 if (BlockInMask || MaskForGaps) {
4187 Value *GroupMask = CreateGroupMask(MaskForGaps);
4188 Value *PoisonVec = PoisonValue::get(T: VecTy);
4189 NewLoad = State.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: ResAddr,
4190 Alignment: Group->getAlign(), Mask: GroupMask,
4191 PassThru: PoisonVec, Name: "wide.masked.vec");
4192 } else
4193 NewLoad = State.Builder.CreateAlignedLoad(Ty: VecTy, Ptr: ResAddr,
4194 Align: Group->getAlign(), Name: "wide.vec");
4195 applyMetadata(I&: *NewLoad);
4196 // TODO: Also manage existing metadata using VPIRMetadata.
4197 Group->addMetadata(NewInst: NewLoad);
4198
4199 ArrayRef<VPRecipeValue *> VPDefs = definedValues();
4200 if (VecTy->isScalableTy()) {
4201 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4202 // so must use intrinsics to deinterleave.
4203 assert(InterleaveFactor <= 8 &&
4204 "Unsupported deinterleave factor for scalable vectors");
4205 NewLoad = State.Builder.CreateIntrinsic(
4206 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4207 Types: NewLoad->getType(), Args: NewLoad,
4208 /*FMFSource=*/nullptr, Name: "strided.vec");
4209 }
4210
4211 auto CreateStridedVector = [&InterleaveFactor, &State,
4212 &NewLoad](unsigned Index) -> Value * {
4213 assert(Index < InterleaveFactor && "Illegal group index");
4214 if (State.VF.isScalable())
4215 return State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: Index);
4216
4217 // For fixed length VF, use shuffle to extract the sub-vectors from the
4218 // wide load.
4219 auto StrideMask =
4220 createStrideMask(Start: Index, Stride: InterleaveFactor, VF: State.VF.getFixedValue());
4221 return State.Builder.CreateShuffleVector(V: NewLoad, Mask: StrideMask,
4222 Name: "strided.vec");
4223 };
4224
4225 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4226 Instruction *Member = Group->getMember(Index: I);
4227
4228 // Skip the gaps in the group.
4229 if (!Member)
4230 continue;
4231
4232 Value *StridedVec = CreateStridedVector(I);
4233
4234 // If this member has different type, cast the result type.
4235 if (Member->getType() != ScalarTy) {
4236 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4237 StridedVec =
4238 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4239 }
4240
4241 if (Group->isReverse())
4242 StridedVec = State.Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
4243
4244 State.set(Def: VPDefs[J], V: StridedVec);
4245 ++J;
4246 }
4247 return;
4248 }
4249
4250 // The sub vector type for current instruction.
4251 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4252
4253 // Vectorize the interleaved store group.
4254 Value *MaskForGaps =
4255 createBitMaskForGaps(Builder&: State.Builder, VF: State.VF.getKnownMinValue(), Group: *Group);
4256 assert(((MaskForGaps != nullptr) == needsMaskForGaps()) &&
4257 "Mismatch between NeedsMaskForGaps and MaskForGaps");
4258 ArrayRef<VPValue *> StoredValues = getStoredValues();
4259 // Collect the stored vector from each member.
4260 SmallVector<Value *, 4> StoredVecs;
4261 unsigned StoredIdx = 0;
4262 for (unsigned i = 0; i < InterleaveFactor; i++) {
4263 assert((Group->getMember(i) || MaskForGaps) &&
4264 "Fail to get a member from an interleaved store group");
4265 Instruction *Member = Group->getMember(Index: i);
4266
4267 // Skip the gaps in the group.
4268 if (!Member) {
4269 Value *Undef = PoisonValue::get(T: SubVT);
4270 StoredVecs.push_back(Elt: Undef);
4271 continue;
4272 }
4273
4274 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4275 ++StoredIdx;
4276
4277 if (Group->isReverse())
4278 StoredVec = State.Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
4279
4280 // If this member has different type, cast it to a unified type.
4281
4282 if (StoredVec->getType() != SubVT)
4283 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4284
4285 StoredVecs.push_back(Elt: StoredVec);
4286 }
4287
4288 // Interleave all the smaller vectors into one wider vector.
4289 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4290 Instruction *NewStoreInstr;
4291 if (BlockInMask || MaskForGaps) {
4292 Value *GroupMask = CreateGroupMask(MaskForGaps);
4293 NewStoreInstr = State.Builder.CreateMaskedStore(
4294 Val: IVec, Ptr: ResAddr, Alignment: Group->getAlign(), Mask: GroupMask);
4295 } else
4296 NewStoreInstr =
4297 State.Builder.CreateAlignedStore(Val: IVec, Ptr: ResAddr, Align: Group->getAlign());
4298
4299 applyMetadata(I&: *NewStoreInstr);
4300 // TODO: Also manage existing metadata using VPIRMetadata.
4301 Group->addMetadata(NewInst: NewStoreInstr);
4302}
4303
4304#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4305void VPInterleaveRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4306 VPSlotTracker &SlotTracker) const {
4307 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4308 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4309 IG->getInsertPos()->printAsOperand(O, false);
4310 O << ", ";
4311 getAddr()->printAsOperand(O, SlotTracker);
4312 VPValue *Mask = getMask();
4313 if (Mask) {
4314 O << ", ";
4315 Mask->printAsOperand(O, SlotTracker);
4316 }
4317
4318 unsigned OpIdx = 0;
4319 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4320 if (!IG->getMember(i))
4321 continue;
4322 if (getNumStoreOperands() > 0) {
4323 O << "\n" << Indent << " store ";
4324 getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
4325 O << " to index " << i;
4326 } else {
4327 O << "\n" << Indent << " ";
4328 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4329 O << " = load from index " << i;
4330 }
4331 ++OpIdx;
4332 }
4333}
4334#endif
4335
4336void VPInterleaveEVLRecipe::execute(VPTransformState &State) {
4337 assert(!State.Lane && "Interleave group being replicated.");
4338 assert(State.VF.isScalable() &&
4339 "Only support scalable VF for EVL tail-folding.");
4340 assert(!needsMaskForGaps() &&
4341 "Masking gaps for scalable vectors is not yet supported.");
4342 const InterleaveGroup<Instruction> *Group = getInterleaveGroup();
4343 Instruction *Instr = Group->getInsertPos();
4344
4345 // Prepare for the vector type of the interleaved load/store.
4346 Type *ScalarTy = getLoadStoreType(I: Instr);
4347 unsigned InterleaveFactor = Group->getFactor();
4348 assert(InterleaveFactor <= 8 &&
4349 "Unsupported deinterleave/interleave factor for scalable vectors");
4350 ElementCount WideVF = State.VF * InterleaveFactor;
4351 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: WideVF);
4352
4353 VPValue *Addr = getAddr();
4354 Value *ResAddr = State.get(Def: Addr, Lane: VPLane(0));
4355 Value *EVL = State.get(Def: getEVL(), Lane: VPLane(0));
4356 Value *InterleaveEVL = State.Builder.CreateMul(
4357 LHS: EVL, RHS: ConstantInt::get(Ty: EVL->getType(), V: InterleaveFactor), Name: "interleave.evl",
4358 /* NUW= */ HasNUW: true, /* NSW= */ HasNSW: true);
4359 LLVMContext &Ctx = State.Builder.getContext();
4360
4361 Value *GroupMask = nullptr;
4362 if (VPValue *BlockInMask = getMask()) {
4363 SmallVector<Value *> Ops(InterleaveFactor, State.get(Def: BlockInMask));
4364 GroupMask = interleaveVectors(Builder&: State.Builder, Vals: Ops, Name: "interleaved.mask");
4365 } else {
4366 GroupMask =
4367 State.Builder.CreateVectorSplat(EC: WideVF, V: State.Builder.getTrue());
4368 }
4369
4370 // Vectorize the interleaved load group.
4371 if (isa<LoadInst>(Val: Instr)) {
4372 CallInst *NewLoad = State.Builder.CreateIntrinsic(
4373 RetTy: VecTy, ID: Intrinsic::vp_load, Args: {ResAddr, GroupMask, InterleaveEVL}, FMFSource: nullptr,
4374 Name: "wide.vp.load");
4375 NewLoad->addParamAttr(ArgNo: 0,
4376 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4377
4378 applyMetadata(I&: *NewLoad);
4379 // TODO: Also manage existing metadata using VPIRMetadata.
4380 Group->addMetadata(NewInst: NewLoad);
4381
4382 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
4383 // so must use intrinsics to deinterleave.
4384 NewLoad = State.Builder.CreateIntrinsic(
4385 ID: Intrinsic::getDeinterleaveIntrinsicID(Factor: InterleaveFactor),
4386 Types: NewLoad->getType(), Args: NewLoad,
4387 /*FMFSource=*/nullptr, Name: "strided.vec");
4388
4389 const DataLayout &DL = Instr->getDataLayout();
4390 for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) {
4391 Instruction *Member = Group->getMember(Index: I);
4392 // Skip the gaps in the group.
4393 if (!Member)
4394 continue;
4395
4396 Value *StridedVec = State.Builder.CreateExtractValue(Agg: NewLoad, Idxs: I);
4397 // If this member has different type, cast the result type.
4398 if (Member->getType() != ScalarTy) {
4399 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: State.VF);
4400 StridedVec =
4401 createBitOrPointerCast(Builder&: State.Builder, V: StridedVec, DstVTy: OtherVTy, DL);
4402 }
4403
4404 State.set(Def: getVPValue(I: J), V: StridedVec);
4405 ++J;
4406 }
4407 return;
4408 } // End for interleaved load.
4409
4410 // The sub vector type for current instruction.
4411 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: State.VF);
4412 // Vectorize the interleaved store group.
4413 ArrayRef<VPValue *> StoredValues = getStoredValues();
4414 // Collect the stored vector from each member.
4415 SmallVector<Value *, 4> StoredVecs;
4416 const DataLayout &DL = Instr->getDataLayout();
4417 for (unsigned I = 0, StoredIdx = 0; I < InterleaveFactor; I++) {
4418 Instruction *Member = Group->getMember(Index: I);
4419 // Skip the gaps in the group.
4420 if (!Member) {
4421 StoredVecs.push_back(Elt: PoisonValue::get(T: SubVT));
4422 continue;
4423 }
4424
4425 Value *StoredVec = State.get(Def: StoredValues[StoredIdx]);
4426 // If this member has different type, cast it to a unified type.
4427 if (StoredVec->getType() != SubVT)
4428 StoredVec = createBitOrPointerCast(Builder&: State.Builder, V: StoredVec, DstVTy: SubVT, DL);
4429
4430 StoredVecs.push_back(Elt: StoredVec);
4431 ++StoredIdx;
4432 }
4433
4434 // Interleave all the smaller vectors into one wider vector.
4435 Value *IVec = interleaveVectors(Builder&: State.Builder, Vals: StoredVecs, Name: "interleaved.vec");
4436 CallInst *NewStore =
4437 State.Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: Ctx), ID: Intrinsic::vp_store,
4438 Args: {IVec, ResAddr, GroupMask, InterleaveEVL});
4439 NewStore->addParamAttr(ArgNo: 1,
4440 Attr: Attribute::getWithAlignment(Context&: Ctx, Alignment: Group->getAlign()));
4441
4442 applyMetadata(I&: *NewStore);
4443 // TODO: Also manage existing metadata using VPIRMetadata.
4444 Group->addMetadata(NewInst: NewStore);
4445}
4446
4447#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4448void VPInterleaveEVLRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4449 VPSlotTracker &SlotTracker) const {
4450 const InterleaveGroup<Instruction> *IG = getInterleaveGroup();
4451 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
4452 IG->getInsertPos()->printAsOperand(O, false);
4453 O << ", ";
4454 getAddr()->printAsOperand(O, SlotTracker);
4455 O << ", ";
4456 getEVL()->printAsOperand(O, SlotTracker);
4457 if (VPValue *Mask = getMask()) {
4458 O << ", ";
4459 Mask->printAsOperand(O, SlotTracker);
4460 }
4461
4462 unsigned OpIdx = 0;
4463 for (unsigned i = 0; i < IG->getFactor(); ++i) {
4464 if (!IG->getMember(i))
4465 continue;
4466 if (getNumStoreOperands() > 0) {
4467 O << "\n" << Indent << " vp.store ";
4468 getOperand(2 + OpIdx)->printAsOperand(O, SlotTracker);
4469 O << " to index " << i;
4470 } else {
4471 O << "\n" << Indent << " ";
4472 getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
4473 O << " = vp.load from index " << i;
4474 }
4475 ++OpIdx;
4476 }
4477}
4478#endif
4479
4480InstructionCost VPInterleaveBase::computeCost(ElementCount VF,
4481 VPCostContext &Ctx) const {
4482 Instruction *InsertPos = getInsertPos();
4483 // Find the VPValue index of the interleave group. We need to skip gaps.
4484 unsigned InsertPosIdx = 0;
4485 for (unsigned Idx = 0; IG->getFactor(); ++Idx)
4486 if (auto *Member = IG->getMember(Index: Idx)) {
4487 if (Member == InsertPos)
4488 break;
4489 InsertPosIdx++;
4490 }
4491 Type *ValTy = Ctx.Types.inferScalarType(
4492 V: getNumDefinedValues() > 0 ? getVPValue(I: InsertPosIdx)
4493 : getStoredValues()[InsertPosIdx]);
4494 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4495 unsigned AS = cast<PointerType>(Val: Ctx.Types.inferScalarType(V: getAddr()))
4496 ->getAddressSpace();
4497
4498 unsigned InterleaveFactor = IG->getFactor();
4499 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4500
4501 // Holds the indices of existing members in the interleaved group.
4502 SmallVector<unsigned, 4> Indices;
4503 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4504 if (IG->getMember(Index: IF))
4505 Indices.push_back(Elt: IF);
4506
4507 // Calculate the cost of the whole interleaved group.
4508 InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
4509 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: IG->getFactor(), Indices,
4510 Alignment: IG->getAlign(), AddressSpace: AS, CostKind: Ctx.CostKind, UseMaskForCond: getMask(), UseMaskForGaps: NeedsMaskForGaps);
4511
4512 if (!IG->isReverse())
4513 return Cost;
4514
4515 return Cost + IG->getNumMembers() *
4516 Ctx.TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse,
4517 DstTy: VectorTy, SrcTy: VectorTy, Mask: {}, CostKind: Ctx.CostKind,
4518 Index: 0);
4519}
4520
4521#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4522void VPCanonicalIVPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4523 VPSlotTracker &SlotTracker) const {
4524 O << Indent << "EMIT ";
4525 printAsOperand(O, SlotTracker);
4526 O << " = CANONICAL-INDUCTION ";
4527 printOperands(O, SlotTracker);
4528}
4529#endif
4530
4531bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
4532 return vputils::onlyScalarValuesUsed(Def: this) &&
4533 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
4534}
4535
4536#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4537void VPWidenPointerInductionRecipe::printRecipe(
4538 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4539 assert((getNumOperands() == 3 || getNumOperands() == 5) &&
4540 "unexpected number of operands");
4541 O << Indent << "EMIT ";
4542 printAsOperand(O, SlotTracker);
4543 O << " = WIDEN-POINTER-INDUCTION ";
4544 getStartValue()->printAsOperand(O, SlotTracker);
4545 O << ", ";
4546 getStepValue()->printAsOperand(O, SlotTracker);
4547 O << ", ";
4548 getOperand(2)->printAsOperand(O, SlotTracker);
4549 if (getNumOperands() == 5) {
4550 O << ", ";
4551 getOperand(3)->printAsOperand(O, SlotTracker);
4552 O << ", ";
4553 getOperand(4)->printAsOperand(O, SlotTracker);
4554 }
4555}
4556
4557void VPExpandSCEVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4558 VPSlotTracker &SlotTracker) const {
4559 O << Indent << "EMIT ";
4560 printAsOperand(O, SlotTracker);
4561 O << " = EXPAND SCEV " << *Expr;
4562}
4563#endif
4564
4565void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
4566 Value *CanonicalIV = State.get(Def: getOperand(N: 0), /*IsScalar*/ true);
4567 Type *STy = CanonicalIV->getType();
4568 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
4569 ElementCount VF = State.VF;
4570 Value *VStart = VF.isScalar()
4571 ? CanonicalIV
4572 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
4573 Value *VStep = Builder.CreateElementCount(
4574 Ty: STy, EC: VF.multiplyCoefficientBy(RHS: getUnrollPart(U: *this)));
4575 if (VF.isVector()) {
4576 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
4577 VStep =
4578 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
4579 }
4580 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
4581 State.set(Def: this, V: CanonicalVectorIV);
4582}
4583
4584#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4585void VPWidenCanonicalIVRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4586 VPSlotTracker &SlotTracker) const {
4587 O << Indent << "EMIT ";
4588 printAsOperand(O, SlotTracker);
4589 O << " = WIDEN-CANONICAL-INDUCTION ";
4590 printOperands(O, SlotTracker);
4591}
4592#endif
4593
4594void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
4595 auto &Builder = State.Builder;
4596 // Create a vector from the initial value.
4597 auto *VectorInit = getStartValue()->getLiveInIRValue();
4598
4599 Type *VecTy = State.VF.isScalar()
4600 ? VectorInit->getType()
4601 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
4602
4603 BasicBlock *VectorPH =
4604 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4605 if (State.VF.isVector()) {
4606 auto *IdxTy = Builder.getInt32Ty();
4607 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
4608 IRBuilder<>::InsertPointGuard Guard(Builder);
4609 Builder.SetInsertPoint(VectorPH->getTerminator());
4610 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
4611 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
4612 VectorInit = Builder.CreateInsertElement(
4613 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
4614 }
4615
4616 // Create a phi node for the new recurrence.
4617 PHINode *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
4618 Phi->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
4619 Phi->addIncoming(V: VectorInit, BB: VectorPH);
4620 State.set(Def: this, V: Phi);
4621}
4622
4623InstructionCost
4624VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
4625 VPCostContext &Ctx) const {
4626 if (VF.isScalar())
4627 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4628
4629 return 0;
4630}
4631
4632#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4633void VPFirstOrderRecurrencePHIRecipe::printRecipe(
4634 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4635 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
4636 printAsOperand(O, SlotTracker);
4637 O << " = phi ";
4638 printOperands(O, SlotTracker);
4639}
4640#endif
4641
4642void VPReductionPHIRecipe::execute(VPTransformState &State) {
4643 // Reductions do not have to start at zero. They can start with
4644 // any loop invariant values.
4645 VPValue *StartVPV = getStartValue();
4646
4647 // In order to support recurrences we need to be able to vectorize Phi nodes.
4648 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4649 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4650 // this value when we vectorize all of the instructions that use the PHI.
4651 BasicBlock *VectorPH =
4652 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4653 bool ScalarPHI = State.VF.isScalar() || isInLoop();
4654 Value *StartV = State.get(Def: StartVPV, IsScalar: ScalarPHI);
4655 Type *VecTy = StartV->getType();
4656
4657 BasicBlock *HeaderBB = State.CFG.PrevBB;
4658 assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
4659 "recipe must be in the vector loop header");
4660 auto *Phi = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
4661 Phi->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
4662 State.set(Def: this, V: Phi, IsScalar: isInLoop());
4663
4664 Phi->addIncoming(V: StartV, BB: VectorPH);
4665}
4666
4667#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4668void VPReductionPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4669 VPSlotTracker &SlotTracker) const {
4670 O << Indent << "WIDEN-REDUCTION-PHI ";
4671
4672 printAsOperand(O, SlotTracker);
4673 O << " = phi";
4674 printFlags(O);
4675 printOperands(O, SlotTracker);
4676 if (getVFScaleFactor() > 1)
4677 O << " (VF scaled by 1/" << getVFScaleFactor() << ")";
4678}
4679#endif
4680
4681void VPWidenPHIRecipe::execute(VPTransformState &State) {
4682 Value *Op0 = State.get(Def: getOperand(N: 0));
4683 Type *VecTy = Op0->getType();
4684 Instruction *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name);
4685 State.set(Def: this, V: VecPhi);
4686}
4687
4688InstructionCost VPWidenPHIRecipe::computeCost(ElementCount VF,
4689 VPCostContext &Ctx) const {
4690 return Ctx.TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Ctx.CostKind);
4691}
4692
4693#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4694void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4695 VPSlotTracker &SlotTracker) const {
4696 O << Indent << "WIDEN-PHI ";
4697
4698 printAsOperand(O, SlotTracker);
4699 O << " = phi ";
4700 printPhiOperands(O, SlotTracker);
4701}
4702#endif
4703
4704void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
4705 BasicBlock *VectorPH =
4706 State.CFG.VPBB2IRBB.at(Val: getParent()->getCFGPredecessor(Idx: 0));
4707 Value *StartMask = State.get(Def: getOperand(N: 0));
4708 PHINode *Phi =
4709 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
4710 Phi->addIncoming(V: StartMask, BB: VectorPH);
4711 State.set(Def: this, V: Phi);
4712}
4713
4714#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4715void VPActiveLaneMaskPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
4716 VPSlotTracker &SlotTracker) const {
4717 O << Indent << "ACTIVE-LANE-MASK-PHI ";
4718
4719 printAsOperand(O, SlotTracker);
4720 O << " = phi ";
4721 printOperands(O, SlotTracker);
4722}
4723#endif
4724
4725#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4726void VPCurrentIterationPHIRecipe::printRecipe(
4727 raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const {
4728 O << Indent << "CURRENT-ITERATION-PHI ";
4729
4730 printAsOperand(O, SlotTracker);
4731 O << " = phi ";
4732 printOperands(O, SlotTracker);
4733}
4734#endif
4735