1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "RISCVTargetTransformInfo.h"
10#include "MCTargetDesc/RISCVMatInt.h"
11#include "llvm/ADT/STLExtras.h"
12#include "llvm/Analysis/TargetTransformInfo.h"
13#include "llvm/CodeGen/BasicTTIImpl.h"
14#include "llvm/CodeGen/CostTable.h"
15#include "llvm/CodeGen/TargetLowering.h"
16#include "llvm/CodeGen/ValueTypes.h"
17#include "llvm/IR/Instructions.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
19#include "llvm/IR/PatternMatch.h"
20#include "llvm/Transforms/InstCombine/InstCombiner.h"
21#include <cmath>
22#include <optional>
23using namespace llvm;
24using namespace llvm::PatternMatch;
25
26#define DEBUG_TYPE "riscvtti"
27
28static cl::opt<unsigned> RVVRegisterWidthLMUL(
29 "riscv-v-register-bit-width-lmul",
30 cl::desc(
31 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
32 "by autovectorized code. Fractional LMULs are not supported."),
33 cl::init(Val: 2), cl::Hidden);
34
35static cl::opt<unsigned> SLPMaxVF(
36 "riscv-v-slp-max-vf",
37 cl::desc(
38 "Overrides result used for getMaximumVF query which is used "
39 "exclusively by SLP vectorizer."),
40 cl::Hidden);
41
42static cl::opt<unsigned>
43 RVVMinTripCount("riscv-v-min-trip-count",
44 cl::desc("Set the lower bound of a trip count to decide on "
45 "vectorization while tail-folding."),
46 cl::init(Val: 5), cl::Hidden);
47
48static cl::opt<bool> EnableOrLikeSelectOpt("enable-riscv-or-like-select",
49 cl::init(Val: true), cl::Hidden);
50
51InstructionCost
52RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
53 TTI::TargetCostKind CostKind) const {
54 // Check if the type is valid for all CostKind
55 if (!VT.isVector())
56 return InstructionCost::getInvalid();
57 size_t NumInstr = OpCodes.size();
58 if (CostKind == TTI::TCK_CodeSize)
59 return NumInstr;
60 InstructionCost LMULCost = TLI->getLMULCost(VT);
61 if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
62 return LMULCost * NumInstr;
63 InstructionCost Cost = 0;
64 for (auto Op : OpCodes) {
65 switch (Op) {
66 case RISCV::VRGATHER_VI:
67 Cost += TLI->getVRGatherVICost(VT);
68 break;
69 case RISCV::VRGATHER_VV:
70 Cost += TLI->getVRGatherVVCost(VT);
71 break;
72 case RISCV::VSLIDEUP_VI:
73 case RISCV::VSLIDEDOWN_VI:
74 Cost += TLI->getVSlideVICost(VT);
75 break;
76 case RISCV::VSLIDEUP_VX:
77 case RISCV::VSLIDEDOWN_VX:
78 Cost += TLI->getVSlideVXCost(VT);
79 break;
80 case RISCV::VREDMAX_VS:
81 case RISCV::VREDMIN_VS:
82 case RISCV::VREDMAXU_VS:
83 case RISCV::VREDMINU_VS:
84 case RISCV::VREDSUM_VS:
85 case RISCV::VREDAND_VS:
86 case RISCV::VREDOR_VS:
87 case RISCV::VREDXOR_VS:
88 case RISCV::VFREDMAX_VS:
89 case RISCV::VFREDMIN_VS:
90 case RISCV::VFREDUSUM_VS: {
91 unsigned VL = VT.getVectorMinNumElements();
92 if (!VT.isFixedLengthVector())
93 VL *= *getVScaleForTuning();
94 Cost += Log2_32_Ceil(Value: VL);
95 break;
96 }
97 case RISCV::VFREDOSUM_VS: {
98 unsigned VL = VT.getVectorMinNumElements();
99 if (!VT.isFixedLengthVector())
100 VL *= *getVScaleForTuning();
101 Cost += VL;
102 break;
103 }
104 case RISCV::VMV_X_S:
105 case RISCV::VMV_S_X:
106 case RISCV::VFMV_F_S:
107 case RISCV::VFMV_S_F:
108 case RISCV::VMOR_MM:
109 case RISCV::VMXOR_MM:
110 case RISCV::VMAND_MM:
111 case RISCV::VMANDN_MM:
112 case RISCV::VMNAND_MM:
113 case RISCV::VCPOP_M:
114 case RISCV::VFIRST_M:
115 Cost += 1;
116 break;
117 case RISCV::VDIV_VV:
118 case RISCV::VREM_VV:
119 Cost += LMULCost * TTI::TCC_Expensive;
120 break;
121 default:
122 Cost += LMULCost;
123 }
124 }
125 return Cost;
126}
127
128static InstructionCost getIntImmCostImpl(const DataLayout &DL,
129 const RISCVSubtarget *ST,
130 const APInt &Imm, Type *Ty,
131 TTI::TargetCostKind CostKind,
132 bool FreeZeroes) {
133 assert(Ty->isIntegerTy() &&
134 "getIntImmCost can only estimate cost of materialising integers");
135
136 // We have a Zero register, so 0 is always free.
137 if (Imm == 0)
138 return TTI::TCC_Free;
139
140 // Otherwise, we check how many instructions it will take to materialise.
141 return RISCVMatInt::getIntMatCost(Val: Imm, Size: DL.getTypeSizeInBits(Ty), STI: *ST,
142 /*CompressionCost=*/false, FreeZeroes);
143}
144
145InstructionCost
146RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
147 TTI::TargetCostKind CostKind) const {
148 return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind, FreeZeroes: false);
149}
150
151// Look for patterns of shift followed by AND that can be turned into a pair of
152// shifts. We won't need to materialize an immediate for the AND so these can
153// be considered free.
154static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
155 uint64_t Mask = Imm.getZExtValue();
156 auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: 0));
157 if (!BO || !BO->hasOneUse())
158 return false;
159
160 if (BO->getOpcode() != Instruction::Shl)
161 return false;
162
163 if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 1)))
164 return false;
165
166 unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))->getZExtValue();
167 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
168 // is a mask shifted by c2 bits with c3 leading zeros.
169 if (isShiftedMask_64(Value: Mask)) {
170 unsigned Trailing = llvm::countr_zero(Val: Mask);
171 if (ShAmt == Trailing)
172 return true;
173 }
174
175 return false;
176}
177
178// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
179// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
180// the type will be split so only the lower 32 bits need to be compared using
181// (srai/srli X, C) == C2.
182static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
183 if (!Inst->hasOneUse())
184 return false;
185
186 // Look for equality comparison.
187 auto *Cmp = dyn_cast<ICmpInst>(Val: *Inst->user_begin());
188 if (!Cmp || !Cmp->isEquality())
189 return false;
190
191 // Right hand side of comparison should be a constant.
192 auto *C = dyn_cast<ConstantInt>(Val: Cmp->getOperand(i_nocapture: 1));
193 if (!C)
194 return false;
195
196 uint64_t Mask = Imm.getZExtValue();
197
198 // Mask should be of the form -(1 << C) in the lower 32 bits.
199 if (!isUInt<32>(x: Mask) || !isPowerOf2_32(Value: -uint32_t(Mask)))
200 return false;
201
202 // Comparison constant should be a subset of Mask.
203 uint64_t CmpC = C->getZExtValue();
204 if ((CmpC & Mask) != CmpC)
205 return false;
206
207 // We'll need to sign extend the comparison constant and shift it right. Make
208 // sure the new constant can use addi/xori+seqz/snez.
209 unsigned ShiftBits = llvm::countr_zero(Val: Mask);
210 int64_t NewCmpC = SignExtend64<32>(x: CmpC) >> ShiftBits;
211 return NewCmpC >= -2048 && NewCmpC <= 2048;
212}
213
214InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
215 const APInt &Imm, Type *Ty,
216 TTI::TargetCostKind CostKind,
217 Instruction *Inst) const {
218 assert(Ty->isIntegerTy() &&
219 "getIntImmCost can only estimate cost of materialising integers");
220
221 // We have a Zero register, so 0 is always free.
222 if (Imm == 0)
223 return TTI::TCC_Free;
224
225 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
226 // commutative, in others the immediate comes from a specific argument index.
227 bool Takes12BitImm = false;
228 unsigned ImmArgIdx = ~0U;
229
230 switch (Opcode) {
231 case Instruction::GetElementPtr:
232 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
233 // split up large offsets in GEP into better parts than ConstantHoisting
234 // can.
235 return TTI::TCC_Free;
236 case Instruction::Store: {
237 // Use the materialization cost regardless of if it's the address or the
238 // value that is constant, except for if the store is misaligned and
239 // misaligned accesses are not legal (experience shows constant hoisting
240 // can sometimes be harmful in such cases).
241 if (Idx == 1 || !Inst)
242 return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind,
243 /*FreeZeroes=*/true);
244
245 StoreInst *ST = cast<StoreInst>(Val: Inst);
246 if (!getTLI()->allowsMemoryAccessForAlignment(
247 Context&: Ty->getContext(), DL, VT: getTLI()->getValueType(DL, Ty),
248 AddrSpace: ST->getPointerAddressSpace(), Alignment: ST->getAlign()))
249 return TTI::TCC_Free;
250
251 return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind,
252 /*FreeZeroes=*/true);
253 }
254 case Instruction::Load:
255 // If the address is a constant, use the materialization cost.
256 return getIntImmCost(Imm, Ty, CostKind);
257 case Instruction::And:
258 // zext.h
259 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
260 return TTI::TCC_Free;
261 // zext.w
262 if (Imm == UINT64_C(0xffffffff) &&
263 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
264 return TTI::TCC_Free;
265 // bclri
266 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
267 return TTI::TCC_Free;
268 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
269 canUseShiftPair(Inst, Imm))
270 return TTI::TCC_Free;
271 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
272 canUseShiftCmp(Inst, Imm))
273 return TTI::TCC_Free;
274 Takes12BitImm = true;
275 break;
276 case Instruction::Add:
277 Takes12BitImm = true;
278 break;
279 case Instruction::Or:
280 case Instruction::Xor:
281 // bseti/binvi
282 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
283 return TTI::TCC_Free;
284 Takes12BitImm = true;
285 break;
286 case Instruction::Mul:
287 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
288 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
289 return TTI::TCC_Free;
290 // One more or less than a power of 2 can use SLLI+ADD/SUB.
291 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
292 return TTI::TCC_Free;
293 // FIXME: There is no MULI instruction.
294 Takes12BitImm = true;
295 break;
296 case Instruction::Sub:
297 case Instruction::Shl:
298 case Instruction::LShr:
299 case Instruction::AShr:
300 Takes12BitImm = true;
301 ImmArgIdx = 1;
302 break;
303 default:
304 break;
305 }
306
307 if (Takes12BitImm) {
308 // Check immediate is the correct argument...
309 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
310 // ... and fits into the 12-bit immediate.
311 if (Imm.getSignificantBits() <= 64 &&
312 getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) {
313 return TTI::TCC_Free;
314 }
315 }
316
317 // Otherwise, use the full materialisation cost.
318 return getIntImmCost(Imm, Ty, CostKind);
319 }
320
321 // By default, prevent hoisting.
322 return TTI::TCC_Free;
323}
324
325InstructionCost
326RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
327 const APInt &Imm, Type *Ty,
328 TTI::TargetCostKind CostKind) const {
329 // Prevent hoisting in unknown cases.
330 return TTI::TCC_Free;
331}
332
333bool RISCVTTIImpl::hasActiveVectorLength() const {
334 return ST->hasVInstructions();
335}
336
337TargetTransformInfo::PopcntSupportKind
338RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
339 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
340 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
341}
342
343InstructionCost RISCVTTIImpl::getPartialReductionCost(
344 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
345 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
346 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
347 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
348 if (Opcode == Instruction::FAdd)
349 return InstructionCost::getInvalid();
350
351 // zve32x is broken for partial_reduce_umla, but let's make sure we
352 // don't generate them.
353 if (!ST->hasStdExtZvdot4a8i() || ST->getELen() < 64 ||
354 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
355 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(BitWidth: 8) ||
356 !AccumType->isIntegerTy(BitWidth: 32) || !VF.isKnownMultipleOf(RHS: 4))
357 return InstructionCost::getInvalid();
358
359 Type *Tp = VectorType::get(ElementType: AccumType, EC: VF.divideCoefficientBy(RHS: 4));
360 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
361 // Note: Asuming all vdot4a* variants are equal cost
362 return LT.first *
363 getRISCVInstructionCost(OpCodes: RISCV::VDOT4A_VV, VT: LT.second, CostKind);
364}
365
366bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
367 // Currently, the ExpandReductions pass can't expand scalable-vector
368 // reductions, but we still request expansion as RVV doesn't support certain
369 // reductions and the SelectionDAG can't legalize them either.
370 switch (II->getIntrinsicID()) {
371 default:
372 return false;
373 // These reductions have no equivalent in RVV
374 case Intrinsic::vector_reduce_mul:
375 case Intrinsic::vector_reduce_fmul:
376 return true;
377 }
378}
379
380std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
381 if (ST->hasVInstructions())
382 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
383 return BaseT::getMaxVScale();
384}
385
386std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
387 if (ST->hasVInstructions())
388 if (unsigned MinVLen = ST->getRealMinVLen();
389 MinVLen >= RISCV::RVVBitsPerBlock)
390 return MinVLen / RISCV::RVVBitsPerBlock;
391 return BaseT::getVScaleForTuning();
392}
393
394TypeSize
395RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
396 unsigned LMUL =
397 llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: 1, hi: 8));
398 switch (K) {
399 case TargetTransformInfo::RGK_Scalar:
400 return TypeSize::getFixed(ExactSize: ST->getXLen());
401 case TargetTransformInfo::RGK_FixedWidthVector:
402 return TypeSize::getFixed(
403 ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
404 case TargetTransformInfo::RGK_ScalableVector:
405 return TypeSize::getScalable(
406 MinimumSize: (ST->hasVInstructions() &&
407 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
408 ? LMUL * RISCV::RVVBitsPerBlock
409 : 0);
410 }
411
412 llvm_unreachable("Unsupported register kind");
413}
414
415InstructionCost RISCVTTIImpl::getStaticDataAddrGenerationCost(
416 const TTI::TargetCostKind CostKind) const {
417 switch (CostKind) {
418 case TTI::TCK_CodeSize:
419 case TTI::TCK_SizeAndLatency:
420 // Always 2 instructions
421 return 2;
422 case TTI::TCK_Latency:
423 case TTI::TCK_RecipThroughput:
424 // Depending on the memory model the address generation will
425 // require AUIPC + ADDI (medany) or LUI + ADDI (medlow). Don't
426 // have a way of getting this information here, so conservatively
427 // require both.
428 // In practice, these are generally implemented together.
429 return (ST->hasAUIPCADDIFusion() && ST->hasLUIADDIFusion()) ? 1 : 2;
430 }
431 llvm_unreachable("Unsupported cost kind");
432}
433
434InstructionCost
435RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
436 TTI::TargetCostKind CostKind) const {
437 // Add a cost of address generation + the cost of the load. The address
438 // is expected to be a PC relative offset to a constant pool entry
439 // using auipc/addi.
440 return getStaticDataAddrGenerationCost(CostKind) +
441 getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty),
442 /*AddressSpace=*/0, CostKind);
443}
444
445static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
446 unsigned Size = Mask.size();
447 if (!isPowerOf2_32(Value: Size))
448 return false;
449 for (unsigned I = 0; I != Size; ++I) {
450 if (static_cast<unsigned>(Mask[I]) == I)
451 continue;
452 if (Mask[I] != 0)
453 return false;
454 if (Size % I != 0)
455 return false;
456 for (unsigned J = I + 1; J != Size; ++J)
457 // Check the pattern is repeated.
458 if (static_cast<unsigned>(Mask[J]) != J % I)
459 return false;
460 SubVectorSize = I;
461 return true;
462 }
463 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
464 return false;
465}
466
467static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST,
468 LLVMContext &C) {
469 assert((DataVT.getScalarSizeInBits() != 8 ||
470 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
471 MVT IndexVT = DataVT.changeTypeToInteger();
472 if (IndexVT.getScalarType().bitsGT(VT: ST.getXLenVT()))
473 IndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i16);
474 return cast<VectorType>(Val: EVT(IndexVT).getTypeForEVT(Context&: C));
475}
476
477/// Attempt to approximate the cost of a shuffle which will require splitting
478/// during legalization. Note that processShuffleMasks is not an exact proxy
479/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
480/// reasonably close upperbound.
481static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI,
482 MVT LegalVT, VectorType *Tp,
483 ArrayRef<int> Mask,
484 TTI::TargetCostKind CostKind) {
485 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
486 "Expected fixed vector type and non-empty mask");
487 unsigned LegalNumElts = LegalVT.getVectorNumElements();
488 // Number of destination vectors after legalization:
489 unsigned NumOfDests = divideCeil(Numerator: Mask.size(), Denominator: LegalNumElts);
490 // We are going to permute multiple sources and the result will be in
491 // multiple destinations. Providing an accurate cost only for splits where
492 // the element type remains the same.
493 if (NumOfDests <= 1 ||
494 LegalVT.getVectorElementType().getSizeInBits() !=
495 Tp->getElementType()->getPrimitiveSizeInBits() ||
496 LegalNumElts >= Tp->getElementCount().getFixedValue())
497 return InstructionCost::getInvalid();
498
499 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Ty: Tp);
500 unsigned LegalVTSize = LegalVT.getStoreSize();
501 // Number of source vectors after legalization:
502 unsigned NumOfSrcs = divideCeil(Numerator: VecTySize, Denominator: LegalVTSize);
503
504 auto *SingleOpTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: LegalNumElts);
505
506 unsigned NormalizedVF = LegalNumElts * std::max(a: NumOfSrcs, b: NumOfDests);
507 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
508 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
509 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
510 assert(NormalizedVF >= Mask.size() &&
511 "Normalized mask expected to be not shorter than original mask.");
512 copy(Range&: Mask, Out: NormalizedMask.begin());
513 InstructionCost Cost = 0;
514 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
515 processShuffleMasks(
516 Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {},
517 SingleInputAction: [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
518 if (ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size()))
519 return;
520 if (!ReusedSingleSrcShuffles.insert(V: std::make_pair(x&: RegMask, y&: SrcReg))
521 .second)
522 return;
523 Cost += TTI.getShuffleCost(
524 Kind: TTI::SK_PermuteSingleSrc,
525 DstTy: FixedVectorType::get(ElementType: SingleOpTy->getElementType(), NumElts: RegMask.size()),
526 SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr);
527 },
528 ManyInputsAction: [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
529 Cost += TTI.getShuffleCost(
530 Kind: TTI::SK_PermuteTwoSrc,
531 DstTy: FixedVectorType::get(ElementType: SingleOpTy->getElementType(), NumElts: RegMask.size()),
532 SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr);
533 });
534 return Cost;
535}
536
537/// Try to perform better estimation of the permutation.
538/// 1. Split the source/destination vectors into real registers.
539/// 2. Do the mask analysis to identify which real registers are
540/// permuted. If more than 1 source registers are used for the
541/// destination register building, the cost for this destination register
542/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
543/// source register is used, build mask and calculate the cost as a cost
544/// of PermuteSingleSrc.
545/// Also, for the single register permute we try to identify if the
546/// destination register is just a copy of the source register or the
547/// copy of the previous destination register (the cost is
548/// TTI::TCC_Basic). If the source register is just reused, the cost for
549/// this operation is 0.
550static InstructionCost
551costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT,
552 std::optional<unsigned> VLen, VectorType *Tp,
553 ArrayRef<int> Mask, TTI::TargetCostKind CostKind) {
554 assert(LegalVT.isFixedLengthVector());
555 if (!VLen || Mask.empty())
556 return InstructionCost::getInvalid();
557 MVT ElemVT = LegalVT.getVectorElementType();
558 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
559 LegalVT = TTI.getTypeLegalizationCost(
560 Ty: FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: ElemsPerVReg))
561 .second;
562 // Number of destination vectors after legalization:
563 InstructionCost NumOfDests =
564 divideCeil(Numerator: Mask.size(), Denominator: LegalVT.getVectorNumElements());
565 if (NumOfDests <= 1 ||
566 LegalVT.getVectorElementType().getSizeInBits() !=
567 Tp->getElementType()->getPrimitiveSizeInBits() ||
568 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
569 return InstructionCost::getInvalid();
570
571 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Ty: Tp);
572 unsigned LegalVTSize = LegalVT.getStoreSize();
573 // Number of source vectors after legalization:
574 unsigned NumOfSrcs = divideCeil(Numerator: VecTySize, Denominator: LegalVTSize);
575
576 auto *SingleOpTy = FixedVectorType::get(ElementType: Tp->getElementType(),
577 NumElts: LegalVT.getVectorNumElements());
578
579 unsigned E = NumOfDests.getValue();
580 unsigned NormalizedVF =
581 LegalVT.getVectorNumElements() * std::max(a: NumOfSrcs, b: E);
582 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
583 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
584 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
585 assert(NormalizedVF >= Mask.size() &&
586 "Normalized mask expected to be not shorter than original mask.");
587 copy(Range&: Mask, Out: NormalizedMask.begin());
588 InstructionCost Cost = 0;
589 int NumShuffles = 0;
590 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
591 processShuffleMasks(
592 Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {},
593 SingleInputAction: [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
594 if (ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size()))
595 return;
596 if (!ReusedSingleSrcShuffles.insert(V: std::make_pair(x&: RegMask, y&: SrcReg))
597 .second)
598 return;
599 ++NumShuffles;
600 Cost += TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: SingleOpTy,
601 SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr);
602 },
603 ManyInputsAction: [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
604 Cost += TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleOpTy,
605 SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr);
606 NumShuffles += 2;
607 });
608 // Note: check that we do not emit too many shuffles here to prevent code
609 // size explosion.
610 // TODO: investigate, if it can be improved by extra analysis of the masks
611 // to check if the code is more profitable.
612 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
613 (NumOfDestRegs <= 2 && NumShuffles < 4))
614 return Cost;
615 return InstructionCost::getInvalid();
616}
617
618InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
619 ArrayRef<int> Mask,
620 TTI::TargetCostKind CostKind) const {
621 // Avoid missing masks and length changing shuffles
622 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
623 return InstructionCost::getInvalid();
624
625 int NumElts = Tp->getNumElements();
626 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
627 // Avoid scalarization cases
628 if (!LT.second.isFixedLengthVector())
629 return InstructionCost::getInvalid();
630
631 // Requires moving elements between parts, which requires additional
632 // unmodeled instructions.
633 if (LT.first != 1)
634 return InstructionCost::getInvalid();
635
636 auto GetSlideOpcode = [&](int SlideAmt) {
637 assert(SlideAmt != 0);
638 bool IsVI = isUInt<5>(x: std::abs(x: SlideAmt));
639 if (SlideAmt < 0)
640 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
641 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
642 };
643
644 std::array<std::pair<int, int>, 2> SrcInfo;
645 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
646 return InstructionCost::getInvalid();
647
648 if (SrcInfo[1].second == 0)
649 std::swap(x&: SrcInfo[0], y&: SrcInfo[1]);
650
651 InstructionCost FirstSlideCost = 0;
652 if (SrcInfo[0].second != 0) {
653 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
654 FirstSlideCost = getRISCVInstructionCost(OpCodes: Opcode, VT: LT.second, CostKind);
655 }
656
657 if (SrcInfo[1].first == -1)
658 return FirstSlideCost;
659
660 InstructionCost SecondSlideCost = 0;
661 if (SrcInfo[1].second != 0) {
662 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
663 SecondSlideCost = getRISCVInstructionCost(OpCodes: Opcode, VT: LT.second, CostKind);
664 } else {
665 SecondSlideCost =
666 getRISCVInstructionCost(OpCodes: RISCV::VMERGE_VVM, VT: LT.second, CostKind);
667 }
668
669 auto EC = Tp->getElementCount();
670 VectorType *MaskTy =
671 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: Tp->getContext()), EC);
672 InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind);
673 return FirstSlideCost + SecondSlideCost + MaskCost;
674}
675
676InstructionCost
677RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
678 VectorType *SrcTy, ArrayRef<int> Mask,
679 TTI::TargetCostKind CostKind, int Index,
680 VectorType *SubTp, ArrayRef<const Value *> Args,
681 const Instruction *CxtI) const {
682 assert((Mask.empty() || DstTy->isScalableTy() ||
683 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
684 "Expected the Mask to match the return size if given");
685 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
686 "Expected the same scalar types");
687
688 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
689
690 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
691 // For now, skip all fixed vector cost analysis when P extension is available
692 // to avoid crashes in getMinRVVVectorSizeInBits()
693 if (ST->hasStdExtP() && isa<FixedVectorType>(Val: SrcTy))
694 return 1;
695
696 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
697
698 // First, handle cases where having a fixed length vector enables us to
699 // give a more accurate cost than falling back to generic scalable codegen.
700 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
701 if (auto *FVTp = dyn_cast<FixedVectorType>(Val: SrcTy);
702 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
703 InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting(
704 TTI: *this, LegalVT: LT.second, VLen: ST->getRealVLen(),
705 Tp: Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
706 if (VRegSplittingCost.isValid())
707 return VRegSplittingCost;
708 switch (Kind) {
709 default:
710 break;
711 case TTI::SK_PermuteSingleSrc: {
712 if (Mask.size() >= 2) {
713 MVT EltTp = LT.second.getVectorElementType();
714 // If the size of the element is < ELEN then shuffles of interleaves and
715 // deinterleaves of 2 vectors can be lowered into the following
716 // sequences
717 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
718 // Example sequence:
719 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
720 // vwaddu.vv v10, v8, v9
721 // li a0, -1 (ignored)
722 // vwmaccu.vx v10, a0, v9
723 if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: 2, NumInputElts: Mask.size()))
724 return 2 * LT.first * TLI->getLMULCost(VT: LT.second);
725
726 if (Mask[0] == 0 || Mask[0] == 1) {
727 auto DeinterleaveMask = createStrideMask(Start: Mask[0], Stride: 2, VF: Mask.size());
728 // Example sequence:
729 // vnsrl.wi v10, v8, 0
730 if (equal(LRange&: DeinterleaveMask, RRange&: Mask))
731 return LT.first * getRISCVInstructionCost(OpCodes: RISCV::VNSRL_WI,
732 VT: LT.second, CostKind);
733 }
734 }
735 int SubVectorSize;
736 if (LT.second.getScalarSizeInBits() != 1 &&
737 isRepeatedConcatMask(Mask, SubVectorSize)) {
738 InstructionCost Cost = 0;
739 unsigned NumSlides = Log2_32(Value: Mask.size() / SubVectorSize);
740 // The cost of extraction from a subvector is 0 if the index is 0.
741 for (unsigned I = 0; I != NumSlides; ++I) {
742 unsigned InsertIndex = SubVectorSize * (1 << I);
743 FixedVectorType *SubTp =
744 FixedVectorType::get(ElementType: SrcTy->getElementType(), NumElts: InsertIndex);
745 FixedVectorType *DestTp =
746 FixedVectorType::getDoubleElementsVectorType(VTy: SubTp);
747 std::pair<InstructionCost, MVT> DestLT =
748 getTypeLegalizationCost(Ty: DestTp);
749 // Add the cost of whole vector register move because the
750 // destination vector register group for vslideup cannot overlap the
751 // source.
752 Cost += DestLT.first * TLI->getLMULCost(VT: DestLT.second);
753 Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy: DestTp, SrcTy: DestTp, Mask: {},
754 CostKind, Index: InsertIndex, SubTp);
755 }
756 return Cost;
757 }
758 }
759
760 if (InstructionCost SlideCost = getSlideCost(Tp: FVTp, Mask, CostKind);
761 SlideCost.isValid())
762 return SlideCost;
763
764 // vrgather + cost of generating the mask constant.
765 // We model this for an unknown mask with a single vrgather.
766 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
767 LT.second.getVectorNumElements() <= 256)) {
768 VectorType *IdxTy =
769 getVRGatherIndexType(DataVT: LT.second, ST: *ST, C&: SrcTy->getContext());
770 InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
771 return IndexCost +
772 getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VV, VT: LT.second, CostKind);
773 }
774 break;
775 }
776 case TTI::SK_Transpose:
777 case TTI::SK_PermuteTwoSrc: {
778
779 if (InstructionCost SlideCost = getSlideCost(Tp: FVTp, Mask, CostKind);
780 SlideCost.isValid())
781 return SlideCost;
782
783 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
784 // register for the second vrgather. We model this for an unknown
785 // (shuffle) mask.
786 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
787 LT.second.getVectorNumElements() <= 256)) {
788 auto &C = SrcTy->getContext();
789 auto EC = SrcTy->getElementCount();
790 VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C);
791 VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC);
792 InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
793 InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind);
794 return 2 * IndexCost +
795 getRISCVInstructionCost(OpCodes: {RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
796 VT: LT.second, CostKind) +
797 MaskCost;
798 }
799 break;
800 }
801 }
802
803 auto shouldSplit = [](TTI::ShuffleKind Kind) {
804 switch (Kind) {
805 default:
806 return false;
807 case TTI::SK_PermuteSingleSrc:
808 case TTI::SK_Transpose:
809 case TTI::SK_PermuteTwoSrc:
810 return true;
811 }
812 };
813
814 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
815 shouldSplit(Kind)) {
816 InstructionCost SplitCost =
817 costShuffleViaSplitting(TTI: *this, LegalVT: LT.second, Tp: FVTp, Mask, CostKind);
818 if (SplitCost.isValid())
819 return SplitCost;
820 }
821 }
822
823 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
824 switch (Kind) {
825 default:
826 // Fallthrough to generic handling.
827 // TODO: Most of these cases will return getInvalid in generic code, and
828 // must be implemented here.
829 break;
830 case TTI::SK_ExtractSubvector:
831 // Extract at zero is always a subregister extract
832 if (Index == 0)
833 return TTI::TCC_Free;
834
835 // If we're extracting a subvector of at most m1 size at a sub-register
836 // boundary - which unfortunately we need exact vlen to identify - this is
837 // a subregister extract at worst and thus won't require a vslidedown.
838 // TODO: Extend for aligned m2, m4 subvector extracts
839 // TODO: Extend for misalgined (but contained) extracts
840 // TODO: Extend for scalable subvector types
841 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
842 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
843 if (std::optional<unsigned> VLen = ST->getRealVLen();
844 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
845 SubLT.second.getSizeInBits() <= *VLen)
846 return TTI::TCC_Free;
847 }
848
849 // Example sequence:
850 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
851 // vslidedown.vi v8, v9, 2
852 return LT.first *
853 getRISCVInstructionCost(OpCodes: RISCV::VSLIDEDOWN_VI, VT: LT.second, CostKind);
854 case TTI::SK_InsertSubvector:
855 // Example sequence:
856 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
857 // vslideup.vi v8, v9, 2
858 LT = getTypeLegalizationCost(Ty: DstTy);
859 return LT.first *
860 getRISCVInstructionCost(OpCodes: RISCV::VSLIDEUP_VI, VT: LT.second, CostKind);
861 case TTI::SK_Select: {
862 // Example sequence:
863 // li a0, 90
864 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
865 // vmv.s.x v0, a0
866 // vmerge.vvm v8, v9, v8, v0
867 // We use 2 for the cost of the mask materialization as this is the true
868 // cost for small masks and most shuffles are small. At worst, this cost
869 // should be a very small constant for the constant pool load. As such,
870 // we may bias towards large selects slightly more than truly warranted.
871 return LT.first *
872 (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_S_X, RISCV::VMERGE_VVM},
873 VT: LT.second, CostKind));
874 }
875 case TTI::SK_Broadcast: {
876 // Check for broadcast loads, which are synthesized by optimized zero-stride
877 // loads (this is checked in RISCVTTIImpl::isLegalBroadcastLoad).
878 bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]);
879 if (IsLoad && LT.second.isVector() &&
880 isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
881 NumElements: LT.second.getVectorElementCount()))
882 return 0;
883
884 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(V: Args[0]) ==
885 Instruction::InsertElement);
886 if (LT.second.getScalarSizeInBits() == 1) {
887 if (HasScalar) {
888 // Example sequence:
889 // andi a0, a0, 1
890 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
891 // vmv.v.x v8, a0
892 // vmsne.vi v0, v8, 0
893 return LT.first *
894 (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI},
895 VT: LT.second, CostKind));
896 }
897 // Example sequence:
898 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
899 // vmv.v.i v8, 0
900 // vmerge.vim v8, v8, 1, v0
901 // vmv.x.s a0, v8
902 // andi a0, a0, 1
903 // vmv.v.x v8, a0
904 // vmsne.vi v0, v8, 0
905
906 return LT.first *
907 (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_I, RISCV::VMERGE_VIM,
908 RISCV::VMV_X_S, RISCV::VMV_V_X,
909 RISCV::VMSNE_VI},
910 VT: LT.second, CostKind));
911 }
912
913 if (HasScalar) {
914 // Example sequence:
915 // vmv.v.x v8, a0
916 return LT.first *
917 getRISCVInstructionCost(OpCodes: RISCV::VMV_V_X, VT: LT.second, CostKind);
918 }
919
920 // Example sequence:
921 // vrgather.vi v9, v8, 0
922 return LT.first *
923 getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VI, VT: LT.second, CostKind);
924 }
925 case TTI::SK_Splice: {
926 // vslidedown+vslideup.
927 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
928 // of similar code, but I think we expand through memory.
929 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
930 if (Index >= 0 && Index < 32)
931 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
932 else if (Index < 0 && Index > -32)
933 Opcodes[1] = RISCV::VSLIDEUP_VI;
934 return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
935 }
936 case TTI::SK_Reverse: {
937
938 if (!LT.second.isVector())
939 return InstructionCost::getInvalid();
940
941 // TODO: Cases to improve here:
942 // * Illegal vector types
943 // * i64 on RV32
944 if (SrcTy->getElementType()->isIntegerTy(BitWidth: 1)) {
945 VectorType *WideTy =
946 VectorType::get(ElementType: IntegerType::get(C&: SrcTy->getContext(), NumBits: 8),
947 EC: cast<VectorType>(Val: SrcTy)->getElementCount());
948 return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: SrcTy,
949 CCH: TTI::CastContextHint::None, CostKind) +
950 getShuffleCost(Kind: TTI::SK_Reverse, DstTy: WideTy, SrcTy: WideTy, Mask: {}, CostKind, Index: 0,
951 SubTp: nullptr) +
952 getCastInstrCost(Opcode: Instruction::Trunc, Dst: SrcTy, Src: WideTy,
953 CCH: TTI::CastContextHint::None, CostKind);
954 }
955
956 MVT ContainerVT = LT.second;
957 if (LT.second.isFixedLengthVector())
958 ContainerVT = TLI->getContainerForFixedLengthVector(VT: LT.second);
959 MVT M1VT = RISCVTargetLowering::getM1VT(VT: ContainerVT);
960 if (ContainerVT.bitsLE(VT: M1VT)) {
961 // Example sequence:
962 // csrr a0, vlenb
963 // srli a0, a0, 3
964 // addi a0, a0, -1
965 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
966 // vid.v v9
967 // vrsub.vx v10, v9, a0
968 // vrgather.vv v9, v8, v10
969 InstructionCost LenCost = 3;
970 if (LT.second.isFixedLengthVector())
971 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
972 LenCost = isInt<5>(x: LT.second.getVectorNumElements() - 1) ? 0 : 1;
973 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
974 if (LT.second.isFixedLengthVector() &&
975 isInt<5>(x: LT.second.getVectorNumElements() - 1))
976 Opcodes[1] = RISCV::VRSUB_VI;
977 InstructionCost GatherCost =
978 getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
979 return LT.first * (LenCost + GatherCost);
980 }
981
982 // At high LMUL, we split into a series of M1 reverses (see
983 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
984 // the resulting gap at the bottom (for fixed vectors only). The important
985 // bit is that the cost scales linearly, not quadratically with LMUL.
986 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
987 InstructionCost FixedCost =
988 getRISCVInstructionCost(OpCodes: M1Opcodes, VT: M1VT, CostKind) + 3;
989 unsigned Ratio =
990 ContainerVT.getVectorMinNumElements() / M1VT.getVectorMinNumElements();
991 InstructionCost GatherCost =
992 getRISCVInstructionCost(OpCodes: {RISCV::VRGATHER_VV}, VT: M1VT, CostKind) * Ratio;
993 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
994 getRISCVInstructionCost(OpCodes: {RISCV::VSLIDEDOWN_VX}, VT: LT.second, CostKind);
995 return FixedCost + LT.first * (GatherCost + SlideCost);
996 }
997 }
998 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
999 SubTp);
1000}
1001
1002static unsigned isM1OrSmaller(MVT VT) {
1003 RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
1004 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
1005 LMUL == RISCVVType::VLMUL::LMUL_F4 ||
1006 LMUL == RISCVVType::VLMUL::LMUL_F2 ||
1007 LMUL == RISCVVType::VLMUL::LMUL_1);
1008}
1009
1010InstructionCost RISCVTTIImpl::getScalarizationOverhead(
1011 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
1012 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
1013 TTI::VectorInstrContext VIC) const {
1014 if (isa<ScalableVectorType>(Val: Ty))
1015 return InstructionCost::getInvalid();
1016
1017 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1018 // For now, skip all fixed vector cost analysis when P extension is available
1019 // to avoid crashes in getMinRVVVectorSizeInBits()
1020 if (ST->hasStdExtP() && isa<FixedVectorType>(Val: Ty)) {
1021 return 1; // Treat as single instruction cost for now
1022 }
1023
1024 // A build_vector (which is m1 sized or smaller) can be done in no
1025 // worse than one vslide1down.vx per element in the type. We could
1026 // in theory do an explode_vector in the inverse manner, but our
1027 // lowering today does not have a first class node for this pattern.
1028 InstructionCost Cost = BaseT::getScalarizationOverhead(
1029 InTy: Ty, DemandedElts, Insert, Extract, CostKind);
1030 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1031 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
1032 if (Ty->getScalarSizeInBits() == 1) {
1033 auto *WideVecTy = cast<VectorType>(Val: Ty->getWithNewBitWidth(NewBitWidth: 8));
1034 // Note: Implicit scalar anyextend is assumed to be free since the i1
1035 // must be stored in a GPR.
1036 return getScalarizationOverhead(Ty: WideVecTy, DemandedElts, Insert, Extract,
1037 CostKind) +
1038 getCastInstrCost(Opcode: Instruction::Trunc, Dst: Ty, Src: WideVecTy,
1039 CCH: TTI::CastContextHint::None, CostKind, I: nullptr);
1040 }
1041
1042 assert(LT.second.isFixedLengthVector());
1043 MVT ContainerVT = TLI->getContainerForFixedLengthVector(VT: LT.second);
1044 if (isM1OrSmaller(VT: ContainerVT)) {
1045 InstructionCost BV =
1046 cast<FixedVectorType>(Val: Ty)->getNumElements() *
1047 getRISCVInstructionCost(OpCodes: RISCV::VSLIDE1DOWN_VX, VT: LT.second, CostKind);
1048 if (BV < Cost)
1049 Cost = BV;
1050 }
1051 }
1052 return Cost;
1053}
1054
1055InstructionCost
1056RISCVTTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
1057 TTI::TargetCostKind CostKind) const {
1058 Type *DataTy = MICA.getDataType();
1059 Align Alignment = MICA.getAlignment();
1060 switch (MICA.getID()) {
1061 case Intrinsic::vp_load_ff: {
1062 EVT DataTypeVT = TLI->getValueType(DL, Ty: DataTy);
1063 if (!TLI->isLegalFirstFaultLoad(DataType: DataTypeVT, Alignment))
1064 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1065
1066 unsigned AS = MICA.getAddressSpace();
1067 return getMemoryOpCost(Opcode: Instruction::Load, Src: DataTy, Alignment, AddressSpace: AS, CostKind,
1068 OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: nullptr);
1069 }
1070 case Intrinsic::experimental_vp_strided_load:
1071 case Intrinsic::experimental_vp_strided_store:
1072 return getStridedMemoryOpCost(MICA, CostKind);
1073 case Intrinsic::masked_compressstore:
1074 case Intrinsic::masked_expandload:
1075 return getExpandCompressMemoryOpCost(MICA, CostKind);
1076 case Intrinsic::vp_scatter:
1077 case Intrinsic::vp_gather:
1078 case Intrinsic::masked_scatter:
1079 case Intrinsic::masked_gather:
1080 return getGatherScatterOpCost(MICA, CostKind);
1081 case Intrinsic::vp_load:
1082 case Intrinsic::vp_store:
1083 case Intrinsic::masked_load:
1084 case Intrinsic::masked_store:
1085 return getMaskedMemoryOpCost(MICA, CostKind);
1086 }
1087 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1088}
1089
1090InstructionCost
1091RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
1092 TTI::TargetCostKind CostKind) const {
1093 unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
1094 : Instruction::Store;
1095 Type *Src = MICA.getDataType();
1096 Align Alignment = MICA.getAlignment();
1097 unsigned AddressSpace = MICA.getAddressSpace();
1098
1099 if (!isLegalMaskedLoadStore(DataType: Src, Alignment) ||
1100 CostKind != TTI::TCK_RecipThroughput)
1101 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1102
1103 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1104}
1105
1106InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
1107 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1108 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1109 bool UseMaskForCond, bool UseMaskForGaps) const {
1110
1111 // The interleaved memory access pass will lower (de)interleave ops combined
1112 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1113 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1114 // gap).
1115 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1116 auto *VTy = cast<VectorType>(Val: VecTy);
1117 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VTy);
1118 // Need to make sure type has't been scalarized
1119 if (LT.second.isVector()) {
1120 auto *SubVecTy =
1121 VectorType::get(ElementType: VTy->getElementType(),
1122 EC: VTy->getElementCount().divideCoefficientBy(RHS: Factor));
1123 if (VTy->getElementCount().isKnownMultipleOf(RHS: Factor) &&
1124 TLI->isLegalInterleavedAccessType(VTy: SubVecTy, Factor, Alignment,
1125 AddrSpace: AddressSpace, DL)) {
1126
1127 // Some processors optimize segment loads/stores as one wide memory op +
1128 // Factor * LMUL shuffle ops.
1129 if (ST->hasOptimizedSegmentLoadStore(NF: Factor)) {
1130 InstructionCost Cost =
1131 getMemoryOpCost(Opcode, Src: VTy, Alignment, AddressSpace, CostKind);
1132 MVT SubVecVT = getTLI()->getValueType(DL, Ty: SubVecTy).getSimpleVT();
1133 Cost += Factor * TLI->getLMULCost(VT: SubVecVT);
1134 return LT.first * Cost;
1135 }
1136
1137 // Otherwise, the cost is proportional to the number of elements (VL *
1138 // Factor ops).
1139 InstructionCost MemOpCost =
1140 getMemoryOpCost(Opcode, Src: VTy->getElementType(), Alignment, AddressSpace: 0,
1141 CostKind, OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None});
1142 unsigned NumLoads = getEstimatedVLFor(Ty: VTy);
1143 return NumLoads * MemOpCost;
1144 }
1145 }
1146 }
1147
1148 // TODO: Return the cost of interleaved accesses for scalable vector when
1149 // unable to convert to segment accesses instructions.
1150 if (isa<ScalableVectorType>(Val: VecTy))
1151 return InstructionCost::getInvalid();
1152
1153 auto *FVTy = cast<FixedVectorType>(Val: VecTy);
1154 // When gaps are only at the tail, for interleaved load, we can emit a wide
1155 // masked load and shufflevectors. For interleaved store, we can emit
1156 // shufflevectors and a wide masked store. The interleaved memory access pass
1157 // will lower them into vlsseg/vssseg intrinsics.
1158 if (UseMaskForGaps) {
1159 assert(llvm::is_sorted(Indices) && "Indices must be sorted");
1160 assert(llvm::adjacent_find(Indices) == Indices.end() &&
1161 "Indices should not contain duplicate elements");
1162 unsigned NumOfFields = Indices.size();
1163 bool IsTailGapOnly = NumOfFields > 1 && (NumOfFields == Indices.back() + 1);
1164 if (IsTailGapOnly &&
1165 NumOfFields <= TLI->getMaxSupportedInterleaveFactor()) {
1166 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: FVTy);
1167 if (LT.second.isVector() &&
1168 FVTy->getElementCount().isKnownMultipleOf(RHS: Factor)) {
1169 auto *SubVecTy = VectorType::get(
1170 ElementType: FVTy->getElementType(),
1171 EC: FVTy->getElementCount().divideCoefficientBy(RHS: Factor));
1172 if (TLI->isLegalInterleavedAccessType(VTy: SubVecTy, Factor: NumOfFields, Alignment,
1173 AddrSpace: AddressSpace, DL)) {
1174 // The cost is proportional to the total number of element accesses.
1175 unsigned NumAccesses = getEstimatedVLFor(Ty: FVTy);
1176 return NumAccesses * TTI::TCC_Basic;
1177 }
1178 }
1179 }
1180 }
1181
1182 InstructionCost MemCost =
1183 getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
1184 unsigned VF = FVTy->getNumElements() / Factor;
1185
1186 // An interleaved load will look like this for Factor=3:
1187 // %wide.vec = load <12 x i32>, ptr %3, align 4
1188 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1189 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1190 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1191 if (Opcode == Instruction::Load) {
1192 InstructionCost Cost = MemCost;
1193 for (unsigned Index : Indices) {
1194 FixedVectorType *VecTy =
1195 FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor);
1196 auto Mask = createStrideMask(Start: Index, Stride: Factor, VF);
1197 Mask.resize(N: VF * Factor, NV: -1);
1198 InstructionCost ShuffleCost =
1199 getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, DstTy: VecTy, SrcTy: VecTy,
1200 Mask, CostKind, Index: 0, SubTp: nullptr, Args: {});
1201 Cost += ShuffleCost;
1202 }
1203 return Cost;
1204 }
1205
1206 // TODO: Model for NF > 2
1207 // We'll need to enhance getShuffleCost to model shuffles that are just
1208 // inserts and extracts into subvectors, since they won't have the full cost
1209 // of a vrgather.
1210 // An interleaved store for 3 vectors of 4 lanes will look like
1211 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1212 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1213 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1214 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1215 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1216 if (Factor != 2)
1217 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1218 Alignment, AddressSpace, CostKind,
1219 UseMaskForCond, UseMaskForGaps);
1220
1221 assert(Opcode == Instruction::Store && "Opcode must be a store");
1222 // For an interleaving store of 2 vectors, we perform one large interleaving
1223 // shuffle that goes into the wide store
1224 auto Mask = createInterleaveMask(VF, NumVecs: Factor);
1225 InstructionCost ShuffleCost =
1226 getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, DstTy: FVTy, SrcTy: FVTy, Mask,
1227 CostKind, Index: 0, SubTp: nullptr, Args: {});
1228 return MemCost + ShuffleCost;
1229}
1230
1231InstructionCost
1232RISCVTTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
1233 TTI::TargetCostKind CostKind) const {
1234
1235 bool IsLoad = MICA.getID() == Intrinsic::masked_gather ||
1236 MICA.getID() == Intrinsic::vp_gather;
1237 unsigned Opcode = IsLoad ? Instruction::Load : Instruction::Store;
1238 Type *DataTy = MICA.getDataType();
1239 Align Alignment = MICA.getAlignment();
1240 if (CostKind != TTI::TCK_RecipThroughput)
1241 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1242
1243 if ((Opcode == Instruction::Load &&
1244 !isLegalMaskedGather(DataType: DataTy, Alignment: Align(Alignment))) ||
1245 (Opcode == Instruction::Store &&
1246 !isLegalMaskedScatter(DataType: DataTy, Alignment: Align(Alignment))))
1247 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1248
1249 // Cost is proportional to the number of memory operations implied. For
1250 // scalable vectors, we use an estimate on that number since we don't
1251 // know exactly what VL will be.
1252 auto &VTy = *cast<VectorType>(Val: DataTy);
1253 unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
1254 return NumLoads * TTI::TCC_Basic;
1255}
1256
1257InstructionCost RISCVTTIImpl::getExpandCompressMemoryOpCost(
1258 const MemIntrinsicCostAttributes &MICA,
1259 TTI::TargetCostKind CostKind) const {
1260 unsigned Opcode = MICA.getID() == Intrinsic::masked_expandload
1261 ? Instruction::Load
1262 : Instruction::Store;
1263 Type *DataTy = MICA.getDataType();
1264 bool VariableMask = MICA.getVariableMask();
1265 Align Alignment = MICA.getAlignment();
1266 bool IsLegal = (Opcode == Instruction::Store &&
1267 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1268 (Opcode == Instruction::Load &&
1269 isLegalMaskedExpandLoad(DataType: DataTy, Alignment));
1270 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1271 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1272 // Example compressstore sequence:
1273 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1274 // vcompress.vm v10, v8, v0
1275 // vcpop.m a1, v0
1276 // vsetvli zero, a1, e32, m2, ta, ma
1277 // vse32.v v10, (a0)
1278 // Example expandload sequence:
1279 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1280 // vcpop.m a1, v0
1281 // vsetvli zero, a1, e32, m2, ta, ma
1282 // vle32.v v10, (a0)
1283 // vsetivli zero, 8, e32, m2, ta, ma
1284 // viota.m v12, v0
1285 // vrgather.vv v8, v10, v12, v0.t
1286 auto MemOpCost =
1287 getMemoryOpCost(Opcode, Src: DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1288 auto LT = getTypeLegalizationCost(Ty: DataTy);
1289 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1290 if (VariableMask)
1291 Opcodes.push_back(Elt: RISCV::VCPOP_M);
1292 if (Opcode == Instruction::Store)
1293 Opcodes.append(IL: {RISCV::VCOMPRESS_VM});
1294 else
1295 Opcodes.append(IL: {RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1296 return MemOpCost +
1297 LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1298}
1299
1300InstructionCost
1301RISCVTTIImpl::getStridedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
1302 TTI::TargetCostKind CostKind) const {
1303
1304 unsigned Opcode = MICA.getID() == Intrinsic::experimental_vp_strided_load
1305 ? Instruction::Load
1306 : Instruction::Store;
1307
1308 Type *DataTy = MICA.getDataType();
1309 Align Alignment = MICA.getAlignment();
1310 const Instruction *I = MICA.getInst();
1311
1312 if (!isLegalStridedLoadStore(DataType: DataTy, Alignment))
1313 return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
1314
1315 if (CostKind == TTI::TCK_CodeSize)
1316 return TTI::TCC_Basic;
1317
1318 // Cost is proportional to the number of memory operations implied. For
1319 // scalable vectors, we use an estimate on that number since we don't
1320 // know exactly what VL will be.
1321 // FIXME: This will overcost for i64 on rv32 with +zve64x.
1322 auto &VTy = *cast<VectorType>(Val: DataTy);
1323 InstructionCost MemOpCost =
1324 getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind,
1325 OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
1326 unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
1327 return NumLoads * MemOpCost;
1328}
1329
1330InstructionCost
1331RISCVTTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
1332 // FIXME: This is a property of the default vector convention, not
1333 // all possible calling conventions. Fixing that will require
1334 // some TTI API and SLP rework.
1335 InstructionCost Cost = 0;
1336 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1337 for (auto *Ty : Tys) {
1338 if (!Ty->isVectorTy())
1339 continue;
1340 Align A = DL.getPrefTypeAlign(Ty);
1341 Cost += getMemoryOpCost(Opcode: Instruction::Store, Src: Ty, Alignment: A, AddressSpace: 0, CostKind) +
1342 getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: A, AddressSpace: 0, CostKind);
1343 }
1344 return Cost;
1345}
1346
1347// Currently, these represent both throughput and codesize costs
1348// for the respective intrinsics. The costs in this table are simply
1349// instruction counts with the following adjustments made:
1350// * One vsetvli is considered free.
1351static const CostTblEntry VectorIntrinsicCostTable[]{
1352 {.ISD: Intrinsic::floor, .Type: MVT::f32, .Cost: 9},
1353 {.ISD: Intrinsic::floor, .Type: MVT::f64, .Cost: 9},
1354 {.ISD: Intrinsic::ceil, .Type: MVT::f32, .Cost: 9},
1355 {.ISD: Intrinsic::ceil, .Type: MVT::f64, .Cost: 9},
1356 {.ISD: Intrinsic::trunc, .Type: MVT::f32, .Cost: 7},
1357 {.ISD: Intrinsic::trunc, .Type: MVT::f64, .Cost: 7},
1358 {.ISD: Intrinsic::round, .Type: MVT::f32, .Cost: 9},
1359 {.ISD: Intrinsic::round, .Type: MVT::f64, .Cost: 9},
1360 {.ISD: Intrinsic::roundeven, .Type: MVT::f32, .Cost: 9},
1361 {.ISD: Intrinsic::roundeven, .Type: MVT::f64, .Cost: 9},
1362 {.ISD: Intrinsic::rint, .Type: MVT::f32, .Cost: 7},
1363 {.ISD: Intrinsic::rint, .Type: MVT::f64, .Cost: 7},
1364 {.ISD: Intrinsic::nearbyint, .Type: MVT::f32, .Cost: 9},
1365 {.ISD: Intrinsic::nearbyint, .Type: MVT::f64, .Cost: 9},
1366 {.ISD: Intrinsic::bswap, .Type: MVT::i16, .Cost: 3},
1367 {.ISD: Intrinsic::bswap, .Type: MVT::i32, .Cost: 12},
1368 {.ISD: Intrinsic::bswap, .Type: MVT::i64, .Cost: 31},
1369 {.ISD: Intrinsic::vp_bswap, .Type: MVT::i16, .Cost: 3},
1370 {.ISD: Intrinsic::vp_bswap, .Type: MVT::i32, .Cost: 12},
1371 {.ISD: Intrinsic::vp_bswap, .Type: MVT::i64, .Cost: 31},
1372 {.ISD: Intrinsic::vp_fshl, .Type: MVT::i8, .Cost: 7},
1373 {.ISD: Intrinsic::vp_fshl, .Type: MVT::i16, .Cost: 7},
1374 {.ISD: Intrinsic::vp_fshl, .Type: MVT::i32, .Cost: 7},
1375 {.ISD: Intrinsic::vp_fshl, .Type: MVT::i64, .Cost: 7},
1376 {.ISD: Intrinsic::vp_fshr, .Type: MVT::i8, .Cost: 7},
1377 {.ISD: Intrinsic::vp_fshr, .Type: MVT::i16, .Cost: 7},
1378 {.ISD: Intrinsic::vp_fshr, .Type: MVT::i32, .Cost: 7},
1379 {.ISD: Intrinsic::vp_fshr, .Type: MVT::i64, .Cost: 7},
1380 {.ISD: Intrinsic::bitreverse, .Type: MVT::i8, .Cost: 17},
1381 {.ISD: Intrinsic::bitreverse, .Type: MVT::i16, .Cost: 24},
1382 {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 33},
1383 {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 52},
1384 {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i8, .Cost: 17},
1385 {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i16, .Cost: 24},
1386 {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i32, .Cost: 33},
1387 {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i64, .Cost: 52},
1388 {.ISD: Intrinsic::ctpop, .Type: MVT::i8, .Cost: 12},
1389 {.ISD: Intrinsic::ctpop, .Type: MVT::i16, .Cost: 19},
1390 {.ISD: Intrinsic::ctpop, .Type: MVT::i32, .Cost: 20},
1391 {.ISD: Intrinsic::ctpop, .Type: MVT::i64, .Cost: 21},
1392 {.ISD: Intrinsic::ctlz, .Type: MVT::i8, .Cost: 19},
1393 {.ISD: Intrinsic::ctlz, .Type: MVT::i16, .Cost: 28},
1394 {.ISD: Intrinsic::ctlz, .Type: MVT::i32, .Cost: 31},
1395 {.ISD: Intrinsic::ctlz, .Type: MVT::i64, .Cost: 35},
1396 {.ISD: Intrinsic::cttz, .Type: MVT::i8, .Cost: 16},
1397 {.ISD: Intrinsic::cttz, .Type: MVT::i16, .Cost: 23},
1398 {.ISD: Intrinsic::cttz, .Type: MVT::i32, .Cost: 24},
1399 {.ISD: Intrinsic::cttz, .Type: MVT::i64, .Cost: 25},
1400 {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i8, .Cost: 12},
1401 {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i16, .Cost: 19},
1402 {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i32, .Cost: 20},
1403 {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i64, .Cost: 21},
1404 {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i8, .Cost: 19},
1405 {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i16, .Cost: 28},
1406 {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i32, .Cost: 31},
1407 {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i64, .Cost: 35},
1408 {.ISD: Intrinsic::vp_cttz, .Type: MVT::i8, .Cost: 16},
1409 {.ISD: Intrinsic::vp_cttz, .Type: MVT::i16, .Cost: 23},
1410 {.ISD: Intrinsic::vp_cttz, .Type: MVT::i32, .Cost: 24},
1411 {.ISD: Intrinsic::vp_cttz, .Type: MVT::i64, .Cost: 25},
1412};
1413
1414InstructionCost
1415RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1416 TTI::TargetCostKind CostKind) const {
1417 auto *RetTy = ICA.getReturnType();
1418 switch (ICA.getID()) {
1419 case Intrinsic::lrint:
1420 case Intrinsic::llrint:
1421 case Intrinsic::lround:
1422 case Intrinsic::llround: {
1423 auto LT = getTypeLegalizationCost(Ty: RetTy);
1424 Type *SrcTy = ICA.getArgTypes().front();
1425 auto SrcLT = getTypeLegalizationCost(Ty: SrcTy);
1426 if (ST->hasVInstructions() && LT.second.isVector()) {
1427 SmallVector<unsigned, 2> Ops;
1428 unsigned SrcEltSz = DL.getTypeSizeInBits(Ty: SrcTy->getScalarType());
1429 unsigned DstEltSz = DL.getTypeSizeInBits(Ty: RetTy->getScalarType());
1430 if (LT.second.getVectorElementType() == MVT::bf16) {
1431 if (!ST->hasVInstructionsBF16Minimal())
1432 return InstructionCost::getInvalid();
1433 if (DstEltSz == 32)
1434 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1435 else
1436 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1437 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1438 !ST->hasVInstructionsF16()) {
1439 if (!ST->hasVInstructionsF16Minimal())
1440 return InstructionCost::getInvalid();
1441 if (DstEltSz == 32)
1442 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1443 else
1444 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1445
1446 } else if (SrcEltSz > DstEltSz) {
1447 Ops = {RISCV::VFNCVT_X_F_W};
1448 } else if (SrcEltSz < DstEltSz) {
1449 Ops = {RISCV::VFWCVT_X_F_V};
1450 } else {
1451 Ops = {RISCV::VFCVT_X_F_V};
1452 }
1453
1454 // We need to use the source LMUL in the case of a narrowing op, and the
1455 // destination LMUL otherwise.
1456 if (SrcEltSz > DstEltSz)
1457 return SrcLT.first *
1458 getRISCVInstructionCost(OpCodes: Ops, VT: SrcLT.second, CostKind);
1459 return LT.first * getRISCVInstructionCost(OpCodes: Ops, VT: LT.second, CostKind);
1460 }
1461 break;
1462 }
1463 case Intrinsic::ceil:
1464 case Intrinsic::floor:
1465 case Intrinsic::trunc:
1466 case Intrinsic::rint:
1467 case Intrinsic::round:
1468 case Intrinsic::roundeven: {
1469 // These all use the same code.
1470 auto LT = getTypeLegalizationCost(Ty: RetTy);
1471 if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second))
1472 return LT.first * 8;
1473 break;
1474 }
1475 case Intrinsic::umin:
1476 case Intrinsic::umax:
1477 case Intrinsic::smin:
1478 case Intrinsic::smax: {
1479 auto LT = getTypeLegalizationCost(Ty: RetTy);
1480 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1481 return LT.first;
1482
1483 if (ST->hasVInstructions() && LT.second.isVector()) {
1484 unsigned Op;
1485 switch (ICA.getID()) {
1486 case Intrinsic::umin:
1487 Op = RISCV::VMINU_VV;
1488 break;
1489 case Intrinsic::umax:
1490 Op = RISCV::VMAXU_VV;
1491 break;
1492 case Intrinsic::smin:
1493 Op = RISCV::VMIN_VV;
1494 break;
1495 case Intrinsic::smax:
1496 Op = RISCV::VMAX_VV;
1497 break;
1498 }
1499 return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
1500 }
1501 break;
1502 }
1503 case Intrinsic::sadd_sat:
1504 case Intrinsic::ssub_sat:
1505 case Intrinsic::uadd_sat:
1506 case Intrinsic::usub_sat: {
1507 auto LT = getTypeLegalizationCost(Ty: RetTy);
1508 if (ST->hasVInstructions() && LT.second.isVector()) {
1509 unsigned Op;
1510 switch (ICA.getID()) {
1511 case Intrinsic::sadd_sat:
1512 Op = RISCV::VSADD_VV;
1513 break;
1514 case Intrinsic::ssub_sat:
1515 Op = RISCV::VSSUB_VV;
1516 break;
1517 case Intrinsic::uadd_sat:
1518 Op = RISCV::VSADDU_VV;
1519 break;
1520 case Intrinsic::usub_sat:
1521 Op = RISCV::VSSUBU_VV;
1522 break;
1523 }
1524 return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
1525 }
1526 break;
1527 }
1528 case Intrinsic::fma:
1529 case Intrinsic::fmuladd: {
1530 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1531 auto LT = getTypeLegalizationCost(Ty: RetTy);
1532 if (ST->hasVInstructions() && LT.second.isVector())
1533 return LT.first *
1534 getRISCVInstructionCost(OpCodes: RISCV::VFMADD_VV, VT: LT.second, CostKind);
1535 break;
1536 }
1537 case Intrinsic::fabs: {
1538 auto LT = getTypeLegalizationCost(Ty: RetTy);
1539 if (ST->hasVInstructions() && LT.second.isVector()) {
1540 // lui a0, 8
1541 // addi a0, a0, -1
1542 // vsetvli a1, zero, e16, m1, ta, ma
1543 // vand.vx v8, v8, a0
1544 // f16 with zvfhmin and bf16 with zvfhbmin
1545 if (LT.second.getVectorElementType() == MVT::bf16 ||
1546 (LT.second.getVectorElementType() == MVT::f16 &&
1547 !ST->hasVInstructionsF16()))
1548 return LT.first * getRISCVInstructionCost(OpCodes: RISCV::VAND_VX, VT: LT.second,
1549 CostKind) +
1550 2;
1551 else
1552 return LT.first *
1553 getRISCVInstructionCost(OpCodes: RISCV::VFSGNJX_VV, VT: LT.second, CostKind);
1554 }
1555 break;
1556 }
1557 case Intrinsic::sqrt: {
1558 auto LT = getTypeLegalizationCost(Ty: RetTy);
1559 if (ST->hasVInstructions() && LT.second.isVector()) {
1560 SmallVector<unsigned, 4> ConvOp;
1561 SmallVector<unsigned, 2> FsqrtOp;
1562 MVT ConvType = LT.second;
1563 MVT FsqrtType = LT.second;
1564 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1565 // will be spilt.
1566 if (LT.second.getVectorElementType() == MVT::bf16) {
1567 if (LT.second == MVT::nxv32bf16) {
1568 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1569 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1570 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1571 ConvType = MVT::nxv16f16;
1572 FsqrtType = MVT::nxv16f32;
1573 } else {
1574 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1575 FsqrtOp = {RISCV::VFSQRT_V};
1576 FsqrtType = TLI->getTypeToPromoteTo(Op: ISD::FSQRT, VT: FsqrtType);
1577 }
1578 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1579 !ST->hasVInstructionsF16()) {
1580 if (LT.second == MVT::nxv32f16) {
1581 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1582 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1583 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1584 ConvType = MVT::nxv16f16;
1585 FsqrtType = MVT::nxv16f32;
1586 } else {
1587 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1588 FsqrtOp = {RISCV::VFSQRT_V};
1589 FsqrtType = TLI->getTypeToPromoteTo(Op: ISD::FSQRT, VT: FsqrtType);
1590 }
1591 } else {
1592 FsqrtOp = {RISCV::VFSQRT_V};
1593 }
1594
1595 return LT.first * (getRISCVInstructionCost(OpCodes: FsqrtOp, VT: FsqrtType, CostKind) +
1596 getRISCVInstructionCost(OpCodes: ConvOp, VT: ConvType, CostKind));
1597 }
1598 break;
1599 }
1600 case Intrinsic::cttz:
1601 case Intrinsic::ctlz:
1602 case Intrinsic::ctpop: {
1603 auto LT = getTypeLegalizationCost(Ty: RetTy);
1604 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1605 unsigned Op;
1606 switch (ICA.getID()) {
1607 case Intrinsic::cttz:
1608 Op = RISCV::VCTZ_V;
1609 break;
1610 case Intrinsic::ctlz:
1611 Op = RISCV::VCLZ_V;
1612 break;
1613 case Intrinsic::ctpop:
1614 Op = RISCV::VCPOP_V;
1615 break;
1616 }
1617 return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
1618 }
1619 break;
1620 }
1621 case Intrinsic::abs: {
1622 auto LT = getTypeLegalizationCost(Ty: RetTy);
1623 if (ST->hasVInstructions() && LT.second.isVector()) {
1624 // vabs.v v10, v8
1625 if (ST->hasStdExtZvabd())
1626 return LT.first *
1627 getRISCVInstructionCost(OpCodes: {RISCV::VABS_V}, VT: LT.second, CostKind);
1628
1629 // vrsub.vi v10, v8, 0
1630 // vmax.vv v8, v8, v10
1631 return LT.first *
1632 getRISCVInstructionCost(OpCodes: {RISCV::VRSUB_VI, RISCV::VMAX_VV},
1633 VT: LT.second, CostKind);
1634 }
1635 break;
1636 }
1637 case Intrinsic::fshl:
1638 case Intrinsic::fshr: {
1639 if (ICA.getArgs().empty())
1640 break;
1641
1642 // Funnel-shifts are ROTL/ROTR when the first and second operand are equal.
1643 // When Zbb/Zbkb is enabled we can use a single ROL(W)/ROR(I)(W)
1644 // instruction.
1645 if ((ST->hasStdExtZbb() || ST->hasStdExtZbkb()) && RetTy->isIntegerTy() &&
1646 ICA.getArgs()[0] == ICA.getArgs()[1] &&
1647 (RetTy->getIntegerBitWidth() == 32 ||
1648 RetTy->getIntegerBitWidth() == 64) &&
1649 RetTy->getIntegerBitWidth() <= ST->getXLen()) {
1650 return 1;
1651 }
1652 break;
1653 }
1654 case Intrinsic::masked_udiv:
1655 return getArithmeticInstrCost(Opcode: Instruction::UDiv, Ty: ICA.getReturnType(),
1656 CostKind);
1657 case Intrinsic::masked_sdiv:
1658 return getArithmeticInstrCost(Opcode: Instruction::SDiv, Ty: ICA.getReturnType(),
1659 CostKind);
1660 case Intrinsic::masked_urem:
1661 return getArithmeticInstrCost(Opcode: Instruction::URem, Ty: ICA.getReturnType(),
1662 CostKind);
1663 case Intrinsic::masked_srem:
1664 return getArithmeticInstrCost(Opcode: Instruction::SRem, Ty: ICA.getReturnType(),
1665 CostKind);
1666 case Intrinsic::get_active_lane_mask: {
1667 if (ST->hasVInstructions()) {
1668 Type *ExpRetTy = VectorType::get(
1669 ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount());
1670 auto LT = getTypeLegalizationCost(Ty: ExpRetTy);
1671
1672 // vid.v v8 // considered hoisted
1673 // vsaddu.vx v8, v8, a0
1674 // vmsltu.vx v0, v8, a1
1675 return LT.first *
1676 getRISCVInstructionCost(OpCodes: {RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1677 VT: LT.second, CostKind);
1678 }
1679 break;
1680 }
1681 // TODO: add more intrinsic
1682 case Intrinsic::stepvector: {
1683 auto LT = getTypeLegalizationCost(Ty: RetTy);
1684 // Legalisation of illegal types involves an `index' instruction plus
1685 // (LT.first - 1) vector adds.
1686 if (ST->hasVInstructions())
1687 return getRISCVInstructionCost(OpCodes: RISCV::VID_V, VT: LT.second, CostKind) +
1688 (LT.first - 1) *
1689 getRISCVInstructionCost(OpCodes: RISCV::VADD_VX, VT: LT.second, CostKind);
1690 return 1 + (LT.first - 1);
1691 }
1692 case Intrinsic::vector_splice_left:
1693 case Intrinsic::vector_splice_right: {
1694 auto LT = getTypeLegalizationCost(Ty: RetTy);
1695 // Constant offsets fall through to getShuffleCost.
1696 if (!ICA.isTypeBasedOnly() && isa<ConstantInt>(Val: ICA.getArgs()[2]))
1697 break;
1698 if (ST->hasVInstructions() && LT.second.isVector()) {
1699 return LT.first *
1700 getRISCVInstructionCost(OpCodes: {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX},
1701 VT: LT.second, CostKind);
1702 }
1703 break;
1704 }
1705 case Intrinsic::experimental_cttz_elts: {
1706 Type *ArgTy = ICA.getArgTypes()[0];
1707 EVT ArgType = TLI->getValueType(DL, Ty: ArgTy, AllowUnknown: true);
1708 if (getTLI()->shouldExpandCttzElements(VT: ArgType))
1709 break;
1710 InstructionCost Cost = getRISCVInstructionCost(
1711 OpCodes: RISCV::VFIRST_M, VT: getTypeLegalizationCost(Ty: ArgTy).second, CostKind);
1712
1713 // If zero_is_poison is false, then we will generate additional
1714 // cmp + select instructions to convert -1 to EVL.
1715 Type *BoolTy = Type::getInt1Ty(C&: RetTy->getContext());
1716 if (ICA.getArgs().size() > 1 &&
1717 cast<ConstantInt>(Val: ICA.getArgs()[1])->isZero())
1718 Cost += getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: BoolTy, CondTy: RetTy,
1719 VecPred: CmpInst::ICMP_SLT, CostKind) +
1720 getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: RetTy, CondTy: BoolTy,
1721 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
1722
1723 return Cost;
1724 }
1725 case Intrinsic::experimental_vp_splice: {
1726 // To support type-based query from vectorizer, set the index to 0.
1727 // Note that index only change the cost from vslide.vx to vslide.vi and in
1728 // current implementations they have same costs.
1729 return getShuffleCost(Kind: TTI::SK_Splice, DstTy: cast<VectorType>(Val: ICA.getReturnType()),
1730 SrcTy: cast<VectorType>(Val: ICA.getArgTypes()[0]), Mask: {}, CostKind,
1731 Index: 0, SubTp: cast<VectorType>(Val: ICA.getReturnType()));
1732 }
1733 case Intrinsic::fptoui_sat:
1734 case Intrinsic::fptosi_sat: {
1735 InstructionCost Cost = 0;
1736 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1737 Type *SrcTy = ICA.getArgTypes()[0];
1738
1739 auto SrcLT = getTypeLegalizationCost(Ty: SrcTy);
1740 auto DstLT = getTypeLegalizationCost(Ty: RetTy);
1741 if (!SrcTy->isVectorTy())
1742 break;
1743
1744 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1745 return InstructionCost::getInvalid();
1746
1747 Cost +=
1748 getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1749 Dst: RetTy, Src: SrcTy, CCH: TTI::CastContextHint::None, CostKind);
1750
1751 // Handle NaN.
1752 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1753 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1754 Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1);
1755 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: SrcTy, CondTy,
1756 VecPred: CmpInst::FCMP_UNO, CostKind);
1757 Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
1758 VecPred: CmpInst::FCMP_UNO, CostKind);
1759 return Cost;
1760 }
1761 case Intrinsic::experimental_vector_extract_last_active: {
1762 auto *ValTy = cast<VectorType>(Val: ICA.getArgTypes()[0]);
1763 auto *MaskTy = cast<VectorType>(Val: ICA.getArgTypes()[1]);
1764
1765 auto ValLT = getTypeLegalizationCost(Ty: ValTy);
1766 auto MaskLT = getTypeLegalizationCost(Ty: MaskTy);
1767
1768 // TODO: Return cheaper cost when the entire lane is inactive.
1769 // The expected asm sequence is:
1770 // vcpop.m a0, v0
1771 // beqz a0, exit # Return passthru when the entire lane is inactive.
1772 // vid v10, v0.t
1773 // vredmaxu.vs v10, v10, v10
1774 // vmv.x.s a0, v10
1775 // zext.b a0, a0
1776 // vslidedown.vx v8, v8, a0
1777 // vmv.x.s a0, v8
1778 // exit:
1779 // ...
1780
1781 // Find a suitable type for a stepvector.
1782 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(numBits: 64));
1783 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1784 RetVT: TLI->getVectorIdxTy(DL: getDataLayout()), EC: MaskTy->getElementCount(),
1785 /*ZeroIsPoison=*/true, VScaleRange: &VScaleRange);
1786 EltWidth = std::max(a: EltWidth, b: MaskTy->getScalarSizeInBits());
1787 Type *StepTy = Type::getIntNTy(C&: MaskTy->getContext(), N: EltWidth);
1788 auto *StepVecTy = VectorType::get(ElementType: StepTy, EC: ValTy->getElementCount());
1789 auto StepLT = getTypeLegalizationCost(Ty: StepVecTy);
1790
1791 // Currently expandVectorFindLastActive cannot handle step vector split.
1792 // So return invalid when the type needs split.
1793 // FIXME: Remove this if expandVectorFindLastActive supports split vector.
1794 if (StepLT.first > 1)
1795 return InstructionCost::getInvalid();
1796
1797 InstructionCost Cost = 0;
1798 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1799
1800 Cost += MaskLT.first *
1801 getRISCVInstructionCost(OpCodes: RISCV::VCPOP_M, VT: MaskLT.second, CostKind);
1802 Cost += getCFInstrCost(Opcode: Instruction::CondBr, CostKind, I: nullptr);
1803 Cost += StepLT.first *
1804 getRISCVInstructionCost(OpCodes: Opcodes, VT: StepLT.second, CostKind);
1805 Cost += getCastInstrCost(Opcode: Instruction::ZExt,
1806 Dst: Type::getInt64Ty(C&: ValTy->getContext()), Src: StepTy,
1807 CCH: TTI::CastContextHint::None, CostKind, I: nullptr);
1808 Cost += ValLT.first *
1809 getRISCVInstructionCost(OpCodes: {RISCV::VSLIDEDOWN_VI, RISCV::VMV_X_S},
1810 VT: ValLT.second, CostKind);
1811 return Cost;
1812 }
1813 }
1814
1815 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1816 if (auto LT = getTypeLegalizationCost(Ty: RetTy);
1817 LT.second.isVector()) {
1818 MVT EltTy = LT.second.getVectorElementType();
1819 if (const auto *Entry = CostTableLookup(Table: VectorIntrinsicCostTable,
1820 ISD: ICA.getID(), Ty: EltTy))
1821 return LT.first * Entry->Cost;
1822 }
1823 }
1824
1825 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1826}
1827
1828InstructionCost
1829RISCVTTIImpl::getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
1830 const SCEV *Ptr,
1831 TTI::TargetCostKind CostKind) const {
1832 // Address computations for vector indexed load/store likely require an offset
1833 // and/or scaling.
1834 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1835 return getArithmeticInstrCost(Opcode: Instruction::Add, Ty: PtrTy, CostKind);
1836
1837 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1838}
1839
1840InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1841 Type *Src,
1842 TTI::CastContextHint CCH,
1843 TTI::TargetCostKind CostKind,
1844 const Instruction *I) const {
1845 bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src);
1846 if (!IsVectorType)
1847 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1848
1849 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
1850 // For now, skip all fixed vector cost analysis when P extension is available
1851 // to avoid crashes in getMinRVVVectorSizeInBits()
1852 if (ST->hasStdExtP() &&
1853 (isa<FixedVectorType>(Val: Dst) || isa<FixedVectorType>(Val: Src))) {
1854 return 1; // Treat as single instruction cost for now
1855 }
1856
1857 // FIXME: Need to compute legalizing cost for illegal types. The current
1858 // code handles only legal types and those which can be trivially
1859 // promoted to legal.
1860 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1861 Dst->getScalarSizeInBits() > ST->getELen())
1862 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1863
1864 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1865 assert(ISD && "Invalid opcode");
1866 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
1867 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
1868
1869 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1870 // The shared implementation doesn't model vector widening during legalization
1871 // and instead assumes scalarization. In order to scalarize an <N x i1>
1872 // vector, we need to extend/trunc to/from i8. If we don't special case
1873 // this, we can get an infinite recursion cycle.
1874 switch (ISD) {
1875 default:
1876 break;
1877 case ISD::SIGN_EXTEND:
1878 case ISD::ZERO_EXTEND:
1879 if (Src->getScalarSizeInBits() == 1) {
1880 // We do not use vsext/vzext to extend from mask vector.
1881 // Instead we use the following instructions to extend from mask vector:
1882 // vmv.v.i v8, 0
1883 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1884 return getRISCVInstructionCost(OpCodes: RISCV::VMV_V_I, VT: DstLT.second, CostKind) +
1885 DstLT.first * getRISCVInstructionCost(OpCodes: RISCV::VMERGE_VIM,
1886 VT: DstLT.second, CostKind) +
1887 DstLT.first - 1;
1888 }
1889 break;
1890 case ISD::TRUNCATE:
1891 if (Dst->getScalarSizeInBits() == 1) {
1892 // We do not use several vncvt to truncate to mask vector. So we could
1893 // not use PowDiff to calculate it.
1894 // Instead we use the following instructions to truncate to mask vector:
1895 // vand.vi v8, v8, 1
1896 // vmsne.vi v0, v8, 0
1897 return SrcLT.first *
1898 getRISCVInstructionCost(OpCodes: {RISCV::VAND_VI, RISCV::VMSNE_VI},
1899 VT: SrcLT.second, CostKind) +
1900 SrcLT.first - 1;
1901 }
1902 break;
1903 };
1904
1905 // Our actual lowering for the case where a wider legal type is available
1906 // uses promotion to the wider type. This is reflected in the result of
1907 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1908 // scalarized if the legalized Src and Dst are not equal sized.
1909 const DataLayout &DL = this->getDataLayout();
1910 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1911 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1912 !TypeSize::isKnownLE(LHS: DL.getTypeSizeInBits(Ty: Src),
1913 RHS: SrcLT.second.getSizeInBits()) ||
1914 !TypeSize::isKnownLE(LHS: DL.getTypeSizeInBits(Ty: Dst),
1915 RHS: DstLT.second.getSizeInBits()) ||
1916 SrcLT.first > 1 || DstLT.first > 1)
1917 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1918
1919 // The split cost is handled by the base getCastInstrCost
1920 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1921
1922 int PowDiff = (int)Log2_32(Value: DstLT.second.getScalarSizeInBits()) -
1923 (int)Log2_32(Value: SrcLT.second.getScalarSizeInBits());
1924 switch (ISD) {
1925 case ISD::SIGN_EXTEND:
1926 case ISD::ZERO_EXTEND: {
1927 if ((PowDiff < 1) || (PowDiff > 3))
1928 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1929 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1930 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1931 unsigned Op =
1932 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1933 return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind);
1934 }
1935 case ISD::TRUNCATE:
1936 case ISD::FP_EXTEND:
1937 case ISD::FP_ROUND: {
1938 // Counts of narrow/widen instructions.
1939 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1940 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1941
1942 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1943 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1944 : RISCV::VFNCVT_F_F_W;
1945 InstructionCost Cost = 0;
1946 for (; SrcEltSize != DstEltSize;) {
1947 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1948 ? MVT::getIntegerVT(BitWidth: DstEltSize)
1949 : MVT::getFloatingPointVT(BitWidth: DstEltSize);
1950 MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT);
1951 DstEltSize =
1952 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1953 Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind);
1954 }
1955 return Cost;
1956 }
1957 case ISD::FP_TO_SINT:
1958 case ISD::FP_TO_UINT: {
1959 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1960 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1961 unsigned FWCVT =
1962 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1963 unsigned FNCVT =
1964 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1965 unsigned SrcEltSize = Src->getScalarSizeInBits();
1966 unsigned DstEltSize = Dst->getScalarSizeInBits();
1967 InstructionCost Cost = 0;
1968 if ((SrcEltSize == 16) &&
1969 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1970 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1971 // pre-widening to f32 and then convert f32 to integer
1972 VectorType *VecF32Ty =
1973 VectorType::get(ElementType: Type::getFloatTy(C&: Dst->getContext()),
1974 EC: cast<VectorType>(Val: Dst)->getElementCount());
1975 std::pair<InstructionCost, MVT> VecF32LT =
1976 getTypeLegalizationCost(Ty: VecF32Ty);
1977 Cost +=
1978 VecF32LT.first * getRISCVInstructionCost(OpCodes: RISCV::VFWCVT_F_F_V,
1979 VT: VecF32LT.second, CostKind);
1980 Cost += getCastInstrCost(Opcode, Dst, Src: VecF32Ty, CCH, CostKind, I);
1981 return Cost;
1982 }
1983 if (DstEltSize == SrcEltSize)
1984 Cost += getRISCVInstructionCost(OpCodes: FCVT, VT: DstLT.second, CostKind);
1985 else if (DstEltSize > SrcEltSize)
1986 Cost += getRISCVInstructionCost(OpCodes: FWCVT, VT: DstLT.second, CostKind);
1987 else { // (SrcEltSize > DstEltSize)
1988 // First do a narrowing conversion to an integer half the size, then
1989 // truncate if needed.
1990 MVT ElementVT = MVT::getIntegerVT(BitWidth: SrcEltSize / 2);
1991 MVT VecVT = DstLT.second.changeVectorElementType(EltVT: ElementVT);
1992 Cost += getRISCVInstructionCost(OpCodes: FNCVT, VT: VecVT, CostKind);
1993 if ((SrcEltSize / 2) > DstEltSize) {
1994 Type *VecTy = EVT(VecVT).getTypeForEVT(Context&: Dst->getContext());
1995 Cost +=
1996 getCastInstrCost(Opcode: Instruction::Trunc, Dst, Src: VecTy, CCH, CostKind, I);
1997 }
1998 }
1999 return Cost;
2000 }
2001 case ISD::SINT_TO_FP:
2002 case ISD::UINT_TO_FP: {
2003 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
2004 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
2005 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
2006 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
2007 unsigned SrcEltSize = Src->getScalarSizeInBits();
2008 unsigned DstEltSize = Dst->getScalarSizeInBits();
2009
2010 InstructionCost Cost = 0;
2011 if ((DstEltSize == 16) &&
2012 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
2013 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
2014 // it is converted to f32 and then converted to f16
2015 VectorType *VecF32Ty =
2016 VectorType::get(ElementType: Type::getFloatTy(C&: Dst->getContext()),
2017 EC: cast<VectorType>(Val: Dst)->getElementCount());
2018 std::pair<InstructionCost, MVT> VecF32LT =
2019 getTypeLegalizationCost(Ty: VecF32Ty);
2020 Cost += getCastInstrCost(Opcode, Dst: VecF32Ty, Src, CCH, CostKind, I);
2021 Cost += VecF32LT.first * getRISCVInstructionCost(OpCodes: RISCV::VFNCVT_F_F_W,
2022 VT: DstLT.second, CostKind);
2023 return Cost;
2024 }
2025
2026 if (DstEltSize == SrcEltSize)
2027 Cost += getRISCVInstructionCost(OpCodes: FCVT, VT: DstLT.second, CostKind);
2028 else if (DstEltSize > SrcEltSize) {
2029 if ((DstEltSize / 2) > SrcEltSize) {
2030 VectorType *VecTy =
2031 VectorType::get(ElementType: IntegerType::get(C&: Dst->getContext(), NumBits: DstEltSize / 2),
2032 EC: cast<VectorType>(Val: Dst)->getElementCount());
2033 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
2034 Cost += getCastInstrCost(Opcode: Op, Dst: VecTy, Src, CCH, CostKind, I);
2035 }
2036 Cost += getRISCVInstructionCost(OpCodes: FWCVT, VT: DstLT.second, CostKind);
2037 } else
2038 Cost += getRISCVInstructionCost(OpCodes: FNCVT, VT: DstLT.second, CostKind);
2039 return Cost;
2040 }
2041 }
2042 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
2043}
2044
2045unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
2046 if (isa<ScalableVectorType>(Val: Ty)) {
2047 const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType());
2048 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
2049 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
2050 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
2051 }
2052 return cast<FixedVectorType>(Val: Ty)->getNumElements();
2053}
2054
2055InstructionCost
2056RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2057 FastMathFlags FMF,
2058 TTI::TargetCostKind CostKind) const {
2059 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
2060 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2061
2062 // Skip if scalar size of Ty is bigger than ELEN.
2063 if (Ty->getScalarSizeInBits() > ST->getELen())
2064 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2065
2066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2067 if (Ty->getElementType()->isIntegerTy(BitWidth: 1)) {
2068 // SelectionDAGBuilder does following transforms:
2069 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
2070 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
2071 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
2072 return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind);
2073 else
2074 return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind);
2075 }
2076
2077 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
2078 SmallVector<unsigned, 3> Opcodes;
2079 InstructionCost ExtraCost = 0;
2080 switch (IID) {
2081 case Intrinsic::maximum:
2082 if (FMF.noNaNs()) {
2083 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2084 } else {
2085 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
2086 RISCV::VFMV_F_S};
2087 // Cost of Canonical Nan + branch
2088 // lui a0, 523264
2089 // fmv.w.x fa0, a0
2090 Type *DstTy = Ty->getScalarType();
2091 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
2092 Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
2093 ExtraCost = 1 +
2094 getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
2095 CCH: TTI::CastContextHint::None, CostKind) +
2096 getCFInstrCost(Opcode: Instruction::CondBr, CostKind);
2097 }
2098 break;
2099
2100 case Intrinsic::minimum:
2101 if (FMF.noNaNs()) {
2102 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2103 } else {
2104 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
2105 RISCV::VFMV_F_S};
2106 // Cost of Canonical Nan + branch
2107 // lui a0, 523264
2108 // fmv.w.x fa0, a0
2109 Type *DstTy = Ty->getScalarType();
2110 const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy);
2111 Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
2112 ExtraCost = 1 +
2113 getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
2114 CCH: TTI::CastContextHint::None, CostKind) +
2115 getCFInstrCost(Opcode: Instruction::CondBr, CostKind);
2116 }
2117 break;
2118 }
2119 return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
2120 }
2121
2122 // IR Reduction is composed by one rvv reduction instruction and vmv
2123 unsigned SplitOp;
2124 SmallVector<unsigned, 3> Opcodes;
2125 switch (IID) {
2126 default:
2127 llvm_unreachable("Unsupported intrinsic");
2128 case Intrinsic::smax:
2129 SplitOp = RISCV::VMAX_VV;
2130 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
2131 break;
2132 case Intrinsic::smin:
2133 SplitOp = RISCV::VMIN_VV;
2134 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
2135 break;
2136 case Intrinsic::umax:
2137 SplitOp = RISCV::VMAXU_VV;
2138 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
2139 break;
2140 case Intrinsic::umin:
2141 SplitOp = RISCV::VMINU_VV;
2142 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
2143 break;
2144 case Intrinsic::maxnum:
2145 SplitOp = RISCV::VFMAX_VV;
2146 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
2147 break;
2148 case Intrinsic::minnum:
2149 SplitOp = RISCV::VFMIN_VV;
2150 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
2151 break;
2152 }
2153 // Add a cost for data larger than LMUL8
2154 InstructionCost SplitCost =
2155 (LT.first > 1) ? (LT.first - 1) *
2156 getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
2157 : 0;
2158 return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
2159}
2160
2161InstructionCost
2162RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2163 std::optional<FastMathFlags> FMF,
2164 TTI::TargetCostKind CostKind) const {
2165 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
2166 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2167
2168 // Skip if scalar size of Ty is bigger than ELEN.
2169 if (Ty->getScalarSizeInBits() > ST->getELen())
2170 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2171
2172 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2173 assert(ISD && "Invalid opcode");
2174
2175 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
2176 ISD != ISD::FADD)
2177 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2178
2179 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2180 Type *ElementTy = Ty->getElementType();
2181 if (ElementTy->isIntegerTy(BitWidth: 1)) {
2182 // Example sequences:
2183 // vfirst.m a0, v0
2184 // seqz a0, a0
2185 if (LT.second == MVT::v1i1)
2186 return getRISCVInstructionCost(OpCodes: RISCV::VFIRST_M, VT: LT.second, CostKind) +
2187 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
2188 VecPred: CmpInst::ICMP_EQ, CostKind);
2189
2190 if (ISD == ISD::AND) {
2191 // Example sequences:
2192 // vmand.mm v8, v9, v8 ; needed every time type is split
2193 // vmnot.m v8, v0 ; alias for vmnand
2194 // vcpop.m a0, v8
2195 // seqz a0, a0
2196
2197 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
2198 // For LMUL <= 8, there is no splitting,
2199 // the sequences are vmnot, vcpop and seqz.
2200 // When LMUL > 8 and split = 1,
2201 // the sequences are vmnand, vcpop and seqz.
2202 // When LMUL > 8 and split > 1,
2203 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
2204 return ((LT.first > 2) ? (LT.first - 2) : 0) *
2205 getRISCVInstructionCost(OpCodes: RISCV::VMAND_MM, VT: LT.second, CostKind) +
2206 getRISCVInstructionCost(OpCodes: RISCV::VMNAND_MM, VT: LT.second, CostKind) +
2207 getRISCVInstructionCost(OpCodes: RISCV::VCPOP_M, VT: LT.second, CostKind) +
2208 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
2209 VecPred: CmpInst::ICMP_EQ, CostKind);
2210 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
2211 // Example sequences:
2212 // vsetvli a0, zero, e8, mf8, ta, ma
2213 // vmxor.mm v8, v0, v8 ; needed every time type is split
2214 // vcpop.m a0, v8
2215 // andi a0, a0, 1
2216 return (LT.first - 1) *
2217 getRISCVInstructionCost(OpCodes: RISCV::VMXOR_MM, VT: LT.second, CostKind) +
2218 getRISCVInstructionCost(OpCodes: RISCV::VCPOP_M, VT: LT.second, CostKind) + 1;
2219 } else {
2220 assert(ISD == ISD::OR);
2221 // Example sequences:
2222 // vsetvli a0, zero, e8, mf8, ta, ma
2223 // vmor.mm v8, v9, v8 ; needed every time type is split
2224 // vcpop.m a0, v0
2225 // snez a0, a0
2226 return (LT.first - 1) *
2227 getRISCVInstructionCost(OpCodes: RISCV::VMOR_MM, VT: LT.second, CostKind) +
2228 getRISCVInstructionCost(OpCodes: RISCV::VCPOP_M, VT: LT.second, CostKind) +
2229 getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
2230 VecPred: CmpInst::ICMP_NE, CostKind);
2231 }
2232 }
2233
2234 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2235 // instruction, and others is composed by two vmv and one rvv reduction
2236 // instruction
2237 unsigned SplitOp;
2238 SmallVector<unsigned, 3> Opcodes;
2239 switch (ISD) {
2240 case ISD::ADD:
2241 SplitOp = RISCV::VADD_VV;
2242 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2243 break;
2244 case ISD::OR:
2245 SplitOp = RISCV::VOR_VV;
2246 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2247 break;
2248 case ISD::XOR:
2249 SplitOp = RISCV::VXOR_VV;
2250 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2251 break;
2252 case ISD::AND:
2253 SplitOp = RISCV::VAND_VV;
2254 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2255 break;
2256 case ISD::FADD:
2257 // We can't promote f16/bf16 fadd reductions.
2258 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2259 LT.second.getScalarType() == MVT::bf16)
2260 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2261 if (TTI::requiresOrderedReduction(FMF)) {
2262 Opcodes.push_back(Elt: RISCV::VFMV_S_F);
2263 for (unsigned i = 0; i < LT.first.getValue(); i++)
2264 Opcodes.push_back(Elt: RISCV::VFREDOSUM_VS);
2265 Opcodes.push_back(Elt: RISCV::VFMV_F_S);
2266 return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
2267 }
2268 SplitOp = RISCV::VFADD_VV;
2269 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2270 break;
2271 }
2272 // Add a cost for data larger than LMUL8
2273 InstructionCost SplitCost =
2274 (LT.first > 1) ? (LT.first - 1) *
2275 getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
2276 : 0;
2277 return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
2278}
2279
2280InstructionCost RISCVTTIImpl::getExtendedReductionCost(
2281 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2282 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2283 if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
2284 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
2285 FMF, CostKind);
2286
2287 // Skip if scalar size of ResTy is bigger than ELEN.
2288 if (ResTy->getScalarSizeInBits() > ST->getELen())
2289 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
2290 FMF, CostKind);
2291
2292 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2293 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
2294 FMF, CostKind);
2295
2296 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
2297
2298 if (IsUnsigned && Opcode == Instruction::Add &&
2299 LT.second.isFixedLengthVectorOf(EltVT: MVT::i1)) {
2300 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2301 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2302 return LT.first *
2303 getRISCVInstructionCost(OpCodes: RISCV::VCPOP_M, VT: LT.second, CostKind);
2304 }
2305
2306 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2307 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
2308 FMF, CostKind);
2309
2310 return (LT.first - 1) +
2311 getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
2312}
2313
2314InstructionCost
2315RISCVTTIImpl::getStoreImmCost(Type *Ty, TTI::OperandValueInfo OpInfo,
2316 TTI::TargetCostKind CostKind) const {
2317 assert(OpInfo.isConstant() && "non constant operand?");
2318 if (!isa<VectorType>(Val: Ty))
2319 // FIXME: We need to account for immediate materialization here, but doing
2320 // a decent job requires more knowledge about the immediate than we
2321 // currently have here.
2322 return 0;
2323
2324 if (OpInfo.isUniform())
2325 // vmv.v.i, vmv.v.x, or vfmv.v.f
2326 // We ignore the cost of the scalar constant materialization to be consistent
2327 // with how we treat scalar constants themselves just above.
2328 return 1;
2329
2330 return getConstantPoolLoadCost(Ty, CostKind);
2331}
2332
2333InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
2334 Align Alignment,
2335 unsigned AddressSpace,
2336 TTI::TargetCostKind CostKind,
2337 TTI::OperandValueInfo OpInfo,
2338 const Instruction *I) const {
2339 EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true);
2340 // Type legalization can't handle structs, and load latency isn't handled here
2341 if (VT == MVT::Other ||
2342 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency))
2343 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2344 CostKind, OpInfo, I);
2345
2346 InstructionCost Cost = 0;
2347 if (Opcode == Instruction::Store && OpInfo.isConstant())
2348 Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind);
2349
2350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
2351
2352 InstructionCost BaseCost = [&]() {
2353 InstructionCost Cost = LT.first;
2354 if (CostKind != TTI::TCK_RecipThroughput)
2355 return Cost;
2356
2357 // Our actual lowering for the case where a wider legal type is available
2358 // uses the a VL predicated load on the wider type. This is reflected in
2359 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2360 // widened cases are scalarized.
2361 const DataLayout &DL = this->getDataLayout();
2362 if (Src->isVectorTy() && LT.second.isVector() &&
2363 TypeSize::isKnownLT(LHS: DL.getTypeStoreSizeInBits(Ty: Src),
2364 RHS: LT.second.getSizeInBits()))
2365 return Cost;
2366
2367 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2368 CostKind, OpInfo, I);
2369 }();
2370
2371 // Assume memory ops cost scale with the number of vector registers
2372 // possible accessed by the instruction. Note that BasicTTI already
2373 // handles the LT.first term for us.
2374 if (ST->hasVInstructions() && LT.second.isVector() &&
2375 CostKind != TTI::TCK_CodeSize)
2376 BaseCost *= TLI->getLMULCost(VT: LT.second);
2377 return Cost + BaseCost;
2378}
2379
2380InstructionCost RISCVTTIImpl::getCmpSelInstrCost(
2381 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2382 TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
2383 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2384 if (CostKind != TTI::TCK_RecipThroughput)
2385 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2386 Op1Info, Op2Info, I);
2387
2388 if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
2389 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2390 Op1Info, Op2Info, I);
2391
2392 // Skip if scalar size of ValTy is bigger than ELEN.
2393 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2394 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2395 Op1Info, Op2Info, I);
2396
2397 auto GetConstantMatCost =
2398 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2399 if (OpInfo.isUniform())
2400 // We return 0 we currently ignore the cost of materializing scalar
2401 // constants in GPRs.
2402 return 0;
2403
2404 return getConstantPoolLoadCost(Ty: ValTy, CostKind);
2405 };
2406
2407 InstructionCost ConstantMatCost;
2408 if (Op1Info.isConstant())
2409 ConstantMatCost += GetConstantMatCost(Op1Info);
2410 if (Op2Info.isConstant())
2411 ConstantMatCost += GetConstantMatCost(Op2Info);
2412
2413 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
2414 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2415 if (CondTy->isVectorTy()) {
2416 if (ValTy->getScalarSizeInBits() == 1) {
2417 // vmandn.mm v8, v8, v9
2418 // vmand.mm v9, v0, v9
2419 // vmor.mm v0, v9, v8
2420 return ConstantMatCost +
2421 LT.first *
2422 getRISCVInstructionCost(
2423 OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2424 VT: LT.second, CostKind);
2425 }
2426 // vselect and max/min are supported natively.
2427 return ConstantMatCost +
2428 LT.first * getRISCVInstructionCost(OpCodes: RISCV::VMERGE_VVM, VT: LT.second,
2429 CostKind);
2430 }
2431
2432 if (ValTy->getScalarSizeInBits() == 1) {
2433 // vmv.v.x v9, a0
2434 // vmsne.vi v9, v9, 0
2435 // vmandn.mm v8, v8, v9
2436 // vmand.mm v9, v0, v9
2437 // vmor.mm v0, v9, v8
2438 MVT InterimVT = LT.second.changeVectorElementType(EltVT: MVT::i8);
2439 return ConstantMatCost +
2440 LT.first *
2441 getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI},
2442 VT: InterimVT, CostKind) +
2443 LT.first * getRISCVInstructionCost(
2444 OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2445 VT: LT.second, CostKind);
2446 }
2447
2448 // vmv.v.x v10, a0
2449 // vmsne.vi v0, v10, 0
2450 // vmerge.vvm v8, v9, v8, v0
2451 return ConstantMatCost +
2452 LT.first * getRISCVInstructionCost(
2453 OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2454 VT: LT.second, CostKind);
2455 }
2456
2457 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2458 CmpInst::isIntPredicate(P: VecPred)) {
2459 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2460 // provided they incur the same cost across all implementations
2461 return ConstantMatCost + LT.first * getRISCVInstructionCost(OpCodes: RISCV::VMSLT_VV,
2462 VT: LT.second,
2463 CostKind);
2464 }
2465
2466 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2467 CmpInst::isFPPredicate(P: VecPred)) {
2468
2469 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2470 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2471 return ConstantMatCost +
2472 getRISCVInstructionCost(OpCodes: RISCV::VMXOR_MM, VT: LT.second, CostKind);
2473
2474 // If we do not support the input floating point vector type, use the base
2475 // one which will calculate as:
2476 // ScalarizeCost + Num * Cost for fixed vector,
2477 // InvalidCost for scalable vector.
2478 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2479 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2480 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2481 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2482 Op1Info, Op2Info, I);
2483
2484 // Assuming vector fp compare and mask instructions are all the same cost
2485 // until a need arises to differentiate them.
2486 switch (VecPred) {
2487 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2488 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2489 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2490 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2491 return ConstantMatCost +
2492 LT.first * getRISCVInstructionCost(
2493 OpCodes: {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2494 VT: LT.second, CostKind);
2495
2496 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2497 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2498 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2499 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2500 return ConstantMatCost +
2501 LT.first *
2502 getRISCVInstructionCost(OpCodes: {RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2503 VT: LT.second, CostKind);
2504
2505 case CmpInst::FCMP_OEQ: // vmfeq.vv
2506 case CmpInst::FCMP_OGT: // vmflt.vv
2507 case CmpInst::FCMP_OGE: // vmfle.vv
2508 case CmpInst::FCMP_OLT: // vmflt.vv
2509 case CmpInst::FCMP_OLE: // vmfle.vv
2510 case CmpInst::FCMP_UNE: // vmfne.vv
2511 return ConstantMatCost +
2512 LT.first *
2513 getRISCVInstructionCost(OpCodes: RISCV::VMFLT_VV, VT: LT.second, CostKind);
2514 default:
2515 break;
2516 }
2517 }
2518
2519 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2520 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2521 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2522 // be (0 + select instr cost).
2523 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(Val: I) &&
2524 ValTy->isIntegerTy() && !I->user_empty()) {
2525 if (all_of(Range: I->users(), P: [&](const User *U) {
2526 return match(V: U, P: m_Select(C: m_Specific(V: I), L: m_Value(), R: m_Value())) &&
2527 U->getType()->isIntegerTy() &&
2528 !isa<ConstantData>(Val: U->getOperand(i: 1)) &&
2529 !isa<ConstantData>(Val: U->getOperand(i: 2));
2530 }))
2531 return 0;
2532 }
2533
2534 // TODO: Add cost for scalar type.
2535
2536 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2537 Op1Info, Op2Info, I);
2538}
2539
2540InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
2541 TTI::TargetCostKind CostKind,
2542 const Instruction *I) const {
2543 if (CostKind != TTI::TCK_RecipThroughput)
2544 return Opcode == Instruction::PHI ? 0 : 1;
2545 // Branches are assumed to be predicted.
2546 return 0;
2547}
2548
2549InstructionCost RISCVTTIImpl::getVectorInstrCost(
2550 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
2551 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
2552 assert(Val->isVectorTy() && "This must be a vector type");
2553
2554 // TODO: Add proper cost model for P extension fixed vectors (e.g., v4i16)
2555 // For now, skip all fixed vector cost analysis when P extension is available
2556 // to avoid crashes in getMinRVVVectorSizeInBits()
2557 if (ST->hasStdExtP() && isa<FixedVectorType>(Val)) {
2558 return 1; // Treat as single instruction cost for now
2559 }
2560
2561 if (Opcode != Instruction::ExtractElement &&
2562 Opcode != Instruction::InsertElement)
2563 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1,
2564 VIC);
2565
2566 // Legalize the type.
2567 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
2568
2569 // This type is legalized to a scalar type.
2570 if (!LT.second.isVector()) {
2571 auto *FixedVecTy = cast<FixedVectorType>(Val);
2572 // If Index is a known constant, cost is zero.
2573 if (Index != -1U)
2574 return 0;
2575 // Extract/InsertElement with non-constant index is very costly when
2576 // scalarized; estimate cost of loads/stores sequence via the stack:
2577 // ExtractElement cost: store vector to stack, load scalar;
2578 // InsertElement cost: store vector to stack, store scalar, load vector.
2579 Type *ElemTy = FixedVecTy->getElementType();
2580 auto NumElems = FixedVecTy->getNumElements();
2581 auto Align = DL.getPrefTypeAlign(Ty: ElemTy);
2582 InstructionCost LoadCost =
2583 getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind);
2584 InstructionCost StoreCost =
2585 getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind);
2586 return Opcode == Instruction::ExtractElement
2587 ? StoreCost * NumElems + LoadCost
2588 : (StoreCost + LoadCost) * NumElems + StoreCost;
2589 }
2590
2591 // For unsupported scalable vector.
2592 if (LT.second.isScalableVector() && !LT.first.isValid())
2593 return LT.first;
2594
2595 // Mask vector extract/insert is expanded via e8.
2596 if (Val->getScalarSizeInBits() == 1) {
2597 VectorType *WideTy =
2598 VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: 8),
2599 EC: cast<VectorType>(Val)->getElementCount());
2600 if (Opcode == Instruction::ExtractElement) {
2601 InstructionCost ExtendCost
2602 = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
2603 CCH: TTI::CastContextHint::None, CostKind);
2604 InstructionCost ExtractCost
2605 = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
2606 return ExtendCost + ExtractCost;
2607 }
2608 InstructionCost ExtendCost
2609 = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
2610 CCH: TTI::CastContextHint::None, CostKind);
2611 InstructionCost InsertCost
2612 = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
2613 InstructionCost TruncCost
2614 = getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy,
2615 CCH: TTI::CastContextHint::None, CostKind);
2616 return ExtendCost + InsertCost + TruncCost;
2617 }
2618
2619
2620 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2621 // and vslideup + vmv.s.x to insert element to vector.
2622 unsigned BaseCost = 1;
2623 // When insertelement we should add the index with 1 as the input of vslideup.
2624 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2625
2626 if (Index != -1U) {
2627 // The type may be split. For fixed-width vectors we can normalize the
2628 // index to the new type.
2629 if (LT.second.isFixedLengthVector()) {
2630 unsigned Width = LT.second.getVectorNumElements();
2631 Index = Index % Width;
2632 }
2633
2634 // If exact VLEN is known, we will insert/extract into the appropriate
2635 // subvector with no additional subvector insert/extract cost.
2636 if (auto VLEN = ST->getRealVLen()) {
2637 unsigned EltSize = LT.second.getScalarSizeInBits();
2638 unsigned M1Max = *VLEN / EltSize;
2639 Index = Index % M1Max;
2640 }
2641
2642 if (Index == 0)
2643 // We can extract/insert the first element without vslidedown/vslideup.
2644 SlideCost = 0;
2645 else if (Opcode == Instruction::InsertElement)
2646 SlideCost = 1; // With a constant index, we do not need to use addi.
2647 }
2648
2649 // When the vector needs to split into multiple register groups and the index
2650 // exceeds single vector register group, we need to insert/extract the element
2651 // via stack.
2652 if (LT.first > 1 &&
2653 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2654 LT.second.isScalableVector()))) {
2655 Type *ScalarType = Val->getScalarType();
2656 Align VecAlign = DL.getPrefTypeAlign(Ty: Val);
2657 Align SclAlign = DL.getPrefTypeAlign(Ty: ScalarType);
2658 // Extra addi for unknown index.
2659 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2660
2661 // Store all split vectors into stack and load the target element.
2662 if (Opcode == Instruction::ExtractElement)
2663 return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) +
2664 getMemoryOpCost(Opcode: Instruction::Load, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0,
2665 CostKind) +
2666 IdxCost;
2667
2668 // Store all split vectors into stack and store the target element and load
2669 // vectors back.
2670 return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) +
2671 getMemoryOpCost(Opcode: Instruction::Load, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) +
2672 getMemoryOpCost(Opcode: Instruction::Store, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0,
2673 CostKind) +
2674 IdxCost;
2675 }
2676
2677 // Extract i64 in the target that has XLEN=32 need more instruction.
2678 if (Val->getScalarType()->isIntegerTy() &&
2679 ST->getXLen() < Val->getScalarSizeInBits()) {
2680 // For extractelement, we need the following instructions:
2681 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2682 // vslidedown.vx v8, v8, a0
2683 // vmv.x.s a0, v8
2684 // li a1, 32
2685 // vsrl.vx v8, v8, a1
2686 // vmv.x.s a1, v8
2687
2688 // For insertelement, we need the following instructions:
2689 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2690 // vmv.v.i v12, 0
2691 // vslide1up.vx v16, v12, a1
2692 // vslide1up.vx v12, v16, a0
2693 // addi a0, a2, 1
2694 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2695 // vslideup.vx v8, v12, a2
2696
2697 // TODO: should we count these special vsetvlis?
2698 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2699 }
2700 return BaseCost + SlideCost;
2701}
2702
2703InstructionCost
2704RISCVTTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
2705 TTI::TargetCostKind CostKind,
2706 unsigned Index) const {
2707 if (isa<FixedVectorType>(Val))
2708 return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
2709 Index);
2710
2711 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2712 // for the cost of extracting the last lane of a scalable vector. It probably
2713 // needs a more accurate cost.
2714 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2715 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2716 return getVectorInstrCost(Opcode, Val, CostKind,
2717 Index: EC.getKnownMinValue() - 1 - Index, Op0: nullptr,
2718 Op1: nullptr);
2719}
2720
2721/// Check to see if this instruction is expected to be combined to a simpler
2722/// operation during/before lowering. If so return the cost of the combined
2723/// operation rather than provided one. For instance, `udiv i16 %X, 2` is likely
2724/// to be combined to `lshr i16 %X, 1`, so return the cost of a `lshr` rather
2725/// than the cost of a `udiv`
2726std::optional<InstructionCost>
2727RISCVTTIImpl::getCombinedArithmeticInstructionCost(
2728 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2729 TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
2730 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2731 // Vector unsigned division/remainder will be simplified to shifts/masks.
2732 if ((Opcode == Instruction::UDiv || Opcode == Instruction::URem) &&
2733 Opd2Info.isConstant() && Opd2Info.isPowerOf2()) {
2734 if (Opcode == Instruction::UDiv)
2735 return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, Op1Info: Opd1Info,
2736 Op2Info: Opd2Info.getNoProps());
2737 // UREM
2738 return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, Op1Info: Opd1Info,
2739 Op2Info: Opd2Info.getNoProps());
2740 }
2741 return std::nullopt;
2742}
2743
2744InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
2745 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2746 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
2747 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2748
2749 // TODO: Handle more cost kinds.
2750 if (CostKind != TTI::TCK_RecipThroughput)
2751 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
2752 Args, CxtI);
2753
2754 if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
2755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
2756 Args, CxtI);
2757
2758 // Skip if scalar size of Ty is bigger than ELEN.
2759 if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2760 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
2761 Args, CxtI);
2762
2763 if (std::optional<InstructionCost> CombinedCost =
2764 getCombinedArithmeticInstructionCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
2765 Opd2Info: Op2Info, Args, CxtI))
2766 return *CombinedCost;
2767
2768 // Legalize the type.
2769 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2770 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2771
2772 // TODO: Handle scalar type.
2773 if (!LT.second.isVector()) {
2774 static const CostTblEntry DivTbl[]{
2775 {.ISD: ISD::UDIV, .Type: MVT::i32, .Cost: TTI::TCC_Expensive},
2776 {.ISD: ISD::UDIV, .Type: MVT::i64, .Cost: TTI::TCC_Expensive},
2777 {.ISD: ISD::SDIV, .Type: MVT::i32, .Cost: TTI::TCC_Expensive},
2778 {.ISD: ISD::SDIV, .Type: MVT::i64, .Cost: TTI::TCC_Expensive},
2779 {.ISD: ISD::UREM, .Type: MVT::i32, .Cost: TTI::TCC_Expensive},
2780 {.ISD: ISD::UREM, .Type: MVT::i64, .Cost: TTI::TCC_Expensive},
2781 {.ISD: ISD::SREM, .Type: MVT::i32, .Cost: TTI::TCC_Expensive},
2782 {.ISD: ISD::SREM, .Type: MVT::i64, .Cost: TTI::TCC_Expensive}};
2783 if (TLI->isOperationLegalOrPromote(Op: ISDOpcode, VT: LT.second))
2784 if (const auto *Entry = CostTableLookup(Table: DivTbl, ISD: ISDOpcode, Ty: LT.second))
2785 return Entry->Cost * LT.first;
2786
2787 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
2788 Args, CxtI);
2789 }
2790
2791 // f16 with zvfhmin and bf16 will be promoted to f32.
2792 // FIXME: nxv32[b]f16 will be custom lowered and split.
2793 InstructionCost CastCost = 0;
2794 if ((LT.second.getVectorElementType() == MVT::f16 ||
2795 LT.second.getVectorElementType() == MVT::bf16) &&
2796 TLI->getOperationAction(Op: ISDOpcode, VT: LT.second) ==
2797 TargetLoweringBase::LegalizeAction::Promote) {
2798 MVT PromotedVT = TLI->getTypeToPromoteTo(Op: ISDOpcode, VT: LT.second);
2799 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Context&: Ty->getContext());
2800 Type *LegalTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext());
2801 // Add cost of extending arguments
2802 CastCost += LT.first * Args.size() *
2803 getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: LegalTy,
2804 CCH: TTI::CastContextHint::None, CostKind);
2805 // Add cost of truncating result
2806 CastCost +=
2807 LT.first * getCastInstrCost(Opcode: Instruction::FPTrunc, Dst: LegalTy, Src: PromotedTy,
2808 CCH: TTI::CastContextHint::None, CostKind);
2809 // Compute cost of op in promoted type
2810 LT.second = PromotedVT;
2811 }
2812
2813 auto getConstantMatCost =
2814 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2815 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2816 // Two sub-cases:
2817 // * Has a 5 bit immediate operand which can be splatted.
2818 // * Has a larger immediate which must be materialized in scalar register
2819 // We return 0 for both as we currently ignore the cost of materializing
2820 // scalar constants in GPRs.
2821 return 0;
2822
2823 return getConstantPoolLoadCost(Ty, CostKind);
2824 };
2825
2826 // Add the cost of materializing any constant vectors required.
2827 InstructionCost ConstantMatCost = 0;
2828 if (Op1Info.isConstant())
2829 ConstantMatCost += getConstantMatCost(0, Op1Info);
2830 if (Op2Info.isConstant())
2831 ConstantMatCost += getConstantMatCost(1, Op2Info);
2832
2833 unsigned Op;
2834 switch (ISDOpcode) {
2835 case ISD::ADD:
2836 case ISD::SUB:
2837 Op = RISCV::VADD_VV;
2838 break;
2839 case ISD::SHL:
2840 case ISD::SRL:
2841 case ISD::SRA:
2842 Op = RISCV::VSLL_VV;
2843 break;
2844 case ISD::AND:
2845 case ISD::OR:
2846 case ISD::XOR:
2847 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2848 break;
2849 case ISD::MUL:
2850 case ISD::MULHS:
2851 case ISD::MULHU:
2852 Op = RISCV::VMUL_VV;
2853 break;
2854 case ISD::SDIV:
2855 case ISD::UDIV:
2856 Op = RISCV::VDIV_VV;
2857 break;
2858 case ISD::SREM:
2859 case ISD::UREM:
2860 Op = RISCV::VREM_VV;
2861 break;
2862 case ISD::FADD:
2863 case ISD::FSUB:
2864 Op = RISCV::VFADD_VV;
2865 break;
2866 case ISD::FMUL:
2867 Op = RISCV::VFMUL_VV;
2868 break;
2869 case ISD::FDIV:
2870 Op = RISCV::VFDIV_VV;
2871 break;
2872 case ISD::FNEG:
2873 Op = RISCV::VFSGNJN_VV;
2874 break;
2875 default:
2876 // Assuming all other instructions have the same cost until a need arises to
2877 // differentiate them.
2878 return CastCost + ConstantMatCost +
2879 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
2880 Args, CxtI);
2881 }
2882
2883 InstructionCost InstrCost = getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
2884 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2885 // ops are twice as expensive as integer ops. Do the same for vectors so
2886 // scalar floating point ops aren't cheaper than their vector equivalents.
2887 if (Ty->isFPOrFPVectorTy())
2888 InstrCost *= 2;
2889 return CastCost + ConstantMatCost + LT.first * InstrCost;
2890}
2891
2892// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2893InstructionCost RISCVTTIImpl::getPointersChainCost(
2894 ArrayRef<const Value *> Ptrs, const Value *Base,
2895 const TTI::PointersChainInfo &Info, Type *AccessTy,
2896 TTI::TargetCostKind CostKind) const {
2897 InstructionCost Cost = TTI::TCC_Free;
2898 // In the basic model we take into account GEP instructions only
2899 // (although here can come alloca instruction, a value, constants and/or
2900 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2901 // pointer). Typically, if Base is a not a GEP-instruction and all the
2902 // pointers are relative to the same base address, all the rest are
2903 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2904 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2905 // any their index is a non-const.
2906 // If no known dependencies between the pointers cost is calculated as a sum
2907 // of costs of GEP instructions.
2908 for (auto [I, V] : enumerate(First&: Ptrs)) {
2909 const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
2910 if (!GEP)
2911 continue;
2912 if (Info.isSameBase() && V != Base) {
2913 if (GEP->hasAllConstantIndices())
2914 continue;
2915 // If the chain is unit-stride and BaseReg + stride*i is a legal
2916 // addressing mode, then presume the base GEP is sitting around in a
2917 // register somewhere and check if we can fold the offset relative to
2918 // it.
2919 unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy);
2920 if (Info.isUnitStride() &&
2921 isLegalAddressingMode(Ty: AccessTy,
2922 /* BaseGV */ nullptr,
2923 /* BaseOffset */ Stride * I,
2924 /* HasBaseReg */ true,
2925 /* Scale */ 0,
2926 AddrSpace: GEP->getType()->getPointerAddressSpace()))
2927 continue;
2928 Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind,
2929 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
2930 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Args: {});
2931 } else {
2932 SmallVector<const Value *> Indices(GEP->indices());
2933 Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(),
2934 Operands: Indices, AccessType: AccessTy, CostKind);
2935 }
2936 }
2937 return Cost;
2938}
2939
2940void RISCVTTIImpl::getUnrollingPreferences(
2941 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
2942 OptimizationRemarkEmitter *ORE) const {
2943 // TODO: More tuning on benchmarks and metrics with changes as needed
2944 // would apply to all settings below to enable performance.
2945
2946
2947 if (ST->enableDefaultUnroll())
2948 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2949
2950 // Enable Upper bound unrolling universally, not dependent upon the conditions
2951 // below.
2952 UP.UpperBound = true;
2953
2954 // Disable loop unrolling for Oz and Os.
2955 UP.OptSizeThreshold = 0;
2956 UP.PartialOptSizeThreshold = 0;
2957 if (L->getHeader()->getParent()->hasOptSize())
2958 return;
2959
2960 SmallVector<BasicBlock *, 4> ExitingBlocks;
2961 L->getExitingBlocks(ExitingBlocks);
2962 LLVM_DEBUG(dbgs() << "Loop has:\n"
2963 << "Blocks: " << L->getNumBlocks() << "\n"
2964 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2965
2966 // Only allow another exit other than the latch. This acts as an early exit
2967 // as it mirrors the profitability calculation of the runtime unroller.
2968 if (ExitingBlocks.size() > 2)
2969 return;
2970
2971 // Limit the CFG of the loop body for targets with a branch predictor.
2972 // Allowing 4 blocks permits if-then-else diamonds in the body.
2973 if (L->getNumBlocks() > 4)
2974 return;
2975
2976 // Scan the loop: don't unroll loops with calls as this could prevent
2977 // inlining. Don't unroll auto-vectorized loops either, though do allow
2978 // unrolling of the scalar remainder.
2979 bool IsVectorized = getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized");
2980 InstructionCost Cost = 0;
2981 for (auto *BB : L->getBlocks()) {
2982 for (auto &I : *BB) {
2983 // Both auto-vectorized loops and the scalar remainder have the
2984 // isvectorized attribute, so differentiate between them by the presence
2985 // of vector instructions.
2986 if (IsVectorized && (I.getType()->isVectorTy() ||
2987 llvm::any_of(Range: I.operand_values(), P: [](Value *V) {
2988 return V->getType()->isVectorTy();
2989 })))
2990 return;
2991
2992 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) {
2993 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
2994 if (!isLoweredToCall(F))
2995 continue;
2996 }
2997 return;
2998 }
2999
3000 SmallVector<const Value *> Operands(I.operand_values());
3001 Cost += getInstructionCost(U: &I, Operands,
3002 CostKind: TargetTransformInfo::TCK_SizeAndLatency);
3003 }
3004 }
3005
3006 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
3007
3008 UP.Partial = true;
3009 UP.Runtime = true;
3010 UP.UnrollRemainder = true;
3011 UP.UnrollAndJam = true;
3012
3013 // Force unrolling small loops can be very useful because of the branch
3014 // taken cost of the backedge.
3015 if (Cost < 12)
3016 UP.Force = true;
3017}
3018
3019void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
3020 TTI::PeelingPreferences &PP) const {
3021 BaseT::getPeelingPreferences(L, SE, PP);
3022}
3023
3024bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
3025 MemIntrinsicInfo &Info) const {
3026 const DataLayout &DL = getDataLayout();
3027 Intrinsic::ID IID = Inst->getIntrinsicID();
3028 LLVMContext &C = Inst->getContext();
3029 bool HasMask = false;
3030
3031 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
3032 bool IsWrite) -> int64_t {
3033 if (auto *TarExtTy =
3034 dyn_cast<TargetExtType>(Val: II->getArgOperand(i: 0)->getType()))
3035 return TarExtTy->getIntParameter(i: 0);
3036
3037 return 1;
3038 };
3039
3040 switch (IID) {
3041 case Intrinsic::riscv_vle_mask:
3042 case Intrinsic::riscv_vse_mask:
3043 case Intrinsic::riscv_vlseg2_mask:
3044 case Intrinsic::riscv_vlseg3_mask:
3045 case Intrinsic::riscv_vlseg4_mask:
3046 case Intrinsic::riscv_vlseg5_mask:
3047 case Intrinsic::riscv_vlseg6_mask:
3048 case Intrinsic::riscv_vlseg7_mask:
3049 case Intrinsic::riscv_vlseg8_mask:
3050 case Intrinsic::riscv_vsseg2_mask:
3051 case Intrinsic::riscv_vsseg3_mask:
3052 case Intrinsic::riscv_vsseg4_mask:
3053 case Intrinsic::riscv_vsseg5_mask:
3054 case Intrinsic::riscv_vsseg6_mask:
3055 case Intrinsic::riscv_vsseg7_mask:
3056 case Intrinsic::riscv_vsseg8_mask:
3057 HasMask = true;
3058 [[fallthrough]];
3059 case Intrinsic::riscv_vle:
3060 case Intrinsic::riscv_vse:
3061 case Intrinsic::riscv_vlseg2:
3062 case Intrinsic::riscv_vlseg3:
3063 case Intrinsic::riscv_vlseg4:
3064 case Intrinsic::riscv_vlseg5:
3065 case Intrinsic::riscv_vlseg6:
3066 case Intrinsic::riscv_vlseg7:
3067 case Intrinsic::riscv_vlseg8:
3068 case Intrinsic::riscv_vsseg2:
3069 case Intrinsic::riscv_vsseg3:
3070 case Intrinsic::riscv_vsseg4:
3071 case Intrinsic::riscv_vsseg5:
3072 case Intrinsic::riscv_vsseg6:
3073 case Intrinsic::riscv_vsseg7:
3074 case Intrinsic::riscv_vsseg8: {
3075 // Intrinsic interface:
3076 // riscv_vle(merge, ptr, vl)
3077 // riscv_vle_mask(merge, ptr, mask, vl, policy)
3078 // riscv_vse(val, ptr, vl)
3079 // riscv_vse_mask(val, ptr, mask, vl, policy)
3080 // riscv_vlseg#(merge, ptr, vl, sew)
3081 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
3082 // riscv_vsseg#(val, ptr, vl, sew)
3083 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
3084 bool IsWrite = Inst->getType()->isVoidTy();
3085 Type *Ty = IsWrite ? Inst->getArgOperand(i: 0)->getType() : Inst->getType();
3086 // The results of segment loads are TargetExtType.
3087 if (auto *TarExtTy = dyn_cast<TargetExtType>(Val: Ty)) {
3088 unsigned SEW =
3089 1 << cast<ConstantInt>(Val: Inst->getArgOperand(i: Inst->arg_size() - 1))
3090 ->getZExtValue();
3091 Ty = TarExtTy->getTypeParameter(i: 0U);
3092 Ty = ScalableVectorType::get(
3093 ElementType: IntegerType::get(C, NumBits: SEW),
3094 MinNumElts: cast<ScalableVectorType>(Val: Ty)->getMinNumElements() * 8 / SEW);
3095 }
3096 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntrinsicID: IID);
3097 unsigned VLIndex = RVVIInfo->VLOperand;
3098 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
3099 MaybeAlign Alignment =
3100 Inst->getArgOperand(i: PtrOperandNo)->getPointerAlignment(DL);
3101 Type *MaskType = Ty->getWithNewType(EltTy: Type::getInt1Ty(C));
3102 Value *Mask = ConstantInt::getTrue(Ty: MaskType);
3103 if (HasMask)
3104 Mask = Inst->getArgOperand(i: VLIndex - 1);
3105 Value *EVL = Inst->getArgOperand(i: VLIndex);
3106 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3107 // RVV uses contiguous elements as a segment.
3108 if (SegNum > 1) {
3109 unsigned ElemSize = Ty->getScalarSizeInBits();
3110 auto *SegTy = IntegerType::get(C, NumBits: ElemSize * SegNum);
3111 Ty = VectorType::get(ElementType: SegTy, Other: cast<VectorType>(Val: Ty));
3112 }
3113 Info.InterestingOperands.emplace_back(Args&: Inst, Args&: PtrOperandNo, Args&: IsWrite, Args&: Ty,
3114 Args&: Alignment, Args&: Mask, Args&: EVL);
3115 return true;
3116 }
3117 case Intrinsic::riscv_vlse_mask:
3118 case Intrinsic::riscv_vsse_mask:
3119 case Intrinsic::riscv_vlsseg2_mask:
3120 case Intrinsic::riscv_vlsseg3_mask:
3121 case Intrinsic::riscv_vlsseg4_mask:
3122 case Intrinsic::riscv_vlsseg5_mask:
3123 case Intrinsic::riscv_vlsseg6_mask:
3124 case Intrinsic::riscv_vlsseg7_mask:
3125 case Intrinsic::riscv_vlsseg8_mask:
3126 case Intrinsic::riscv_vssseg2_mask:
3127 case Intrinsic::riscv_vssseg3_mask:
3128 case Intrinsic::riscv_vssseg4_mask:
3129 case Intrinsic::riscv_vssseg5_mask:
3130 case Intrinsic::riscv_vssseg6_mask:
3131 case Intrinsic::riscv_vssseg7_mask:
3132 case Intrinsic::riscv_vssseg8_mask:
3133 HasMask = true;
3134 [[fallthrough]];
3135 case Intrinsic::riscv_vlse:
3136 case Intrinsic::riscv_vsse:
3137 case Intrinsic::riscv_vlsseg2:
3138 case Intrinsic::riscv_vlsseg3:
3139 case Intrinsic::riscv_vlsseg4:
3140 case Intrinsic::riscv_vlsseg5:
3141 case Intrinsic::riscv_vlsseg6:
3142 case Intrinsic::riscv_vlsseg7:
3143 case Intrinsic::riscv_vlsseg8:
3144 case Intrinsic::riscv_vssseg2:
3145 case Intrinsic::riscv_vssseg3:
3146 case Intrinsic::riscv_vssseg4:
3147 case Intrinsic::riscv_vssseg5:
3148 case Intrinsic::riscv_vssseg6:
3149 case Intrinsic::riscv_vssseg7:
3150 case Intrinsic::riscv_vssseg8: {
3151 // Intrinsic interface:
3152 // riscv_vlse(merge, ptr, stride, vl)
3153 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
3154 // riscv_vsse(val, ptr, stride, vl)
3155 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
3156 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
3157 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
3158 // riscv_vssseg#(val, ptr, offset, vl, sew)
3159 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
3160 bool IsWrite = Inst->getType()->isVoidTy();
3161 Type *Ty = IsWrite ? Inst->getArgOperand(i: 0)->getType() : Inst->getType();
3162 // The results of segment loads are TargetExtType.
3163 if (auto *TarExtTy = dyn_cast<TargetExtType>(Val: Ty)) {
3164 unsigned SEW =
3165 1 << cast<ConstantInt>(Val: Inst->getArgOperand(i: Inst->arg_size() - 1))
3166 ->getZExtValue();
3167 Ty = TarExtTy->getTypeParameter(i: 0U);
3168 Ty = ScalableVectorType::get(
3169 ElementType: IntegerType::get(C, NumBits: SEW),
3170 MinNumElts: cast<ScalableVectorType>(Val: Ty)->getMinNumElements() * 8 / SEW);
3171 }
3172 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntrinsicID: IID);
3173 unsigned VLIndex = RVVIInfo->VLOperand;
3174 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3175 MaybeAlign Alignment =
3176 Inst->getArgOperand(i: PtrOperandNo)->getPointerAlignment(DL);
3177
3178 Value *Stride = Inst->getArgOperand(i: PtrOperandNo + 1);
3179 // Use the pointer alignment as the element alignment if the stride is a
3180 // multiple of the pointer alignment. Otherwise, the element alignment
3181 // should be the greatest common divisor of pointer alignment and stride.
3182 // For simplicity, just consider unalignment for elements.
3183 unsigned PointerAlign = Alignment.valueOrOne().value();
3184 if (!isa<ConstantInt>(Val: Stride) ||
3185 cast<ConstantInt>(Val: Stride)->getZExtValue() % PointerAlign != 0)
3186 Alignment = Align(1);
3187
3188 Type *MaskType = Ty->getWithNewType(EltTy: Type::getInt1Ty(C));
3189 Value *Mask = ConstantInt::getTrue(Ty: MaskType);
3190 if (HasMask)
3191 Mask = Inst->getArgOperand(i: VLIndex - 1);
3192 Value *EVL = Inst->getArgOperand(i: VLIndex);
3193 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3194 // RVV uses contiguous elements as a segment.
3195 if (SegNum > 1) {
3196 unsigned ElemSize = Ty->getScalarSizeInBits();
3197 auto *SegTy = IntegerType::get(C, NumBits: ElemSize * SegNum);
3198 Ty = VectorType::get(ElementType: SegTy, Other: cast<VectorType>(Val: Ty));
3199 }
3200 Info.InterestingOperands.emplace_back(Args&: Inst, Args&: PtrOperandNo, Args&: IsWrite, Args&: Ty,
3201 Args&: Alignment, Args&: Mask, Args&: EVL, Args&: Stride);
3202 return true;
3203 }
3204 case Intrinsic::riscv_vloxei_mask:
3205 case Intrinsic::riscv_vluxei_mask:
3206 case Intrinsic::riscv_vsoxei_mask:
3207 case Intrinsic::riscv_vsuxei_mask:
3208 case Intrinsic::riscv_vloxseg2_mask:
3209 case Intrinsic::riscv_vloxseg3_mask:
3210 case Intrinsic::riscv_vloxseg4_mask:
3211 case Intrinsic::riscv_vloxseg5_mask:
3212 case Intrinsic::riscv_vloxseg6_mask:
3213 case Intrinsic::riscv_vloxseg7_mask:
3214 case Intrinsic::riscv_vloxseg8_mask:
3215 case Intrinsic::riscv_vluxseg2_mask:
3216 case Intrinsic::riscv_vluxseg3_mask:
3217 case Intrinsic::riscv_vluxseg4_mask:
3218 case Intrinsic::riscv_vluxseg5_mask:
3219 case Intrinsic::riscv_vluxseg6_mask:
3220 case Intrinsic::riscv_vluxseg7_mask:
3221 case Intrinsic::riscv_vluxseg8_mask:
3222 case Intrinsic::riscv_vsoxseg2_mask:
3223 case Intrinsic::riscv_vsoxseg3_mask:
3224 case Intrinsic::riscv_vsoxseg4_mask:
3225 case Intrinsic::riscv_vsoxseg5_mask:
3226 case Intrinsic::riscv_vsoxseg6_mask:
3227 case Intrinsic::riscv_vsoxseg7_mask:
3228 case Intrinsic::riscv_vsoxseg8_mask:
3229 case Intrinsic::riscv_vsuxseg2_mask:
3230 case Intrinsic::riscv_vsuxseg3_mask:
3231 case Intrinsic::riscv_vsuxseg4_mask:
3232 case Intrinsic::riscv_vsuxseg5_mask:
3233 case Intrinsic::riscv_vsuxseg6_mask:
3234 case Intrinsic::riscv_vsuxseg7_mask:
3235 case Intrinsic::riscv_vsuxseg8_mask:
3236 HasMask = true;
3237 [[fallthrough]];
3238 case Intrinsic::riscv_vloxei:
3239 case Intrinsic::riscv_vluxei:
3240 case Intrinsic::riscv_vsoxei:
3241 case Intrinsic::riscv_vsuxei:
3242 case Intrinsic::riscv_vloxseg2:
3243 case Intrinsic::riscv_vloxseg3:
3244 case Intrinsic::riscv_vloxseg4:
3245 case Intrinsic::riscv_vloxseg5:
3246 case Intrinsic::riscv_vloxseg6:
3247 case Intrinsic::riscv_vloxseg7:
3248 case Intrinsic::riscv_vloxseg8:
3249 case Intrinsic::riscv_vluxseg2:
3250 case Intrinsic::riscv_vluxseg3:
3251 case Intrinsic::riscv_vluxseg4:
3252 case Intrinsic::riscv_vluxseg5:
3253 case Intrinsic::riscv_vluxseg6:
3254 case Intrinsic::riscv_vluxseg7:
3255 case Intrinsic::riscv_vluxseg8:
3256 case Intrinsic::riscv_vsoxseg2:
3257 case Intrinsic::riscv_vsoxseg3:
3258 case Intrinsic::riscv_vsoxseg4:
3259 case Intrinsic::riscv_vsoxseg5:
3260 case Intrinsic::riscv_vsoxseg6:
3261 case Intrinsic::riscv_vsoxseg7:
3262 case Intrinsic::riscv_vsoxseg8:
3263 case Intrinsic::riscv_vsuxseg2:
3264 case Intrinsic::riscv_vsuxseg3:
3265 case Intrinsic::riscv_vsuxseg4:
3266 case Intrinsic::riscv_vsuxseg5:
3267 case Intrinsic::riscv_vsuxseg6:
3268 case Intrinsic::riscv_vsuxseg7:
3269 case Intrinsic::riscv_vsuxseg8: {
3270 // Intrinsic interface (only listed ordered version):
3271 // riscv_vloxei(merge, ptr, index, vl)
3272 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
3273 // riscv_vsoxei(val, ptr, index, vl)
3274 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
3275 // riscv_vloxseg#(merge, ptr, index, vl, sew)
3276 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
3277 // riscv_vsoxseg#(val, ptr, index, vl, sew)
3278 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3279 bool IsWrite = Inst->getType()->isVoidTy();
3280 Type *Ty = IsWrite ? Inst->getArgOperand(i: 0)->getType() : Inst->getType();
3281 // The results of segment loads are TargetExtType.
3282 if (auto *TarExtTy = dyn_cast<TargetExtType>(Val: Ty)) {
3283 unsigned SEW =
3284 1 << cast<ConstantInt>(Val: Inst->getArgOperand(i: Inst->arg_size() - 1))
3285 ->getZExtValue();
3286 Ty = TarExtTy->getTypeParameter(i: 0U);
3287 Ty = ScalableVectorType::get(
3288 ElementType: IntegerType::get(C, NumBits: SEW),
3289 MinNumElts: cast<ScalableVectorType>(Val: Ty)->getMinNumElements() * 8 / SEW);
3290 }
3291 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntrinsicID: IID);
3292 unsigned VLIndex = RVVIInfo->VLOperand;
3293 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3294 Value *Mask;
3295 if (HasMask) {
3296 Mask = Inst->getArgOperand(i: VLIndex - 1);
3297 } else {
3298 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3299 // and casting that to scalar i64 triggers a vector/scalar mismatch
3300 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3301 // via extractelement instead.
3302 Type *MaskType = Ty->getWithNewType(EltTy: Type::getInt1Ty(C));
3303 Mask = ConstantInt::getTrue(Ty: MaskType);
3304 }
3305 Value *EVL = Inst->getArgOperand(i: VLIndex);
3306 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3307 // RVV uses contiguous elements as a segment.
3308 if (SegNum > 1) {
3309 unsigned ElemSize = Ty->getScalarSizeInBits();
3310 auto *SegTy = IntegerType::get(C, NumBits: ElemSize * SegNum);
3311 Ty = VectorType::get(ElementType: SegTy, Other: cast<VectorType>(Val: Ty));
3312 }
3313 Value *OffsetOp = Inst->getArgOperand(i: PtrOperandNo + 1);
3314 Info.InterestingOperands.emplace_back(Args&: Inst, Args&: PtrOperandNo, Args&: IsWrite, Args&: Ty,
3315 Args: Align(1), Args&: Mask, Args&: EVL,
3316 /* Stride */ Args: nullptr, Args&: OffsetOp);
3317 return true;
3318 }
3319 }
3320 return false;
3321}
3322
3323unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) const {
3324 if (Ty->isVectorTy()) {
3325 // f16 with only zvfhmin and bf16 will be promoted to f32
3326 Type *EltTy = cast<VectorType>(Val: Ty)->getElementType();
3327 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3328 EltTy->isBFloatTy())
3329 Ty = VectorType::get(ElementType: Type::getFloatTy(C&: Ty->getContext()),
3330 Other: cast<VectorType>(Val: Ty));
3331
3332 TypeSize Size = DL.getTypeSizeInBits(Ty);
3333 if (Size.isScalable() && ST->hasVInstructions())
3334 return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock);
3335
3336 if (ST->useRVVForFixedLengthVectors())
3337 return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen());
3338 }
3339
3340 return BaseT::getRegUsageForType(Ty);
3341}
3342
3343unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3344 if (SLPMaxVF.getNumOccurrences())
3345 return SLPMaxVF;
3346
3347 // Return how many elements can fit in getRegisterBitwidth. This is the
3348 // same routine as used in LoopVectorizer. We should probably be
3349 // accounting for whether we actually have instructions with the right
3350 // lane type, but we don't have enough information to do that without
3351 // some additional plumbing which hasn't been justified yet.
3352 TypeSize RegWidth =
3353 getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector);
3354 // If no vector registers, or absurd element widths, disable
3355 // vectorization by returning 1.
3356 return std::max<unsigned>(a: 1U, b: RegWidth.getFixedValue() / ElemWidth);
3357}
3358
3359unsigned RISCVTTIImpl::getMinTripCountTailFoldingThreshold() const {
3360 return RVVMinTripCount;
3361}
3362
3363bool RISCVTTIImpl::preferAlternateOpcodeVectorization() const {
3364 return ST->enableUnalignedVectorMem();
3365}
3366
3367TTI::AddressingModeKind
3368RISCVTTIImpl::getPreferredAddressingMode(const Loop *L,
3369 ScalarEvolution *SE) const {
3370 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3371 return TTI::AMK_PostIndexed;
3372
3373 return BasicTTIImplBase::getPreferredAddressingMode(L, SE);
3374}
3375
3376bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
3377 const TargetTransformInfo::LSRCost &C2) const {
3378 // RISC-V specific here are "instruction number 1st priority".
3379 // If we need to emit adds inside the loop to add up base registers, then
3380 // we need at least one extra temporary register.
3381 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3382 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3383 return std::tie(args: C1.Insns, args&: C1NumRegs, args: C1.AddRecCost,
3384 args: C1.NumIVMuls, args: C1.NumBaseAdds,
3385 args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
3386 std::tie(args: C2.Insns, args&: C2NumRegs, args: C2.AddRecCost,
3387 args: C2.NumIVMuls, args: C2.NumBaseAdds,
3388 args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
3389}
3390
3391bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy,
3392 Align Alignment) const {
3393 auto *VTy = dyn_cast<VectorType>(Val: DataTy);
3394 if (!VTy || VTy->isScalableTy())
3395 return false;
3396
3397 if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment))
3398 return false;
3399
3400 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3401 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3402 if (VTy->getElementType()->isIntegerTy(BitWidth: 8))
3403 if (VTy->getElementCount().getFixedValue() > 256)
3404 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3405 ST->getMaxLMULForFixedLengthVectors();
3406 return true;
3407}
3408
3409bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy,
3410 Align Alignment) const {
3411 auto *VTy = dyn_cast<VectorType>(Val: DataTy);
3412 if (!VTy || VTy->isScalableTy())
3413 return false;
3414
3415 if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment))
3416 return false;
3417 return true;
3418}
3419
3420bool RISCVTTIImpl::isLegalBroadcastLoad(Type *ElementTy,
3421 ElementCount NumElements) const {
3422 // Optimized zero-stride loads can be treated as broadcasts.
3423 if (!ST->hasVInstructions() || !ST->hasOptimizedZeroStrideLoad())
3424 return false;
3425
3426 return TLI->isLegalElementTypeForRVV(ScalarTy: TLI->getValueType(DL, Ty: ElementTy));
3427}
3428
3429/// See if \p I should be considered for address type promotion. We check if \p
3430/// I is a sext with right type and used in memory accesses. If it used in a
3431/// "complex" getelementptr, we allow it to be promoted without finding other
3432/// sext instructions that sign extended the same initial value. A getelementptr
3433/// is considered as "complex" if it has more than 2 operands.
3434bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
3435 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3436 bool Considerable = false;
3437 AllowPromotionWithoutCommonHeader = false;
3438 if (!isa<SExtInst>(Val: &I))
3439 return false;
3440 Type *ConsideredSExtType =
3441 Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
3442 if (I.getType() != ConsideredSExtType)
3443 return false;
3444 // See if the sext is the one with the right type and used in at least one
3445 // GetElementPtrInst.
3446 for (const User *U : I.users()) {
3447 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
3448 Considerable = true;
3449 // A getelementptr is considered as "complex" if it has more than 2
3450 // operands. We will promote a SExt used in such complex GEP as we
3451 // expect some computation to be merged if they are done on 64 bits.
3452 if (GEPInst->getNumOperands() > 2) {
3453 AllowPromotionWithoutCommonHeader = true;
3454 break;
3455 }
3456 }
3457 }
3458 return Considerable;
3459}
3460
3461bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3462 switch (Opcode) {
3463 case Instruction::Add:
3464 case Instruction::Sub:
3465 case Instruction::Mul:
3466 case Instruction::And:
3467 case Instruction::Or:
3468 case Instruction::Xor:
3469 case Instruction::FAdd:
3470 case Instruction::FSub:
3471 case Instruction::FMul:
3472 case Instruction::FDiv:
3473 case Instruction::ICmp:
3474 case Instruction::FCmp:
3475 return true;
3476 case Instruction::Shl:
3477 case Instruction::LShr:
3478 case Instruction::AShr:
3479 case Instruction::UDiv:
3480 case Instruction::SDiv:
3481 case Instruction::URem:
3482 case Instruction::SRem:
3483 case Instruction::Select:
3484 return Operand == 1;
3485 default:
3486 return false;
3487 }
3488}
3489
3490bool RISCVTTIImpl::canSplatOperand(Instruction *I, int Operand) const {
3491 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3492 return false;
3493
3494 if (canSplatOperand(Opcode: I->getOpcode(), Operand))
3495 return true;
3496
3497 auto *II = dyn_cast<IntrinsicInst>(Val: I);
3498 if (!II)
3499 return false;
3500
3501 switch (II->getIntrinsicID()) {
3502 case Intrinsic::fma:
3503 case Intrinsic::vp_fma:
3504 case Intrinsic::fmuladd:
3505 case Intrinsic::vp_fmuladd:
3506 return Operand == 0 || Operand == 1;
3507 case Intrinsic::vp_shl:
3508 case Intrinsic::vp_lshr:
3509 case Intrinsic::vp_ashr:
3510 case Intrinsic::vp_udiv:
3511 case Intrinsic::vp_sdiv:
3512 case Intrinsic::vp_urem:
3513 case Intrinsic::vp_srem:
3514 case Intrinsic::ssub_sat:
3515 case Intrinsic::vp_ssub_sat:
3516 case Intrinsic::usub_sat:
3517 case Intrinsic::vp_usub_sat:
3518 case Intrinsic::vp_select:
3519 return Operand == 1;
3520 // These intrinsics are commutative.
3521 case Intrinsic::vp_add:
3522 case Intrinsic::vp_mul:
3523 case Intrinsic::vp_and:
3524 case Intrinsic::vp_or:
3525 case Intrinsic::vp_xor:
3526 case Intrinsic::vp_fadd:
3527 case Intrinsic::vp_fmul:
3528 case Intrinsic::vp_icmp:
3529 case Intrinsic::vp_fcmp:
3530 case Intrinsic::smin:
3531 case Intrinsic::vp_smin:
3532 case Intrinsic::umin:
3533 case Intrinsic::vp_umin:
3534 case Intrinsic::smax:
3535 case Intrinsic::vp_smax:
3536 case Intrinsic::umax:
3537 case Intrinsic::vp_umax:
3538 case Intrinsic::sadd_sat:
3539 case Intrinsic::vp_sadd_sat:
3540 case Intrinsic::uadd_sat:
3541 case Intrinsic::vp_uadd_sat:
3542 // These intrinsics have 'vr' versions.
3543 case Intrinsic::vp_sub:
3544 case Intrinsic::vp_fsub:
3545 case Intrinsic::vp_fdiv:
3546 return Operand == 0 || Operand == 1;
3547 default:
3548 return false;
3549 }
3550}
3551
3552/// Check if sinking \p I's operands to I's basic block is profitable, because
3553/// the operands can be folded into a target instruction, e.g.
3554/// splats of scalars can fold into vector instructions.
3555bool RISCVTTIImpl::isProfitableToSinkOperands(
3556 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
3557 using namespace llvm::PatternMatch;
3558
3559 if (I->isBitwiseLogicOp()) {
3560 if (!I->getType()->isVectorTy()) {
3561 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3562 for (auto &Op : I->operands()) {
3563 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3564 if (match(V: Op.get(), P: m_Not(V: m_Value()))) {
3565 Ops.push_back(Elt: &Op);
3566 return true;
3567 }
3568 }
3569 }
3570 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3571 for (auto &Op : I->operands()) {
3572 // (and X, (not Y)) -> (vandn.vv X, Y)
3573 if (match(V: Op.get(), P: m_Not(V: m_Value()))) {
3574 Ops.push_back(Elt: &Op);
3575 return true;
3576 }
3577 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3578 if (match(V: Op.get(), P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Not(V: m_Value()),
3579 Idx: m_ZeroInt()),
3580 v2: m_Value(), mask: m_ZeroMask()))) {
3581 Use &InsertElt = cast<Instruction>(Val&: Op)->getOperandUse(i: 0);
3582 Use &Not = cast<Instruction>(Val&: InsertElt)->getOperandUse(i: 1);
3583 Ops.push_back(Elt: &Not);
3584 Ops.push_back(Elt: &InsertElt);
3585 Ops.push_back(Elt: &Op);
3586 return true;
3587 }
3588 }
3589 }
3590 }
3591
3592 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3593 return false;
3594
3595 // Don't sink splat operands if the target prefers it. Some targets requires
3596 // S2V transfer buffers and we can run out of them copying the same value
3597 // repeatedly.
3598 // FIXME: It could still be worth doing if it would improve vector register
3599 // pressure and prevent a vector spill.
3600 if (!ST->sinkSplatOperands())
3601 return false;
3602
3603 for (auto OpIdx : enumerate(First: I->operands())) {
3604 if (!canSplatOperand(I, Operand: OpIdx.index()))
3605 continue;
3606
3607 Instruction *Op = dyn_cast<Instruction>(Val: OpIdx.value().get());
3608 // Make sure we are not already sinking this operand
3609 if (!Op || any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; }))
3610 continue;
3611
3612 // We are looking for a splat that can be sunk.
3613 if (!match(V: Op, P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ZeroInt()),
3614 v2: m_Value(), mask: m_ZeroMask())))
3615 continue;
3616
3617 // Don't sink i1 splats.
3618 if (cast<VectorType>(Val: Op->getType())->getElementType()->isIntegerTy(BitWidth: 1))
3619 continue;
3620
3621 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3622 // and vector registers
3623 for (Use &U : Op->uses()) {
3624 Instruction *Insn = cast<Instruction>(Val: U.getUser());
3625 if (!canSplatOperand(I: Insn, Operand: U.getOperandNo()))
3626 return false;
3627 }
3628
3629 // Sink any fpexts since they might be used in a widening fp pattern.
3630 Use *InsertEltUse = &Op->getOperandUse(i: 0);
3631 auto *InsertElt = cast<InsertElementInst>(Val: InsertEltUse);
3632 if (isa<FPExtInst>(Val: InsertElt->getOperand(i_nocapture: 1)))
3633 Ops.push_back(Elt: &InsertElt->getOperandUse(i: 1));
3634 Ops.push_back(Elt: InsertEltUse);
3635 Ops.push_back(Elt: &OpIdx.value());
3636 }
3637 return true;
3638}
3639
3640RISCVTTIImpl::TTI::MemCmpExpansionOptions
3641RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3642 TTI::MemCmpExpansionOptions Options;
3643 // TODO: Enable expansion when unaligned access is not supported after we fix
3644 // issues in ExpandMemcmp.
3645 if (!ST->enableUnalignedScalarMem())
3646 return Options;
3647
3648 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3649 return Options;
3650
3651 Options.AllowOverlappingLoads = true;
3652 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3653 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3654 if (ST->is64Bit()) {
3655 Options.LoadSizes = {8, 4, 2, 1};
3656 Options.AllowedTailExpansions = {3, 5, 6};
3657 } else {
3658 Options.LoadSizes = {4, 2, 1};
3659 Options.AllowedTailExpansions = {3};
3660 }
3661
3662 if (IsZeroCmp && ST->hasVInstructions()) {
3663 unsigned VLenB = ST->getRealMinVLen() / 8;
3664 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3665 // `VLenB * MaxLMUL` so that it fits in a single register group.
3666 unsigned MinSize = ST->getXLen() / 8 + 1;
3667 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3668 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3669 Options.LoadSizes.insert(I: Options.LoadSizes.begin(), Elt: Size);
3670 }
3671 return Options;
3672}
3673
3674bool RISCVTTIImpl::shouldTreatInstructionLikeSelect(
3675 const Instruction *I) const {
3676 if (EnableOrLikeSelectOpt) {
3677 // For the binary operators (e.g. or) we need to be more careful than
3678 // selects, here we only transform them if they are already at a natural
3679 // break point in the code - the end of a block with an unconditional
3680 // terminator.
3681 if (I->getOpcode() == Instruction::Or &&
3682 isa<UncondBrInst>(Val: I->getNextNode()))
3683 return true;
3684
3685 if (I->getOpcode() == Instruction::Add ||
3686 I->getOpcode() == Instruction::Sub)
3687 return true;
3688 }
3689 return BaseT::shouldTreatInstructionLikeSelect(I);
3690}
3691
3692bool RISCVTTIImpl::shouldCopyAttributeWhenOutliningFrom(
3693 const Function *Caller, const Attribute &Attr) const {
3694 // "interrupt" controls the prolog/epilog of interrupt handlers (and includes
3695 // restrictions on their signatures). We can outline from the bodies of these
3696 // handlers, but when we do we need to make sure we don't mark the outlined
3697 // function as an interrupt handler too.
3698 if (Attr.isStringAttribute() && Attr.getKindAsString() == "interrupt")
3699 return false;
3700
3701 return BaseT::shouldCopyAttributeWhenOutliningFrom(Caller, Attr);
3702}
3703
3704std::optional<Instruction *>
3705RISCVTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
3706 // If all operands of a vmv.v.x are constant, fold a bitcast(vmv.v.x) to scale
3707 // the vmv.v.x, enabling removal of the bitcast. The transform helps avoid
3708 // creating redundant masks.
3709 const DataLayout &DL = IC.getDataLayout();
3710 if (II.user_empty())
3711 return {};
3712 auto *TargetVecTy = dyn_cast<ScalableVectorType>(Val: II.user_back()->getType());
3713 if (!TargetVecTy)
3714 return {};
3715 const APInt *Scalar;
3716 uint64_t VL;
3717 if (!match(V: &II, P: m_Intrinsic<Intrinsic::riscv_vmv_v_x>(
3718 Op0: m_Poison(), Op1: m_APInt(Res&: Scalar), Op2: m_ConstantInt(V&: VL))) ||
3719 !all_of(Range: II.users(), P: [TargetVecTy](User *U) {
3720 return U->getType() == TargetVecTy && match(V: U, P: m_BitCast(Op: m_Value()));
3721 }))
3722 return {};
3723 auto *SourceVecTy = cast<ScalableVectorType>(Val: II.getType());
3724 unsigned TargetEltBW = DL.getTypeSizeInBits(Ty: TargetVecTy->getElementType());
3725 unsigned SourceEltBW = DL.getTypeSizeInBits(Ty: SourceVecTy->getElementType());
3726 if (TargetEltBW % SourceEltBW)
3727 return {};
3728 unsigned TargetScale = TargetEltBW / SourceEltBW;
3729 if (VL % TargetScale || TargetScale == 1)
3730 return {};
3731 Type *VLTy = II.getOperand(i_nocapture: 2)->getType();
3732 ElementCount SourceEC = SourceVecTy->getElementCount();
3733 unsigned NewEltBW = SourceEltBW * TargetScale;
3734 if (!SourceEC.isKnownMultipleOf(RHS: TargetScale) ||
3735 !DL.fitsInLegalInteger(Width: NewEltBW))
3736 return {};
3737 auto *NewEltTy = IntegerType::get(C&: II.getContext(), NumBits: NewEltBW);
3738 if (!TLI->isLegalElementTypeForRVV(ScalarTy: TLI->getValueType(DL, Ty: NewEltTy)))
3739 return {};
3740 ElementCount NewEC = SourceEC.divideCoefficientBy(RHS: TargetScale);
3741 Type *RetTy = VectorType::get(ElementType: NewEltTy, EC: NewEC);
3742 assert(SourceVecTy->canLosslesslyBitCastTo(RetTy) &&
3743 "Lossless bitcast between types expected");
3744 APInt NewScalar = APInt::getSplat(NewLen: NewEltBW, V: *Scalar);
3745 return IC.replaceInstUsesWith(
3746 I&: II,
3747 V: IC.Builder.CreateBitCast(
3748 V: IC.Builder.CreateIntrinsic(
3749 RetTy, ID: Intrinsic::riscv_vmv_v_x,
3750 Args: {PoisonValue::get(T: RetTy), ConstantInt::get(Ty: NewEltTy, V: NewScalar),
3751 ConstantInt::get(Ty: VLTy, V: VL / TargetScale)}),
3752 DestTy: SourceVecTy));
3753}
3754