RISCVTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp]

1	//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "RISCVTargetTransformInfo.h"
10	#include "MCTargetDesc/RISCVMatInt.h"
11	#include "llvm/ADT/STLExtras.h"
12	#include "llvm/Analysis/TargetTransformInfo.h"
13	#include "llvm/CodeGen/BasicTTIImpl.h"
14	#include "llvm/CodeGen/CostTable.h"
15	#include "llvm/CodeGen/TargetLowering.h"
16	#include "llvm/IR/Instructions.h"
17	#include "llvm/IR/PatternMatch.h"
18	#include <cmath>
19	#include <optional>
20	using namespace llvm;
21	using namespace llvm::PatternMatch;
22
23	#define DEBUG_TYPE "riscvtti"
24
25	static cl::opt<unsigned> RVVRegisterWidthLMUL(
26	"riscv-v-register-bit-width-lmul",
27	cl::desc (
28	"The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
29	"by autovectorized code. Fractional LMULs are not supported."),
30	cl::init(Val: `2`), cl::Hidden);
31
32	static cl::opt<unsigned> SLPMaxVF(
33	"riscv-v-slp-max-vf",
34	cl::desc (
35	"Overrides result used for getMaximumVF query which is used "
36	"exclusively by SLP vectorizer."),
37	cl::Hidden);
38
39	InstructionCost
40	RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
41	TTI::TargetCostKind CostKind) {
42	// Check if the type is valid for all CostKind
43	if (!VT.isVector())
44	return InstructionCost::getInvalid();
45	size_t NumInstr = OpCodes.size();
46	if (CostKind == TTI::TCK_CodeSize)
47	return NumInstr;
48	InstructionCost LMULCost = TLI->getLMULCost(VT);
49	if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
50	return LMULCost * NumInstr;
51	InstructionCost Cost = `0`;
52	for (auto Op : OpCodes) {
53	switch (Op) {
54	case RISCV::VRGATHER_VI:
55	Cost += TLI->getVRGatherVICost(VT);
56	break;
57	case RISCV::VRGATHER_VV:
58	Cost += TLI->getVRGatherVVCost(VT);
59	break;
60	case RISCV::VSLIDEUP_VI:
61	case RISCV::VSLIDEDOWN_VI:
62	Cost += TLI->getVSlideVICost(VT);
63	break;
64	case RISCV::VSLIDEUP_VX:
65	case RISCV::VSLIDEDOWN_VX:
66	Cost += TLI->getVSlideVXCost(VT);
67	break;
68	case RISCV::VREDMAX_VS:
69	case RISCV::VREDMIN_VS:
70	case RISCV::VREDMAXU_VS:
71	case RISCV::VREDMINU_VS:
72	case RISCV::VREDSUM_VS:
73	case RISCV::VREDAND_VS:
74	case RISCV::VREDOR_VS:
75	case RISCV::VREDXOR_VS:
76	case RISCV::VFREDMAX_VS:
77	case RISCV::VFREDMIN_VS:
78	case RISCV::VFREDUSUM_VS: {
79	unsigned VL = VT.getVectorMinNumElements();
80	if (!VT.isFixedLengthVector())
81	VL = getVScaleForTuning();
82	Cost += Log2_32_Ceil(Value: VL);
83	break;
84	}
85	case RISCV::VFREDOSUM_VS: {
86	unsigned VL = VT.getVectorMinNumElements();
87	if (!VT.isFixedLengthVector())
88	VL = getVScaleForTuning();
89	Cost += VL;
90	break;
91	}
92	case RISCV::VMV_X_S:
93	case RISCV::VMV_S_X:
94	case RISCV::VFMV_F_S:
95	case RISCV::VFMV_S_F:
96	case RISCV::VMOR_MM:
97	case RISCV::VMXOR_MM:
98	case RISCV::VMAND_MM:
99	case RISCV::VMANDN_MM:
100	case RISCV::VMNAND_MM:
101	case RISCV::VCPOP_M:
102	case RISCV::VFIRST_M:
103	Cost += `1`;
104	break;
105	default:
106	Cost += LMULCost;
107	}
108	}
109	return Cost;
110	}
111
112	static InstructionCost getIntImmCostImpl(const DataLayout &DL,
113	const RISCVSubtarget *ST,
114	const APInt &Imm, Type *Ty,
115	TTI::TargetCostKind CostKind,
116	bool FreeZeroes) {
117	assert(Ty->isIntegerTy() &&
118	"getIntImmCost can only estimate cost of materialising integers");
119
120	// We have a Zero register, so 0 is always free.
121	if (Imm == `0`)
122	return TTI::TCC_Free;
123
124	// Otherwise, we check how many instructions it will take to materialise.
125	return RISCVMatInt::getIntMatCost(Val: Imm, Size: DL.getTypeSizeInBits(Ty), STI: *ST,
126	/CompressionCost=/false, FreeZeroes);
127	}
128
129	InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
130	TTI::TargetCostKind CostKind) {
131	return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind, FreeZeroes: false);
132	}
133
134	// Look for patterns of shift followed by AND that can be turned into a pair of
135	// shifts. We won't need to materialize an immediate for the AND so these can
136	// be considered free.
137	static bool canUseShiftPair(Instruction Inst, const* APInt &Imm) {
138	uint64_t Mask = Imm.getZExtValue();
139	auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: `0`));
140	if (!BO \|\| !BO->hasOneUse())
141	return false;
142
143	if (BO->getOpcode() != Instruction::Shl)
144	return false;
145
146	if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: `1`)))
147	return false;
148
149	unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: `1`))->getZExtValue();
150	// (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
151	// is a mask shifted by c2 bits with c3 leading zeros.
152	if (isShiftedMask_64(Value: Mask)) {
153	unsigned Trailing = llvm::countr_zero(Val: Mask);
154	if (ShAmt == Trailing)
155	return true;
156	}
157
158	return false;
159	}
160
161	InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
162	const APInt &Imm, Type *Ty,
163	TTI::TargetCostKind CostKind,
164	Instruction *Inst) {
165	assert(Ty->isIntegerTy() &&
166	"getIntImmCost can only estimate cost of materialising integers");
167
168	// We have a Zero register, so 0 is always free.
169	if (Imm == `0`)
170	return TTI::TCC_Free;
171
172	// Some instructions in RISC-V can take a 12-bit immediate. Some of these are
173	// commutative, in others the immediate comes from a specific argument index.
174	bool Takes12BitImm = false;
175	unsigned ImmArgIdx = ~`0U`;
176
177	switch (Opcode) {
178	case Instruction::GetElementPtr:
179	// Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
180	// split up large offsets in GEP into better parts than ConstantHoisting
181	// can.
182	return TTI::TCC_Free;
183	case Instruction::Store: {
184	// Use the materialization cost regardless of if it's the address or the
185	// value that is constant, except for if the store is misaligned and
186	// misaligned accesses are not legal (experience shows constant hoisting
187	// can sometimes be harmful in such cases).
188	if (Idx == `1` \|\| !Inst)
189	return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind,
190	/FreeZeroes=/true);
191
192	StoreInst *ST = cast<StoreInst>(Val: Inst);
193	if (!getTLI()->allowsMemoryAccessForAlignment(
194	Context&: Ty->getContext(), DL, VT: getTLI()->getValueType(DL, Ty),
195	AddrSpace: ST->getPointerAddressSpace(), Alignment: ST->getAlign()))
196	return TTI::TCC_Free;
197
198	return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind,
199	/FreeZeroes=/true);
200	}
201	case Instruction::Load:
202	// If the address is a constant, use the materialization cost.
203	return getIntImmCost(Imm, Ty, CostKind);
204	case Instruction::And:
205	// zext.h
206	if (Imm == UINT64_C(`0xffff`) && ST->hasStdExtZbb())
207	return TTI::TCC_Free;
208	// zext.w
209	if (Imm == UINT64_C(`0xffffffff`) && ST->hasStdExtZba())
210	return TTI::TCC_Free;
211	// bclri
212	if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
213	return TTI::TCC_Free;
214	if (Inst && Idx == `1` && Imm.getBitWidth() <= ST->getXLen() &&
215	canUseShiftPair(Inst, Imm))
216	return TTI::TCC_Free;
217	Takes12BitImm = true;
218	break;
219	case Instruction::Add:
220	Takes12BitImm = true;
221	break;
222	case Instruction::Or:
223	case Instruction::Xor:
224	// bseti/binvi
225	if (ST->hasStdExtZbs() && Imm.isPowerOf2())
226	return TTI::TCC_Free;
227	Takes12BitImm = true;
228	break;
229	case Instruction::Mul:
230	// Power of 2 is a shift. Negated power of 2 is a shift and a negate.
231	if (Imm.isPowerOf2() \|\| Imm.isNegatedPowerOf2())
232	return TTI::TCC_Free;
233	// One more or less than a power of 2 can use SLLI+ADD/SUB.
234	if ((Imm + `1`).isPowerOf2() \|\| (Imm - `1`).isPowerOf2())
235	return TTI::TCC_Free;
236	// FIXME: There is no MULI instruction.
237	Takes12BitImm = true;
238	break;
239	case Instruction::Sub:
240	case Instruction::Shl:
241	case Instruction::LShr:
242	case Instruction::AShr:
243	Takes12BitImm = true;
244	ImmArgIdx = `1`;
245	break;
246	default:
247	break;
248	}
249
250	if (Takes12BitImm) {
251	// Check immediate is the correct argument...
252	if (Instruction::isCommutative(Opcode) \|\| Idx == ImmArgIdx) {
253	// ... and fits into the 12-bit immediate.
254	if (Imm.getSignificantBits() <= `64` &&
255	getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) {
256	return TTI::TCC_Free;
257	}
258	}
259
260	// Otherwise, use the full materialisation cost.
261	return getIntImmCost(Imm, Ty, CostKind);
262	}
263
264	// By default, prevent hoisting.
265	return TTI::TCC_Free;
266	}
267
268	InstructionCost
269	RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
270	const APInt &Imm, Type *Ty,
271	TTI::TargetCostKind CostKind) {
272	// Prevent hoisting in unknown cases.
273	return TTI::TCC_Free;
274	}
275
276	bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type DataTy, Align) const* {
277	return ST->hasVInstructions();
278	}
279
280	TargetTransformInfo::PopcntSupportKind
281	RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) {
282	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
283	return ST->hasStdExtZbb() \|\| ST->hasVendorXCVbitmanip()
284	? TTI::PSK_FastHardware
285	: TTI::PSK_Software;
286	}
287
288	bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst II) const* {
289	// Currently, the ExpandReductions pass can't expand scalable-vector
290	// reductions, but we still request expansion as RVV doesn't support certain
291	// reductions and the SelectionDAG can't legalize them either.
292	switch (II->getIntrinsicID()) {
293	default:
294	return false;
295	// These reductions have no equivalent in RVV
296	case Intrinsic::vector_reduce_mul:
297	case Intrinsic::vector_reduce_fmul:
298	return true;
299	}
300	}
301
302	std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
303	if (ST->hasVInstructions())
304	return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
305	return BaseT::getMaxVScale();
306	}
307
308	std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
309	if (ST->hasVInstructions())
310	if (unsigned MinVLen = ST->getRealMinVLen();
311	MinVLen >= RISCV::RVVBitsPerBlock)
312	return MinVLen / RISCV::RVVBitsPerBlock;
313	return BaseT::getVScaleForTuning();
314	}
315
316	TypeSize
317	RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
318	unsigned LMUL =
319	llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: `1`, hi: `8`));
320	switch (K) {
321	case TargetTransformInfo::RGK_Scalar:
322	return TypeSize::getFixed(ExactSize: ST->getXLen());
323	case TargetTransformInfo::RGK_FixedWidthVector:
324	return TypeSize::getFixed(
325	ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : `0`);
326	case TargetTransformInfo::RGK_ScalableVector:
327	return TypeSize::getScalable(
328	MinimumSize: (ST->hasVInstructions() &&
329	ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
330	? LMUL * RISCV::RVVBitsPerBlock
331	: `0`);
332	}
333
334	llvm_unreachable("Unsupported register kind");
335	}
336
337	InstructionCost
338	RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) {
339	// Add a cost of address generation + the cost of the load. The address
340	// is expected to be a PC relative offset to a constant pool entry
341	// using auipc/addi.
342	return `2` + getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty),
343	/AddressSpace=/`0`, CostKind);
344	}
345
346	static VectorType getVRGatherIndexType(MVT DataVT, const* RISCVSubtarget &ST,
347	LLVMContext &C) {
348	assert((DataVT.getScalarSizeInBits() != `8` \|\|
349	DataVT.getVectorNumElements() <= `256`) && "unhandled case in lowering");
350	MVT IndexVT = DataVT.changeTypeToInteger();
351	if (IndexVT.getScalarType().bitsGT(VT: ST.getXLenVT()))
352	IndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i16);
353	return cast<VectorType>(Val: EVT (IndexVT).getTypeForEVT(Context&: C));
354	}
355
356	InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
357	VectorType Tp, ArrayRef<int*> Mask,
358	TTI::TargetCostKind CostKind,
359	int Index, VectorType *SubTp,
360	ArrayRef<const Value *> Args,
361	const Instruction *CxtI) {
362	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
363
364	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
365
366	// First, handle cases where having a fixed length vector enables us to
367	// give a more accurate cost than falling back to generic scalable codegen.
368	// TODO: Each of these cases hints at a modeling gap around scalable vectors.
369	if (isa<FixedVectorType>(Val: Tp)) {
370	switch (Kind) {
371	default:
372	break;
373	case TTI::SK_PermuteSingleSrc: {
374	if (Mask.size() >= `2` && LT.second.isFixedLengthVector()) {
375	MVT EltTp = LT.second.getVectorElementType();
376	// If the size of the element is < ELEN then shuffles of interleaves and
377	// deinterleaves of 2 vectors can be lowered into the following
378	// sequences
379	if (EltTp.getScalarSizeInBits() < ST->getELen()) {
380	// Example sequence:
381	// vsetivli zero, 4, e8, mf4, ta, ma (ignored)
382	// vwaddu.vv v10, v8, v9
383	// li a0, -1 (ignored)
384	// vwmaccu.vx v10, a0, v9
385	if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: `2`, NumInputElts: Mask.size()))
386	return `2` * LT.first * TLI->getLMULCost(VT: LT.second);
387
388	if (Mask [`0`] == `0` \|\| Mask [`0`] == `1`) {
389	auto DeinterleaveMask = createStrideMask(Start: Mask [`0`], Stride: `2`, VF: Mask.size());
390	// Example sequence:
391	// vnsrl.wi v10, v8, 0
392	if (equal(LRange&: DeinterleaveMask, RRange&: Mask))
393	return LT.first * getRISCVInstructionCost(OpCodes: RISCV::VNSRL_WI,
394	VT: LT.second, CostKind);
395	}
396	}
397	}
398	// vrgather + cost of generating the mask constant.
399	// We model this for an unknown mask with a single vrgather.
400	if (LT.second.isFixedLengthVector() && LT.first == `1` &&
401	(LT.second.getScalarSizeInBits() != `8` \|\|
402	LT.second.getVectorNumElements() <= `256`)) {
403	VectorType IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: ST, C&: Tp->getContext());
404	InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
405	return IndexCost +
406	getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VV, VT: LT.second, CostKind);
407	}
408	[[fallthrough]];
409	}
410	case TTI::SK_Transpose:
411	case TTI::SK_PermuteTwoSrc: {
412	// 2 x (vrgather + cost of generating the mask constant) + cost of mask
413	// register for the second vrgather. We model this for an unknown
414	// (shuffle) mask.
415	if (LT.second.isFixedLengthVector() && LT.first == `1` &&
416	(LT.second.getScalarSizeInBits() != `8` \|\|
417	LT.second.getVectorNumElements() <= `256`)) {
418	auto &C = Tp->getContext();
419	auto EC = Tp->getElementCount();
420	VectorType IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: ST, C);
421	VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC);
422	InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind);
423	InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind);
424	return `2` * IndexCost +
425	getRISCVInstructionCost(OpCodes: {RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
426	VT: LT.second, CostKind) +
427	MaskCost;
428	}
429	[[fallthrough]];
430	}
431	case TTI::SK_Select: {
432	// We are going to permute multiple sources and the result will be in
433	// multiple destinations. Providing an accurate cost only for splits where
434	// the element type remains the same.
435	if (!Mask.empty() && LT.first.isValid() && LT.first != `1` &&
436	LT.second.isFixedLengthVector() &&
437	LT.second.getVectorElementType().getSizeInBits() ==
438	Tp->getElementType()->getPrimitiveSizeInBits() &&
439	LT.second.getVectorNumElements() <
440	cast<FixedVectorType>(Val: Tp)->getNumElements() &&
441	divideCeil(Numerator: Mask.size(),
442	Denominator: cast<FixedVectorType>(Val: Tp)->getNumElements()) ==
443	static_cast<unsigned>(*LT.first.getValue())) {
444	unsigned NumRegs = *LT.first.getValue();
445	unsigned VF = cast<FixedVectorType>(Val: Tp)->getNumElements();
446	unsigned SubVF = PowerOf2Ceil(A: VF / NumRegs);
447	auto *SubVecTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: SubVF);
448
449	InstructionCost Cost = `0`;
450	for (unsigned I = `0`; I < NumRegs; ++I) {
451	bool IsSingleVector = true;
452	SmallVector<int> SubMask(SubVF, PoisonMaskElem);
453	transform(Range: Mask.slice(N: I * SubVF,
454	M: I == NumRegs - `1` ? Mask.size() % SubVF : SubVF),
455	d_first: SubMask.begin(), F: [&](int I) {
456	bool SingleSubVector = I / VF == `0`;
457	IsSingleVector &= SingleSubVector;
458	return (SingleSubVector ? `0` : `1`) * SubVF + I % VF;
459	});
460	Cost += getShuffleCost(Kind: IsSingleVector ? TTI::SK_PermuteSingleSrc
461	: TTI::SK_PermuteTwoSrc,
462	Tp: SubVecTy, Mask: SubMask, CostKind, Index: `0`, SubTp: nullptr);
463	return Cost;
464	}
465	}
466	break;
467	}
468	}
469	};
470
471	// Handle scalable vectors (and fixed vectors legalized to scalable vectors).
472	switch (Kind) {
473	default:
474	// Fallthrough to generic handling.
475	// TODO: Most of these cases will return getInvalid in generic code, and
476	// must be implemented here.
477	break;
478	case TTI::SK_ExtractSubvector:
479	// Extract at zero is always a subregister extract
480	if (Index == `0`)
481	return TTI::TCC_Free;
482
483	// If we're extracting a subvector of at most m1 size at a sub-register
484	// boundary - which unfortunately we need exact vlen to identify - this is
485	// a subregister extract at worst and thus won't require a vslidedown.
486	// TODO: Extend for aligned m2, m4 subvector extracts
487	// TODO: Extend for misalgined (but contained) extracts
488	// TODO: Extend for scalable subvector types
489	if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
490	SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
491	const unsigned MinVLen = ST->getRealMinVLen();
492	const unsigned MaxVLen = ST->getRealMaxVLen();
493	if (MinVLen == MaxVLen &&
494	SubLT.second.getScalarSizeInBits() * Index % MinVLen == `0` &&
495	SubLT.second.getSizeInBits() <= MinVLen)
496	return TTI::TCC_Free;
497	}
498
499	// Example sequence:
500	// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
501	// vslidedown.vi v8, v9, 2
502	return LT.first *
503	getRISCVInstructionCost(OpCodes: RISCV::VSLIDEDOWN_VI, VT: LT.second, CostKind);
504	case TTI::SK_InsertSubvector:
505	// Example sequence:
506	// vsetivli zero, 4, e8, mf2, tu, ma (ignored)
507	// vslideup.vi v8, v9, 2
508	return LT.first *
509	getRISCVInstructionCost(OpCodes: RISCV::VSLIDEUP_VI, VT: LT.second, CostKind);
510	case TTI::SK_Select: {
511	// Example sequence:
512	// li a0, 90
513	// vsetivli zero, 8, e8, mf2, ta, ma (ignored)
514	// vmv.s.x v0, a0
515	// vmerge.vvm v8, v9, v8, v0
516	// We use 2 for the cost of the mask materialization as this is the true
517	// cost for small masks and most shuffles are small. At worst, this cost
518	// should be a very small constant for the constant pool load. As such,
519	// we may bias towards large selects slightly more than truely warranted.
520	return LT.first *
521	(`1` + getRISCVInstructionCost(OpCodes: {RISCV::VMV_S_X, RISCV::VMERGE_VVM},
522	VT: LT.second, CostKind));
523	}
524	case TTI::SK_Broadcast: {
525	bool HasScalar = (Args.size() > `0`) && (Operator::getOpcode(V: Args [`0`]) ==
526	Instruction::InsertElement);
527	if (LT.second.getScalarSizeInBits() == `1`) {
528	if (HasScalar) {
529	// Example sequence:
530	// andi a0, a0, 1
531	// vsetivli zero, 2, e8, mf8, ta, ma (ignored)
532	// vmv.v.x v8, a0
533	// vmsne.vi v0, v8, 0
534	return LT.first *
535	(`1` + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI},
536	VT: LT.second, CostKind));
537	}
538	// Example sequence:
539	// vsetivli zero, 2, e8, mf8, ta, mu (ignored)
540	// vmv.v.i v8, 0
541	// vmerge.vim v8, v8, 1, v0
542	// vmv.x.s a0, v8
543	// andi a0, a0, 1
544	// vmv.v.x v8, a0
545	// vmsne.vi v0, v8, 0
546
547	return LT.first *
548	(`1` + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_I, RISCV::VMERGE_VIM,
549	RISCV::VMV_X_S, RISCV::VMV_V_X,
550	RISCV::VMSNE_VI},
551	VT: LT.second, CostKind));
552	}
553
554	if (HasScalar) {
555	// Example sequence:
556	// vmv.v.x v8, a0
557	return LT.first *
558	getRISCVInstructionCost(OpCodes: RISCV::VMV_V_X, VT: LT.second, CostKind);
559	}
560
561	// Example sequence:
562	// vrgather.vi v9, v8, 0
563	return LT.first *
564	getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VI, VT: LT.second, CostKind);
565	}
566	case TTI::SK_Splice: {
567	// vslidedown+vslideup.
568	// TODO: Multiplying by LT.first implies this legalizes into multiple copies
569	// of similar code, but I think we expand through memory.
570	unsigned Opcodes[`2`] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
571	if (Index >= `0` && Index < `32`)
572	Opcodes[`0`] = RISCV::VSLIDEDOWN_VI;
573	else if (Index < `0` && Index > -`32`)
574	Opcodes[`1`] = RISCV::VSLIDEUP_VI;
575	return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
576	}
577	case TTI::SK_Reverse: {
578	// TODO: Cases to improve here:
579	// Illegal vector types*
580	// i64 on RV32*
581	// i1 vector*
582	// At low LMUL, most of the cost is producing the vrgather index register.
583	// At high LMUL, the cost of the vrgather itself will dominate.
584	// Example sequence:
585	// csrr a0, vlenb
586	// srli a0, a0, 3
587	// addi a0, a0, -1
588	// vsetvli a1, zero, e8, mf8, ta, mu (ignored)
589	// vid.v v9
590	// vrsub.vx v10, v9, a0
591	// vrgather.vv v9, v8, v10
592	InstructionCost LenCost = `3`;
593	if (LT.second.isFixedLengthVector())
594	// vrsub.vi has a 5 bit immediate field, otherwise an li suffices
595	LenCost = isInt<`5`>(x: LT.second.getVectorNumElements() - `1`) ? `0` : `1`;
596	unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
597	if (LT.second.isFixedLengthVector() &&
598	isInt<`5`>(x: LT.second.getVectorNumElements() - `1`))
599	Opcodes[`1`] = RISCV::VRSUB_VI;
600	InstructionCost GatherCost =
601	getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
602	// Mask operation additionally required extend and truncate
603	InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(Bitwidth: `1`) ? `3` : `0`;
604	return LT.first * (LenCost + GatherCost + ExtendCost);
605	}
606	}
607	return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
608	}
609
610	InstructionCost
611	RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
612	unsigned AddressSpace,
613	TTI::TargetCostKind CostKind) {
614	if (!isLegalMaskedLoadStore(DataType: Src, Alignment) \|\|
615	CostKind != TTI::TCK_RecipThroughput)
616	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
617	CostKind);
618
619	return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
620	}
621
622	InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
623	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
624	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
625	bool UseMaskForCond, bool UseMaskForGaps) {
626	if (isa<ScalableVectorType>(Val: VecTy) && Factor != `2`)
627	return InstructionCost::getInvalid();
628
629	// The interleaved memory access pass will lower interleaved memory ops (i.e
630	// a load and store followed by a specific shuffle) to vlseg/vsseg
631	// intrinsics. In those cases then we can treat it as if it's just one (legal)
632	// memory op
633	if (!UseMaskForCond && !UseMaskForGaps &&
634	Factor <= TLI->getMaxSupportedInterleaveFactor()) {
635	auto *VTy = cast<VectorType>(Val: VecTy);
636	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VTy);
637	// Need to make sure type has't been scalarized
638	if (LT.second.isVector()) {
639	auto *SubVecTy =
640	VectorType::get(ElementType: VTy->getElementType(),
641	EC: VTy->getElementCount().divideCoefficientBy(RHS: Factor));
642
643	if (VTy->getElementCount().isKnownMultipleOf(RHS: Factor) &&
644	TLI->isLegalInterleavedAccessType(VTy: SubVecTy, Factor, Alignment,
645	AddrSpace: AddressSpace, DL)) {
646	// FIXME: We use the memory op cost of the legalized* type here,*
647	// because it's getMemoryOpCost returns a really expensive cost for
648	// types like <6 x i8>, which show up when doing interleaves of
649	// Factor=3 etc. Should the memory op cost of these be cheaper?
650	auto *LegalVTy = VectorType::get(ElementType: VTy->getElementType(),
651	EC: LT.second.getVectorElementCount());
652	InstructionCost LegalMemCost = getMemoryOpCost(
653	Opcode, Src: LegalVTy, Alignment, AddressSpace, CostKind);
654	return LT.first + LegalMemCost;
655	}
656	}
657	}
658
659	// TODO: Return the cost of interleaved accesses for scalable vector when
660	// unable to convert to segment accesses instructions.
661	if (isa<ScalableVectorType>(Val: VecTy))
662	return InstructionCost::getInvalid();
663
664	auto *FVTy = cast<FixedVectorType>(Val: VecTy);
665	InstructionCost MemCost =
666	getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind);
667	unsigned VF = FVTy->getNumElements() / Factor;
668
669	// An interleaved load will look like this for Factor=3:
670	// %wide.vec = load <12 x i32>, ptr %3, align 4
671	// %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
672	// %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
673	// %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
674	if (Opcode == Instruction::Load) {
675	InstructionCost Cost = MemCost;
676	for (unsigned Index : Indices) {
677	FixedVectorType *SubVecTy =
678	FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor);
679	auto Mask = createStrideMask(Start: Index, Stride: Factor, VF);
680	InstructionCost ShuffleCost =
681	getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: SubVecTy, Mask,
682	CostKind, Index: `0`, SubTp: nullptr, Args: {});
683	Cost += ShuffleCost;
684	}
685	return Cost;
686	}
687
688	// TODO: Model for NF > 2
689	// We'll need to enhance getShuffleCost to model shuffles that are just
690	// inserts and extracts into subvectors, since they won't have the full cost
691	// of a vrgather.
692	// An interleaved store for 3 vectors of 4 lanes will look like
693	// %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
694	// %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
695	// %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
696	// %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
697	// store <12 x i32> %interleaved.vec, ptr %10, align 4
698	if (Factor != `2`)
699	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
700	Alignment, AddressSpace, CostKind,
701	UseMaskForCond, UseMaskForGaps);
702
703	assert(Opcode == Instruction::Store && "Opcode must be a store");
704	// For an interleaving store of 2 vectors, we perform one large interleaving
705	// shuffle that goes into the wide store
706	auto Mask = createInterleaveMask(VF, NumVecs: Factor);
707	InstructionCost ShuffleCost =
708	getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: FVTy, Mask,
709	CostKind, Index: `0`, SubTp: nullptr, Args: {});
710	return MemCost + ShuffleCost;
711	}
712
713	InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
714	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
715	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
716	if (CostKind != TTI::TCK_RecipThroughput)
717	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
718	Alignment, CostKind, I);
719
720	if ((Opcode == Instruction::Load &&
721	!isLegalMaskedGather(DataType: DataTy, Alignment: Align (Alignment))) \|\|
722	(Opcode == Instruction::Store &&
723	!isLegalMaskedScatter(DataType: DataTy, Alignment: Align (Alignment))))
724	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
725	Alignment, CostKind, I);
726
727	// Cost is proportional to the number of memory operations implied. For
728	// scalable vectors, we use an estimate on that number since we don't
729	// know exactly what VL will be.
730	auto &VTy = *cast<VectorType>(Val: DataTy);
731	InstructionCost MemOpCost =
732	getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: `0`, CostKind,
733	OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
734	unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
735	return NumLoads * MemOpCost;
736	}
737
738	InstructionCost RISCVTTIImpl::getStridedMemoryOpCost(
739	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
740	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
741	if (((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
742	!isLegalStridedLoadStore(DataType: DataTy, Alignment)) \|\|
743	(Opcode != Instruction::Load && Opcode != Instruction::Store))
744	return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
745	Alignment, CostKind, I);
746
747	if (CostKind == TTI::TCK_CodeSize)
748	return TTI::TCC_Basic;
749
750	// Cost is proportional to the number of memory operations implied. For
751	// scalable vectors, we use an estimate on that number since we don't
752	// know exactly what VL will be.
753	auto &VTy = *cast<VectorType>(Val: DataTy);
754	InstructionCost MemOpCost =
755	getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: `0`, CostKind,
756	OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
757	unsigned NumLoads = getEstimatedVLFor(Ty: &VTy);
758	return NumLoads * MemOpCost;
759	}
760
761	// Currently, these represent both throughput and codesize costs
762	// for the respective intrinsics. The costs in this table are simply
763	// instruction counts with the following adjustments made:
764	// One vsetvli is considered free.*
765	static const CostTblEntry VectorIntrinsicCostTable[]{
766	{.ISD: Intrinsic::floor, .Type: MVT::f32, .Cost: `9`},
767	{.ISD: Intrinsic::floor, .Type: MVT::f64, .Cost: `9`},
768	{.ISD: Intrinsic::ceil, .Type: MVT::f32, .Cost: `9`},
769	{.ISD: Intrinsic::ceil, .Type: MVT::f64, .Cost: `9`},
770	{.ISD: Intrinsic::trunc, .Type: MVT::f32, .Cost: `7`},
771	{.ISD: Intrinsic::trunc, .Type: MVT::f64, .Cost: `7`},
772	{.ISD: Intrinsic::round, .Type: MVT::f32, .Cost: `9`},
773	{.ISD: Intrinsic::round, .Type: MVT::f64, .Cost: `9`},
774	{.ISD: Intrinsic::roundeven, .Type: MVT::f32, .Cost: `9`},
775	{.ISD: Intrinsic::roundeven, .Type: MVT::f64, .Cost: `9`},
776	{.ISD: Intrinsic::rint, .Type: MVT::f32, .Cost: `7`},
777	{.ISD: Intrinsic::rint, .Type: MVT::f64, .Cost: `7`},
778	{.ISD: Intrinsic::lrint, .Type: MVT::i32, .Cost: `1`},
779	{.ISD: Intrinsic::lrint, .Type: MVT::i64, .Cost: `1`},
780	{.ISD: Intrinsic::llrint, .Type: MVT::i64, .Cost: `1`},
781	{.ISD: Intrinsic::nearbyint, .Type: MVT::f32, .Cost: `9`},
782	{.ISD: Intrinsic::nearbyint, .Type: MVT::f64, .Cost: `9`},
783	{.ISD: Intrinsic::bswap, .Type: MVT::i16, .Cost: `3`},
784	{.ISD: Intrinsic::bswap, .Type: MVT::i32, .Cost: `12`},
785	{.ISD: Intrinsic::bswap, .Type: MVT::i64, .Cost: `31`},
786	{.ISD: Intrinsic::vp_bswap, .Type: MVT::i16, .Cost: `3`},
787	{.ISD: Intrinsic::vp_bswap, .Type: MVT::i32, .Cost: `12`},
788	{.ISD: Intrinsic::vp_bswap, .Type: MVT::i64, .Cost: `31`},
789	{.ISD: Intrinsic::vp_fshl, .Type: MVT::i8, .Cost: `7`},
790	{.ISD: Intrinsic::vp_fshl, .Type: MVT::i16, .Cost: `7`},
791	{.ISD: Intrinsic::vp_fshl, .Type: MVT::i32, .Cost: `7`},
792	{.ISD: Intrinsic::vp_fshl, .Type: MVT::i64, .Cost: `7`},
793	{.ISD: Intrinsic::vp_fshr, .Type: MVT::i8, .Cost: `7`},
794	{.ISD: Intrinsic::vp_fshr, .Type: MVT::i16, .Cost: `7`},
795	{.ISD: Intrinsic::vp_fshr, .Type: MVT::i32, .Cost: `7`},
796	{.ISD: Intrinsic::vp_fshr, .Type: MVT::i64, .Cost: `7`},
797	{.ISD: Intrinsic::bitreverse, .Type: MVT::i8, .Cost: `17`},
798	{.ISD: Intrinsic::bitreverse, .Type: MVT::i16, .Cost: `24`},
799	{.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: `33`},
800	{.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: `52`},
801	{.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i8, .Cost: `17`},
802	{.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i16, .Cost: `24`},
803	{.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i32, .Cost: `33`},
804	{.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i64, .Cost: `52`},
805	{.ISD: Intrinsic::ctpop, .Type: MVT::i8, .Cost: `12`},
806	{.ISD: Intrinsic::ctpop, .Type: MVT::i16, .Cost: `19`},
807	{.ISD: Intrinsic::ctpop, .Type: MVT::i32, .Cost: `20`},
808	{.ISD: Intrinsic::ctpop, .Type: MVT::i64, .Cost: `21`},
809	{.ISD: Intrinsic::vp_ctpop, .Type: MVT::i8, .Cost: `12`},
810	{.ISD: Intrinsic::vp_ctpop, .Type: MVT::i16, .Cost: `19`},
811	{.ISD: Intrinsic::vp_ctpop, .Type: MVT::i32, .Cost: `20`},
812	{.ISD: Intrinsic::vp_ctpop, .Type: MVT::i64, .Cost: `21`},
813	{.ISD: Intrinsic::vp_ctlz, .Type: MVT::i8, .Cost: `19`},
814	{.ISD: Intrinsic::vp_ctlz, .Type: MVT::i16, .Cost: `28`},
815	{.ISD: Intrinsic::vp_ctlz, .Type: MVT::i32, .Cost: `31`},
816	{.ISD: Intrinsic::vp_ctlz, .Type: MVT::i64, .Cost: `35`},
817	{.ISD: Intrinsic::vp_cttz, .Type: MVT::i8, .Cost: `16`},
818	{.ISD: Intrinsic::vp_cttz, .Type: MVT::i16, .Cost: `23`},
819	{.ISD: Intrinsic::vp_cttz, .Type: MVT::i32, .Cost: `24`},
820	{.ISD: Intrinsic::vp_cttz, .Type: MVT::i64, .Cost: `25`},
821	};
822
823	static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
824	switch (ID) {
825	#define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \
826	case Intrinsic::VPID: \
827	return ISD::VPSD;
828	#include "llvm/IR/VPIntrinsics.def"
829	#undef HELPER_MAP_VPID_TO_VPSD
830	}
831	return ISD::DELETED_NODE;
832	}
833
834	InstructionCost
835	RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
836	TTI::TargetCostKind CostKind) {
837	auto *RetTy = ICA.getReturnType();
838	switch (ICA.getID()) {
839	case Intrinsic::ceil:
840	case Intrinsic::floor:
841	case Intrinsic::trunc:
842	case Intrinsic::rint:
843	case Intrinsic::lrint:
844	case Intrinsic::llrint:
845	case Intrinsic::round:
846	case Intrinsic::roundeven: {
847	// These all use the same code.
848	auto LT = getTypeLegalizationCost(Ty: RetTy);
849	if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second))
850	return LT.first * `8`;
851	break;
852	}
853	case Intrinsic::umin:
854	case Intrinsic::umax:
855	case Intrinsic::smin:
856	case Intrinsic::smax: {
857	auto LT = getTypeLegalizationCost(Ty: RetTy);
858	if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
859	return LT.first;
860
861	if (ST->hasVInstructions() && LT.second.isVector()) {
862	unsigned Op;
863	switch (ICA.getID()) {
864	case Intrinsic::umin:
865	Op = RISCV::VMINU_VV;
866	break;
867	case Intrinsic::umax:
868	Op = RISCV::VMAXU_VV;
869	break;
870	case Intrinsic::smin:
871	Op = RISCV::VMIN_VV;
872	break;
873	case Intrinsic::smax:
874	Op = RISCV::VMAX_VV;
875	break;
876	}
877	return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
878	}
879	break;
880	}
881	case Intrinsic::sadd_sat:
882	case Intrinsic::ssub_sat:
883	case Intrinsic::uadd_sat:
884	case Intrinsic::usub_sat:
885	case Intrinsic::fabs:
886	case Intrinsic::sqrt: {
887	auto LT = getTypeLegalizationCost(Ty: RetTy);
888	if (ST->hasVInstructions() && LT.second.isVector())
889	return LT.first;
890	break;
891	}
892	case Intrinsic::ctpop: {
893	auto LT = getTypeLegalizationCost(Ty: RetTy);
894	if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector())
895	return LT.first;
896	break;
897	}
898	case Intrinsic::abs: {
899	auto LT = getTypeLegalizationCost(Ty: RetTy);
900	if (ST->hasVInstructions() && LT.second.isVector()) {
901	// vrsub.vi v10, v8, 0
902	// vmax.vv v8, v8, v10
903	return LT.first * `2`;
904	}
905	break;
906	}
907	case Intrinsic::get_active_lane_mask: {
908	if (ST->hasVInstructions()) {
909	Type *ExpRetTy = VectorType::get(
910	ElementType: ICA.getArgTypes()[`0`], EC: cast<VectorType>(Val: RetTy)->getElementCount());
911	auto LT = getTypeLegalizationCost(Ty: ExpRetTy);
912
913	// vid.v v8 // considered hoisted
914	// vsaddu.vx v8, v8, a0
915	// vmsltu.vx v0, v8, a1
916	return LT.first *
917	getRISCVInstructionCost(OpCodes: {RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
918	VT: LT.second, CostKind);
919	}
920	break;
921	}
922	// TODO: add more intrinsic
923	case Intrinsic::experimental_stepvector: {
924	auto LT = getTypeLegalizationCost(Ty: RetTy);
925	// Legalisation of illegal types involves an `index' instruction plus
926	// (LT.first - 1) vector adds.
927	if (ST->hasVInstructions())
928	return getRISCVInstructionCost(OpCodes: RISCV::VID_V, VT: LT.second, CostKind) +
929	(LT.first - `1`) *
930	getRISCVInstructionCost(OpCodes: RISCV::VADD_VX, VT: LT.second, CostKind);
931	return `1` + (LT.first - `1`);
932	}
933	case Intrinsic::experimental_cttz_elts: {
934	Type *ArgTy = ICA.getArgTypes()[`0`];
935	EVT ArgType = TLI->getValueType(DL, Ty: ArgTy, AllowUnknown: true);
936	if (getTLI()->shouldExpandCttzElements(VT: ArgType))
937	break;
938	InstructionCost Cost = getRISCVInstructionCost(
939	OpCodes: RISCV::VFIRST_M, VT: getTypeLegalizationCost(Ty: ArgTy).second, CostKind);
940
941	// If zero_is_poison is false, then we will generate additional
942	// cmp + select instructions to convert -1 to EVL.
943	Type *BoolTy = Type::getInt1Ty(C&: RetTy->getContext());
944	if (ICA.getArgs().size() > `1` &&
945	cast<ConstantInt>(Val: ICA.getArgs()[`1`])->isZero())
946	Cost += getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: BoolTy, CondTy: RetTy,
947	VecPred: CmpInst::ICMP_SLT, CostKind) +
948	getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: RetTy, CondTy: BoolTy,
949	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
950
951	return Cost;
952	}
953	case Intrinsic::vp_rint: {
954	// RISC-V target uses at least 5 instructions to lower rounding intrinsics.
955	unsigned Cost = `5`;
956	auto LT = getTypeLegalizationCost(Ty: RetTy);
957	if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
958	return Cost * LT.first;
959	break;
960	}
961	case Intrinsic::vp_nearbyint: {
962	// More one read and one write for fflags than vp_rint.
963	unsigned Cost = `7`;
964	auto LT = getTypeLegalizationCost(Ty: RetTy);
965	if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second))
966	return Cost * LT.first;
967	break;
968	}
969	case Intrinsic::vp_ceil:
970	case Intrinsic::vp_floor:
971	case Intrinsic::vp_round:
972	case Intrinsic::vp_roundeven:
973	case Intrinsic::vp_roundtozero: {
974	// Rounding with static rounding mode needs two more instructions to
975	// swap/write FRM than vp_rint.
976	unsigned Cost = `7`;
977	auto LT = getTypeLegalizationCost(Ty: RetTy);
978	unsigned VPISD = getISDForVPIntrinsicID(ID: ICA.getID());
979	if (TLI->isOperationCustom(Op: VPISD, VT: LT.second))
980	return Cost * LT.first;
981	break;
982	}
983	// vp integer arithmetic ops.
984	case Intrinsic::vp_add:
985	case Intrinsic::vp_and:
986	case Intrinsic::vp_ashr:
987	case Intrinsic::vp_lshr:
988	case Intrinsic::vp_mul:
989	case Intrinsic::vp_or:
990	case Intrinsic::vp_sdiv:
991	case Intrinsic::vp_shl:
992	case Intrinsic::vp_srem:
993	case Intrinsic::vp_sub:
994	case Intrinsic::vp_udiv:
995	case Intrinsic::vp_urem:
996	case Intrinsic::vp_xor:
997	// vp float arithmetic ops.
998	case Intrinsic::vp_fadd:
999	case Intrinsic::vp_fsub:
1000	case Intrinsic::vp_fmul:
1001	case Intrinsic::vp_fdiv:
1002	case Intrinsic::vp_frem: {
1003	std::optional<unsigned> FOp =
1004	VPIntrinsic::getFunctionalOpcodeForVP(ID: ICA.getID());
1005	if (FOp)
1006	return getArithmeticInstrCost(Opcode: *FOp, Ty: ICA.getReturnType(), CostKind);
1007	break;
1008	}
1009	}
1010
1011	if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1012	if (auto LT = getTypeLegalizationCost(Ty: RetTy);
1013	LT.second.isVector()) {
1014	MVT EltTy = LT.second.getVectorElementType();
1015	if (const auto *Entry = CostTableLookup(Table: VectorIntrinsicCostTable,
1016	ISD: ICA.getID(), Ty: EltTy))
1017	return LT.first * Entry->Cost;
1018	}
1019	}
1020
1021	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1022	}
1023
1024	InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
1025	Type *Src,
1026	TTI::CastContextHint CCH,
1027	TTI::TargetCostKind CostKind,
1028	const Instruction *I) {
1029	bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src);
1030	if (!IsVectorType)
1031	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1032
1033	bool IsTypeLegal = isTypeLegal(Ty: Src) && isTypeLegal(Ty: Dst) &&
1034	(Src->getScalarSizeInBits() <= ST->getELen()) &&
1035	(Dst->getScalarSizeInBits() <= ST->getELen());
1036
1037	// FIXME: Need to compute legalizing cost for illegal types.
1038	if (!IsTypeLegal)
1039	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1040
1041	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
1042	std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst);
1043
1044	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1045	assert(ISD && "Invalid opcode");
1046
1047	int PowDiff = (int)Log2_32(Value: Dst->getScalarSizeInBits()) -
1048	(int)Log2_32(Value: Src->getScalarSizeInBits());
1049	switch (ISD) {
1050	case ISD::SIGN_EXTEND:
1051	case ISD::ZERO_EXTEND: {
1052	const unsigned SrcEltSize = Src->getScalarSizeInBits();
1053	if (SrcEltSize == `1`) {
1054	// We do not use vsext/vzext to extend from mask vector.
1055	// Instead we use the following instructions to extend from mask vector:
1056	// vmv.v.i v8, 0
1057	// vmerge.vim v8, v8, -1, v0
1058	return getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_I, RISCV::VMERGE_VIM},
1059	VT: DstLT.second, CostKind);
1060	}
1061	if ((PowDiff < `1`) \|\| (PowDiff > `3`))
1062	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1063	unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1064	unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1065	unsigned Op =
1066	(ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - `1`] : ZExtOp[PowDiff - `1`];
1067	return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind);
1068	}
1069	case ISD::TRUNCATE:
1070	if (Dst->getScalarSizeInBits() == `1`) {
1071	// We do not use several vncvt to truncate to mask vector. So we could
1072	// not use PowDiff to calculate it.
1073	// Instead we use the following instructions to truncate to mask vector:
1074	// vand.vi v8, v8, 1
1075	// vmsne.vi v0, v8, 0
1076	return getRISCVInstructionCost(OpCodes: {RISCV::VAND_VI, RISCV::VMSNE_VI},
1077	VT: SrcLT.second, CostKind);
1078	}
1079	[[fallthrough]];
1080	case ISD::FP_EXTEND:
1081	case ISD::FP_ROUND: {
1082	// Counts of narrow/widen instructions.
1083	unsigned SrcEltSize = Src->getScalarSizeInBits();
1084	unsigned DstEltSize = Dst->getScalarSizeInBits();
1085
1086	unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1087	: (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1088	: RISCV::VFNCVT_F_F_W;
1089	InstructionCost Cost = `0`;
1090	for (; SrcEltSize != DstEltSize;) {
1091	MVT ElementMVT = (ISD == ISD::TRUNCATE)
1092	? MVT::getIntegerVT(BitWidth: DstEltSize)
1093	: MVT::getFloatingPointVT(BitWidth: DstEltSize);
1094	MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT);
1095	DstEltSize =
1096	(DstEltSize > SrcEltSize) ? DstEltSize >> `1` : DstEltSize << `1`;
1097	Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind);
1098	}
1099	return Cost;
1100	}
1101	case ISD::FP_TO_SINT:
1102	case ISD::FP_TO_UINT:
1103	case ISD::SINT_TO_FP:
1104	case ISD::UINT_TO_FP:
1105	if (Src->getScalarSizeInBits() == `1` \|\| Dst->getScalarSizeInBits() == `1`) {
1106	// The cost of convert from or to mask vector is different from other
1107	// cases. We could not use PowDiff to calculate it.
1108	// For mask vector to fp, we should use the following instructions:
1109	// vmv.v.i v8, 0
1110	// vmerge.vim v8, v8, -1, v0
1111	// vfcvt.f.x.v v8, v8
1112
1113	// And for fp vector to mask, we use:
1114	// vfncvt.rtz.x.f.w v9, v8
1115	// vand.vi v8, v9, 1
1116	// vmsne.vi v0, v8, 0
1117	return `3`;
1118	}
1119	if (std::abs(x: PowDiff) <= `1`)
1120	return `1`;
1121	// Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8),
1122	// so it only need two conversion.
1123	if (Src->isIntOrIntVectorTy())
1124	return `2`;
1125	// Counts of narrow/widen instructions.
1126	return std::abs(x: PowDiff);
1127	}
1128	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1129	}
1130
1131	unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) {
1132	if (isa<ScalableVectorType>(Val: Ty)) {
1133	const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType());
1134	const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1135	const unsigned VectorBits = getVScaleForTuning() RISCV::RVVBitsPerBlock;
1136	return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1137	}
1138	return cast<FixedVectorType>(Val: Ty)->getNumElements();
1139	}
1140
1141	InstructionCost
1142	RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1143	FastMathFlags FMF,
1144	TTI::TargetCostKind CostKind) {
1145	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1146	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1147
1148	// Skip if scalar size of Ty is bigger than ELEN.
1149	if (Ty->getScalarSizeInBits() > ST->getELen())
1150	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1151
1152	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1153	if (Ty->getElementType()->isIntegerTy(Bitwidth: `1`)) {
1154	// SelectionDAGBuilder does following transforms:
1155	// vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1156	// vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1157	if (IID == Intrinsic::umax \|\| IID == Intrinsic::smin)
1158	return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind);
1159	else
1160	return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind);
1161	}
1162
1163	if (IID == Intrinsic::maximum \|\| IID == Intrinsic::minimum) {
1164	SmallVector<unsigned, `3`> Opcodes;
1165	InstructionCost ExtraCost = `0`;
1166	switch (IID) {
1167	case Intrinsic::maximum:
1168	if (FMF.noNaNs()) {
1169	Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1170	} else {
1171	Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1172	RISCV::VFMV_F_S};
1173	// Cost of Canonical Nan + branch
1174	// lui a0, 523264
1175	// fmv.w.x fa0, a0
1176	Type *DstTy = Ty->getScalarType();
1177	const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1178	Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1179	ExtraCost = `1` +
1180	getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1181	CCH: TTI::CastContextHint::None, CostKind) +
1182	getCFInstrCost(Opcode: Instruction::Br, CostKind);
1183	}
1184	break;
1185
1186	case Intrinsic::minimum:
1187	if (FMF.noNaNs()) {
1188	Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1189	} else {
1190	Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1191	RISCV::VFMV_F_S};
1192	// Cost of Canonical Nan + branch
1193	// lui a0, 523264
1194	// fmv.w.x fa0, a0
1195	Type *DstTy = Ty->getScalarType();
1196	const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy);
1197	Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits);
1198	ExtraCost = `1` +
1199	getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy,
1200	CCH: TTI::CastContextHint::None, CostKind) +
1201	getCFInstrCost(Opcode: Instruction::Br, CostKind);
1202	}
1203	break;
1204	}
1205	return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1206	}
1207
1208	// IR Reduction is composed by two vmv and one rvv reduction instruction.
1209	unsigned SplitOp;
1210	SmallVector<unsigned, `3`> Opcodes;
1211	switch (IID) {
1212	default:
1213	llvm_unreachable("Unsupported intrinsic");
1214	case Intrinsic::smax:
1215	SplitOp = RISCV::VMAX_VV;
1216	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1217	break;
1218	case Intrinsic::smin:
1219	SplitOp = RISCV::VMIN_VV;
1220	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1221	break;
1222	case Intrinsic::umax:
1223	SplitOp = RISCV::VMAXU_VV;
1224	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1225	break;
1226	case Intrinsic::umin:
1227	SplitOp = RISCV::VMINU_VV;
1228	Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1229	break;
1230	case Intrinsic::maxnum:
1231	SplitOp = RISCV::VFMAX_VV;
1232	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1233	break;
1234	case Intrinsic::minnum:
1235	SplitOp = RISCV::VFMIN_VV;
1236	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1237	break;
1238	}
1239	// Add a cost for data larger than LMUL8
1240	InstructionCost SplitCost =
1241	(LT.first > `1`) ? (LT.first - `1`) *
1242	getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1243	: `0`;
1244	return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1245	}
1246
1247	InstructionCost
1248	RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1249	std::optional<FastMathFlags> FMF,
1250	TTI::TargetCostKind CostKind) {
1251	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1252	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1253
1254	// Skip if scalar size of Ty is bigger than ELEN.
1255	if (Ty->getScalarSizeInBits() > ST->getELen())
1256	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1257
1258	int ISD = TLI->InstructionOpcodeToISD(Opcode);
1259	assert(ISD && "Invalid opcode");
1260
1261	if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1262	ISD != ISD::FADD)
1263	return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1264
1265	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1266	SmallVector<unsigned, `3`> Opcodes;
1267	Type *ElementTy = Ty->getElementType();
1268	if (ElementTy->isIntegerTy(Bitwidth: `1`)) {
1269	if (ISD == ISD::AND) {
1270	// Example sequences:
1271	// vsetvli a0, zero, e8, mf8, ta, ma
1272	// vmnot.m v8, v0
1273	// vcpop.m a0, v8
1274	// seqz a0, a0
1275	Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M};
1276	return (LT.first - `1`) +
1277	getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1278	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1279	VecPred: CmpInst::ICMP_EQ, CostKind);
1280	} else {
1281	// Example sequences:
1282	// vsetvli a0, zero, e8, mf8, ta, ma
1283	// vcpop.m a0, v0
1284	// snez a0, a0
1285	Opcodes = {RISCV::VCPOP_M};
1286	return (LT.first - `1`) +
1287	getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) +
1288	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy,
1289	VecPred: CmpInst::ICMP_NE, CostKind);
1290	}
1291	}
1292
1293	// IR Reduction is composed by two vmv and one rvv reduction instruction.
1294	if (TTI::requiresOrderedReduction(FMF)) {
1295	Opcodes.push_back(Elt: RISCV::VFMV_S_F);
1296	for (unsigned i = `0`; i < LT.first.getValue(); i++)
1297	Opcodes.push_back(Elt: RISCV::VFREDOSUM_VS);
1298	Opcodes.push_back(Elt: RISCV::VFMV_F_S);
1299	return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1300	}
1301	unsigned SplitOp;
1302	switch (ISD) {
1303	case ISD::ADD:
1304	SplitOp = RISCV::VADD_VV;
1305	Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
1306	break;
1307	case ISD::OR:
1308	SplitOp = RISCV::VOR_VV;
1309	Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S};
1310	break;
1311	case ISD::XOR:
1312	SplitOp = RISCV::VXOR_VV;
1313	Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
1314	break;
1315	case ISD::AND:
1316	SplitOp = RISCV::VAND_VV;
1317	Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
1318	break;
1319	case ISD::FADD:
1320	SplitOp = RISCV::VFADD_VV;
1321	Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
1322	break;
1323	}
1324	// Add a cost for data larger than LMUL8
1325	InstructionCost SplitCost =
1326	(LT.first > `1`) ? (LT.first - `1`) *
1327	getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind)
1328	: `0`;
1329	return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind);
1330	}
1331
1332	InstructionCost RISCVTTIImpl::getExtendedReductionCost(
1333	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType ValTy,
1334	FastMathFlags FMF, TTI::TargetCostKind CostKind) {
1335	if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1336	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1337	FMF, CostKind);
1338
1339	// Skip if scalar size of ResTy is bigger than ELEN.
1340	if (ResTy->getScalarSizeInBits() > ST->getELen())
1341	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1342	FMF, CostKind);
1343
1344	if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
1345	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1346	FMF, CostKind);
1347
1348	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1349
1350	if (ResTy->getScalarSizeInBits() != `2` * LT.second.getScalarSizeInBits())
1351	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy,
1352	FMF, CostKind);
1353
1354	return (LT.first - `1`) +
1355	getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
1356	}
1357
1358	InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty,
1359	TTI::OperandValueInfo OpInfo,
1360	TTI::TargetCostKind CostKind) {
1361	assert(OpInfo.isConstant() && "non constant operand?");
1362	if (!isa<VectorType>(Val: Ty))
1363	// FIXME: We need to account for immediate materialization here, but doing
1364	// a decent job requires more knowledge about the immediate than we
1365	// currently have here.
1366	return `0`;
1367
1368	if (OpInfo.isUniform())
1369	// vmv.x.i, vmv.v.x, or vfmv.v.f
1370	// We ignore the cost of the scalar constant materialization to be consistent
1371	// with how we treat scalar constants themselves just above.
1372	return `1`;
1373
1374	return getConstantPoolLoadCost(Ty, CostKind);
1375	}
1376
1377
1378	InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1379	MaybeAlign Alignment,
1380	unsigned AddressSpace,
1381	TTI::TargetCostKind CostKind,
1382	TTI::OperandValueInfo OpInfo,
1383	const Instruction *I) {
1384	EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true);
1385	// Type legalization can't handle structs
1386	if (VT == MVT::Other)
1387	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1388	CostKind, OpInfo, I);
1389
1390	InstructionCost Cost = `0`;
1391	if (Opcode == Instruction::Store && OpInfo.isConstant())
1392	Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind);
1393	InstructionCost BaseCost =
1394	BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1395	CostKind, OpInfo, I);
1396	// Assume memory ops cost scale with the number of vector registers
1397	// possible accessed by the instruction. Note that BasicTTI already
1398	// handles the LT.first term for us.
1399	if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src);
1400	LT.second.isVector() && CostKind != TTI::TCK_CodeSize)
1401	BaseCost *= TLI->getLMULCost(VT: LT.second);
1402	return Cost + BaseCost;
1403
1404	}
1405
1406	InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
1407	Type *CondTy,
1408	CmpInst::Predicate VecPred,
1409	TTI::TargetCostKind CostKind,
1410	const Instruction *I) {
1411	if (CostKind != TTI::TCK_RecipThroughput)
1412	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1413	I);
1414
1415	if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors())
1416	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1417	I);
1418
1419	// Skip if scalar size of ValTy is bigger than ELEN.
1420	if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
1421	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1422	I);
1423
1424	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
1425	if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
1426	if (CondTy->isVectorTy()) {
1427	if (ValTy->getScalarSizeInBits() == `1`) {
1428	// vmandn.mm v8, v8, v9
1429	// vmand.mm v9, v0, v9
1430	// vmor.mm v0, v9, v8
1431	return LT.first *
1432	getRISCVInstructionCost(
1433	OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1434	VT: LT.second, CostKind);
1435	}
1436	// vselect and max/min are supported natively.
1437	return LT.first *
1438	getRISCVInstructionCost(OpCodes: RISCV::VMERGE_VVM, VT: LT.second, CostKind);
1439	}
1440
1441	if (ValTy->getScalarSizeInBits() == `1`) {
1442	// vmv.v.x v9, a0
1443	// vmsne.vi v9, v9, 0
1444	// vmandn.mm v8, v8, v9
1445	// vmand.mm v9, v0, v9
1446	// vmor.mm v0, v9, v8
1447	MVT InterimVT = LT.second.changeVectorElementType(EltVT: MVT::i8);
1448	return LT.first *
1449	getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI},
1450	VT: InterimVT, CostKind) +
1451	LT.first * getRISCVInstructionCost(
1452	OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
1453	VT: LT.second, CostKind);
1454	}
1455
1456	// vmv.v.x v10, a0
1457	// vmsne.vi v0, v10, 0
1458	// vmerge.vvm v8, v9, v8, v0
1459	return LT.first * getRISCVInstructionCost(
1460	OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
1461	VT: LT.second, CostKind);
1462	}
1463
1464	if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
1465	CmpInst::isIntPredicate(P: VecPred)) {
1466	// Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
1467	// provided they incur the same cost across all implementations
1468	return LT.first *
1469	getRISCVInstructionCost(OpCodes: RISCV::VMSLT_VV, VT: LT.second, CostKind);
1470	}
1471
1472	if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
1473	CmpInst::isFPPredicate(P: VecPred)) {
1474
1475	// Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
1476	if ((VecPred == CmpInst::FCMP_FALSE) \|\| (VecPred == CmpInst::FCMP_TRUE))
1477	return getRISCVInstructionCost(OpCodes: RISCV::VMXOR_MM, VT: LT.second, CostKind);
1478
1479	// If we do not support the input floating point vector type, use the base
1480	// one which will calculate as:
1481	// ScalarizeCost + Num Cost for fixed vector,*
1482	// InvalidCost for scalable vector.
1483	if ((ValTy->getScalarSizeInBits() == `16` && !ST->hasVInstructionsF16()) \|\|
1484	(ValTy->getScalarSizeInBits() == `32` && !ST->hasVInstructionsF32()) \|\|
1485	(ValTy->getScalarSizeInBits() == `64` && !ST->hasVInstructionsF64()))
1486	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1487	I);
1488
1489	// Assuming vector fp compare and mask instructions are all the same cost
1490	// until a need arises to differentiate them.
1491	switch (VecPred) {
1492	case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
1493	case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
1494	case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
1495	case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
1496	return LT.first * getRISCVInstructionCost(
1497	OpCodes: {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
1498	VT: LT.second, CostKind);
1499
1500	case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
1501	case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
1502	case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
1503	case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
1504	return LT.first *
1505	getRISCVInstructionCost(OpCodes: {RISCV::VMFLT_VV, RISCV::VMNAND_MM},
1506	VT: LT.second, CostKind);
1507
1508	case CmpInst::FCMP_OEQ: // vmfeq.vv
1509	case CmpInst::FCMP_OGT: // vmflt.vv
1510	case CmpInst::FCMP_OGE: // vmfle.vv
1511	case CmpInst::FCMP_OLT: // vmflt.vv
1512	case CmpInst::FCMP_OLE: // vmfle.vv
1513	case CmpInst::FCMP_UNE: // vmfne.vv
1514	return LT.first *
1515	getRISCVInstructionCost(OpCodes: RISCV::VMFLT_VV, VT: LT.second, CostKind);
1516	default:
1517	break;
1518	}
1519	}
1520
1521	// With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
1522	// instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
1523	// generate a conditional branch + mv. The cost of scalar (icmp + select) will
1524	// be (0 + select instr cost).
1525	if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(Val: I) &&
1526	ValTy->isIntegerTy() && !I->user_empty()) {
1527	if (all_of(Range: I->users(), P: [&](const User *U) {
1528	return match(V: U, P: m_Select(C: m_Specific(V: I), L: m_Value(), R: m_Value())) &&
1529	U->getType()->isIntegerTy() &&
1530	!isa<ConstantData>(Val: U->getOperand(i: `1`)) &&
1531	!isa<ConstantData>(Val: U->getOperand(i: `2`));
1532	}))
1533	return `0`;
1534	}
1535
1536	// TODO: Add cost for scalar type.
1537
1538	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
1539	}
1540
1541	InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode,
1542	TTI::TargetCostKind CostKind,
1543	const Instruction *I) {
1544	if (CostKind != TTI::TCK_RecipThroughput)
1545	return Opcode == Instruction::PHI ? `0` : `1`;
1546	// Branches are assumed to be predicted.
1547	return `0`;
1548	}
1549
1550	InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1551	TTI::TargetCostKind CostKind,
1552	unsigned Index, Value *Op0,
1553	Value *Op1) {
1554	assert(Val->isVectorTy() && "This must be a vector type");
1555
1556	if (Opcode != Instruction::ExtractElement &&
1557	Opcode != Instruction::InsertElement)
1558	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1559
1560	// Legalize the type.
1561	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
1562
1563	// This type is legalized to a scalar type.
1564	if (!LT.second.isVector()) {
1565	auto *FixedVecTy = cast<FixedVectorType>(Val);
1566	// If Index is a known constant, cost is zero.
1567	if (Index != -`1U`)
1568	return `0`;
1569	// Extract/InsertElement with non-constant index is very costly when
1570	// scalarized; estimate cost of loads/stores sequence via the stack:
1571	// ExtractElement cost: store vector to stack, load scalar;
1572	// InsertElement cost: store vector to stack, store scalar, load vector.
1573	Type *ElemTy = FixedVecTy->getElementType();
1574	auto NumElems = FixedVecTy->getNumElements();
1575	auto Align = DL.getPrefTypeAlign(Ty: ElemTy);
1576	InstructionCost LoadCost =
1577	getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: `0`, CostKind);
1578	InstructionCost StoreCost =
1579	getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: `0`, CostKind);
1580	return Opcode == Instruction::ExtractElement
1581	? StoreCost * NumElems + LoadCost
1582	: (StoreCost + LoadCost) * NumElems + StoreCost;
1583	}
1584
1585	// For unsupported scalable vector.
1586	if (LT.second.isScalableVector() && !LT.first.isValid())
1587	return LT.first;
1588
1589	if (!isTypeLegal(Ty: Val))
1590	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1591
1592	// Mask vector extract/insert is expanded via e8.
1593	if (Val->getScalarSizeInBits() == `1`) {
1594	VectorType *WideTy =
1595	VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: `8`),
1596	EC: cast<VectorType>(Val)->getElementCount());
1597	if (Opcode == Instruction::ExtractElement) {
1598	InstructionCost ExtendCost
1599	= getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1600	CCH: TTI::CastContextHint::None, CostKind);
1601	InstructionCost ExtractCost
1602	= getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1603	return ExtendCost + ExtractCost;
1604	}
1605	InstructionCost ExtendCost
1606	= getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val,
1607	CCH: TTI::CastContextHint::None, CostKind);
1608	InstructionCost InsertCost
1609	= getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr);
1610	InstructionCost TruncCost
1611	= getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy,
1612	CCH: TTI::CastContextHint::None, CostKind);
1613	return ExtendCost + InsertCost + TruncCost;
1614	}
1615
1616
1617	// In RVV, we could use vslidedown + vmv.x.s to extract element from vector
1618	// and vslideup + vmv.s.x to insert element to vector.
1619	unsigned BaseCost = `1`;
1620	// When insertelement we should add the index with 1 as the input of vslideup.
1621	unsigned SlideCost = Opcode == Instruction::InsertElement ? `2` : `1`;
1622
1623	if (Index != -`1U`) {
1624	// The type may be split. For fixed-width vectors we can normalize the
1625	// index to the new type.
1626	if (LT.second.isFixedLengthVector()) {
1627	unsigned Width = LT.second.getVectorNumElements();
1628	Index = Index % Width;
1629	}
1630
1631	// We could extract/insert the first element without vslidedown/vslideup.
1632	if (Index == `0`)
1633	SlideCost = `0`;
1634	else if (Opcode == Instruction::InsertElement)
1635	SlideCost = `1`; // With a constant index, we do not need to use addi.
1636	}
1637
1638	// Extract i64 in the target that has XLEN=32 need more instruction.
1639	if (Val->getScalarType()->isIntegerTy() &&
1640	ST->getXLen() < Val->getScalarSizeInBits()) {
1641	// For extractelement, we need the following instructions:
1642	// vsetivli zero, 1, e64, m1, ta, mu (not count)
1643	// vslidedown.vx v8, v8, a0
1644	// vmv.x.s a0, v8
1645	// li a1, 32
1646	// vsrl.vx v8, v8, a1
1647	// vmv.x.s a1, v8
1648
1649	// For insertelement, we need the following instructions:
1650	// vsetivli zero, 2, e32, m4, ta, mu (not count)
1651	// vmv.v.i v12, 0
1652	// vslide1up.vx v16, v12, a1
1653	// vslide1up.vx v12, v16, a0
1654	// addi a0, a2, 1
1655	// vsetvli zero, a0, e64, m4, tu, mu (not count)
1656	// vslideup.vx v8, v12, a2
1657
1658	// TODO: should we count these special vsetvlis?
1659	BaseCost = Opcode == Instruction::InsertElement ? `3` : `4`;
1660	}
1661	return BaseCost + SlideCost;
1662	}
1663
1664	InstructionCost RISCVTTIImpl::getArithmeticInstrCost(
1665	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1666	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
1667	ArrayRef<const Value > Args, const* Instruction *CxtI) {
1668
1669	// TODO: Handle more cost kinds.
1670	if (CostKind != TTI::TCK_RecipThroughput)
1671	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1672	Args, CxtI);
1673
1674	if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors())
1675	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1676	Args, CxtI);
1677
1678	// Skip if scalar size of Ty is bigger than ELEN.
1679	if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen())
1680	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1681	Args, CxtI);
1682
1683	// Legalize the type.
1684	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1685
1686	// TODO: Handle scalar type.
1687	if (!LT.second.isVector())
1688	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
1689	Args, CxtI);
1690
1691	auto getConstantMatCost =
1692	[&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
1693	if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
1694	// Two sub-cases:
1695	// Has a 5 bit immediate operand which can be splatted.*
1696	// Has a larger immediate which must be materialized in scalar register*
1697	// We return 0 for both as we currently ignore the cost of materializing
1698	// scalar constants in GPRs.
1699	return `0`;
1700
1701	return getConstantPoolLoadCost(Ty, CostKind);
1702	};
1703
1704	// Add the cost of materializing any constant vectors required.
1705	InstructionCost ConstantMatCost = `0`;
1706	if (Op1Info.isConstant())
1707	ConstantMatCost += getConstantMatCost (`0`, Op1Info);
1708	if (Op2Info.isConstant())
1709	ConstantMatCost += getConstantMatCost (`1`, Op2Info);
1710
1711	unsigned Op;
1712	switch (TLI->InstructionOpcodeToISD(Opcode)) {
1713	case ISD::ADD:
1714	case ISD::SUB:
1715	Op = RISCV::VADD_VV;
1716	break;
1717	case ISD::SHL:
1718	case ISD::SRL:
1719	case ISD::SRA:
1720	Op = RISCV::VSLL_VV;
1721	break;
1722	case ISD::AND:
1723	case ISD::OR:
1724	case ISD::XOR:
1725	Op = (Ty->getScalarSizeInBits() == `1`) ? RISCV::VMAND_MM : RISCV::VAND_VV;
1726	break;
1727	case ISD::MUL:
1728	case ISD::MULHS:
1729	case ISD::MULHU:
1730	Op = RISCV::VMUL_VV;
1731	break;
1732	case ISD::SDIV:
1733	case ISD::UDIV:
1734	Op = RISCV::VDIV_VV;
1735	break;
1736	case ISD::SREM:
1737	case ISD::UREM:
1738	Op = RISCV::VREM_VV;
1739	break;
1740	case ISD::FADD:
1741	case ISD::FSUB:
1742	// TODO: Address FP16 with VFHMIN
1743	Op = RISCV::VFADD_VV;
1744	break;
1745	case ISD::FMUL:
1746	// TODO: Address FP16 with VFHMIN
1747	Op = RISCV::VFMUL_VV;
1748	break;
1749	case ISD::FDIV:
1750	Op = RISCV::VFDIV_VV;
1751	break;
1752	case ISD::FNEG:
1753	Op = RISCV::VFSGNJN_VV;
1754	break;
1755	default:
1756	// Assuming all other instructions have the same cost until a need arises to
1757	// differentiate them.
1758	return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
1759	Opd1Info: Op1Info, Opd2Info: Op2Info,
1760	Args, CxtI);
1761	}
1762
1763	InstructionCost InstrCost = getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind);
1764	// We use BasicTTIImpl to calculate scalar costs, which assumes floating point
1765	// ops are twice as expensive as integer ops. Do the same for vectors so
1766	// scalar floating point ops aren't cheaper than their vector equivalents.
1767	if (Ty->isFPOrFPVectorTy())
1768	InstrCost *= `2`;
1769	return ConstantMatCost + LT.first * InstrCost;
1770	}
1771
1772	// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
1773	InstructionCost RISCVTTIImpl::getPointersChainCost(
1774	ArrayRef<const Value > Ptrs, const* Value *Base,
1775	const TTI::PointersChainInfo &Info, Type *AccessTy,
1776	TTI::TargetCostKind CostKind) {
1777	InstructionCost Cost = TTI::TCC_Free;
1778	// In the basic model we take into account GEP instructions only
1779	// (although here can come alloca instruction, a value, constants and/or
1780	// constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
1781	// pointer). Typically, if Base is a not a GEP-instruction and all the
1782	// pointers are relative to the same base address, all the rest are
1783	// either GEP instructions, PHIs, bitcasts or constants. When we have same
1784	// base, we just calculate cost of each non-Base GEP as an ADD operation if
1785	// any their index is a non-const.
1786	// If no known dependecies between the pointers cost is calculated as a sum
1787	// of costs of GEP instructions.
1788	for (auto [I, V] : enumerate(First&: Ptrs)) {
1789	const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
1790	if (!GEP)
1791	continue;
1792	if (Info.isSameBase() && V != Base) {
1793	if (GEP->hasAllConstantIndices())
1794	continue;
1795	// If the chain is unit-stride and BaseReg + stridei is a legal*
1796	// addressing mode, then presume the base GEP is sitting around in a
1797	// register somewhere and check if we can fold the offset relative to
1798	// it.
1799	unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy);
1800	if (Info.isUnitStride() &&
1801	isLegalAddressingMode(Ty: AccessTy,
1802	/ BaseGV / nullptr,
1803	/ BaseOffset / Stride * I,
1804	/ HasBaseReg / true,
1805	/ Scale / `0`,
1806	AddrSpace: GEP->getType()->getPointerAddressSpace()))
1807	continue;
1808	Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind,
1809	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1810	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
1811	Args: std::nullopt);
1812	} else {
1813	SmallVector<const Value *> Indices(GEP->indices());
1814	Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(),
1815	Operands: Indices, AccessType: AccessTy, CostKind);
1816	}
1817	}
1818	return Cost;
1819	}
1820
1821	void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
1822	TTI::UnrollingPreferences &UP,
1823	OptimizationRemarkEmitter *ORE) {
1824	// TODO: More tuning on benchmarks and metrics with changes as needed
1825	// would apply to all settings below to enable performance.
1826
1827
1828	if (ST->enableDefaultUnroll())
1829	return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
1830
1831	// Enable Upper bound unrolling universally, not dependant upon the conditions
1832	// below.
1833	UP.UpperBound = true;
1834
1835	// Disable loop unrolling for Oz and Os.
1836	UP.OptSizeThreshold = `0`;
1837	UP.PartialOptSizeThreshold = `0`;
1838	if (L->getHeader()->getParent()->hasOptSize())
1839	return;
1840
1841	SmallVector<BasicBlock *, `4`> ExitingBlocks;
1842	L->getExitingBlocks(ExitingBlocks);
1843	LLVM_DEBUG(dbgs() << "Loop has:\n"
1844	<< "Blocks: " << L->getNumBlocks() << "\n"
1845	<< "Exit blocks: " << ExitingBlocks.size() << "\n");
1846
1847	// Only allow another exit other than the latch. This acts as an early exit
1848	// as it mirrors the profitability calculation of the runtime unroller.
1849	if (ExitingBlocks.size() > `2`)
1850	return;
1851
1852	// Limit the CFG of the loop body for targets with a branch predictor.
1853	// Allowing 4 blocks permits if-then-else diamonds in the body.
1854	if (L->getNumBlocks() > `4`)
1855	return;
1856
1857	// Don't unroll vectorized loops, including the remainder loop
1858	if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized"))
1859	return;
1860
1861	// Scan the loop: don't unroll loops with calls as this could prevent
1862	// inlining.
1863	InstructionCost Cost = `0`;
1864	for (auto *BB : L->getBlocks()) {
1865	for (auto &I : *BB) {
1866	// Initial setting - Don't unroll loops containing vectorized
1867	// instructions.
1868	if (I.getType()->isVectorTy())
1869	return;
1870
1871	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
1872	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
1873	if (!isLoweredToCall(F))
1874	continue;
1875	}
1876	return;
1877	}
1878
1879	SmallVector<const Value *> Operands(I.operand_values());
1880	Cost += getInstructionCost(U: &I, Operands,
1881	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
1882	}
1883	}
1884
1885	LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
1886
1887	UP.Partial = true;
1888	UP.Runtime = true;
1889	UP.UnrollRemainder = true;
1890	UP.UnrollAndJam = true;
1891	UP.UnrollAndJamInnerLoopThreshold = `60`;
1892
1893	// Force unrolling small loops can be very useful because of the branch
1894	// taken cost of the backedge.
1895	if (Cost < `12`)
1896	UP.Force = true;
1897	}
1898
1899	void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
1900	TTI::PeelingPreferences &PP) {
1901	BaseT::getPeelingPreferences(L, SE, PP);
1902	}
1903
1904	unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) {
1905	TypeSize Size = DL.getTypeSizeInBits(Ty);
1906	if (Ty->isVectorTy()) {
1907	if (Size.isScalable() && ST->hasVInstructions())
1908	return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock);
1909
1910	if (ST->useRVVForFixedLengthVectors())
1911	return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen());
1912	}
1913
1914	return BaseT::getRegUsageForType(Ty);
1915	}
1916
1917	unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1918	if (SLPMaxVF.getNumOccurrences())
1919	return SLPMaxVF;
1920
1921	// Return how many elements can fit in getRegisterBitwidth. This is the
1922	// same routine as used in LoopVectorizer. We should probably be
1923	// accounting for whether we actually have instructions with the right
1924	// lane type, but we don't have enough information to do that without
1925	// some additional plumbing which hasn't been justified yet.
1926	TypeSize RegWidth =
1927	getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector);
1928	// If no vector registers, or absurd element widths, disable
1929	// vectorization by returning 1.
1930	return std::max<unsigned>(a: `1U`, b: RegWidth.getFixedValue() / ElemWidth);
1931	}
1932
1933	bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
1934	const TargetTransformInfo::LSRCost &C2) {
1935	// RISC-V specific here are "instruction number 1st priority".
1936	// If we need to emit adds inside the loop to add up base registers, then
1937	// we need at least one extra temporary register.
1938	unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != `0`);
1939	unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != `0`);
1940	return std::tie(args: C1.Insns, args&: C1NumRegs, args: C1.AddRecCost,
1941	args: C1.NumIVMuls, args: C1.NumBaseAdds,
1942	args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
1943	std::tie(args: C2.Insns, args&: C2NumRegs, args: C2.AddRecCost,
1944	args: C2.NumIVMuls, args: C2.NumBaseAdds,
1945	args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
1946	}
1947
1948	bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
1949	auto *VTy = dyn_cast<VectorType>(Val: DataTy);
1950	if (!VTy \|\| VTy->isScalableTy())
1951	return false;
1952
1953	if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment))
1954	return false;
1955	return true;
1956	}
1957
1958	bool RISCVTTIImpl::areInlineCompatible(const Function *Caller,
1959	const Function Callee) const* {
1960	const TargetMachine &TM = getTLI()->getTargetMachine();
1961
1962	const FeatureBitset &CallerBits =
1963	TM.getSubtargetImpl(*Caller)->getFeatureBits();
1964	const FeatureBitset &CalleeBits =
1965	TM.getSubtargetImpl(*Callee)->getFeatureBits();
1966
1967	// Inline a callee if its target-features are a subset of the callers
1968	// target-features.
1969	return (CallerBits & CalleeBits) == CalleeBits;
1970	}
1971

Browse the source code of llvm_projects/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp