1 | //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "RISCVTargetTransformInfo.h" |
10 | #include "MCTargetDesc/RISCVMatInt.h" |
11 | #include "llvm/ADT/STLExtras.h" |
12 | #include "llvm/Analysis/TargetTransformInfo.h" |
13 | #include "llvm/CodeGen/BasicTTIImpl.h" |
14 | #include "llvm/CodeGen/CostTable.h" |
15 | #include "llvm/CodeGen/TargetLowering.h" |
16 | #include "llvm/IR/Instructions.h" |
17 | #include "llvm/IR/PatternMatch.h" |
18 | #include <cmath> |
19 | #include <optional> |
20 | using namespace llvm; |
21 | using namespace llvm::PatternMatch; |
22 | |
23 | #define DEBUG_TYPE "riscvtti" |
24 | |
25 | static cl::opt<unsigned> RVVRegisterWidthLMUL( |
26 | "riscv-v-register-bit-width-lmul" , |
27 | cl::desc( |
28 | "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " |
29 | "by autovectorized code. Fractional LMULs are not supported." ), |
30 | cl::init(Val: 2), cl::Hidden); |
31 | |
32 | static cl::opt<unsigned> SLPMaxVF( |
33 | "riscv-v-slp-max-vf" , |
34 | cl::desc( |
35 | "Overrides result used for getMaximumVF query which is used " |
36 | "exclusively by SLP vectorizer." ), |
37 | cl::Hidden); |
38 | |
39 | InstructionCost |
40 | RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, |
41 | TTI::TargetCostKind CostKind) { |
42 | // Check if the type is valid for all CostKind |
43 | if (!VT.isVector()) |
44 | return InstructionCost::getInvalid(); |
45 | size_t NumInstr = OpCodes.size(); |
46 | if (CostKind == TTI::TCK_CodeSize) |
47 | return NumInstr; |
48 | InstructionCost LMULCost = TLI->getLMULCost(VT); |
49 | if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) |
50 | return LMULCost * NumInstr; |
51 | InstructionCost Cost = 0; |
52 | for (auto Op : OpCodes) { |
53 | switch (Op) { |
54 | case RISCV::VRGATHER_VI: |
55 | Cost += TLI->getVRGatherVICost(VT); |
56 | break; |
57 | case RISCV::VRGATHER_VV: |
58 | Cost += TLI->getVRGatherVVCost(VT); |
59 | break; |
60 | case RISCV::VSLIDEUP_VI: |
61 | case RISCV::VSLIDEDOWN_VI: |
62 | Cost += TLI->getVSlideVICost(VT); |
63 | break; |
64 | case RISCV::VSLIDEUP_VX: |
65 | case RISCV::VSLIDEDOWN_VX: |
66 | Cost += TLI->getVSlideVXCost(VT); |
67 | break; |
68 | case RISCV::VREDMAX_VS: |
69 | case RISCV::VREDMIN_VS: |
70 | case RISCV::VREDMAXU_VS: |
71 | case RISCV::VREDMINU_VS: |
72 | case RISCV::VREDSUM_VS: |
73 | case RISCV::VREDAND_VS: |
74 | case RISCV::VREDOR_VS: |
75 | case RISCV::VREDXOR_VS: |
76 | case RISCV::VFREDMAX_VS: |
77 | case RISCV::VFREDMIN_VS: |
78 | case RISCV::VFREDUSUM_VS: { |
79 | unsigned VL = VT.getVectorMinNumElements(); |
80 | if (!VT.isFixedLengthVector()) |
81 | VL *= *getVScaleForTuning(); |
82 | Cost += Log2_32_Ceil(Value: VL); |
83 | break; |
84 | } |
85 | case RISCV::VFREDOSUM_VS: { |
86 | unsigned VL = VT.getVectorMinNumElements(); |
87 | if (!VT.isFixedLengthVector()) |
88 | VL *= *getVScaleForTuning(); |
89 | Cost += VL; |
90 | break; |
91 | } |
92 | case RISCV::VMV_X_S: |
93 | case RISCV::VMV_S_X: |
94 | case RISCV::VFMV_F_S: |
95 | case RISCV::VFMV_S_F: |
96 | case RISCV::VMOR_MM: |
97 | case RISCV::VMXOR_MM: |
98 | case RISCV::VMAND_MM: |
99 | case RISCV::VMANDN_MM: |
100 | case RISCV::VMNAND_MM: |
101 | case RISCV::VCPOP_M: |
102 | case RISCV::VFIRST_M: |
103 | Cost += 1; |
104 | break; |
105 | default: |
106 | Cost += LMULCost; |
107 | } |
108 | } |
109 | return Cost; |
110 | } |
111 | |
112 | static InstructionCost getIntImmCostImpl(const DataLayout &DL, |
113 | const RISCVSubtarget *ST, |
114 | const APInt &Imm, Type *Ty, |
115 | TTI::TargetCostKind CostKind, |
116 | bool FreeZeroes) { |
117 | assert(Ty->isIntegerTy() && |
118 | "getIntImmCost can only estimate cost of materialising integers" ); |
119 | |
120 | // We have a Zero register, so 0 is always free. |
121 | if (Imm == 0) |
122 | return TTI::TCC_Free; |
123 | |
124 | // Otherwise, we check how many instructions it will take to materialise. |
125 | return RISCVMatInt::getIntMatCost(Val: Imm, Size: DL.getTypeSizeInBits(Ty), STI: *ST, |
126 | /*CompressionCost=*/false, FreeZeroes); |
127 | } |
128 | |
129 | InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
130 | TTI::TargetCostKind CostKind) { |
131 | return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind, FreeZeroes: false); |
132 | } |
133 | |
134 | // Look for patterns of shift followed by AND that can be turned into a pair of |
135 | // shifts. We won't need to materialize an immediate for the AND so these can |
136 | // be considered free. |
137 | static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { |
138 | uint64_t Mask = Imm.getZExtValue(); |
139 | auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: 0)); |
140 | if (!BO || !BO->hasOneUse()) |
141 | return false; |
142 | |
143 | if (BO->getOpcode() != Instruction::Shl) |
144 | return false; |
145 | |
146 | if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))) |
147 | return false; |
148 | |
149 | unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))->getZExtValue(); |
150 | // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 |
151 | // is a mask shifted by c2 bits with c3 leading zeros. |
152 | if (isShiftedMask_64(Value: Mask)) { |
153 | unsigned Trailing = llvm::countr_zero(Val: Mask); |
154 | if (ShAmt == Trailing) |
155 | return true; |
156 | } |
157 | |
158 | return false; |
159 | } |
160 | |
161 | InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
162 | const APInt &Imm, Type *Ty, |
163 | TTI::TargetCostKind CostKind, |
164 | Instruction *Inst) { |
165 | assert(Ty->isIntegerTy() && |
166 | "getIntImmCost can only estimate cost of materialising integers" ); |
167 | |
168 | // We have a Zero register, so 0 is always free. |
169 | if (Imm == 0) |
170 | return TTI::TCC_Free; |
171 | |
172 | // Some instructions in RISC-V can take a 12-bit immediate. Some of these are |
173 | // commutative, in others the immediate comes from a specific argument index. |
174 | bool Takes12BitImm = false; |
175 | unsigned ImmArgIdx = ~0U; |
176 | |
177 | switch (Opcode) { |
178 | case Instruction::GetElementPtr: |
179 | // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will |
180 | // split up large offsets in GEP into better parts than ConstantHoisting |
181 | // can. |
182 | return TTI::TCC_Free; |
183 | case Instruction::Store: { |
184 | // Use the materialization cost regardless of if it's the address or the |
185 | // value that is constant, except for if the store is misaligned and |
186 | // misaligned accesses are not legal (experience shows constant hoisting |
187 | // can sometimes be harmful in such cases). |
188 | if (Idx == 1 || !Inst) |
189 | return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind, |
190 | /*FreeZeroes=*/true); |
191 | |
192 | StoreInst *ST = cast<StoreInst>(Val: Inst); |
193 | if (!getTLI()->allowsMemoryAccessForAlignment( |
194 | Context&: Ty->getContext(), DL, VT: getTLI()->getValueType(DL, Ty), |
195 | AddrSpace: ST->getPointerAddressSpace(), Alignment: ST->getAlign())) |
196 | return TTI::TCC_Free; |
197 | |
198 | return getIntImmCostImpl(DL: getDataLayout(), ST: getST(), Imm, Ty, CostKind, |
199 | /*FreeZeroes=*/true); |
200 | } |
201 | case Instruction::Load: |
202 | // If the address is a constant, use the materialization cost. |
203 | return getIntImmCost(Imm, Ty, CostKind); |
204 | case Instruction::And: |
205 | // zext.h |
206 | if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) |
207 | return TTI::TCC_Free; |
208 | // zext.w |
209 | if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) |
210 | return TTI::TCC_Free; |
211 | // bclri |
212 | if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) |
213 | return TTI::TCC_Free; |
214 | if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && |
215 | canUseShiftPair(Inst, Imm)) |
216 | return TTI::TCC_Free; |
217 | Takes12BitImm = true; |
218 | break; |
219 | case Instruction::Add: |
220 | Takes12BitImm = true; |
221 | break; |
222 | case Instruction::Or: |
223 | case Instruction::Xor: |
224 | // bseti/binvi |
225 | if (ST->hasStdExtZbs() && Imm.isPowerOf2()) |
226 | return TTI::TCC_Free; |
227 | Takes12BitImm = true; |
228 | break; |
229 | case Instruction::Mul: |
230 | // Power of 2 is a shift. Negated power of 2 is a shift and a negate. |
231 | if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) |
232 | return TTI::TCC_Free; |
233 | // One more or less than a power of 2 can use SLLI+ADD/SUB. |
234 | if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) |
235 | return TTI::TCC_Free; |
236 | // FIXME: There is no MULI instruction. |
237 | Takes12BitImm = true; |
238 | break; |
239 | case Instruction::Sub: |
240 | case Instruction::Shl: |
241 | case Instruction::LShr: |
242 | case Instruction::AShr: |
243 | Takes12BitImm = true; |
244 | ImmArgIdx = 1; |
245 | break; |
246 | default: |
247 | break; |
248 | } |
249 | |
250 | if (Takes12BitImm) { |
251 | // Check immediate is the correct argument... |
252 | if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { |
253 | // ... and fits into the 12-bit immediate. |
254 | if (Imm.getSignificantBits() <= 64 && |
255 | getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) { |
256 | return TTI::TCC_Free; |
257 | } |
258 | } |
259 | |
260 | // Otherwise, use the full materialisation cost. |
261 | return getIntImmCost(Imm, Ty, CostKind); |
262 | } |
263 | |
264 | // By default, prevent hoisting. |
265 | return TTI::TCC_Free; |
266 | } |
267 | |
268 | InstructionCost |
269 | RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
270 | const APInt &Imm, Type *Ty, |
271 | TTI::TargetCostKind CostKind) { |
272 | // Prevent hoisting in unknown cases. |
273 | return TTI::TCC_Free; |
274 | } |
275 | |
276 | bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { |
277 | return ST->hasVInstructions(); |
278 | } |
279 | |
280 | TargetTransformInfo::PopcntSupportKind |
281 | RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { |
282 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
283 | return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() |
284 | ? TTI::PSK_FastHardware |
285 | : TTI::PSK_Software; |
286 | } |
287 | |
288 | bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { |
289 | // Currently, the ExpandReductions pass can't expand scalable-vector |
290 | // reductions, but we still request expansion as RVV doesn't support certain |
291 | // reductions and the SelectionDAG can't legalize them either. |
292 | switch (II->getIntrinsicID()) { |
293 | default: |
294 | return false; |
295 | // These reductions have no equivalent in RVV |
296 | case Intrinsic::vector_reduce_mul: |
297 | case Intrinsic::vector_reduce_fmul: |
298 | return true; |
299 | } |
300 | } |
301 | |
302 | std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { |
303 | if (ST->hasVInstructions()) |
304 | return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; |
305 | return BaseT::getMaxVScale(); |
306 | } |
307 | |
308 | std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { |
309 | if (ST->hasVInstructions()) |
310 | if (unsigned MinVLen = ST->getRealMinVLen(); |
311 | MinVLen >= RISCV::RVVBitsPerBlock) |
312 | return MinVLen / RISCV::RVVBitsPerBlock; |
313 | return BaseT::getVScaleForTuning(); |
314 | } |
315 | |
316 | TypeSize |
317 | RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
318 | unsigned LMUL = |
319 | llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: 1, hi: 8)); |
320 | switch (K) { |
321 | case TargetTransformInfo::RGK_Scalar: |
322 | return TypeSize::getFixed(ExactSize: ST->getXLen()); |
323 | case TargetTransformInfo::RGK_FixedWidthVector: |
324 | return TypeSize::getFixed( |
325 | ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); |
326 | case TargetTransformInfo::RGK_ScalableVector: |
327 | return TypeSize::getScalable( |
328 | MinimumSize: (ST->hasVInstructions() && |
329 | ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) |
330 | ? LMUL * RISCV::RVVBitsPerBlock |
331 | : 0); |
332 | } |
333 | |
334 | llvm_unreachable("Unsupported register kind" ); |
335 | } |
336 | |
337 | InstructionCost |
338 | RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { |
339 | // Add a cost of address generation + the cost of the load. The address |
340 | // is expected to be a PC relative offset to a constant pool entry |
341 | // using auipc/addi. |
342 | return 2 + getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty), |
343 | /*AddressSpace=*/0, CostKind); |
344 | } |
345 | |
346 | static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, |
347 | LLVMContext &C) { |
348 | assert((DataVT.getScalarSizeInBits() != 8 || |
349 | DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering" ); |
350 | MVT IndexVT = DataVT.changeTypeToInteger(); |
351 | if (IndexVT.getScalarType().bitsGT(VT: ST.getXLenVT())) |
352 | IndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i16); |
353 | return cast<VectorType>(Val: EVT(IndexVT).getTypeForEVT(Context&: C)); |
354 | } |
355 | |
356 | InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
357 | VectorType *Tp, ArrayRef<int> Mask, |
358 | TTI::TargetCostKind CostKind, |
359 | int Index, VectorType *SubTp, |
360 | ArrayRef<const Value *> Args, |
361 | const Instruction *CxtI) { |
362 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp); |
363 | |
364 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
365 | |
366 | // First, handle cases where having a fixed length vector enables us to |
367 | // give a more accurate cost than falling back to generic scalable codegen. |
368 | // TODO: Each of these cases hints at a modeling gap around scalable vectors. |
369 | if (isa<FixedVectorType>(Val: Tp)) { |
370 | switch (Kind) { |
371 | default: |
372 | break; |
373 | case TTI::SK_PermuteSingleSrc: { |
374 | if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { |
375 | MVT EltTp = LT.second.getVectorElementType(); |
376 | // If the size of the element is < ELEN then shuffles of interleaves and |
377 | // deinterleaves of 2 vectors can be lowered into the following |
378 | // sequences |
379 | if (EltTp.getScalarSizeInBits() < ST->getELen()) { |
380 | // Example sequence: |
381 | // vsetivli zero, 4, e8, mf4, ta, ma (ignored) |
382 | // vwaddu.vv v10, v8, v9 |
383 | // li a0, -1 (ignored) |
384 | // vwmaccu.vx v10, a0, v9 |
385 | if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: 2, NumInputElts: Mask.size())) |
386 | return 2 * LT.first * TLI->getLMULCost(VT: LT.second); |
387 | |
388 | if (Mask[0] == 0 || Mask[0] == 1) { |
389 | auto DeinterleaveMask = createStrideMask(Start: Mask[0], Stride: 2, VF: Mask.size()); |
390 | // Example sequence: |
391 | // vnsrl.wi v10, v8, 0 |
392 | if (equal(LRange&: DeinterleaveMask, RRange&: Mask)) |
393 | return LT.first * getRISCVInstructionCost(OpCodes: RISCV::VNSRL_WI, |
394 | VT: LT.second, CostKind); |
395 | } |
396 | } |
397 | } |
398 | // vrgather + cost of generating the mask constant. |
399 | // We model this for an unknown mask with a single vrgather. |
400 | if (LT.second.isFixedLengthVector() && LT.first == 1 && |
401 | (LT.second.getScalarSizeInBits() != 8 || |
402 | LT.second.getVectorNumElements() <= 256)) { |
403 | VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C&: Tp->getContext()); |
404 | InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind); |
405 | return IndexCost + |
406 | getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VV, VT: LT.second, CostKind); |
407 | } |
408 | [[fallthrough]]; |
409 | } |
410 | case TTI::SK_Transpose: |
411 | case TTI::SK_PermuteTwoSrc: { |
412 | // 2 x (vrgather + cost of generating the mask constant) + cost of mask |
413 | // register for the second vrgather. We model this for an unknown |
414 | // (shuffle) mask. |
415 | if (LT.second.isFixedLengthVector() && LT.first == 1 && |
416 | (LT.second.getScalarSizeInBits() != 8 || |
417 | LT.second.getVectorNumElements() <= 256)) { |
418 | auto &C = Tp->getContext(); |
419 | auto EC = Tp->getElementCount(); |
420 | VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C); |
421 | VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC); |
422 | InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind); |
423 | InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind); |
424 | return 2 * IndexCost + |
425 | getRISCVInstructionCost(OpCodes: {RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, |
426 | VT: LT.second, CostKind) + |
427 | MaskCost; |
428 | } |
429 | [[fallthrough]]; |
430 | } |
431 | case TTI::SK_Select: { |
432 | // We are going to permute multiple sources and the result will be in |
433 | // multiple destinations. Providing an accurate cost only for splits where |
434 | // the element type remains the same. |
435 | if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && |
436 | LT.second.isFixedLengthVector() && |
437 | LT.second.getVectorElementType().getSizeInBits() == |
438 | Tp->getElementType()->getPrimitiveSizeInBits() && |
439 | LT.second.getVectorNumElements() < |
440 | cast<FixedVectorType>(Val: Tp)->getNumElements() && |
441 | divideCeil(Numerator: Mask.size(), |
442 | Denominator: cast<FixedVectorType>(Val: Tp)->getNumElements()) == |
443 | static_cast<unsigned>(*LT.first.getValue())) { |
444 | unsigned NumRegs = *LT.first.getValue(); |
445 | unsigned VF = cast<FixedVectorType>(Val: Tp)->getNumElements(); |
446 | unsigned SubVF = PowerOf2Ceil(A: VF / NumRegs); |
447 | auto *SubVecTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: SubVF); |
448 | |
449 | InstructionCost Cost = 0; |
450 | for (unsigned I = 0; I < NumRegs; ++I) { |
451 | bool IsSingleVector = true; |
452 | SmallVector<int> SubMask(SubVF, PoisonMaskElem); |
453 | transform(Range: Mask.slice(N: I * SubVF, |
454 | M: I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), |
455 | d_first: SubMask.begin(), F: [&](int I) { |
456 | bool SingleSubVector = I / VF == 0; |
457 | IsSingleVector &= SingleSubVector; |
458 | return (SingleSubVector ? 0 : 1) * SubVF + I % VF; |
459 | }); |
460 | Cost += getShuffleCost(Kind: IsSingleVector ? TTI::SK_PermuteSingleSrc |
461 | : TTI::SK_PermuteTwoSrc, |
462 | Tp: SubVecTy, Mask: SubMask, CostKind, Index: 0, SubTp: nullptr); |
463 | return Cost; |
464 | } |
465 | } |
466 | break; |
467 | } |
468 | } |
469 | }; |
470 | |
471 | // Handle scalable vectors (and fixed vectors legalized to scalable vectors). |
472 | switch (Kind) { |
473 | default: |
474 | // Fallthrough to generic handling. |
475 | // TODO: Most of these cases will return getInvalid in generic code, and |
476 | // must be implemented here. |
477 | break; |
478 | case TTI::SK_ExtractSubvector: |
479 | // Extract at zero is always a subregister extract |
480 | if (Index == 0) |
481 | return TTI::TCC_Free; |
482 | |
483 | // If we're extracting a subvector of at most m1 size at a sub-register |
484 | // boundary - which unfortunately we need exact vlen to identify - this is |
485 | // a subregister extract at worst and thus won't require a vslidedown. |
486 | // TODO: Extend for aligned m2, m4 subvector extracts |
487 | // TODO: Extend for misalgined (but contained) extracts |
488 | // TODO: Extend for scalable subvector types |
489 | if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
490 | SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { |
491 | const unsigned MinVLen = ST->getRealMinVLen(); |
492 | const unsigned MaxVLen = ST->getRealMaxVLen(); |
493 | if (MinVLen == MaxVLen && |
494 | SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 && |
495 | SubLT.second.getSizeInBits() <= MinVLen) |
496 | return TTI::TCC_Free; |
497 | } |
498 | |
499 | // Example sequence: |
500 | // vsetivli zero, 4, e8, mf2, tu, ma (ignored) |
501 | // vslidedown.vi v8, v9, 2 |
502 | return LT.first * |
503 | getRISCVInstructionCost(OpCodes: RISCV::VSLIDEDOWN_VI, VT: LT.second, CostKind); |
504 | case TTI::SK_InsertSubvector: |
505 | // Example sequence: |
506 | // vsetivli zero, 4, e8, mf2, tu, ma (ignored) |
507 | // vslideup.vi v8, v9, 2 |
508 | return LT.first * |
509 | getRISCVInstructionCost(OpCodes: RISCV::VSLIDEUP_VI, VT: LT.second, CostKind); |
510 | case TTI::SK_Select: { |
511 | // Example sequence: |
512 | // li a0, 90 |
513 | // vsetivli zero, 8, e8, mf2, ta, ma (ignored) |
514 | // vmv.s.x v0, a0 |
515 | // vmerge.vvm v8, v9, v8, v0 |
516 | // We use 2 for the cost of the mask materialization as this is the true |
517 | // cost for small masks and most shuffles are small. At worst, this cost |
518 | // should be a very small constant for the constant pool load. As such, |
519 | // we may bias towards large selects slightly more than truely warranted. |
520 | return LT.first * |
521 | (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_S_X, RISCV::VMERGE_VVM}, |
522 | VT: LT.second, CostKind)); |
523 | } |
524 | case TTI::SK_Broadcast: { |
525 | bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(V: Args[0]) == |
526 | Instruction::InsertElement); |
527 | if (LT.second.getScalarSizeInBits() == 1) { |
528 | if (HasScalar) { |
529 | // Example sequence: |
530 | // andi a0, a0, 1 |
531 | // vsetivli zero, 2, e8, mf8, ta, ma (ignored) |
532 | // vmv.v.x v8, a0 |
533 | // vmsne.vi v0, v8, 0 |
534 | return LT.first * |
535 | (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI}, |
536 | VT: LT.second, CostKind)); |
537 | } |
538 | // Example sequence: |
539 | // vsetivli zero, 2, e8, mf8, ta, mu (ignored) |
540 | // vmv.v.i v8, 0 |
541 | // vmerge.vim v8, v8, 1, v0 |
542 | // vmv.x.s a0, v8 |
543 | // andi a0, a0, 1 |
544 | // vmv.v.x v8, a0 |
545 | // vmsne.vi v0, v8, 0 |
546 | |
547 | return LT.first * |
548 | (1 + getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_I, RISCV::VMERGE_VIM, |
549 | RISCV::VMV_X_S, RISCV::VMV_V_X, |
550 | RISCV::VMSNE_VI}, |
551 | VT: LT.second, CostKind)); |
552 | } |
553 | |
554 | if (HasScalar) { |
555 | // Example sequence: |
556 | // vmv.v.x v8, a0 |
557 | return LT.first * |
558 | getRISCVInstructionCost(OpCodes: RISCV::VMV_V_X, VT: LT.second, CostKind); |
559 | } |
560 | |
561 | // Example sequence: |
562 | // vrgather.vi v9, v8, 0 |
563 | return LT.first * |
564 | getRISCVInstructionCost(OpCodes: RISCV::VRGATHER_VI, VT: LT.second, CostKind); |
565 | } |
566 | case TTI::SK_Splice: { |
567 | // vslidedown+vslideup. |
568 | // TODO: Multiplying by LT.first implies this legalizes into multiple copies |
569 | // of similar code, but I think we expand through memory. |
570 | unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; |
571 | if (Index >= 0 && Index < 32) |
572 | Opcodes[0] = RISCV::VSLIDEDOWN_VI; |
573 | else if (Index < 0 && Index > -32) |
574 | Opcodes[1] = RISCV::VSLIDEUP_VI; |
575 | return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
576 | } |
577 | case TTI::SK_Reverse: { |
578 | // TODO: Cases to improve here: |
579 | // * Illegal vector types |
580 | // * i64 on RV32 |
581 | // * i1 vector |
582 | // At low LMUL, most of the cost is producing the vrgather index register. |
583 | // At high LMUL, the cost of the vrgather itself will dominate. |
584 | // Example sequence: |
585 | // csrr a0, vlenb |
586 | // srli a0, a0, 3 |
587 | // addi a0, a0, -1 |
588 | // vsetvli a1, zero, e8, mf8, ta, mu (ignored) |
589 | // vid.v v9 |
590 | // vrsub.vx v10, v9, a0 |
591 | // vrgather.vv v9, v8, v10 |
592 | InstructionCost LenCost = 3; |
593 | if (LT.second.isFixedLengthVector()) |
594 | // vrsub.vi has a 5 bit immediate field, otherwise an li suffices |
595 | LenCost = isInt<5>(x: LT.second.getVectorNumElements() - 1) ? 0 : 1; |
596 | unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; |
597 | if (LT.second.isFixedLengthVector() && |
598 | isInt<5>(x: LT.second.getVectorNumElements() - 1)) |
599 | Opcodes[1] = RISCV::VRSUB_VI; |
600 | InstructionCost GatherCost = |
601 | getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
602 | // Mask operation additionally required extend and truncate |
603 | InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(Bitwidth: 1) ? 3 : 0; |
604 | return LT.first * (LenCost + GatherCost + ExtendCost); |
605 | } |
606 | } |
607 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); |
608 | } |
609 | |
610 | InstructionCost |
611 | RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
612 | unsigned AddressSpace, |
613 | TTI::TargetCostKind CostKind) { |
614 | if (!isLegalMaskedLoadStore(DataType: Src, Alignment) || |
615 | CostKind != TTI::TCK_RecipThroughput) |
616 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
617 | CostKind); |
618 | |
619 | return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); |
620 | } |
621 | |
622 | InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( |
623 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
624 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
625 | bool UseMaskForCond, bool UseMaskForGaps) { |
626 | if (isa<ScalableVectorType>(Val: VecTy) && Factor != 2) |
627 | return InstructionCost::getInvalid(); |
628 | |
629 | // The interleaved memory access pass will lower interleaved memory ops (i.e |
630 | // a load and store followed by a specific shuffle) to vlseg/vsseg |
631 | // intrinsics. In those cases then we can treat it as if it's just one (legal) |
632 | // memory op |
633 | if (!UseMaskForCond && !UseMaskForGaps && |
634 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
635 | auto *VTy = cast<VectorType>(Val: VecTy); |
636 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VTy); |
637 | // Need to make sure type has't been scalarized |
638 | if (LT.second.isVector()) { |
639 | auto *SubVecTy = |
640 | VectorType::get(ElementType: VTy->getElementType(), |
641 | EC: VTy->getElementCount().divideCoefficientBy(RHS: Factor)); |
642 | |
643 | if (VTy->getElementCount().isKnownMultipleOf(RHS: Factor) && |
644 | TLI->isLegalInterleavedAccessType(VTy: SubVecTy, Factor, Alignment, |
645 | AddrSpace: AddressSpace, DL)) { |
646 | // FIXME: We use the memory op cost of the *legalized* type here, |
647 | // because it's getMemoryOpCost returns a really expensive cost for |
648 | // types like <6 x i8>, which show up when doing interleaves of |
649 | // Factor=3 etc. Should the memory op cost of these be cheaper? |
650 | auto *LegalVTy = VectorType::get(ElementType: VTy->getElementType(), |
651 | EC: LT.second.getVectorElementCount()); |
652 | InstructionCost LegalMemCost = getMemoryOpCost( |
653 | Opcode, Src: LegalVTy, Alignment, AddressSpace, CostKind); |
654 | return LT.first + LegalMemCost; |
655 | } |
656 | } |
657 | } |
658 | |
659 | // TODO: Return the cost of interleaved accesses for scalable vector when |
660 | // unable to convert to segment accesses instructions. |
661 | if (isa<ScalableVectorType>(Val: VecTy)) |
662 | return InstructionCost::getInvalid(); |
663 | |
664 | auto *FVTy = cast<FixedVectorType>(Val: VecTy); |
665 | InstructionCost MemCost = |
666 | getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind); |
667 | unsigned VF = FVTy->getNumElements() / Factor; |
668 | |
669 | // An interleaved load will look like this for Factor=3: |
670 | // %wide.vec = load <12 x i32>, ptr %3, align 4 |
671 | // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
672 | // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
673 | // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
674 | if (Opcode == Instruction::Load) { |
675 | InstructionCost Cost = MemCost; |
676 | for (unsigned Index : Indices) { |
677 | FixedVectorType *SubVecTy = |
678 | FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor); |
679 | auto Mask = createStrideMask(Start: Index, Stride: Factor, VF); |
680 | InstructionCost ShuffleCost = |
681 | getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: SubVecTy, Mask, |
682 | CostKind, Index: 0, SubTp: nullptr, Args: {}); |
683 | Cost += ShuffleCost; |
684 | } |
685 | return Cost; |
686 | } |
687 | |
688 | // TODO: Model for NF > 2 |
689 | // We'll need to enhance getShuffleCost to model shuffles that are just |
690 | // inserts and extracts into subvectors, since they won't have the full cost |
691 | // of a vrgather. |
692 | // An interleaved store for 3 vectors of 4 lanes will look like |
693 | // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> |
694 | // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> |
695 | // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> |
696 | // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> |
697 | // store <12 x i32> %interleaved.vec, ptr %10, align 4 |
698 | if (Factor != 2) |
699 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
700 | Alignment, AddressSpace, CostKind, |
701 | UseMaskForCond, UseMaskForGaps); |
702 | |
703 | assert(Opcode == Instruction::Store && "Opcode must be a store" ); |
704 | // For an interleaving store of 2 vectors, we perform one large interleaving |
705 | // shuffle that goes into the wide store |
706 | auto Mask = createInterleaveMask(VF, NumVecs: Factor); |
707 | InstructionCost ShuffleCost = |
708 | getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: FVTy, Mask, |
709 | CostKind, Index: 0, SubTp: nullptr, Args: {}); |
710 | return MemCost + ShuffleCost; |
711 | } |
712 | |
713 | InstructionCost RISCVTTIImpl::getGatherScatterOpCost( |
714 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
715 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
716 | if (CostKind != TTI::TCK_RecipThroughput) |
717 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
718 | Alignment, CostKind, I); |
719 | |
720 | if ((Opcode == Instruction::Load && |
721 | !isLegalMaskedGather(DataType: DataTy, Alignment: Align(Alignment))) || |
722 | (Opcode == Instruction::Store && |
723 | !isLegalMaskedScatter(DataType: DataTy, Alignment: Align(Alignment)))) |
724 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
725 | Alignment, CostKind, I); |
726 | |
727 | // Cost is proportional to the number of memory operations implied. For |
728 | // scalable vectors, we use an estimate on that number since we don't |
729 | // know exactly what VL will be. |
730 | auto &VTy = *cast<VectorType>(Val: DataTy); |
731 | InstructionCost MemOpCost = |
732 | getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind, |
733 | OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
734 | unsigned NumLoads = getEstimatedVLFor(Ty: &VTy); |
735 | return NumLoads * MemOpCost; |
736 | } |
737 | |
738 | InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( |
739 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
740 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
741 | if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
742 | !isLegalStridedLoadStore(DataType: DataTy, Alignment)) || |
743 | (Opcode != Instruction::Load && Opcode != Instruction::Store)) |
744 | return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, |
745 | Alignment, CostKind, I); |
746 | |
747 | if (CostKind == TTI::TCK_CodeSize) |
748 | return TTI::TCC_Basic; |
749 | |
750 | // Cost is proportional to the number of memory operations implied. For |
751 | // scalable vectors, we use an estimate on that number since we don't |
752 | // know exactly what VL will be. |
753 | auto &VTy = *cast<VectorType>(Val: DataTy); |
754 | InstructionCost MemOpCost = |
755 | getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind, |
756 | OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
757 | unsigned NumLoads = getEstimatedVLFor(Ty: &VTy); |
758 | return NumLoads * MemOpCost; |
759 | } |
760 | |
761 | // Currently, these represent both throughput and codesize costs |
762 | // for the respective intrinsics. The costs in this table are simply |
763 | // instruction counts with the following adjustments made: |
764 | // * One vsetvli is considered free. |
765 | static const CostTblEntry VectorIntrinsicCostTable[]{ |
766 | {.ISD: Intrinsic::floor, .Type: MVT::f32, .Cost: 9}, |
767 | {.ISD: Intrinsic::floor, .Type: MVT::f64, .Cost: 9}, |
768 | {.ISD: Intrinsic::ceil, .Type: MVT::f32, .Cost: 9}, |
769 | {.ISD: Intrinsic::ceil, .Type: MVT::f64, .Cost: 9}, |
770 | {.ISD: Intrinsic::trunc, .Type: MVT::f32, .Cost: 7}, |
771 | {.ISD: Intrinsic::trunc, .Type: MVT::f64, .Cost: 7}, |
772 | {.ISD: Intrinsic::round, .Type: MVT::f32, .Cost: 9}, |
773 | {.ISD: Intrinsic::round, .Type: MVT::f64, .Cost: 9}, |
774 | {.ISD: Intrinsic::roundeven, .Type: MVT::f32, .Cost: 9}, |
775 | {.ISD: Intrinsic::roundeven, .Type: MVT::f64, .Cost: 9}, |
776 | {.ISD: Intrinsic::rint, .Type: MVT::f32, .Cost: 7}, |
777 | {.ISD: Intrinsic::rint, .Type: MVT::f64, .Cost: 7}, |
778 | {.ISD: Intrinsic::lrint, .Type: MVT::i32, .Cost: 1}, |
779 | {.ISD: Intrinsic::lrint, .Type: MVT::i64, .Cost: 1}, |
780 | {.ISD: Intrinsic::llrint, .Type: MVT::i64, .Cost: 1}, |
781 | {.ISD: Intrinsic::nearbyint, .Type: MVT::f32, .Cost: 9}, |
782 | {.ISD: Intrinsic::nearbyint, .Type: MVT::f64, .Cost: 9}, |
783 | {.ISD: Intrinsic::bswap, .Type: MVT::i16, .Cost: 3}, |
784 | {.ISD: Intrinsic::bswap, .Type: MVT::i32, .Cost: 12}, |
785 | {.ISD: Intrinsic::bswap, .Type: MVT::i64, .Cost: 31}, |
786 | {.ISD: Intrinsic::vp_bswap, .Type: MVT::i16, .Cost: 3}, |
787 | {.ISD: Intrinsic::vp_bswap, .Type: MVT::i32, .Cost: 12}, |
788 | {.ISD: Intrinsic::vp_bswap, .Type: MVT::i64, .Cost: 31}, |
789 | {.ISD: Intrinsic::vp_fshl, .Type: MVT::i8, .Cost: 7}, |
790 | {.ISD: Intrinsic::vp_fshl, .Type: MVT::i16, .Cost: 7}, |
791 | {.ISD: Intrinsic::vp_fshl, .Type: MVT::i32, .Cost: 7}, |
792 | {.ISD: Intrinsic::vp_fshl, .Type: MVT::i64, .Cost: 7}, |
793 | {.ISD: Intrinsic::vp_fshr, .Type: MVT::i8, .Cost: 7}, |
794 | {.ISD: Intrinsic::vp_fshr, .Type: MVT::i16, .Cost: 7}, |
795 | {.ISD: Intrinsic::vp_fshr, .Type: MVT::i32, .Cost: 7}, |
796 | {.ISD: Intrinsic::vp_fshr, .Type: MVT::i64, .Cost: 7}, |
797 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i8, .Cost: 17}, |
798 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i16, .Cost: 24}, |
799 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: 33}, |
800 | {.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: 52}, |
801 | {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i8, .Cost: 17}, |
802 | {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i16, .Cost: 24}, |
803 | {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i32, .Cost: 33}, |
804 | {.ISD: Intrinsic::vp_bitreverse, .Type: MVT::i64, .Cost: 52}, |
805 | {.ISD: Intrinsic::ctpop, .Type: MVT::i8, .Cost: 12}, |
806 | {.ISD: Intrinsic::ctpop, .Type: MVT::i16, .Cost: 19}, |
807 | {.ISD: Intrinsic::ctpop, .Type: MVT::i32, .Cost: 20}, |
808 | {.ISD: Intrinsic::ctpop, .Type: MVT::i64, .Cost: 21}, |
809 | {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i8, .Cost: 12}, |
810 | {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i16, .Cost: 19}, |
811 | {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i32, .Cost: 20}, |
812 | {.ISD: Intrinsic::vp_ctpop, .Type: MVT::i64, .Cost: 21}, |
813 | {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i8, .Cost: 19}, |
814 | {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i16, .Cost: 28}, |
815 | {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i32, .Cost: 31}, |
816 | {.ISD: Intrinsic::vp_ctlz, .Type: MVT::i64, .Cost: 35}, |
817 | {.ISD: Intrinsic::vp_cttz, .Type: MVT::i8, .Cost: 16}, |
818 | {.ISD: Intrinsic::vp_cttz, .Type: MVT::i16, .Cost: 23}, |
819 | {.ISD: Intrinsic::vp_cttz, .Type: MVT::i32, .Cost: 24}, |
820 | {.ISD: Intrinsic::vp_cttz, .Type: MVT::i64, .Cost: 25}, |
821 | }; |
822 | |
823 | static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { |
824 | switch (ID) { |
825 | #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ |
826 | case Intrinsic::VPID: \ |
827 | return ISD::VPSD; |
828 | #include "llvm/IR/VPIntrinsics.def" |
829 | #undef HELPER_MAP_VPID_TO_VPSD |
830 | } |
831 | return ISD::DELETED_NODE; |
832 | } |
833 | |
834 | InstructionCost |
835 | RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
836 | TTI::TargetCostKind CostKind) { |
837 | auto *RetTy = ICA.getReturnType(); |
838 | switch (ICA.getID()) { |
839 | case Intrinsic::ceil: |
840 | case Intrinsic::floor: |
841 | case Intrinsic::trunc: |
842 | case Intrinsic::rint: |
843 | case Intrinsic::lrint: |
844 | case Intrinsic::llrint: |
845 | case Intrinsic::round: |
846 | case Intrinsic::roundeven: { |
847 | // These all use the same code. |
848 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
849 | if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second)) |
850 | return LT.first * 8; |
851 | break; |
852 | } |
853 | case Intrinsic::umin: |
854 | case Intrinsic::umax: |
855 | case Intrinsic::smin: |
856 | case Intrinsic::smax: { |
857 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
858 | if (LT.second.isScalarInteger() && ST->hasStdExtZbb()) |
859 | return LT.first; |
860 | |
861 | if (ST->hasVInstructions() && LT.second.isVector()) { |
862 | unsigned Op; |
863 | switch (ICA.getID()) { |
864 | case Intrinsic::umin: |
865 | Op = RISCV::VMINU_VV; |
866 | break; |
867 | case Intrinsic::umax: |
868 | Op = RISCV::VMAXU_VV; |
869 | break; |
870 | case Intrinsic::smin: |
871 | Op = RISCV::VMIN_VV; |
872 | break; |
873 | case Intrinsic::smax: |
874 | Op = RISCV::VMAX_VV; |
875 | break; |
876 | } |
877 | return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind); |
878 | } |
879 | break; |
880 | } |
881 | case Intrinsic::sadd_sat: |
882 | case Intrinsic::ssub_sat: |
883 | case Intrinsic::uadd_sat: |
884 | case Intrinsic::usub_sat: |
885 | case Intrinsic::fabs: |
886 | case Intrinsic::sqrt: { |
887 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
888 | if (ST->hasVInstructions() && LT.second.isVector()) |
889 | return LT.first; |
890 | break; |
891 | } |
892 | case Intrinsic::ctpop: { |
893 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
894 | if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) |
895 | return LT.first; |
896 | break; |
897 | } |
898 | case Intrinsic::abs: { |
899 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
900 | if (ST->hasVInstructions() && LT.second.isVector()) { |
901 | // vrsub.vi v10, v8, 0 |
902 | // vmax.vv v8, v8, v10 |
903 | return LT.first * 2; |
904 | } |
905 | break; |
906 | } |
907 | case Intrinsic::get_active_lane_mask: { |
908 | if (ST->hasVInstructions()) { |
909 | Type *ExpRetTy = VectorType::get( |
910 | ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount()); |
911 | auto LT = getTypeLegalizationCost(Ty: ExpRetTy); |
912 | |
913 | // vid.v v8 // considered hoisted |
914 | // vsaddu.vx v8, v8, a0 |
915 | // vmsltu.vx v0, v8, a1 |
916 | return LT.first * |
917 | getRISCVInstructionCost(OpCodes: {RISCV::VSADDU_VX, RISCV::VMSLTU_VX}, |
918 | VT: LT.second, CostKind); |
919 | } |
920 | break; |
921 | } |
922 | // TODO: add more intrinsic |
923 | case Intrinsic::experimental_stepvector: { |
924 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
925 | // Legalisation of illegal types involves an `index' instruction plus |
926 | // (LT.first - 1) vector adds. |
927 | if (ST->hasVInstructions()) |
928 | return getRISCVInstructionCost(OpCodes: RISCV::VID_V, VT: LT.second, CostKind) + |
929 | (LT.first - 1) * |
930 | getRISCVInstructionCost(OpCodes: RISCV::VADD_VX, VT: LT.second, CostKind); |
931 | return 1 + (LT.first - 1); |
932 | } |
933 | case Intrinsic::experimental_cttz_elts: { |
934 | Type *ArgTy = ICA.getArgTypes()[0]; |
935 | EVT ArgType = TLI->getValueType(DL, Ty: ArgTy, AllowUnknown: true); |
936 | if (getTLI()->shouldExpandCttzElements(VT: ArgType)) |
937 | break; |
938 | InstructionCost Cost = getRISCVInstructionCost( |
939 | OpCodes: RISCV::VFIRST_M, VT: getTypeLegalizationCost(Ty: ArgTy).second, CostKind); |
940 | |
941 | // If zero_is_poison is false, then we will generate additional |
942 | // cmp + select instructions to convert -1 to EVL. |
943 | Type *BoolTy = Type::getInt1Ty(C&: RetTy->getContext()); |
944 | if (ICA.getArgs().size() > 1 && |
945 | cast<ConstantInt>(Val: ICA.getArgs()[1])->isZero()) |
946 | Cost += getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: BoolTy, CondTy: RetTy, |
947 | VecPred: CmpInst::ICMP_SLT, CostKind) + |
948 | getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: RetTy, CondTy: BoolTy, |
949 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
950 | |
951 | return Cost; |
952 | } |
953 | case Intrinsic::vp_rint: { |
954 | // RISC-V target uses at least 5 instructions to lower rounding intrinsics. |
955 | unsigned Cost = 5; |
956 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
957 | if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second)) |
958 | return Cost * LT.first; |
959 | break; |
960 | } |
961 | case Intrinsic::vp_nearbyint: { |
962 | // More one read and one write for fflags than vp_rint. |
963 | unsigned Cost = 7; |
964 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
965 | if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second)) |
966 | return Cost * LT.first; |
967 | break; |
968 | } |
969 | case Intrinsic::vp_ceil: |
970 | case Intrinsic::vp_floor: |
971 | case Intrinsic::vp_round: |
972 | case Intrinsic::vp_roundeven: |
973 | case Intrinsic::vp_roundtozero: { |
974 | // Rounding with static rounding mode needs two more instructions to |
975 | // swap/write FRM than vp_rint. |
976 | unsigned Cost = 7; |
977 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
978 | unsigned VPISD = getISDForVPIntrinsicID(ID: ICA.getID()); |
979 | if (TLI->isOperationCustom(Op: VPISD, VT: LT.second)) |
980 | return Cost * LT.first; |
981 | break; |
982 | } |
983 | // vp integer arithmetic ops. |
984 | case Intrinsic::vp_add: |
985 | case Intrinsic::vp_and: |
986 | case Intrinsic::vp_ashr: |
987 | case Intrinsic::vp_lshr: |
988 | case Intrinsic::vp_mul: |
989 | case Intrinsic::vp_or: |
990 | case Intrinsic::vp_sdiv: |
991 | case Intrinsic::vp_shl: |
992 | case Intrinsic::vp_srem: |
993 | case Intrinsic::vp_sub: |
994 | case Intrinsic::vp_udiv: |
995 | case Intrinsic::vp_urem: |
996 | case Intrinsic::vp_xor: |
997 | // vp float arithmetic ops. |
998 | case Intrinsic::vp_fadd: |
999 | case Intrinsic::vp_fsub: |
1000 | case Intrinsic::vp_fmul: |
1001 | case Intrinsic::vp_fdiv: |
1002 | case Intrinsic::vp_frem: { |
1003 | std::optional<unsigned> FOp = |
1004 | VPIntrinsic::getFunctionalOpcodeForVP(ID: ICA.getID()); |
1005 | if (FOp) |
1006 | return getArithmeticInstrCost(Opcode: *FOp, Ty: ICA.getReturnType(), CostKind); |
1007 | break; |
1008 | } |
1009 | } |
1010 | |
1011 | if (ST->hasVInstructions() && RetTy->isVectorTy()) { |
1012 | if (auto LT = getTypeLegalizationCost(Ty: RetTy); |
1013 | LT.second.isVector()) { |
1014 | MVT EltTy = LT.second.getVectorElementType(); |
1015 | if (const auto *Entry = CostTableLookup(Table: VectorIntrinsicCostTable, |
1016 | ISD: ICA.getID(), Ty: EltTy)) |
1017 | return LT.first * Entry->Cost; |
1018 | } |
1019 | } |
1020 | |
1021 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
1022 | } |
1023 | |
1024 | InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
1025 | Type *Src, |
1026 | TTI::CastContextHint CCH, |
1027 | TTI::TargetCostKind CostKind, |
1028 | const Instruction *I) { |
1029 | bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src); |
1030 | if (!IsVectorType) |
1031 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1032 | |
1033 | bool IsTypeLegal = isTypeLegal(Ty: Src) && isTypeLegal(Ty: Dst) && |
1034 | (Src->getScalarSizeInBits() <= ST->getELen()) && |
1035 | (Dst->getScalarSizeInBits() <= ST->getELen()); |
1036 | |
1037 | // FIXME: Need to compute legalizing cost for illegal types. |
1038 | if (!IsTypeLegal) |
1039 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1040 | |
1041 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
1042 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst); |
1043 | |
1044 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1045 | assert(ISD && "Invalid opcode" ); |
1046 | |
1047 | int PowDiff = (int)Log2_32(Value: Dst->getScalarSizeInBits()) - |
1048 | (int)Log2_32(Value: Src->getScalarSizeInBits()); |
1049 | switch (ISD) { |
1050 | case ISD::SIGN_EXTEND: |
1051 | case ISD::ZERO_EXTEND: { |
1052 | const unsigned SrcEltSize = Src->getScalarSizeInBits(); |
1053 | if (SrcEltSize == 1) { |
1054 | // We do not use vsext/vzext to extend from mask vector. |
1055 | // Instead we use the following instructions to extend from mask vector: |
1056 | // vmv.v.i v8, 0 |
1057 | // vmerge.vim v8, v8, -1, v0 |
1058 | return getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_I, RISCV::VMERGE_VIM}, |
1059 | VT: DstLT.second, CostKind); |
1060 | } |
1061 | if ((PowDiff < 1) || (PowDiff > 3)) |
1062 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1063 | unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8}; |
1064 | unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8}; |
1065 | unsigned Op = |
1066 | (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1]; |
1067 | return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind); |
1068 | } |
1069 | case ISD::TRUNCATE: |
1070 | if (Dst->getScalarSizeInBits() == 1) { |
1071 | // We do not use several vncvt to truncate to mask vector. So we could |
1072 | // not use PowDiff to calculate it. |
1073 | // Instead we use the following instructions to truncate to mask vector: |
1074 | // vand.vi v8, v8, 1 |
1075 | // vmsne.vi v0, v8, 0 |
1076 | return getRISCVInstructionCost(OpCodes: {RISCV::VAND_VI, RISCV::VMSNE_VI}, |
1077 | VT: SrcLT.second, CostKind); |
1078 | } |
1079 | [[fallthrough]]; |
1080 | case ISD::FP_EXTEND: |
1081 | case ISD::FP_ROUND: { |
1082 | // Counts of narrow/widen instructions. |
1083 | unsigned SrcEltSize = Src->getScalarSizeInBits(); |
1084 | unsigned DstEltSize = Dst->getScalarSizeInBits(); |
1085 | |
1086 | unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI |
1087 | : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V |
1088 | : RISCV::VFNCVT_F_F_W; |
1089 | InstructionCost Cost = 0; |
1090 | for (; SrcEltSize != DstEltSize;) { |
1091 | MVT ElementMVT = (ISD == ISD::TRUNCATE) |
1092 | ? MVT::getIntegerVT(BitWidth: DstEltSize) |
1093 | : MVT::getFloatingPointVT(BitWidth: DstEltSize); |
1094 | MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT); |
1095 | DstEltSize = |
1096 | (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1; |
1097 | Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind); |
1098 | } |
1099 | return Cost; |
1100 | } |
1101 | case ISD::FP_TO_SINT: |
1102 | case ISD::FP_TO_UINT: |
1103 | case ISD::SINT_TO_FP: |
1104 | case ISD::UINT_TO_FP: |
1105 | if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { |
1106 | // The cost of convert from or to mask vector is different from other |
1107 | // cases. We could not use PowDiff to calculate it. |
1108 | // For mask vector to fp, we should use the following instructions: |
1109 | // vmv.v.i v8, 0 |
1110 | // vmerge.vim v8, v8, -1, v0 |
1111 | // vfcvt.f.x.v v8, v8 |
1112 | |
1113 | // And for fp vector to mask, we use: |
1114 | // vfncvt.rtz.x.f.w v9, v8 |
1115 | // vand.vi v8, v9, 1 |
1116 | // vmsne.vi v0, v8, 0 |
1117 | return 3; |
1118 | } |
1119 | if (std::abs(x: PowDiff) <= 1) |
1120 | return 1; |
1121 | // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), |
1122 | // so it only need two conversion. |
1123 | if (Src->isIntOrIntVectorTy()) |
1124 | return 2; |
1125 | // Counts of narrow/widen instructions. |
1126 | return std::abs(x: PowDiff); |
1127 | } |
1128 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1129 | } |
1130 | |
1131 | unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { |
1132 | if (isa<ScalableVectorType>(Val: Ty)) { |
1133 | const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType()); |
1134 | const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); |
1135 | const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; |
1136 | return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); |
1137 | } |
1138 | return cast<FixedVectorType>(Val: Ty)->getNumElements(); |
1139 | } |
1140 | |
1141 | InstructionCost |
1142 | RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
1143 | FastMathFlags FMF, |
1144 | TTI::TargetCostKind CostKind) { |
1145 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1146 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
1147 | |
1148 | // Skip if scalar size of Ty is bigger than ELEN. |
1149 | if (Ty->getScalarSizeInBits() > ST->getELen()) |
1150 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
1151 | |
1152 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1153 | if (Ty->getElementType()->isIntegerTy(Bitwidth: 1)) { |
1154 | // SelectionDAGBuilder does following transforms: |
1155 | // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>) |
1156 | // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>) |
1157 | if (IID == Intrinsic::umax || IID == Intrinsic::smin) |
1158 | return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind); |
1159 | else |
1160 | return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind); |
1161 | } |
1162 | |
1163 | if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) { |
1164 | SmallVector<unsigned, 3> Opcodes; |
1165 | InstructionCost = 0; |
1166 | switch (IID) { |
1167 | case Intrinsic::maximum: |
1168 | if (FMF.noNaNs()) { |
1169 | Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; |
1170 | } else { |
1171 | Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS, |
1172 | RISCV::VFMV_F_S}; |
1173 | // Cost of Canonical Nan + branch |
1174 | // lui a0, 523264 |
1175 | // fmv.w.x fa0, a0 |
1176 | Type *DstTy = Ty->getScalarType(); |
1177 | const unsigned EltTyBits = DstTy->getScalarSizeInBits(); |
1178 | Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits); |
1179 | ExtraCost = 1 + |
1180 | getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy, |
1181 | CCH: TTI::CastContextHint::None, CostKind) + |
1182 | getCFInstrCost(Opcode: Instruction::Br, CostKind); |
1183 | } |
1184 | break; |
1185 | |
1186 | case Intrinsic::minimum: |
1187 | if (FMF.noNaNs()) { |
1188 | Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; |
1189 | } else { |
1190 | Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS, |
1191 | RISCV::VFMV_F_S}; |
1192 | // Cost of Canonical Nan + branch |
1193 | // lui a0, 523264 |
1194 | // fmv.w.x fa0, a0 |
1195 | Type *DstTy = Ty->getScalarType(); |
1196 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy); |
1197 | Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits); |
1198 | ExtraCost = 1 + |
1199 | getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy, |
1200 | CCH: TTI::CastContextHint::None, CostKind) + |
1201 | getCFInstrCost(Opcode: Instruction::Br, CostKind); |
1202 | } |
1203 | break; |
1204 | } |
1205 | return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1206 | } |
1207 | |
1208 | // IR Reduction is composed by two vmv and one rvv reduction instruction. |
1209 | unsigned SplitOp; |
1210 | SmallVector<unsigned, 3> Opcodes; |
1211 | switch (IID) { |
1212 | default: |
1213 | llvm_unreachable("Unsupported intrinsic" ); |
1214 | case Intrinsic::smax: |
1215 | SplitOp = RISCV::VMAX_VV; |
1216 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S}; |
1217 | break; |
1218 | case Intrinsic::smin: |
1219 | SplitOp = RISCV::VMIN_VV; |
1220 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S}; |
1221 | break; |
1222 | case Intrinsic::umax: |
1223 | SplitOp = RISCV::VMAXU_VV; |
1224 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S}; |
1225 | break; |
1226 | case Intrinsic::umin: |
1227 | SplitOp = RISCV::VMINU_VV; |
1228 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S}; |
1229 | break; |
1230 | case Intrinsic::maxnum: |
1231 | SplitOp = RISCV::VFMAX_VV; |
1232 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; |
1233 | break; |
1234 | case Intrinsic::minnum: |
1235 | SplitOp = RISCV::VFMIN_VV; |
1236 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; |
1237 | break; |
1238 | } |
1239 | // Add a cost for data larger than LMUL8 |
1240 | InstructionCost SplitCost = |
1241 | (LT.first > 1) ? (LT.first - 1) * |
1242 | getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind) |
1243 | : 0; |
1244 | return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1245 | } |
1246 | |
1247 | InstructionCost |
1248 | RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
1249 | std::optional<FastMathFlags> FMF, |
1250 | TTI::TargetCostKind CostKind) { |
1251 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1252 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1253 | |
1254 | // Skip if scalar size of Ty is bigger than ELEN. |
1255 | if (Ty->getScalarSizeInBits() > ST->getELen()) |
1256 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1257 | |
1258 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1259 | assert(ISD && "Invalid opcode" ); |
1260 | |
1261 | if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && |
1262 | ISD != ISD::FADD) |
1263 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1264 | |
1265 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1266 | SmallVector<unsigned, 3> Opcodes; |
1267 | Type *ElementTy = Ty->getElementType(); |
1268 | if (ElementTy->isIntegerTy(Bitwidth: 1)) { |
1269 | if (ISD == ISD::AND) { |
1270 | // Example sequences: |
1271 | // vsetvli a0, zero, e8, mf8, ta, ma |
1272 | // vmnot.m v8, v0 |
1273 | // vcpop.m a0, v8 |
1274 | // seqz a0, a0 |
1275 | Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M}; |
1276 | return (LT.first - 1) + |
1277 | getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) + |
1278 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy, |
1279 | VecPred: CmpInst::ICMP_EQ, CostKind); |
1280 | } else { |
1281 | // Example sequences: |
1282 | // vsetvli a0, zero, e8, mf8, ta, ma |
1283 | // vcpop.m a0, v0 |
1284 | // snez a0, a0 |
1285 | Opcodes = {RISCV::VCPOP_M}; |
1286 | return (LT.first - 1) + |
1287 | getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) + |
1288 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy, |
1289 | VecPred: CmpInst::ICMP_NE, CostKind); |
1290 | } |
1291 | } |
1292 | |
1293 | // IR Reduction is composed by two vmv and one rvv reduction instruction. |
1294 | if (TTI::requiresOrderedReduction(FMF)) { |
1295 | Opcodes.push_back(Elt: RISCV::VFMV_S_F); |
1296 | for (unsigned i = 0; i < LT.first.getValue(); i++) |
1297 | Opcodes.push_back(Elt: RISCV::VFREDOSUM_VS); |
1298 | Opcodes.push_back(Elt: RISCV::VFMV_F_S); |
1299 | return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1300 | } |
1301 | unsigned SplitOp; |
1302 | switch (ISD) { |
1303 | case ISD::ADD: |
1304 | SplitOp = RISCV::VADD_VV; |
1305 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S}; |
1306 | break; |
1307 | case ISD::OR: |
1308 | SplitOp = RISCV::VOR_VV; |
1309 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S}; |
1310 | break; |
1311 | case ISD::XOR: |
1312 | SplitOp = RISCV::VXOR_VV; |
1313 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S}; |
1314 | break; |
1315 | case ISD::AND: |
1316 | SplitOp = RISCV::VAND_VV; |
1317 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S}; |
1318 | break; |
1319 | case ISD::FADD: |
1320 | SplitOp = RISCV::VFADD_VV; |
1321 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; |
1322 | break; |
1323 | } |
1324 | // Add a cost for data larger than LMUL8 |
1325 | InstructionCost SplitCost = |
1326 | (LT.first > 1) ? (LT.first - 1) * |
1327 | getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind) |
1328 | : 0; |
1329 | return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1330 | } |
1331 | |
1332 | InstructionCost RISCVTTIImpl::getExtendedReductionCost( |
1333 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, |
1334 | FastMathFlags FMF, TTI::TargetCostKind CostKind) { |
1335 | if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors()) |
1336 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1337 | FMF, CostKind); |
1338 | |
1339 | // Skip if scalar size of ResTy is bigger than ELEN. |
1340 | if (ResTy->getScalarSizeInBits() > ST->getELen()) |
1341 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1342 | FMF, CostKind); |
1343 | |
1344 | if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) |
1345 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1346 | FMF, CostKind); |
1347 | |
1348 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
1349 | |
1350 | if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) |
1351 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1352 | FMF, CostKind); |
1353 | |
1354 | return (LT.first - 1) + |
1355 | getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
1356 | } |
1357 | |
1358 | InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, |
1359 | TTI::OperandValueInfo OpInfo, |
1360 | TTI::TargetCostKind CostKind) { |
1361 | assert(OpInfo.isConstant() && "non constant operand?" ); |
1362 | if (!isa<VectorType>(Val: Ty)) |
1363 | // FIXME: We need to account for immediate materialization here, but doing |
1364 | // a decent job requires more knowledge about the immediate than we |
1365 | // currently have here. |
1366 | return 0; |
1367 | |
1368 | if (OpInfo.isUniform()) |
1369 | // vmv.x.i, vmv.v.x, or vfmv.v.f |
1370 | // We ignore the cost of the scalar constant materialization to be consistent |
1371 | // with how we treat scalar constants themselves just above. |
1372 | return 1; |
1373 | |
1374 | return getConstantPoolLoadCost(Ty, CostKind); |
1375 | } |
1376 | |
1377 | |
1378 | InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
1379 | MaybeAlign Alignment, |
1380 | unsigned AddressSpace, |
1381 | TTI::TargetCostKind CostKind, |
1382 | TTI::OperandValueInfo OpInfo, |
1383 | const Instruction *I) { |
1384 | EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true); |
1385 | // Type legalization can't handle structs |
1386 | if (VT == MVT::Other) |
1387 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1388 | CostKind, OpInfo, I); |
1389 | |
1390 | InstructionCost Cost = 0; |
1391 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
1392 | Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind); |
1393 | InstructionCost BaseCost = |
1394 | BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1395 | CostKind, OpInfo, I); |
1396 | // Assume memory ops cost scale with the number of vector registers |
1397 | // possible accessed by the instruction. Note that BasicTTI already |
1398 | // handles the LT.first term for us. |
1399 | if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
1400 | LT.second.isVector() && CostKind != TTI::TCK_CodeSize) |
1401 | BaseCost *= TLI->getLMULCost(VT: LT.second); |
1402 | return Cost + BaseCost; |
1403 | |
1404 | } |
1405 | |
1406 | InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
1407 | Type *CondTy, |
1408 | CmpInst::Predicate VecPred, |
1409 | TTI::TargetCostKind CostKind, |
1410 | const Instruction *I) { |
1411 | if (CostKind != TTI::TCK_RecipThroughput) |
1412 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1413 | I); |
1414 | |
1415 | if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors()) |
1416 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1417 | I); |
1418 | |
1419 | // Skip if scalar size of ValTy is bigger than ELEN. |
1420 | if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) |
1421 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1422 | I); |
1423 | |
1424 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
1425 | if (Opcode == Instruction::Select && ValTy->isVectorTy()) { |
1426 | if (CondTy->isVectorTy()) { |
1427 | if (ValTy->getScalarSizeInBits() == 1) { |
1428 | // vmandn.mm v8, v8, v9 |
1429 | // vmand.mm v9, v0, v9 |
1430 | // vmor.mm v0, v9, v8 |
1431 | return LT.first * |
1432 | getRISCVInstructionCost( |
1433 | OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, |
1434 | VT: LT.second, CostKind); |
1435 | } |
1436 | // vselect and max/min are supported natively. |
1437 | return LT.first * |
1438 | getRISCVInstructionCost(OpCodes: RISCV::VMERGE_VVM, VT: LT.second, CostKind); |
1439 | } |
1440 | |
1441 | if (ValTy->getScalarSizeInBits() == 1) { |
1442 | // vmv.v.x v9, a0 |
1443 | // vmsne.vi v9, v9, 0 |
1444 | // vmandn.mm v8, v8, v9 |
1445 | // vmand.mm v9, v0, v9 |
1446 | // vmor.mm v0, v9, v8 |
1447 | MVT InterimVT = LT.second.changeVectorElementType(EltVT: MVT::i8); |
1448 | return LT.first * |
1449 | getRISCVInstructionCost(OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI}, |
1450 | VT: InterimVT, CostKind) + |
1451 | LT.first * getRISCVInstructionCost( |
1452 | OpCodes: {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, |
1453 | VT: LT.second, CostKind); |
1454 | } |
1455 | |
1456 | // vmv.v.x v10, a0 |
1457 | // vmsne.vi v0, v10, 0 |
1458 | // vmerge.vvm v8, v9, v8, v0 |
1459 | return LT.first * getRISCVInstructionCost( |
1460 | OpCodes: {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM}, |
1461 | VT: LT.second, CostKind); |
1462 | } |
1463 | |
1464 | if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && |
1465 | CmpInst::isIntPredicate(P: VecPred)) { |
1466 | // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE |
1467 | // provided they incur the same cost across all implementations |
1468 | return LT.first * |
1469 | getRISCVInstructionCost(OpCodes: RISCV::VMSLT_VV, VT: LT.second, CostKind); |
1470 | } |
1471 | |
1472 | if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() && |
1473 | CmpInst::isFPPredicate(P: VecPred)) { |
1474 | |
1475 | // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask |
1476 | if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE)) |
1477 | return getRISCVInstructionCost(OpCodes: RISCV::VMXOR_MM, VT: LT.second, CostKind); |
1478 | |
1479 | // If we do not support the input floating point vector type, use the base |
1480 | // one which will calculate as: |
1481 | // ScalarizeCost + Num * Cost for fixed vector, |
1482 | // InvalidCost for scalable vector. |
1483 | if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || |
1484 | (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || |
1485 | (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) |
1486 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1487 | I); |
1488 | |
1489 | // Assuming vector fp compare and mask instructions are all the same cost |
1490 | // until a need arises to differentiate them. |
1491 | switch (VecPred) { |
1492 | case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm |
1493 | case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm |
1494 | case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm |
1495 | case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm |
1496 | return LT.first * getRISCVInstructionCost( |
1497 | OpCodes: {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM}, |
1498 | VT: LT.second, CostKind); |
1499 | |
1500 | case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m |
1501 | case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m |
1502 | case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m |
1503 | case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m |
1504 | return LT.first * |
1505 | getRISCVInstructionCost(OpCodes: {RISCV::VMFLT_VV, RISCV::VMNAND_MM}, |
1506 | VT: LT.second, CostKind); |
1507 | |
1508 | case CmpInst::FCMP_OEQ: // vmfeq.vv |
1509 | case CmpInst::FCMP_OGT: // vmflt.vv |
1510 | case CmpInst::FCMP_OGE: // vmfle.vv |
1511 | case CmpInst::FCMP_OLT: // vmflt.vv |
1512 | case CmpInst::FCMP_OLE: // vmfle.vv |
1513 | case CmpInst::FCMP_UNE: // vmfne.vv |
1514 | return LT.first * |
1515 | getRISCVInstructionCost(OpCodes: RISCV::VMFLT_VV, VT: LT.second, CostKind); |
1516 | default: |
1517 | break; |
1518 | } |
1519 | } |
1520 | |
1521 | // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select |
1522 | // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will |
1523 | // generate a conditional branch + mv. The cost of scalar (icmp + select) will |
1524 | // be (0 + select instr cost). |
1525 | if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(Val: I) && |
1526 | ValTy->isIntegerTy() && !I->user_empty()) { |
1527 | if (all_of(Range: I->users(), P: [&](const User *U) { |
1528 | return match(V: U, P: m_Select(C: m_Specific(V: I), L: m_Value(), R: m_Value())) && |
1529 | U->getType()->isIntegerTy() && |
1530 | !isa<ConstantData>(Val: U->getOperand(i: 1)) && |
1531 | !isa<ConstantData>(Val: U->getOperand(i: 2)); |
1532 | })) |
1533 | return 0; |
1534 | } |
1535 | |
1536 | // TODO: Add cost for scalar type. |
1537 | |
1538 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
1539 | } |
1540 | |
1541 | InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, |
1542 | TTI::TargetCostKind CostKind, |
1543 | const Instruction *I) { |
1544 | if (CostKind != TTI::TCK_RecipThroughput) |
1545 | return Opcode == Instruction::PHI ? 0 : 1; |
1546 | // Branches are assumed to be predicted. |
1547 | return 0; |
1548 | } |
1549 | |
1550 | InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
1551 | TTI::TargetCostKind CostKind, |
1552 | unsigned Index, Value *Op0, |
1553 | Value *Op1) { |
1554 | assert(Val->isVectorTy() && "This must be a vector type" ); |
1555 | |
1556 | if (Opcode != Instruction::ExtractElement && |
1557 | Opcode != Instruction::InsertElement) |
1558 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); |
1559 | |
1560 | // Legalize the type. |
1561 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
1562 | |
1563 | // This type is legalized to a scalar type. |
1564 | if (!LT.second.isVector()) { |
1565 | auto *FixedVecTy = cast<FixedVectorType>(Val); |
1566 | // If Index is a known constant, cost is zero. |
1567 | if (Index != -1U) |
1568 | return 0; |
1569 | // Extract/InsertElement with non-constant index is very costly when |
1570 | // scalarized; estimate cost of loads/stores sequence via the stack: |
1571 | // ExtractElement cost: store vector to stack, load scalar; |
1572 | // InsertElement cost: store vector to stack, store scalar, load vector. |
1573 | Type *ElemTy = FixedVecTy->getElementType(); |
1574 | auto NumElems = FixedVecTy->getNumElements(); |
1575 | auto Align = DL.getPrefTypeAlign(Ty: ElemTy); |
1576 | InstructionCost LoadCost = |
1577 | getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind); |
1578 | InstructionCost StoreCost = |
1579 | getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind); |
1580 | return Opcode == Instruction::ExtractElement |
1581 | ? StoreCost * NumElems + LoadCost |
1582 | : (StoreCost + LoadCost) * NumElems + StoreCost; |
1583 | } |
1584 | |
1585 | // For unsupported scalable vector. |
1586 | if (LT.second.isScalableVector() && !LT.first.isValid()) |
1587 | return LT.first; |
1588 | |
1589 | if (!isTypeLegal(Ty: Val)) |
1590 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); |
1591 | |
1592 | // Mask vector extract/insert is expanded via e8. |
1593 | if (Val->getScalarSizeInBits() == 1) { |
1594 | VectorType *WideTy = |
1595 | VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: 8), |
1596 | EC: cast<VectorType>(Val)->getElementCount()); |
1597 | if (Opcode == Instruction::ExtractElement) { |
1598 | InstructionCost ExtendCost |
1599 | = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val, |
1600 | CCH: TTI::CastContextHint::None, CostKind); |
1601 | InstructionCost |
1602 | = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr); |
1603 | return ExtendCost + ExtractCost; |
1604 | } |
1605 | InstructionCost ExtendCost |
1606 | = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val, |
1607 | CCH: TTI::CastContextHint::None, CostKind); |
1608 | InstructionCost InsertCost |
1609 | = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr); |
1610 | InstructionCost TruncCost |
1611 | = getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy, |
1612 | CCH: TTI::CastContextHint::None, CostKind); |
1613 | return ExtendCost + InsertCost + TruncCost; |
1614 | } |
1615 | |
1616 | |
1617 | // In RVV, we could use vslidedown + vmv.x.s to extract element from vector |
1618 | // and vslideup + vmv.s.x to insert element to vector. |
1619 | unsigned BaseCost = 1; |
1620 | // When insertelement we should add the index with 1 as the input of vslideup. |
1621 | unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; |
1622 | |
1623 | if (Index != -1U) { |
1624 | // The type may be split. For fixed-width vectors we can normalize the |
1625 | // index to the new type. |
1626 | if (LT.second.isFixedLengthVector()) { |
1627 | unsigned Width = LT.second.getVectorNumElements(); |
1628 | Index = Index % Width; |
1629 | } |
1630 | |
1631 | // We could extract/insert the first element without vslidedown/vslideup. |
1632 | if (Index == 0) |
1633 | SlideCost = 0; |
1634 | else if (Opcode == Instruction::InsertElement) |
1635 | SlideCost = 1; // With a constant index, we do not need to use addi. |
1636 | } |
1637 | |
1638 | // Extract i64 in the target that has XLEN=32 need more instruction. |
1639 | if (Val->getScalarType()->isIntegerTy() && |
1640 | ST->getXLen() < Val->getScalarSizeInBits()) { |
1641 | // For extractelement, we need the following instructions: |
1642 | // vsetivli zero, 1, e64, m1, ta, mu (not count) |
1643 | // vslidedown.vx v8, v8, a0 |
1644 | // vmv.x.s a0, v8 |
1645 | // li a1, 32 |
1646 | // vsrl.vx v8, v8, a1 |
1647 | // vmv.x.s a1, v8 |
1648 | |
1649 | // For insertelement, we need the following instructions: |
1650 | // vsetivli zero, 2, e32, m4, ta, mu (not count) |
1651 | // vmv.v.i v12, 0 |
1652 | // vslide1up.vx v16, v12, a1 |
1653 | // vslide1up.vx v12, v16, a0 |
1654 | // addi a0, a2, 1 |
1655 | // vsetvli zero, a0, e64, m4, tu, mu (not count) |
1656 | // vslideup.vx v8, v12, a2 |
1657 | |
1658 | // TODO: should we count these special vsetvlis? |
1659 | BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; |
1660 | } |
1661 | return BaseCost + SlideCost; |
1662 | } |
1663 | |
1664 | InstructionCost RISCVTTIImpl::getArithmeticInstrCost( |
1665 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
1666 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
1667 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
1668 | |
1669 | // TODO: Handle more cost kinds. |
1670 | if (CostKind != TTI::TCK_RecipThroughput) |
1671 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1672 | Args, CxtI); |
1673 | |
1674 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1675 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1676 | Args, CxtI); |
1677 | |
1678 | // Skip if scalar size of Ty is bigger than ELEN. |
1679 | if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen()) |
1680 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1681 | Args, CxtI); |
1682 | |
1683 | // Legalize the type. |
1684 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1685 | |
1686 | // TODO: Handle scalar type. |
1687 | if (!LT.second.isVector()) |
1688 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1689 | Args, CxtI); |
1690 | |
1691 | auto getConstantMatCost = |
1692 | [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { |
1693 | if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) |
1694 | // Two sub-cases: |
1695 | // * Has a 5 bit immediate operand which can be splatted. |
1696 | // * Has a larger immediate which must be materialized in scalar register |
1697 | // We return 0 for both as we currently ignore the cost of materializing |
1698 | // scalar constants in GPRs. |
1699 | return 0; |
1700 | |
1701 | return getConstantPoolLoadCost(Ty, CostKind); |
1702 | }; |
1703 | |
1704 | // Add the cost of materializing any constant vectors required. |
1705 | InstructionCost ConstantMatCost = 0; |
1706 | if (Op1Info.isConstant()) |
1707 | ConstantMatCost += getConstantMatCost(0, Op1Info); |
1708 | if (Op2Info.isConstant()) |
1709 | ConstantMatCost += getConstantMatCost(1, Op2Info); |
1710 | |
1711 | unsigned Op; |
1712 | switch (TLI->InstructionOpcodeToISD(Opcode)) { |
1713 | case ISD::ADD: |
1714 | case ISD::SUB: |
1715 | Op = RISCV::VADD_VV; |
1716 | break; |
1717 | case ISD::SHL: |
1718 | case ISD::SRL: |
1719 | case ISD::SRA: |
1720 | Op = RISCV::VSLL_VV; |
1721 | break; |
1722 | case ISD::AND: |
1723 | case ISD::OR: |
1724 | case ISD::XOR: |
1725 | Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV; |
1726 | break; |
1727 | case ISD::MUL: |
1728 | case ISD::MULHS: |
1729 | case ISD::MULHU: |
1730 | Op = RISCV::VMUL_VV; |
1731 | break; |
1732 | case ISD::SDIV: |
1733 | case ISD::UDIV: |
1734 | Op = RISCV::VDIV_VV; |
1735 | break; |
1736 | case ISD::SREM: |
1737 | case ISD::UREM: |
1738 | Op = RISCV::VREM_VV; |
1739 | break; |
1740 | case ISD::FADD: |
1741 | case ISD::FSUB: |
1742 | // TODO: Address FP16 with VFHMIN |
1743 | Op = RISCV::VFADD_VV; |
1744 | break; |
1745 | case ISD::FMUL: |
1746 | // TODO: Address FP16 with VFHMIN |
1747 | Op = RISCV::VFMUL_VV; |
1748 | break; |
1749 | case ISD::FDIV: |
1750 | Op = RISCV::VFDIV_VV; |
1751 | break; |
1752 | case ISD::FNEG: |
1753 | Op = RISCV::VFSGNJN_VV; |
1754 | break; |
1755 | default: |
1756 | // Assuming all other instructions have the same cost until a need arises to |
1757 | // differentiate them. |
1758 | return ConstantMatCost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, |
1759 | Opd1Info: Op1Info, Opd2Info: Op2Info, |
1760 | Args, CxtI); |
1761 | } |
1762 | |
1763 | InstructionCost InstrCost = getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind); |
1764 | // We use BasicTTIImpl to calculate scalar costs, which assumes floating point |
1765 | // ops are twice as expensive as integer ops. Do the same for vectors so |
1766 | // scalar floating point ops aren't cheaper than their vector equivalents. |
1767 | if (Ty->isFPOrFPVectorTy()) |
1768 | InstrCost *= 2; |
1769 | return ConstantMatCost + LT.first * InstrCost; |
1770 | } |
1771 | |
1772 | // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. |
1773 | InstructionCost RISCVTTIImpl::getPointersChainCost( |
1774 | ArrayRef<const Value *> Ptrs, const Value *Base, |
1775 | const TTI::PointersChainInfo &Info, Type *AccessTy, |
1776 | TTI::TargetCostKind CostKind) { |
1777 | InstructionCost Cost = TTI::TCC_Free; |
1778 | // In the basic model we take into account GEP instructions only |
1779 | // (although here can come alloca instruction, a value, constants and/or |
1780 | // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a |
1781 | // pointer). Typically, if Base is a not a GEP-instruction and all the |
1782 | // pointers are relative to the same base address, all the rest are |
1783 | // either GEP instructions, PHIs, bitcasts or constants. When we have same |
1784 | // base, we just calculate cost of each non-Base GEP as an ADD operation if |
1785 | // any their index is a non-const. |
1786 | // If no known dependecies between the pointers cost is calculated as a sum |
1787 | // of costs of GEP instructions. |
1788 | for (auto [I, V] : enumerate(First&: Ptrs)) { |
1789 | const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V); |
1790 | if (!GEP) |
1791 | continue; |
1792 | if (Info.isSameBase() && V != Base) { |
1793 | if (GEP->hasAllConstantIndices()) |
1794 | continue; |
1795 | // If the chain is unit-stride and BaseReg + stride*i is a legal |
1796 | // addressing mode, then presume the base GEP is sitting around in a |
1797 | // register somewhere and check if we can fold the offset relative to |
1798 | // it. |
1799 | unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy); |
1800 | if (Info.isUnitStride() && |
1801 | isLegalAddressingMode(Ty: AccessTy, |
1802 | /* BaseGV */ nullptr, |
1803 | /* BaseOffset */ Stride * I, |
1804 | /* HasBaseReg */ true, |
1805 | /* Scale */ 0, |
1806 | AddrSpace: GEP->getType()->getPointerAddressSpace())) |
1807 | continue; |
1808 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind, |
1809 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1810 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1811 | Args: std::nullopt); |
1812 | } else { |
1813 | SmallVector<const Value *> Indices(GEP->indices()); |
1814 | Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(), |
1815 | Operands: Indices, AccessType: AccessTy, CostKind); |
1816 | } |
1817 | } |
1818 | return Cost; |
1819 | } |
1820 | |
1821 | void RISCVTTIImpl::(Loop *L, ScalarEvolution &SE, |
1822 | TTI::UnrollingPreferences &UP, |
1823 | OptimizationRemarkEmitter *ORE) { |
1824 | // TODO: More tuning on benchmarks and metrics with changes as needed |
1825 | // would apply to all settings below to enable performance. |
1826 | |
1827 | |
1828 | if (ST->enableDefaultUnroll()) |
1829 | return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); |
1830 | |
1831 | // Enable Upper bound unrolling universally, not dependant upon the conditions |
1832 | // below. |
1833 | UP.UpperBound = true; |
1834 | |
1835 | // Disable loop unrolling for Oz and Os. |
1836 | UP.OptSizeThreshold = 0; |
1837 | UP.PartialOptSizeThreshold = 0; |
1838 | if (L->getHeader()->getParent()->hasOptSize()) |
1839 | return; |
1840 | |
1841 | SmallVector<BasicBlock *, 4> ExitingBlocks; |
1842 | L->getExitingBlocks(ExitingBlocks); |
1843 | LLVM_DEBUG(dbgs() << "Loop has:\n" |
1844 | << "Blocks: " << L->getNumBlocks() << "\n" |
1845 | << "Exit blocks: " << ExitingBlocks.size() << "\n" ); |
1846 | |
1847 | // Only allow another exit other than the latch. This acts as an early exit |
1848 | // as it mirrors the profitability calculation of the runtime unroller. |
1849 | if (ExitingBlocks.size() > 2) |
1850 | return; |
1851 | |
1852 | // Limit the CFG of the loop body for targets with a branch predictor. |
1853 | // Allowing 4 blocks permits if-then-else diamonds in the body. |
1854 | if (L->getNumBlocks() > 4) |
1855 | return; |
1856 | |
1857 | // Don't unroll vectorized loops, including the remainder loop |
1858 | if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized" )) |
1859 | return; |
1860 | |
1861 | // Scan the loop: don't unroll loops with calls as this could prevent |
1862 | // inlining. |
1863 | InstructionCost Cost = 0; |
1864 | for (auto *BB : L->getBlocks()) { |
1865 | for (auto &I : *BB) { |
1866 | // Initial setting - Don't unroll loops containing vectorized |
1867 | // instructions. |
1868 | if (I.getType()->isVectorTy()) |
1869 | return; |
1870 | |
1871 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
1872 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
1873 | if (!isLoweredToCall(F)) |
1874 | continue; |
1875 | } |
1876 | return; |
1877 | } |
1878 | |
1879 | SmallVector<const Value *> Operands(I.operand_values()); |
1880 | Cost += getInstructionCost(U: &I, Operands, |
1881 | CostKind: TargetTransformInfo::TCK_SizeAndLatency); |
1882 | } |
1883 | } |
1884 | |
1885 | LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n" ); |
1886 | |
1887 | UP.Partial = true; |
1888 | UP.Runtime = true; |
1889 | UP.UnrollRemainder = true; |
1890 | UP.UnrollAndJam = true; |
1891 | UP.UnrollAndJamInnerLoopThreshold = 60; |
1892 | |
1893 | // Force unrolling small loops can be very useful because of the branch |
1894 | // taken cost of the backedge. |
1895 | if (Cost < 12) |
1896 | UP.Force = true; |
1897 | } |
1898 | |
1899 | void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1900 | TTI::PeelingPreferences &PP) { |
1901 | BaseT::getPeelingPreferences(L, SE, PP); |
1902 | } |
1903 | |
1904 | unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { |
1905 | TypeSize Size = DL.getTypeSizeInBits(Ty); |
1906 | if (Ty->isVectorTy()) { |
1907 | if (Size.isScalable() && ST->hasVInstructions()) |
1908 | return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock); |
1909 | |
1910 | if (ST->useRVVForFixedLengthVectors()) |
1911 | return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen()); |
1912 | } |
1913 | |
1914 | return BaseT::getRegUsageForType(Ty); |
1915 | } |
1916 | |
1917 | unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { |
1918 | if (SLPMaxVF.getNumOccurrences()) |
1919 | return SLPMaxVF; |
1920 | |
1921 | // Return how many elements can fit in getRegisterBitwidth. This is the |
1922 | // same routine as used in LoopVectorizer. We should probably be |
1923 | // accounting for whether we actually have instructions with the right |
1924 | // lane type, but we don't have enough information to do that without |
1925 | // some additional plumbing which hasn't been justified yet. |
1926 | TypeSize RegWidth = |
1927 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector); |
1928 | // If no vector registers, or absurd element widths, disable |
1929 | // vectorization by returning 1. |
1930 | return std::max<unsigned>(a: 1U, b: RegWidth.getFixedValue() / ElemWidth); |
1931 | } |
1932 | |
1933 | bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
1934 | const TargetTransformInfo::LSRCost &C2) { |
1935 | // RISC-V specific here are "instruction number 1st priority". |
1936 | // If we need to emit adds inside the loop to add up base registers, then |
1937 | // we need at least one extra temporary register. |
1938 | unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0); |
1939 | unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0); |
1940 | return std::tie(args: C1.Insns, args&: C1NumRegs, args: C1.AddRecCost, |
1941 | args: C1.NumIVMuls, args: C1.NumBaseAdds, |
1942 | args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
1943 | std::tie(args: C2.Insns, args&: C2NumRegs, args: C2.AddRecCost, |
1944 | args: C2.NumIVMuls, args: C2.NumBaseAdds, |
1945 | args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
1946 | } |
1947 | |
1948 | bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { |
1949 | auto *VTy = dyn_cast<VectorType>(Val: DataTy); |
1950 | if (!VTy || VTy->isScalableTy()) |
1951 | return false; |
1952 | |
1953 | if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment)) |
1954 | return false; |
1955 | return true; |
1956 | } |
1957 | |
1958 | bool RISCVTTIImpl::areInlineCompatible(const Function *Caller, |
1959 | const Function *Callee) const { |
1960 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
1961 | |
1962 | const FeatureBitset &CallerBits = |
1963 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
1964 | const FeatureBitset &CalleeBits = |
1965 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
1966 | |
1967 | // Inline a callee if its target-features are a subset of the callers |
1968 | // target-features. |
1969 | return (CallerBits & CalleeBits) == CalleeBits; |
1970 | } |
1971 | |