| 1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// This file implements a TargetTransformInfo analysis pass specific to the |
| 10 | /// X86 target machine. It uses the target's detailed information to provide |
| 11 | /// more precise answers to certain TTI queries, while letting the target |
| 12 | /// independent and default TTI implementations handle the rest. |
| 13 | /// |
| 14 | //===----------------------------------------------------------------------===// |
| 15 | /// About Cost Model numbers used below it's necessary to say the following: |
| 16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a |
| 17 | /// specific CPU model. Usually the numbers correspond to the CPU where the |
| 18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in |
| 19 | /// the lookups below the cost is based on Nehalem as that was the first CPU |
| 20 | /// to support that feature level and thus has most likely the worst case cost, |
| 21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). |
| 22 | /// |
| 23 | /// Some examples of other technologies/CPUs: |
| 24 | /// SSE 3 - Pentium4 / Athlon64 |
| 25 | /// SSE 4.1 - Penryn |
| 26 | /// SSE 4.2 - Nehalem / Silvermont |
| 27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer |
| 28 | /// AVX2 - Haswell / Ryzen |
| 29 | /// AVX-512 - Xeon Phi / Skylake |
| 30 | /// |
| 31 | /// And some examples of instruction target dependent costs (latency) |
| 32 | /// divss sqrtss rsqrtss |
| 33 | /// AMD K7 11-16 19 3 |
| 34 | /// Piledriver 9-24 13-15 5 |
| 35 | /// Jaguar 14 16 2 |
| 36 | /// Pentium II,III 18 30 2 |
| 37 | /// Nehalem 7-14 7-18 3 |
| 38 | /// Haswell 10-13 11 5 |
| 39 | /// |
| 40 | /// Interpreting the 4 TargetCostKind types: |
| 41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case |
| 42 | /// values reported by the CPU scheduler models (and llvm-mca). |
| 43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the |
| 44 | /// actual encoding size of the instruction. |
| 45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by |
| 46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are |
| 47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are |
| 48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. |
| 49 | //===----------------------------------------------------------------------===// |
| 50 | |
| 51 | #include "X86TargetTransformInfo.h" |
| 52 | #include "llvm/ADT/SmallBitVector.h" |
| 53 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 54 | #include "llvm/CodeGen/BasicTTIImpl.h" |
| 55 | #include "llvm/CodeGen/CostTable.h" |
| 56 | #include "llvm/CodeGen/TargetLowering.h" |
| 57 | #include "llvm/IR/InstIterator.h" |
| 58 | #include "llvm/IR/IntrinsicInst.h" |
| 59 | #include <optional> |
| 60 | |
| 61 | using namespace llvm; |
| 62 | |
| 63 | #define DEBUG_TYPE "x86tti" |
| 64 | |
| 65 | //===----------------------------------------------------------------------===// |
| 66 | // |
| 67 | // X86 cost model. |
| 68 | // |
| 69 | //===----------------------------------------------------------------------===// |
| 70 | |
| 71 | // Helper struct to store/access costs for each cost kind. |
| 72 | // TODO: Move this to allow other targets to use it? |
| 73 | struct CostKindCosts { |
| 74 | unsigned RecipThroughputCost = ~0U; |
| 75 | unsigned LatencyCost = ~0U; |
| 76 | unsigned CodeSizeCost = ~0U; |
| 77 | unsigned SizeAndLatencyCost = ~0U; |
| 78 | |
| 79 | std::optional<unsigned> |
| 80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { |
| 81 | unsigned Cost = ~0U; |
| 82 | switch (Kind) { |
| 83 | case TargetTransformInfo::TCK_RecipThroughput: |
| 84 | Cost = RecipThroughputCost; |
| 85 | break; |
| 86 | case TargetTransformInfo::TCK_Latency: |
| 87 | Cost = LatencyCost; |
| 88 | break; |
| 89 | case TargetTransformInfo::TCK_CodeSize: |
| 90 | Cost = CodeSizeCost; |
| 91 | break; |
| 92 | case TargetTransformInfo::TCK_SizeAndLatency: |
| 93 | Cost = SizeAndLatencyCost; |
| 94 | break; |
| 95 | } |
| 96 | if (Cost == ~0U) |
| 97 | return std::nullopt; |
| 98 | return Cost; |
| 99 | } |
| 100 | }; |
| 101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; |
| 102 | using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>; |
| 103 | |
| 104 | TargetTransformInfo::PopcntSupportKind |
| 105 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) const { |
| 106 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
| 107 | // TODO: Currently the __builtin_popcount() implementation using SSE3 |
| 108 | // instructions is inefficient. Once the problem is fixed, we should |
| 109 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
| 110 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; |
| 111 | } |
| 112 | |
| 113 | std::optional<unsigned> X86TTIImpl::getCacheSize( |
| 114 | TargetTransformInfo::CacheLevel Level) const { |
| 115 | switch (Level) { |
| 116 | case TargetTransformInfo::CacheLevel::L1D: |
| 117 | // - Penryn |
| 118 | // - Nehalem |
| 119 | // - Westmere |
| 120 | // - Sandy Bridge |
| 121 | // - Ivy Bridge |
| 122 | // - Haswell |
| 123 | // - Broadwell |
| 124 | // - Skylake |
| 125 | // - Kabylake |
| 126 | return 32 * 1024; // 32 KiB |
| 127 | case TargetTransformInfo::CacheLevel::L2D: |
| 128 | // - Penryn |
| 129 | // - Nehalem |
| 130 | // - Westmere |
| 131 | // - Sandy Bridge |
| 132 | // - Ivy Bridge |
| 133 | // - Haswell |
| 134 | // - Broadwell |
| 135 | // - Skylake |
| 136 | // - Kabylake |
| 137 | return 256 * 1024; // 256 KiB |
| 138 | } |
| 139 | |
| 140 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
| 141 | } |
| 142 | |
| 143 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( |
| 144 | TargetTransformInfo::CacheLevel Level) const { |
| 145 | // - Penryn |
| 146 | // - Nehalem |
| 147 | // - Westmere |
| 148 | // - Sandy Bridge |
| 149 | // - Ivy Bridge |
| 150 | // - Haswell |
| 151 | // - Broadwell |
| 152 | // - Skylake |
| 153 | // - Kabylake |
| 154 | switch (Level) { |
| 155 | case TargetTransformInfo::CacheLevel::L1D: |
| 156 | [[fallthrough]]; |
| 157 | case TargetTransformInfo::CacheLevel::L2D: |
| 158 | return 8; |
| 159 | } |
| 160 | |
| 161 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
| 162 | } |
| 163 | |
| 164 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
| 165 | bool Vector = (ClassID == 1); |
| 166 | if (Vector && !ST->hasSSE1()) |
| 167 | return 0; |
| 168 | |
| 169 | if (ST->is64Bit()) { |
| 170 | if (Vector && ST->hasAVX512()) |
| 171 | return 32; |
| 172 | if (!Vector && ST->hasEGPR()) |
| 173 | return 32; |
| 174 | return 16; |
| 175 | } |
| 176 | return 8; |
| 177 | } |
| 178 | |
| 179 | bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const { |
| 180 | if (!ST->hasCF()) |
| 181 | return false; |
| 182 | if (!Ty) |
| 183 | return true; |
| 184 | // Conditional faulting is supported by CFCMOV, which only accepts |
| 185 | // 16/32/64-bit operands. |
| 186 | // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's |
| 187 | // profitable. |
| 188 | auto *VTy = dyn_cast<FixedVectorType>(Val: Ty); |
| 189 | if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1)) |
| 190 | return false; |
| 191 | auto *ScalarTy = Ty->getScalarType(); |
| 192 | switch (cast<IntegerType>(Val: ScalarTy)->getBitWidth()) { |
| 193 | default: |
| 194 | return false; |
| 195 | case 16: |
| 196 | case 32: |
| 197 | case 64: |
| 198 | return true; |
| 199 | } |
| 200 | } |
| 201 | |
| 202 | TypeSize |
| 203 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
| 204 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); |
| 205 | switch (K) { |
| 206 | case TargetTransformInfo::RGK_Scalar: |
| 207 | return TypeSize::getFixed(ExactSize: ST->is64Bit() ? 64 : 32); |
| 208 | case TargetTransformInfo::RGK_FixedWidthVector: |
| 209 | if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) |
| 210 | return TypeSize::getFixed(ExactSize: 512); |
| 211 | if (ST->hasAVX() && PreferVectorWidth >= 256) |
| 212 | return TypeSize::getFixed(ExactSize: 256); |
| 213 | if (ST->hasSSE1() && PreferVectorWidth >= 128) |
| 214 | return TypeSize::getFixed(ExactSize: 128); |
| 215 | return TypeSize::getFixed(ExactSize: 0); |
| 216 | case TargetTransformInfo::RGK_ScalableVector: |
| 217 | return TypeSize::getScalable(MinimumSize: 0); |
| 218 | } |
| 219 | |
| 220 | llvm_unreachable("Unsupported register kind" ); |
| 221 | } |
| 222 | |
| 223 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
| 224 | return getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
| 225 | .getFixedValue(); |
| 226 | } |
| 227 | |
| 228 | unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
| 229 | // If the loop will not be vectorized, don't interleave the loop. |
| 230 | // Let regular unroll to unroll the loop, which saves the overflow |
| 231 | // check and memory check cost. |
| 232 | if (VF.isScalar()) |
| 233 | return 1; |
| 234 | |
| 235 | if (ST->isAtom()) |
| 236 | return 1; |
| 237 | |
| 238 | // Sandybridge and Haswell have multiple execution ports and pipelined |
| 239 | // vector units. |
| 240 | if (ST->hasAVX()) |
| 241 | return 4; |
| 242 | |
| 243 | return 2; |
| 244 | } |
| 245 | |
| 246 | InstructionCost X86TTIImpl::getArithmeticInstrCost( |
| 247 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
| 248 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
| 249 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
| 250 | |
| 251 | // vXi8 multiplications are always promoted to vXi16. |
| 252 | // Sub-128-bit types can be extended/packed more efficiently. |
| 253 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && |
| 254 | Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { |
| 255 | Type *WideVecTy = |
| 256 | VectorType::getExtendedElementVectorType(VTy: cast<VectorType>(Val: Ty)); |
| 257 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: Ty, |
| 258 | CCH: TargetTransformInfo::CastContextHint::None, |
| 259 | CostKind) + |
| 260 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: Ty, Src: WideVecTy, |
| 261 | CCH: TargetTransformInfo::CastContextHint::None, |
| 262 | CostKind) + |
| 263 | getArithmeticInstrCost(Opcode, Ty: WideVecTy, CostKind, Op1Info, Op2Info); |
| 264 | } |
| 265 | |
| 266 | // Legalize the type. |
| 267 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
| 268 | |
| 269 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 270 | assert(ISD && "Invalid opcode" ); |
| 271 | |
| 272 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && |
| 273 | (LT.second.getScalarType() == MVT::i32 || |
| 274 | LT.second.getScalarType() == MVT::i64)) { |
| 275 | // Check if the operands can be represented as a smaller datatype. |
| 276 | bool Op1Signed = false, Op2Signed = false; |
| 277 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Val: Args[0], isSigned&: Op1Signed); |
| 278 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Val: Args[1], isSigned&: Op2Signed); |
| 279 | unsigned OpMinSize = std::max(a: Op1MinSize, b: Op2MinSize); |
| 280 | bool SignedMode = Op1Signed || Op2Signed; |
| 281 | |
| 282 | // If both vXi32 are representable as i15 and at least one is constant, |
| 283 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we |
| 284 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. |
| 285 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && |
| 286 | LT.second.getScalarType() == MVT::i32) { |
| 287 | bool Op1Constant = |
| 288 | isa<ConstantDataVector>(Val: Args[0]) || isa<ConstantVector>(Val: Args[0]); |
| 289 | bool Op2Constant = |
| 290 | isa<ConstantDataVector>(Val: Args[1]) || isa<ConstantVector>(Val: Args[1]); |
| 291 | bool Op1Sext = isa<SExtInst>(Val: Args[0]) && |
| 292 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); |
| 293 | bool Op2Sext = isa<SExtInst>(Val: Args[1]) && |
| 294 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); |
| 295 | |
| 296 | bool IsZeroExtended = !Op1Signed || !Op2Signed; |
| 297 | bool IsConstant = Op1Constant || Op2Constant; |
| 298 | bool IsSext = Op1Sext || Op2Sext; |
| 299 | if (IsConstant || IsZeroExtended || IsSext) |
| 300 | LT.second = |
| 301 | MVT::getVectorVT(VT: MVT::i16, NumElements: 2 * LT.second.getVectorNumElements()); |
| 302 | } |
| 303 | |
| 304 | // Check if the vXi32 operands can be shrunk into a smaller datatype. |
| 305 | // This should match the codegen from reduceVMULWidth. |
| 306 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). |
| 307 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { |
| 308 | if (OpMinSize <= 7) |
| 309 | return LT.first * 3; // pmullw/sext |
| 310 | if (!SignedMode && OpMinSize <= 8) |
| 311 | return LT.first * 3; // pmullw/zext |
| 312 | if (OpMinSize <= 15) |
| 313 | return LT.first * 5; // pmullw/pmulhw/pshuf |
| 314 | if (!SignedMode && OpMinSize <= 16) |
| 315 | return LT.first * 5; // pmullw/pmulhw/pshuf |
| 316 | } |
| 317 | |
| 318 | // If both vXi64 are representable as (unsigned) i32, then we can perform |
| 319 | // the multiple with a single PMULUDQ instruction. |
| 320 | // TODO: Add (SSE41+) PMULDQ handling for signed extensions. |
| 321 | if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) |
| 322 | ISD = X86ISD::PMULUDQ; |
| 323 | } |
| 324 | |
| 325 | // Vector multiply by pow2 will be simplified to shifts. |
| 326 | // Vector multiply by -pow2 will be simplified to shifts/negates. |
| 327 | if (ISD == ISD::MUL && Op2Info.isConstant() && |
| 328 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { |
| 329 | InstructionCost Cost = |
| 330 | getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind, |
| 331 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 332 | if (Op2Info.isNegatedPowerOf2()) |
| 333 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind); |
| 334 | return Cost; |
| 335 | } |
| 336 | |
| 337 | // On X86, vector signed division by constants power-of-two are |
| 338 | // normally expanded to the sequence SRA + SRL + ADD + SRA. |
| 339 | // The OperandValue properties may not be the same as that of the previous |
| 340 | // operation; conservatively assume OP_None. |
| 341 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && |
| 342 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
| 343 | InstructionCost Cost = |
| 344 | 2 * getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
| 345 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 346 | Cost += getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
| 347 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 348 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
| 349 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 350 | |
| 351 | if (ISD == ISD::SREM) { |
| 352 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) |
| 353 | Cost += getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
| 354 | Op2Info: Op2Info.getNoProps()); |
| 355 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
| 356 | Op2Info: Op2Info.getNoProps()); |
| 357 | } |
| 358 | |
| 359 | return Cost; |
| 360 | } |
| 361 | |
| 362 | // Vector unsigned division/remainder will be simplified to shifts/masks. |
| 363 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && |
| 364 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
| 365 | if (ISD == ISD::UDIV) |
| 366 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
| 367 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 368 | // UREM |
| 369 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
| 370 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 371 | } |
| 372 | |
| 373 | static const CostKindTblEntry GFNIUniformConstCostTable[] = { |
| 374 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 375 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 376 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 377 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 378 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 379 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 380 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 381 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 382 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 383 | }; |
| 384 | |
| 385 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI()) |
| 386 | if (const auto *Entry = |
| 387 | CostTableLookup(Table: GFNIUniformConstCostTable, ISD, Ty: LT.second)) |
| 388 | if (auto KindCost = Entry->Cost[CostKind]) |
| 389 | return LT.first * *KindCost; |
| 390 | |
| 391 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { |
| 392 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 393 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 394 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
| 395 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 396 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 397 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
| 398 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 399 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 400 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
| 401 | |
| 402 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
| 403 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
| 404 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
| 405 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
| 406 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
| 407 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
| 408 | }; |
| 409 | |
| 410 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) |
| 411 | if (const auto *Entry = |
| 412 | CostTableLookup(Table: AVX512BWUniformConstCostTable, ISD, Ty: LT.second)) |
| 413 | if (auto KindCost = Entry->Cost[CostKind]) |
| 414 | return LT.first * *KindCost; |
| 415 | |
| 416 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { |
| 417 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psllw + pand. |
| 418 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw + pand. |
| 419 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 12, .SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
| 420 | |
| 421 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psllw + split. |
| 422 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psrlw + split. |
| 423 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psraw + split. |
| 424 | |
| 425 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
| 426 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
| 427 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
| 428 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
| 429 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
| 430 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
| 431 | |
| 432 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
| 433 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
| 434 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
| 435 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
| 436 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
| 437 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
| 438 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
| 439 | |
| 440 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
| 441 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
| 442 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
| 443 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
| 444 | }; |
| 445 | |
| 446 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) |
| 447 | if (const auto *Entry = |
| 448 | CostTableLookup(Table: AVX512UniformConstCostTable, ISD, Ty: LT.second)) |
| 449 | if (auto KindCost = Entry->Cost[CostKind]) |
| 450 | return LT.first * *KindCost; |
| 451 | |
| 452 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { |
| 453 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 454 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 455 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
| 456 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psllw + pand. |
| 457 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psrlw + pand. |
| 458 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw, pand, pxor, psubb. |
| 459 | |
| 460 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
| 461 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
| 462 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw |
| 463 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw |
| 464 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw |
| 465 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw |
| 466 | |
| 467 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
| 468 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
| 469 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
| 470 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
| 471 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
| 472 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
| 473 | |
| 474 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
| 475 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
| 476 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
| 477 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
| 478 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
| 479 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // psrad + shuffle + split. |
| 480 | |
| 481 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
| 482 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
| 483 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
| 484 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
| 485 | }; |
| 486 | |
| 487 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) |
| 488 | if (const auto *Entry = |
| 489 | CostTableLookup(Table: AVX2UniformConstCostTable, ISD, Ty: LT.second)) |
| 490 | if (auto KindCost = Entry->Cost[CostKind]) |
| 491 | return LT.first * *KindCost; |
| 492 | |
| 493 | static const CostKindTblEntry AVXUniformConstCostTable[] = { |
| 494 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 495 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 496 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
| 497 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psllw + pand) + split. |
| 498 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psrlw + pand) + split. |
| 499 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 12, .SizeAndLatencyCost: 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. |
| 500 | |
| 501 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
| 502 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
| 503 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
| 504 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psllw + split. |
| 505 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw + split. |
| 506 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psraw + split. |
| 507 | |
| 508 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld. |
| 509 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
| 510 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
| 511 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // pslld + split. |
| 512 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrld + split. |
| 513 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrad + split. |
| 514 | |
| 515 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
| 516 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
| 517 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
| 518 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
| 519 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
| 520 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle + split. |
| 521 | |
| 522 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmuludq sequence + split. |
| 523 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmuludq+mul+sub sequence + split. |
| 524 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 12 } }, // 2*pmuludq sequence + split. |
| 525 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 16 } }, // 2*pmuludq+mul+sub sequence + split. |
| 526 | }; |
| 527 | |
| 528 | // XOP has faster vXi8 shifts. |
| 529 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && |
| 530 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
| 531 | if (const auto *Entry = |
| 532 | CostTableLookup(Table: AVXUniformConstCostTable, ISD, Ty: LT.second)) |
| 533 | if (auto KindCost = Entry->Cost[CostKind]) |
| 534 | return LT.first * *KindCost; |
| 535 | |
| 536 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { |
| 537 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
| 538 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
| 539 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
| 540 | |
| 541 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
| 542 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
| 543 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
| 544 | |
| 545 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
| 546 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
| 547 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
| 548 | |
| 549 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
| 550 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
| 551 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, // 2 x psrad + shuffle. |
| 552 | |
| 553 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
| 554 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
| 555 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
| 556 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
| 557 | }; |
| 558 | |
| 559 | // XOP has faster vXi8 shifts. |
| 560 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && |
| 561 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
| 562 | if (const auto *Entry = |
| 563 | CostTableLookup(Table: SSE2UniformConstCostTable, ISD, Ty: LT.second)) |
| 564 | if (auto KindCost = Entry->Cost[CostKind]) |
| 565 | return LT.first * *KindCost; |
| 566 | |
| 567 | static const CostKindTblEntry AVX512BWConstCostTable[] = { |
| 568 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 569 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 570 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 571 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 572 | |
| 573 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
| 574 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
| 575 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
| 576 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
| 577 | }; |
| 578 | |
| 579 | if (Op2Info.isConstant() && ST->hasBWI()) |
| 580 | if (const auto *Entry = |
| 581 | CostTableLookup(Table: AVX512BWConstCostTable, ISD, Ty: LT.second)) |
| 582 | if (auto KindCost = Entry->Cost[CostKind]) |
| 583 | return LT.first * *KindCost; |
| 584 | |
| 585 | static const CostKindTblEntry AVX512ConstCostTable[] = { |
| 586 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
| 587 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
| 588 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
| 589 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
| 590 | |
| 591 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhw sequence |
| 592 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhw+mul+sub sequence |
| 593 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhuw sequence |
| 594 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhuw+mul+sub sequence |
| 595 | |
| 596 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
| 597 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuldq+mul+sub sequence |
| 598 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
| 599 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuludq+mul+sub sequence |
| 600 | }; |
| 601 | |
| 602 | if (Op2Info.isConstant() && ST->hasAVX512()) |
| 603 | if (const auto *Entry = |
| 604 | CostTableLookup(Table: AVX512ConstCostTable, ISD, Ty: LT.second)) |
| 605 | if (auto KindCost = Entry->Cost[CostKind]) |
| 606 | return LT.first * *KindCost; |
| 607 | |
| 608 | static const CostKindTblEntry AVX2ConstCostTable[] = { |
| 609 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 610 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 611 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 612 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 613 | |
| 614 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
| 615 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
| 616 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
| 617 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
| 618 | |
| 619 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
| 620 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuldq+mul+sub sequence |
| 621 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
| 622 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuludq+mul+sub sequence |
| 623 | }; |
| 624 | |
| 625 | if (Op2Info.isConstant() && ST->hasAVX2()) |
| 626 | if (const auto *Entry = CostTableLookup(Table: AVX2ConstCostTable, ISD, Ty: LT.second)) |
| 627 | if (auto KindCost = Entry->Cost[CostKind]) |
| 628 | return LT.first * *KindCost; |
| 629 | |
| 630 | static const CostKindTblEntry AVXConstCostTable[] = { |
| 631 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
| 632 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
| 633 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
| 634 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
| 635 | |
| 636 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhw sequence + split. |
| 637 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhw+mul+sub sequence + split. |
| 638 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhuw sequence + split. |
| 639 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhuw+mul+sub sequence + split. |
| 640 | |
| 641 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // vpmuludq sequence |
| 642 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 38 } }, // vpmuludq+mul+sub sequence |
| 643 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // 2*pmuludq sequence + split. |
| 644 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 42 } }, // 2*pmuludq+mul+sub sequence + split. |
| 645 | }; |
| 646 | |
| 647 | if (Op2Info.isConstant() && ST->hasAVX()) |
| 648 | if (const auto *Entry = CostTableLookup(Table: AVXConstCostTable, ISD, Ty: LT.second)) |
| 649 | if (auto KindCost = Entry->Cost[CostKind]) |
| 650 | return LT.first * *KindCost; |
| 651 | |
| 652 | static const CostKindTblEntry SSE41ConstCostTable[] = { |
| 653 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
| 654 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // vpmuludq+mul+sub sequence |
| 655 | }; |
| 656 | |
| 657 | if (Op2Info.isConstant() && ST->hasSSE41()) |
| 658 | if (const auto *Entry = |
| 659 | CostTableLookup(Table: SSE41ConstCostTable, ISD, Ty: LT.second)) |
| 660 | if (auto KindCost = Entry->Cost[CostKind]) |
| 661 | return LT.first * *KindCost; |
| 662 | |
| 663 | static const CostKindTblEntry SSE2ConstCostTable[] = { |
| 664 | { .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 665 | { .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 666 | { .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
| 667 | { .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
| 668 | |
| 669 | { .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhw sequence |
| 670 | { .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhw+mul+sub sequence |
| 671 | { .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhuw sequence |
| 672 | { .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhuw+mul+sub sequence |
| 673 | |
| 674 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19 } }, // pmuludq sequence |
| 675 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 24 } }, // pmuludq+mul+sub sequence |
| 676 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // pmuludq sequence |
| 677 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // pmuludq+mul+sub sequence |
| 678 | }; |
| 679 | |
| 680 | if (Op2Info.isConstant() && ST->hasSSE2()) |
| 681 | if (const auto *Entry = CostTableLookup(Table: SSE2ConstCostTable, ISD, Ty: LT.second)) |
| 682 | if (auto KindCost = Entry->Cost[CostKind]) |
| 683 | return LT.first * *KindCost; |
| 684 | |
| 685 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { |
| 686 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
| 687 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
| 688 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4,.LatencyCost: 12, .CodeSizeCost: 8,.SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
| 689 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
| 690 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
| 691 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
| 692 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
| 693 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // psrlw + pand. |
| 694 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 15 } }, // psrlw, pand, pxor, psubb. |
| 695 | |
| 696 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw |
| 697 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw |
| 698 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrqw |
| 699 | }; |
| 700 | |
| 701 | if (ST->hasBWI() && Op2Info.isUniform()) |
| 702 | if (const auto *Entry = |
| 703 | CostTableLookup(Table: AVX512BWUniformCostTable, ISD, Ty: LT.second)) |
| 704 | if (auto KindCost = Entry->Cost[CostKind]) |
| 705 | return LT.first * *KindCost; |
| 706 | |
| 707 | static const CostKindTblEntry AVX512UniformCostTable[] = { |
| 708 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
| 709 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
| 710 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
| 711 | |
| 712 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
| 713 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
| 714 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
| 715 | |
| 716 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
| 717 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
| 718 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
| 719 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
| 720 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
| 721 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
| 722 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
| 723 | }; |
| 724 | |
| 725 | if (ST->hasAVX512() && Op2Info.isUniform()) |
| 726 | if (const auto *Entry = |
| 727 | CostTableLookup(Table: AVX512UniformCostTable, ISD, Ty: LT.second)) |
| 728 | if (auto KindCost = Entry->Cost[CostKind]) |
| 729 | return LT.first * *KindCost; |
| 730 | |
| 731 | static const CostKindTblEntry AVX2UniformCostTable[] = { |
| 732 | // Uniform splats are cheaper for the following instructions. |
| 733 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
| 734 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
| 735 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
| 736 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
| 737 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
| 738 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // psrlw, pand, pxor, psubb. |
| 739 | |
| 740 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
| 741 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
| 742 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
| 743 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw. |
| 744 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw. |
| 745 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psraw. |
| 746 | |
| 747 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
| 748 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
| 749 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
| 750 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
| 751 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
| 752 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
| 753 | |
| 754 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
| 755 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
| 756 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
| 757 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
| 758 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
| 759 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle. |
| 760 | }; |
| 761 | |
| 762 | if (ST->hasAVX2() && Op2Info.isUniform()) |
| 763 | if (const auto *Entry = |
| 764 | CostTableLookup(Table: AVX2UniformCostTable, ISD, Ty: LT.second)) |
| 765 | if (auto KindCost = Entry->Cost[CostKind]) |
| 766 | return LT.first * *KindCost; |
| 767 | |
| 768 | static const CostKindTblEntry AVXUniformCostTable[] = { |
| 769 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
| 770 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
| 771 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
| 772 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 8,.CodeSizeCost: 11,.SizeAndLatencyCost: 14 } }, // psllw + pand + split. |
| 773 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // psrlw + pand + split. |
| 774 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10,.LatencyCost: 11,.CodeSizeCost: 16,.SizeAndLatencyCost: 21 } }, // psrlw, pand, pxor, psubb + split. |
| 775 | |
| 776 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
| 777 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
| 778 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
| 779 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
| 780 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
| 781 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
| 782 | |
| 783 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld. |
| 784 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
| 785 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
| 786 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // pslld + split. |
| 787 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrld + split. |
| 788 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrad + split. |
| 789 | |
| 790 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
| 791 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
| 792 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
| 793 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psllq + split. |
| 794 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlq + split. |
| 795 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // 2 x (2 x psrad + shuffle) + split. |
| 796 | }; |
| 797 | |
| 798 | // XOP has faster vXi8 shifts. |
| 799 | if (ST->hasAVX() && Op2Info.isUniform() && |
| 800 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
| 801 | if (const auto *Entry = |
| 802 | CostTableLookup(Table: AVXUniformCostTable, ISD, Ty: LT.second)) |
| 803 | if (auto KindCost = Entry->Cost[CostKind]) |
| 804 | return LT.first * *KindCost; |
| 805 | |
| 806 | static const CostKindTblEntry SSE2UniformCostTable[] = { |
| 807 | // Uniform splats are cheaper for the following instructions. |
| 808 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, // psllw + pand. |
| 809 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
| 810 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // pcmpgtb sequence. |
| 811 | |
| 812 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
| 813 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
| 814 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
| 815 | |
| 816 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
| 817 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
| 818 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
| 819 | |
| 820 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
| 821 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
| 822 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2*psrlq + xor + sub. |
| 823 | }; |
| 824 | |
| 825 | if (ST->hasSSE2() && Op2Info.isUniform() && |
| 826 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
| 827 | if (const auto *Entry = |
| 828 | CostTableLookup(Table: SSE2UniformCostTable, ISD, Ty: LT.second)) |
| 829 | if (auto KindCost = Entry->Cost[CostKind]) |
| 830 | return LT.first * *KindCost; |
| 831 | |
| 832 | static const CostKindTblEntry AVX512DQCostTable[] = { |
| 833 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
| 834 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
| 835 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } } // pmullq |
| 836 | }; |
| 837 | |
| 838 | // Look for AVX512DQ lowering tricks for custom cases. |
| 839 | if (ST->hasDQI()) |
| 840 | if (const auto *Entry = CostTableLookup(Table: AVX512DQCostTable, ISD, Ty: LT.second)) |
| 841 | if (auto KindCost = Entry->Cost[CostKind]) |
| 842 | return LT.first * *KindCost; |
| 843 | |
| 844 | static const CostKindTblEntry AVX512BWCostTable[] = { |
| 845 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsllvw/pack sequence. |
| 846 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsrlvw/pack sequence. |
| 847 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsravw/pack sequence. |
| 848 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // extend/vpsllvw/pack sequence. |
| 849 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
| 850 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
| 851 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 19,.CodeSizeCost: 13,.SizeAndLatencyCost: 15 } }, // extend/vpsllvw/pack sequence. |
| 852 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 27,.CodeSizeCost: 15,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
| 853 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 15,.CodeSizeCost: 30,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
| 854 | |
| 855 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
| 856 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
| 857 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
| 858 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
| 859 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
| 860 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
| 861 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
| 862 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
| 863 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
| 864 | |
| 865 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
| 866 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
| 867 | |
| 868 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
| 869 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
| 870 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddd |
| 871 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
| 872 | |
| 873 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
| 874 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
| 875 | |
| 876 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/pmullw/trunc |
| 877 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
| 878 | { .ISD: ISD::MUL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
| 879 | { .ISD: ISD::MUL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
| 880 | |
| 881 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
| 882 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
| 883 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubd |
| 884 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
| 885 | }; |
| 886 | |
| 887 | // Look for AVX512BW lowering tricks for custom cases. |
| 888 | if (ST->hasBWI()) |
| 889 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTable, ISD, Ty: LT.second)) |
| 890 | if (auto KindCost = Entry->Cost[CostKind]) |
| 891 | return LT.first * *KindCost; |
| 892 | |
| 893 | static const CostKindTblEntry AVX512CostTable[] = { |
| 894 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 27,.SizeAndLatencyCost: 33 } }, // vpblendv+split sequence. |
| 895 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // vpblendv+split sequence. |
| 896 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 37,.CodeSizeCost: 51,.SizeAndLatencyCost: 63 } }, // vpblendv+split sequence. |
| 897 | |
| 898 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
| 899 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
| 900 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsravd/pack sequence. |
| 901 | |
| 902 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 903 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 904 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 905 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 906 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 907 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 908 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 909 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 910 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 911 | |
| 912 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 913 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 914 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 915 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 916 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 917 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 918 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 919 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 920 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 921 | |
| 922 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddb + split |
| 923 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddw + split |
| 924 | |
| 925 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubb + split |
| 926 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubw + split |
| 927 | |
| 928 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 929 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 930 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 931 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 932 | |
| 933 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 934 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 935 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 936 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 937 | |
| 938 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 939 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 940 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 941 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 942 | |
| 943 | { .ISD: ISD::MUL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
| 944 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
| 945 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
| 946 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
| 947 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Skylake from http://www.agner.org/ |
| 948 | |
| 949 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 950 | |
| 951 | { .ISD: ISD::FNEG, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
| 952 | { .ISD: ISD::FADD, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 953 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 954 | { .ISD: ISD::FSUB, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 955 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 956 | { .ISD: ISD::FMUL, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 957 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 958 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 959 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 960 | |
| 961 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 962 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 963 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 964 | { .ISD: ISD::FDIV, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
| 965 | |
| 966 | { .ISD: ISD::FNEG, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
| 967 | { .ISD: ISD::FADD, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 968 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 969 | { .ISD: ISD::FSUB, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 970 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 971 | { .ISD: ISD::FMUL, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 972 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 973 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 974 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 975 | |
| 976 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 977 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 978 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 979 | { .ISD: ISD::FDIV, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
| 980 | }; |
| 981 | |
| 982 | if (ST->hasAVX512()) |
| 983 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTable, ISD, Ty: LT.second)) |
| 984 | if (auto KindCost = Entry->Cost[CostKind]) |
| 985 | return LT.first * *KindCost; |
| 986 | |
| 987 | static const CostKindTblEntry AVX2ShiftCostTable[] = { |
| 988 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to |
| 989 | // customize them to detect the cases where shift amount is a scalar one. |
| 990 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
| 991 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
| 992 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
| 993 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
| 994 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
| 995 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
| 996 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvq (Haswell from agner.org) |
| 997 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvq (Haswell from agner.org) |
| 998 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsllvq (Haswell from agner.org) |
| 999 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsrlvq (Haswell from agner.org) |
| 1000 | }; |
| 1001 | |
| 1002 | if (ST->hasAVX512()) { |
| 1003 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) |
| 1004 | // On AVX512, a packed v32i16 shift left by a constant build_vector |
| 1005 | // is lowered into a vector multiply (vpmullw). |
| 1006 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
| 1007 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 1008 | } |
| 1009 | |
| 1010 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). |
| 1011 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { |
| 1012 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && |
| 1013 | Op2Info.isConstant()) |
| 1014 | // On AVX2, a packed v16i16 shift left by a constant build_vector |
| 1015 | // is lowered into a vector multiply (vpmullw). |
| 1016 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
| 1017 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 1018 | |
| 1019 | if (const auto *Entry = CostTableLookup(Table: AVX2ShiftCostTable, ISD, Ty: LT.second)) |
| 1020 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1021 | return LT.first * *KindCost; |
| 1022 | } |
| 1023 | |
| 1024 | static const CostKindTblEntry XOPShiftCostTable[] = { |
| 1025 | // 128bit shifts take 1cy, but right shifts require negation beforehand. |
| 1026 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1027 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1028 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1029 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1030 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1031 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1032 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1033 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1034 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1035 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1036 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1037 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1038 | // 256bit shifts require splitting if AVX2 didn't catch them above. |
| 1039 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1040 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1041 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1042 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1043 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1044 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1045 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1046 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1047 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1048 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1049 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1050 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 1051 | }; |
| 1052 | |
| 1053 | // Look for XOP lowering tricks. |
| 1054 | if (ST->hasXOP()) { |
| 1055 | // If the right shift is constant then we'll fold the negation so |
| 1056 | // it's as cheap as a left shift. |
| 1057 | int ShiftISD = ISD; |
| 1058 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) |
| 1059 | ShiftISD = ISD::SHL; |
| 1060 | if (const auto *Entry = |
| 1061 | CostTableLookup(Table: XOPShiftCostTable, ISD: ShiftISD, Ty: LT.second)) |
| 1062 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1063 | return LT.first * *KindCost; |
| 1064 | } |
| 1065 | |
| 1066 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { |
| 1067 | MVT VT = LT.second; |
| 1068 | // Vector shift left by non uniform constant can be lowered |
| 1069 | // into vector multiply. |
| 1070 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || |
| 1071 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) |
| 1072 | ISD = ISD::MUL; |
| 1073 | } |
| 1074 | |
| 1075 | static const CostKindTblEntry GLMCostTable[] = { |
| 1076 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
| 1077 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 36, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divps |
| 1078 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 33, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
| 1079 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 65, .LatencyCost: 66, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divpd |
| 1080 | }; |
| 1081 | |
| 1082 | if (ST->useGLMDivSqrtCosts()) |
| 1083 | if (const auto *Entry = CostTableLookup(Table: GLMCostTable, ISD, Ty: LT.second)) |
| 1084 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1085 | return LT.first * *KindCost; |
| 1086 | |
| 1087 | static const CostKindTblEntry SLMCostTable[] = { |
| 1088 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 7 } }, // pmulld |
| 1089 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
| 1090 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulsd |
| 1091 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulss |
| 1092 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulpd |
| 1093 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulps |
| 1094 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
| 1095 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divps |
| 1096 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
| 1097 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divpd |
| 1098 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // addpd |
| 1099 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // subpd |
| 1100 | // v2i64/v4i64 mul is custom lowered as a series of long: |
| 1101 | // multiplies(3), shifts(3) and adds(2) |
| 1102 | // slm muldq version throughput is 2 and addq throughput 4 |
| 1103 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + |
| 1104 | // 3X4 (addq throughput) = 17 |
| 1105 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
| 1106 | // slm addq\subq throughput is 4 |
| 1107 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 1108 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 1109 | }; |
| 1110 | |
| 1111 | if (ST->useSLMArithCosts()) |
| 1112 | if (const auto *Entry = CostTableLookup(Table: SLMCostTable, ISD, Ty: LT.second)) |
| 1113 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1114 | return LT.first * *KindCost; |
| 1115 | |
| 1116 | static const CostKindTblEntry AVX2CostTable[] = { |
| 1117 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // vpblendvb sequence. |
| 1118 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 22 } }, // vpblendvb sequence. |
| 1119 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
| 1120 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
| 1121 | |
| 1122 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // vpblendvb sequence. |
| 1123 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 24 } }, // vpblendvb sequence. |
| 1124 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
| 1125 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
| 1126 | |
| 1127 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 17,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // vpblendvb sequence. |
| 1128 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 20,.CodeSizeCost: 24,.SizeAndLatencyCost: 43 } }, // vpblendvb sequence. |
| 1129 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsravd/pack sequence. |
| 1130 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsravd/pack sequence. |
| 1131 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // srl/xor/sub sequence. |
| 1132 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // srl/xor/sub sequence. |
| 1133 | |
| 1134 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubb |
| 1135 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddb |
| 1136 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubw |
| 1137 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddw |
| 1138 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubd |
| 1139 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddd |
| 1140 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
| 1141 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
| 1142 | |
| 1143 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 6,.SizeAndLatencyCost: 12 } }, // extend/pmullw/pack |
| 1144 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 8,.SizeAndLatencyCost: 16 } }, // pmaddubsw |
| 1145 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmullw |
| 1146 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
| 1147 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
| 1148 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8,.SizeAndLatencyCost: 13 } }, // 3*pmuludq/3*shift/2*add |
| 1149 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
| 1150 | |
| 1151 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1152 | |
| 1153 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorpd |
| 1154 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
| 1155 | |
| 1156 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddsd |
| 1157 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddss |
| 1158 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddpd |
| 1159 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddps |
| 1160 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddpd |
| 1161 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddps |
| 1162 | |
| 1163 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubsd |
| 1164 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubss |
| 1165 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubpd |
| 1166 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubps |
| 1167 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubpd |
| 1168 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubps |
| 1169 | |
| 1170 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulsd |
| 1171 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulss |
| 1172 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulpd |
| 1173 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulps |
| 1174 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulpd |
| 1175 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulps |
| 1176 | |
| 1177 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivss |
| 1178 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivps |
| 1179 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivps |
| 1180 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivsd |
| 1181 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivpd |
| 1182 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivpd |
| 1183 | }; |
| 1184 | |
| 1185 | // Look for AVX2 lowering tricks for custom cases. |
| 1186 | if (ST->hasAVX2()) |
| 1187 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTable, ISD, Ty: LT.second)) |
| 1188 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1189 | return LT.first * *KindCost; |
| 1190 | |
| 1191 | static const CostKindTblEntry AVX1CostTable[] = { |
| 1192 | // We don't have to scalarize unsupported ops. We can issue two half-sized |
| 1193 | // operations and we only need to extract the upper YMM half. |
| 1194 | // Two ops + 1 extract + 1 insert = 4. |
| 1195 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 11, .CodeSizeCost: 18, .SizeAndLatencyCost: 19 } }, // pmaddubsw + split |
| 1196 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
| 1197 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // pmullw + split |
| 1198 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, // pmulld + split |
| 1199 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmulld |
| 1200 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
| 1201 | |
| 1202 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
| 1203 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
| 1204 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
| 1205 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
| 1206 | |
| 1207 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
| 1208 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
| 1209 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
| 1210 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
| 1211 | |
| 1212 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
| 1213 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
| 1214 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
| 1215 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
| 1216 | |
| 1217 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubb + split |
| 1218 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddb + split |
| 1219 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubw + split |
| 1220 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddw + split |
| 1221 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubd + split |
| 1222 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddd + split |
| 1223 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubq + split |
| 1224 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddq + split |
| 1225 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
| 1226 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
| 1227 | |
| 1228 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 17 } }, // pblendvb sequence. |
| 1229 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22,.CodeSizeCost: 27,.SizeAndLatencyCost: 40 } }, // pblendvb sequence + split. |
| 1230 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
| 1231 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 24,.SizeAndLatencyCost: 25 } }, // pblendvb sequence + split. |
| 1232 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // pslld/paddd/cvttps2dq/pmulld |
| 1233 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 17 } }, // pslld/paddd/cvttps2dq/pmulld + split |
| 1234 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
| 1235 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
| 1236 | |
| 1237 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // pblendvb sequence. |
| 1238 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23,.CodeSizeCost: 30,.SizeAndLatencyCost: 43 } }, // pblendvb sequence + split. |
| 1239 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
| 1240 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
| 1241 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
| 1242 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
| 1243 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
| 1244 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
| 1245 | |
| 1246 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 22,.CodeSizeCost: 24,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
| 1247 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45,.CodeSizeCost: 51,.SizeAndLatencyCost: 76 } }, // pblendvb sequence + split. |
| 1248 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
| 1249 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
| 1250 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
| 1251 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
| 1252 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // Shift each lane + blend. |
| 1253 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 22,.SizeAndLatencyCost: 30 } }, // Shift each lane + blend + split. |
| 1254 | |
| 1255 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
| 1256 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
| 1257 | |
| 1258 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1259 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1260 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1261 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1262 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
| 1263 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
| 1264 | |
| 1265 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1266 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1267 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1268 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
| 1269 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
| 1270 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
| 1271 | |
| 1272 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
| 1273 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
| 1274 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
| 1275 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
| 1276 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
| 1277 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
| 1278 | |
| 1279 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
| 1280 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
| 1281 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 29, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
| 1282 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
| 1283 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
| 1284 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
| 1285 | }; |
| 1286 | |
| 1287 | if (ST->hasAVX()) |
| 1288 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTable, ISD, Ty: LT.second)) |
| 1289 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1290 | return LT.first * *KindCost; |
| 1291 | |
| 1292 | static const CostKindTblEntry SSE42CostTable[] = { |
| 1293 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1294 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1295 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1296 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1297 | |
| 1298 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1299 | { .ISD: ISD::FSUB, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1300 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1301 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1302 | |
| 1303 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1304 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1305 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1306 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1307 | |
| 1308 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1309 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1310 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1311 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 1312 | |
| 1313 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } } // 3*pmuludq/3*shift/2*add |
| 1314 | }; |
| 1315 | |
| 1316 | if (ST->hasSSE42()) |
| 1317 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTable, ISD, Ty: LT.second)) |
| 1318 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1319 | return LT.first * *KindCost; |
| 1320 | |
| 1321 | static const CostKindTblEntry SSE41CostTable[] = { |
| 1322 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 24,.CodeSizeCost: 17,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
| 1323 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 14,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
| 1324 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 4,.SizeAndLatencyCost: 10 } }, // pslld/paddd/cvttps2dq/pmulld |
| 1325 | |
| 1326 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27,.CodeSizeCost: 18,.SizeAndLatencyCost: 24 } }, // pblendvb sequence. |
| 1327 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
| 1328 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
| 1329 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
| 1330 | |
| 1331 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 41,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
| 1332 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
| 1333 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
| 1334 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 17, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
| 1335 | |
| 1336 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } } // pmulld (Nehalem from agner.org) |
| 1337 | }; |
| 1338 | |
| 1339 | if (ST->hasSSE41()) |
| 1340 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTable, ISD, Ty: LT.second)) |
| 1341 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1342 | return LT.first * *KindCost; |
| 1343 | |
| 1344 | static const CostKindTblEntry SSSE3CostTable[] = { |
| 1345 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
| 1346 | }; |
| 1347 | |
| 1348 | if (ST->hasSSSE3()) |
| 1349 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTable, ISD, Ty: LT.second)) |
| 1350 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1351 | return LT.first * *KindCost; |
| 1352 | |
| 1353 | static const CostKindTblEntry SSE2CostTable[] = { |
| 1354 | // We don't correctly identify costs of casts because they are marked as |
| 1355 | // custom. |
| 1356 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 21,.CodeSizeCost: 26,.SizeAndLatencyCost: 28 } }, // cmpgtb sequence. |
| 1357 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 27,.CodeSizeCost: 16,.SizeAndLatencyCost: 20 } }, // cmpgtw sequence. |
| 1358 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // pslld/paddd/cvttps2dq/pmuludq. |
| 1359 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
| 1360 | |
| 1361 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28,.CodeSizeCost: 27,.SizeAndLatencyCost: 30 } }, // cmpgtb sequence. |
| 1362 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
| 1363 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
| 1364 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
| 1365 | |
| 1366 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 30,.CodeSizeCost: 54,.SizeAndLatencyCost: 54 } }, // unpacked cmpgtb sequence. |
| 1367 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
| 1368 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
| 1369 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // srl/xor/sub splat+shuffle sequence. |
| 1370 | |
| 1371 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
| 1372 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
| 1373 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
| 1374 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
| 1375 | |
| 1376 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
| 1377 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
| 1378 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
| 1379 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
| 1380 | |
| 1381 | { .ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
| 1382 | { .ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
| 1383 | { .ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
| 1384 | { .ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
| 1385 | |
| 1386 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
| 1387 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
| 1388 | |
| 1389 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18,.CodeSizeCost: 12,.SizeAndLatencyCost: 12 } }, // 2*unpack/2*pmullw/2*and/pack |
| 1390 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
| 1391 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // 3*pmuludq/4*shuffle |
| 1392 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } }, // 3*pmuludq/3*shift/2*add |
| 1393 | |
| 1394 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1395 | |
| 1396 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1397 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1398 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1399 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1400 | |
| 1401 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1402 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1403 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1404 | { .ISD: ISD::FNEG, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1405 | |
| 1406 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1407 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1408 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1409 | |
| 1410 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1411 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1412 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1413 | |
| 1414 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1415 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
| 1416 | }; |
| 1417 | |
| 1418 | if (ST->hasSSE2()) |
| 1419 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTable, ISD, Ty: LT.second)) |
| 1420 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1421 | return LT.first * *KindCost; |
| 1422 | |
| 1423 | static const CostKindTblEntry SSE1CostTable[] = { |
| 1424 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1425 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 48, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1426 | |
| 1427 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
| 1428 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
| 1429 | |
| 1430 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1431 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1432 | |
| 1433 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1434 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1435 | |
| 1436 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1437 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1438 | }; |
| 1439 | |
| 1440 | if (ST->hasSSE1()) |
| 1441 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTable, ISD, Ty: LT.second)) |
| 1442 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1443 | return LT.first * *KindCost; |
| 1444 | |
| 1445 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
| 1446 | { .ISD: ISD::ADD, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
| 1447 | { .ISD: ISD::SUB, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
| 1448 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 1449 | }; |
| 1450 | |
| 1451 | if (ST->is64Bit()) |
| 1452 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: LT.second)) |
| 1453 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1454 | return LT.first * *KindCost; |
| 1455 | |
| 1456 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
| 1457 | { .ISD: ISD::ADD, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1458 | { .ISD: ISD::ADD, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1459 | { .ISD: ISD::ADD, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1460 | |
| 1461 | { .ISD: ISD::SUB, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1462 | { .ISD: ISD::SUB, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1463 | { .ISD: ISD::SUB, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
| 1464 | |
| 1465 | { .ISD: ISD::MUL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1466 | { .ISD: ISD::MUL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1467 | { .ISD: ISD::MUL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 1468 | |
| 1469 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // (x87) |
| 1470 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
| 1471 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
| 1472 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
| 1473 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
| 1474 | }; |
| 1475 | |
| 1476 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: LT.second)) |
| 1477 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1478 | return LT.first * *KindCost; |
| 1479 | |
| 1480 | // It is not a good idea to vectorize division. We have to scalarize it and |
| 1481 | // in the process we will often end up having to spilling regular |
| 1482 | // registers. The overhead of division is going to dominate most kernels |
| 1483 | // anyways so try hard to prevent vectorization of division - it is |
| 1484 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
| 1485 | // to hide "20 cycles" for each lane. |
| 1486 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && |
| 1487 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || |
| 1488 | ISD == ISD::UREM)) { |
| 1489 | InstructionCost ScalarCost = |
| 1490 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
| 1491 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
| 1492 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; |
| 1493 | } |
| 1494 | |
| 1495 | // Handle some basic single instruction code size cases. |
| 1496 | if (CostKind == TTI::TCK_CodeSize) { |
| 1497 | switch (ISD) { |
| 1498 | case ISD::FADD: |
| 1499 | case ISD::FSUB: |
| 1500 | case ISD::FMUL: |
| 1501 | case ISD::FDIV: |
| 1502 | case ISD::FNEG: |
| 1503 | case ISD::AND: |
| 1504 | case ISD::OR: |
| 1505 | case ISD::XOR: |
| 1506 | return LT.first; |
| 1507 | break; |
| 1508 | } |
| 1509 | } |
| 1510 | |
| 1511 | // Fallback to the default implementation. |
| 1512 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
| 1513 | Args, CxtI); |
| 1514 | } |
| 1515 | |
| 1516 | InstructionCost |
| 1517 | X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, |
| 1518 | unsigned Opcode1, const SmallBitVector &OpcodeMask, |
| 1519 | TTI::TargetCostKind CostKind) const { |
| 1520 | if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) |
| 1521 | return TTI::TCC_Basic; |
| 1522 | return InstructionCost::getInvalid(); |
| 1523 | } |
| 1524 | |
| 1525 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
| 1526 | VectorType *DstTy, VectorType *SrcTy, |
| 1527 | ArrayRef<int> Mask, |
| 1528 | TTI::TargetCostKind CostKind, |
| 1529 | int Index, VectorType *SubTp, |
| 1530 | ArrayRef<const Value *> Args, |
| 1531 | const Instruction *CxtI) const { |
| 1532 | assert((Mask.empty() || DstTy->isScalableTy() || |
| 1533 | Mask.size() == DstTy->getElementCount().getKnownMinValue()) && |
| 1534 | "Expected the Mask to match the return size if given" ); |
| 1535 | assert(SrcTy->getScalarType() == DstTy->getScalarType() && |
| 1536 | "Expected the same scalar types" ); |
| 1537 | |
| 1538 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
| 1539 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. |
| 1540 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy); |
| 1541 | |
| 1542 | Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp); |
| 1543 | |
| 1544 | // If all args are constant than this will be constant folded away. |
| 1545 | if (!Args.empty() && |
| 1546 | all_of(Range&: Args, P: [](const Value *Arg) { return isa<Constant>(Val: Arg); })) |
| 1547 | return TTI::TCC_Free; |
| 1548 | |
| 1549 | // Recognize a basic concat_vector shuffle. |
| 1550 | if (Kind == TTI::SK_PermuteTwoSrc && |
| 1551 | Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) && |
| 1552 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
| 1553 | return getShuffleCost(Kind: TTI::SK_InsertSubvector, |
| 1554 | DstTy: VectorType::getDoubleElementsVectorType(VTy: SrcTy), |
| 1555 | SrcTy: VectorType::getDoubleElementsVectorType(VTy: SrcTy), Mask, |
| 1556 | CostKind, Index: Mask.size() / 2, SubTp: SrcTy); |
| 1557 | |
| 1558 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. |
| 1559 | if (Kind == TTI::SK_Transpose) |
| 1560 | Kind = TTI::SK_PermuteTwoSrc; |
| 1561 | |
| 1562 | if (Kind == TTI::SK_Broadcast) { |
| 1563 | // For Broadcasts we are splatting the first element from the first input |
| 1564 | // register, so only need to reference that input and all the output |
| 1565 | // registers are the same. |
| 1566 | LT.first = 1; |
| 1567 | |
| 1568 | // If we're broadcasting a load then AVX/AVX2 can do this for free. |
| 1569 | using namespace PatternMatch; |
| 1570 | if (!Args.empty() && match(V: Args[0], P: m_OneUse(SubPattern: m_Load(Op: m_Value()))) && |
| 1571 | (ST->hasAVX2() || |
| 1572 | (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) |
| 1573 | return TTI::TCC_Free; |
| 1574 | } |
| 1575 | |
| 1576 | // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector |
| 1577 | // permutation. |
| 1578 | // Attempt to detect a shuffle mask with a single defined element. |
| 1579 | bool IsInLaneShuffle = false; |
| 1580 | bool IsSingleElementMask = false; |
| 1581 | if (SrcTy->getPrimitiveSizeInBits() > 0 && |
| 1582 | (SrcTy->getPrimitiveSizeInBits() % 128) == 0 && |
| 1583 | SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
| 1584 | Mask.size() == SrcTy->getElementCount().getKnownMinValue()) { |
| 1585 | unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128; |
| 1586 | unsigned NumEltsPerLane = Mask.size() / NumLanes; |
| 1587 | if ((Mask.size() % NumLanes) == 0) { |
| 1588 | IsInLaneShuffle = all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) { |
| 1589 | return P.value() == PoisonMaskElem || |
| 1590 | ((P.value() % Mask.size()) / NumEltsPerLane) == |
| 1591 | (P.index() / NumEltsPerLane); |
| 1592 | }); |
| 1593 | IsSingleElementMask = |
| 1594 | (Mask.size() - 1) == static_cast<unsigned>(count_if(Range&: Mask, P: [](int M) { |
| 1595 | return M == PoisonMaskElem; |
| 1596 | })); |
| 1597 | } |
| 1598 | } |
| 1599 | |
| 1600 | // Treat <X x bfloat> shuffles as <X x half>. |
| 1601 | if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) |
| 1602 | LT.second = LT.second.changeVectorElementType(EltVT: MVT::f16); |
| 1603 | |
| 1604 | // Subvector extractions are free if they start at the beginning of a |
| 1605 | // vector and cheap if the subvectors are aligned. |
| 1606 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { |
| 1607 | int NumElts = LT.second.getVectorNumElements(); |
| 1608 | if ((Index % NumElts) == 0) |
| 1609 | return TTI::TCC_Free; |
| 1610 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
| 1611 | if (SubLT.second.isVector()) { |
| 1612 | int NumSubElts = SubLT.second.getVectorNumElements(); |
| 1613 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| 1614 | return SubLT.first; |
| 1615 | // Handle some cases for widening legalization. For now we only handle |
| 1616 | // cases where the original subvector was naturally aligned and evenly |
| 1617 | // fit in its legalized subvector type. |
| 1618 | // FIXME: Remove some of the alignment restrictions. |
| 1619 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit |
| 1620 | // vectors. |
| 1621 | int OrigSubElts = cast<FixedVectorType>(Val: SubTp)->getNumElements(); |
| 1622 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && |
| 1623 | (NumSubElts % OrigSubElts) == 0 && |
| 1624 | LT.second.getVectorElementType() == |
| 1625 | SubLT.second.getVectorElementType() && |
| 1626 | LT.second.getVectorElementType().getSizeInBits() == |
| 1627 | SrcTy->getElementType()->getPrimitiveSizeInBits()) { |
| 1628 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts && |
| 1629 | "Unexpected number of elements!" ); |
| 1630 | auto *VecTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
| 1631 | NumElts: LT.second.getVectorNumElements()); |
| 1632 | auto *SubTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
| 1633 | NumElts: SubLT.second.getVectorNumElements()); |
| 1634 | int = alignDown(Value: (Index % NumElts), Align: NumSubElts); |
| 1635 | InstructionCost = |
| 1636 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind, |
| 1637 | Index: ExtractIndex, SubTp: SubTy); |
| 1638 | |
| 1639 | // If the original size is 32-bits or more, we can use pshufd. Otherwise |
| 1640 | // if we have SSSE3 we can use pshufb. |
| 1641 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) |
| 1642 | return ExtractCost + 1; // pshufd or pshufb |
| 1643 | |
| 1644 | assert(SubTp->getPrimitiveSizeInBits() == 16 && |
| 1645 | "Unexpected vector size" ); |
| 1646 | |
| 1647 | return ExtractCost + 2; // worst case pshufhw + pshufd |
| 1648 | } |
| 1649 | } |
| 1650 | // If the extract subvector is not optimal, treat it as single op shuffle. |
| 1651 | Kind = TTI::SK_PermuteSingleSrc; |
| 1652 | } |
| 1653 | |
| 1654 | // Subvector insertions are cheap if the subvectors are aligned. |
| 1655 | // Note that in general, the insertion starting at the beginning of a vector |
| 1656 | // isn't free, because we need to preserve the rest of the wide vector, |
| 1657 | // but if the destination vector legalizes to the same width as the subvector |
| 1658 | // then the insertion will simplify to a (free) register copy. |
| 1659 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { |
| 1660 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy); |
| 1661 | int NumElts = DstLT.second.getVectorNumElements(); |
| 1662 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
| 1663 | if (SubLT.second.isVector()) { |
| 1664 | int NumSubElts = SubLT.second.getVectorNumElements(); |
| 1665 | bool MatchingTypes = |
| 1666 | NumElts == NumSubElts && |
| 1667 | (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0; |
| 1668 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| 1669 | return MatchingTypes ? TTI::TCC_Free : SubLT.first; |
| 1670 | } |
| 1671 | |
| 1672 | // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have |
| 1673 | // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of |
| 1674 | // v1f32 (legalised to f32) into a v4f32. |
| 1675 | if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 && |
| 1676 | SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41())) |
| 1677 | return 1; |
| 1678 | |
| 1679 | // If the insertion is the lowest subvector then it will be blended |
| 1680 | // otherwise treat it like a 2-op shuffle. |
| 1681 | Kind = |
| 1682 | (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc; |
| 1683 | } |
| 1684 | |
| 1685 | // Handle some common (illegal) sub-vector types as they are often very cheap |
| 1686 | // to shuffle even on targets without PSHUFB. |
| 1687 | EVT VT = TLI->getValueType(DL, Ty: SrcTy); |
| 1688 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && |
| 1689 | !ST->hasSSSE3()) { |
| 1690 | static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = { |
| 1691 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1692 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1693 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
| 1694 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
| 1695 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
| 1696 | |
| 1697 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1698 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1699 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // punpck/pshuflw/packus |
| 1700 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
| 1701 | |
| 1702 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
| 1703 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
| 1704 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
| 1705 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
| 1706 | |
| 1707 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
| 1708 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
| 1709 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 7,.LatencyCost: 7,.CodeSizeCost: 7,.SizeAndLatencyCost: 7}}, // punpck/pshuflw |
| 1710 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // punpck/pshuflw |
| 1711 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck |
| 1712 | |
| 1713 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1714 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
| 1715 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // punpck/pshuflw |
| 1716 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // punpck/pshuflw |
| 1717 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
| 1718 | }; |
| 1719 | |
| 1720 | if (ST->hasSSE2()) |
| 1721 | if (const auto *Entry = |
| 1722 | CostTableLookup(Table: SSE2SubVectorShuffleTbl, ISD: Kind, Ty: VT.getSimpleVT())) |
| 1723 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1724 | return LT.first * *KindCost; |
| 1725 | } |
| 1726 | |
| 1727 | // We are going to permute multiple sources and the result will be in multiple |
| 1728 | // destinations. Providing an accurate cost only for splits where the element |
| 1729 | // type remains the same. |
| 1730 | if (LT.first != 1) { |
| 1731 | MVT LegalVT = LT.second; |
| 1732 | if (LegalVT.isVector() && |
| 1733 | LegalVT.getVectorElementType().getSizeInBits() == |
| 1734 | SrcTy->getElementType()->getPrimitiveSizeInBits() && |
| 1735 | LegalVT.getVectorNumElements() < |
| 1736 | cast<FixedVectorType>(Val: SrcTy)->getNumElements()) { |
| 1737 | unsigned VecTySize = DL.getTypeStoreSize(Ty: SrcTy); |
| 1738 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
| 1739 | // Number of source vectors after legalization: |
| 1740 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
| 1741 | // Number of destination vectors after legalization: |
| 1742 | InstructionCost NumOfDests = LT.first; |
| 1743 | |
| 1744 | auto *SingleOpTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
| 1745 | NumElts: LegalVT.getVectorNumElements()); |
| 1746 | |
| 1747 | if (!Mask.empty() && NumOfDests.isValid()) { |
| 1748 | // Try to perform better estimation of the permutation. |
| 1749 | // 1. Split the source/destination vectors into real registers. |
| 1750 | // 2. Do the mask analysis to identify which real registers are |
| 1751 | // permuted. If more than 1 source registers are used for the |
| 1752 | // destination register building, the cost for this destination register |
| 1753 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one |
| 1754 | // source register is used, build mask and calculate the cost as a cost |
| 1755 | // of PermuteSingleSrc. |
| 1756 | // Also, for the single register permute we try to identify if the |
| 1757 | // destination register is just a copy of the source register or the |
| 1758 | // copy of the previous destination register (the cost is |
| 1759 | // TTI::TCC_Basic). If the source register is just reused, the cost for |
| 1760 | // this operation is TTI::TCC_Free. |
| 1761 | NumOfDests = |
| 1762 | getTypeLegalizationCost( |
| 1763 | Ty: FixedVectorType::get(ElementType: SrcTy->getElementType(), NumElts: Mask.size())) |
| 1764 | .first; |
| 1765 | unsigned E = NumOfDests.getValue(); |
| 1766 | unsigned NormalizedVF = |
| 1767 | LegalVT.getVectorNumElements() * std::max(a: NumOfSrcs, b: E); |
| 1768 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
| 1769 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
| 1770 | SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); |
| 1771 | copy(Range&: Mask, Out: NormalizedMask.begin()); |
| 1772 | unsigned PrevSrcReg = 0; |
| 1773 | ArrayRef<int> PrevRegMask; |
| 1774 | InstructionCost Cost = 0; |
| 1775 | processShuffleMasks( |
| 1776 | Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {}, |
| 1777 | SingleInputAction: [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, |
| 1778 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { |
| 1779 | if (!ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size())) { |
| 1780 | // Check if the previous register can be just copied to the next |
| 1781 | // one. |
| 1782 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || |
| 1783 | PrevRegMask != RegMask) |
| 1784 | Cost += |
| 1785 | getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: SingleOpTy, |
| 1786 | SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
| 1787 | else |
| 1788 | // Just a copy of previous destination register. |
| 1789 | Cost += TTI::TCC_Basic; |
| 1790 | return; |
| 1791 | } |
| 1792 | if (SrcReg != DestReg && |
| 1793 | any_of(Range&: RegMask, P: [](int I) { return I != PoisonMaskElem; })) { |
| 1794 | // Just a copy of the source register. |
| 1795 | Cost += TTI::TCC_Free; |
| 1796 | } |
| 1797 | PrevSrcReg = SrcReg; |
| 1798 | PrevRegMask = RegMask; |
| 1799 | }, |
| 1800 | ManyInputsAction: [this, SingleOpTy, CostKind, |
| 1801 | &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/, |
| 1802 | unsigned /*Unused*/, bool /*Unused*/) { |
| 1803 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleOpTy, |
| 1804 | SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
| 1805 | }); |
| 1806 | return Cost; |
| 1807 | } |
| 1808 | |
| 1809 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
| 1810 | return NumOfShuffles * getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleOpTy, |
| 1811 | SrcTy: SingleOpTy, Mask: {}, CostKind, Index: 0, |
| 1812 | SubTp: nullptr); |
| 1813 | } |
| 1814 | |
| 1815 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, |
| 1816 | SubTp); |
| 1817 | } |
| 1818 | |
| 1819 | // If we're just moving a single element around (probably as an alternative to |
| 1820 | // extracting it), we can assume this is cheap. |
| 1821 | if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask) |
| 1822 | return TTI::TCC_Basic; |
| 1823 | |
| 1824 | static const CostKindTblEntry AVX512VBMIShuffleTbl[] = { |
| 1825 | { .ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
| 1826 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
| 1827 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
| 1828 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
| 1829 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2b |
| 1830 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2b |
| 1831 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } } // vpermt2b |
| 1832 | }; |
| 1833 | |
| 1834 | if (ST->hasVBMI()) |
| 1835 | if (const auto *Entry = |
| 1836 | CostTableLookup(Table: AVX512VBMIShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 1837 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1838 | return LT.first * *KindCost; |
| 1839 | |
| 1840 | static const CostKindTblEntry AVX512BWShuffleTbl[] = { |
| 1841 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1842 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1843 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
| 1844 | |
| 1845 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1846 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1847 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1848 | { .ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // pshufb + vshufi64x2 |
| 1849 | |
| 1850 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1851 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1852 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1853 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
| 1854 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // extend to v32i16 |
| 1855 | |
| 1856 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
| 1857 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
| 1858 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
| 1859 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
| 1860 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 19, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, // 6 * v32i8 + 1 |
| 1861 | |
| 1862 | { .ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmw |
| 1863 | { .ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmb |
| 1864 | |
| 1865 | { .ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
| 1866 | { .ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
| 1867 | { .ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
| 1868 | }; |
| 1869 | |
| 1870 | if (ST->hasBWI()) |
| 1871 | if (const auto *Entry = |
| 1872 | CostTableLookup(Table: AVX512BWShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 1873 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1874 | return LT.first * *KindCost; |
| 1875 | |
| 1876 | static const CostKindTblEntry AVX512ShuffleTbl[] = { |
| 1877 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastsd |
| 1878 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastss |
| 1879 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastq |
| 1880 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastd |
| 1881 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1882 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1883 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
| 1884 | |
| 1885 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 1886 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 1887 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 1888 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 1889 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
| 1890 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
| 1891 | {.ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
| 1892 | |
| 1893 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1894 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1895 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1896 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1897 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1898 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1899 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1900 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
| 1901 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
| 1902 | {.ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
| 1903 | {.ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
| 1904 | |
| 1905 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 1906 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 1907 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 1908 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 1909 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 1910 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 1911 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 1912 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 1913 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 1914 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 1915 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 1916 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 1917 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pshufb |
| 1918 | |
| 1919 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
| 1920 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
| 1921 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
| 1922 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
| 1923 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
| 1924 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
| 1925 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
| 1926 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
| 1927 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
| 1928 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
| 1929 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
| 1930 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
| 1931 | |
| 1932 | // FIXME: This just applies the type legalization cost rules above |
| 1933 | // assuming these completely split. |
| 1934 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 1935 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 1936 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 1937 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
| 1938 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
| 1939 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
| 1940 | |
| 1941 | {.ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 1942 | {.ISD: TTI::SK_Select, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 1943 | {.ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 1944 | {.ISD: TTI::SK_Select, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmpd |
| 1945 | {.ISD: TTI::SK_Select, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmps |
| 1946 | {.ISD: TTI::SK_Select, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmq |
| 1947 | {.ISD: TTI::SK_Select, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmd |
| 1948 | }; |
| 1949 | |
| 1950 | if (ST->hasAVX512()) |
| 1951 | if (const auto *Entry = CostTableLookup(Table: AVX512ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 1952 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1953 | return LT.first * *KindCost; |
| 1954 | |
| 1955 | static const CostKindTblEntry AVX2InLaneShuffleTbl[] = { |
| 1956 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
| 1957 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
| 1958 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
| 1959 | |
| 1960 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufpd + vblendpd |
| 1961 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufps + vblendps |
| 1962 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufd + vpblendd |
| 1963 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufd + vpblendd |
| 1964 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
| 1965 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
| 1966 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
| 1967 | }; |
| 1968 | |
| 1969 | if (IsInLaneShuffle && ST->hasAVX2()) |
| 1970 | if (const auto *Entry = |
| 1971 | CostTableLookup(Table: AVX2InLaneShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 1972 | if (auto KindCost = Entry->Cost[CostKind]) |
| 1973 | return LT.first * *KindCost; |
| 1974 | |
| 1975 | static const CostKindTblEntry AVX2ShuffleTbl[] = { |
| 1976 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastpd |
| 1977 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastps |
| 1978 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastq |
| 1979 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastd |
| 1980 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1981 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
| 1982 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
| 1983 | |
| 1984 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 1985 | { .ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 1986 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 1987 | { .ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 1988 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
| 1989 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
| 1990 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
| 1991 | |
| 1992 | { .ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
| 1993 | { .ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
| 1994 | { .ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
| 1995 | |
| 1996 | { .ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
| 1997 | { .ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
| 1998 | { .ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
| 1999 | { .ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
| 2000 | { .ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
| 2001 | |
| 2002 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
| 2003 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
| 2004 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
| 2005 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
| 2006 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 2007 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 2008 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 2009 | |
| 2010 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermpd + vblendpd |
| 2011 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermps + vblendps |
| 2012 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermq + vpblendd |
| 2013 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermd + vpblendd |
| 2014 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 2015 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 2016 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 2017 | }; |
| 2018 | |
| 2019 | if (ST->hasAVX2()) |
| 2020 | if (const auto *Entry = CostTableLookup(Table: AVX2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2021 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2022 | return LT.first * *KindCost; |
| 2023 | |
| 2024 | static const CostKindTblEntry XOPShuffleTbl[] = { |
| 2025 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2pd |
| 2026 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2ps |
| 2027 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2pd |
| 2028 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2ps |
| 2029 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*vpperm |
| 2030 | // + vinsertf128 |
| 2031 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*vpperm |
| 2032 | // + vinsertf128 |
| 2033 | |
| 2034 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 6*vpperm |
| 2035 | // + vinsertf128 |
| 2036 | |
| 2037 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpperm |
| 2038 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 6*vpperm |
| 2039 | // + vinsertf128 |
| 2040 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpperm |
| 2041 | }; |
| 2042 | |
| 2043 | if (ST->hasXOP()) |
| 2044 | if (const auto *Entry = CostTableLookup(Table: XOPShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2045 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2046 | return LT.first * *KindCost; |
| 2047 | |
| 2048 | static const CostKindTblEntry AVX1InLaneShuffleTbl[] = { |
| 2049 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilpd |
| 2050 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilpd |
| 2051 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilps |
| 2052 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilps |
| 2053 | |
| 2054 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
| 2055 | // + vpor + vinsertf128 |
| 2056 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
| 2057 | // + vpor + vinsertf128 |
| 2058 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
| 2059 | // + vpor + vinsertf128 |
| 2060 | |
| 2061 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufpd + vblendpd |
| 2062 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufps + vblendps |
| 2063 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpermilpd + vblendpd |
| 2064 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpermilps + vblendps |
| 2065 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
| 2066 | // + 2*vpor + vinsertf128 |
| 2067 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
| 2068 | // + 2*vpor + vinsertf128 |
| 2069 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
| 2070 | // + 2*vpor + vinsertf128 |
| 2071 | }; |
| 2072 | |
| 2073 | if (IsInLaneShuffle && ST->hasAVX()) |
| 2074 | if (const auto *Entry = |
| 2075 | CostTableLookup(Table: AVX1InLaneShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2076 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2077 | return LT.first * *KindCost; |
| 2078 | |
| 2079 | static const CostKindTblEntry AVX1ShuffleTbl[] = { |
| 2080 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
| 2081 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
| 2082 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
| 2083 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
| 2084 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpshuflw + vpshufd + vinsertf128 |
| 2085 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpshuflw + vpshufd + vinsertf128 |
| 2086 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vpshufb + vinsertf128 |
| 2087 | |
| 2088 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
| 2089 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
| 2090 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
| 2091 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
| 2092 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
| 2093 | // + vinsertf128 |
| 2094 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
| 2095 | // + vinsertf128 |
| 2096 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
| 2097 | // + vinsertf128 |
| 2098 | |
| 2099 | {.ISD: TTI::SK_Select, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendpd |
| 2100 | {.ISD: TTI::SK_Select, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendpd |
| 2101 | {.ISD: TTI::SK_Select, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendps |
| 2102 | {.ISD: TTI::SK_Select, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendps |
| 2103 | {.ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
| 2104 | {.ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
| 2105 | {.ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
| 2106 | |
| 2107 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + shufpd |
| 2108 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + shufpd |
| 2109 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2110 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2111 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
| 2112 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
| 2113 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
| 2114 | |
| 2115 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vshufpd |
| 2116 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vshufpd |
| 2117 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2118 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2119 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16,.Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
| 2120 | // + 2*por + vinsertf128 |
| 2121 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16,.Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
| 2122 | // + 2*por + vinsertf128 |
| 2123 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
| 2124 | // + 2*por + vinsertf128 |
| 2125 | |
| 2126 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // 2*vperm2f128 + vshufpd |
| 2127 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // 2*vperm2f128 + vshufpd |
| 2128 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2129 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
| 2130 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16,.Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
| 2131 | // + 4*por + vinsertf128 |
| 2132 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16,.Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
| 2133 | // + 4*por + vinsertf128 |
| 2134 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
| 2135 | // + 4*por + vinsertf128 |
| 2136 | }; |
| 2137 | |
| 2138 | if (ST->hasAVX()) |
| 2139 | if (const auto *Entry = CostTableLookup(Table: AVX1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2140 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2141 | return LT.first * *KindCost; |
| 2142 | |
| 2143 | static const CostKindTblEntry SSE41ShuffleTbl[] = { |
| 2144 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
| 2145 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // movsd |
| 2146 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
| 2147 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // blendps |
| 2148 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
| 2149 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
| 2150 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}} // pblendvb |
| 2151 | }; |
| 2152 | |
| 2153 | if (ST->hasSSE41()) |
| 2154 | if (const auto *Entry = CostTableLookup(Table: SSE41ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2155 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2156 | return LT.first * *KindCost; |
| 2157 | |
| 2158 | static const CostKindTblEntry SSSE3ShuffleTbl[] = { |
| 2159 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2160 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2161 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2162 | |
| 2163 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2164 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2165 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2166 | |
| 2167 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2168 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2169 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2170 | |
| 2171 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
| 2172 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
| 2173 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
| 2174 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
| 2175 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
| 2176 | |
| 2177 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2178 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2179 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
| 2180 | |
| 2181 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2182 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2183 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
| 2184 | }; |
| 2185 | |
| 2186 | if (ST->hasSSSE3()) |
| 2187 | if (const auto *Entry = CostTableLookup(Table: SSSE3ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2188 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2189 | return LT.first * *KindCost; |
| 2190 | |
| 2191 | static const CostKindTblEntry SSE2ShuffleTbl[] = { |
| 2192 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2193 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2194 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2195 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // pshuflw + pshufd |
| 2196 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // pshuflw + pshufd |
| 2197 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // unpck + pshuflw + pshufd |
| 2198 | |
| 2199 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2200 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2201 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2202 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pshuflw + pshufhw + pshufd |
| 2203 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pshuflw + pshufhw + pshufd |
| 2204 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9}}, // 2*pshuflw + 2*pshufhw |
| 2205 | // + 2*pshufd + 2*unpck + packus |
| 2206 | |
| 2207 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // movsd |
| 2208 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // movsd |
| 2209 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*shufps |
| 2210 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
| 2211 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
| 2212 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
| 2213 | |
| 2214 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2215 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2216 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*{unpck,movsd,pshufd} |
| 2217 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
| 2218 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
| 2219 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
| 2220 | |
| 2221 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2222 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2223 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
| 2224 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5}}, // 2*pshuflw + 2*pshufhw |
| 2225 | // + pshufd/unpck |
| 2226 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5}}, // 2*pshuflw + 2*pshufhw |
| 2227 | // + pshufd/unpck |
| 2228 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 8, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 10}}, // 2*pshuflw + 2*pshufhw |
| 2229 | // + 2*pshufd + 2*unpck + 2*packus |
| 2230 | |
| 2231 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2232 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
| 2233 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*{unpck,movsd,pshufd} |
| 2234 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8}}, // blend+permute |
| 2235 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8}}, // blend+permute |
| 2236 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 13, .CodeSizeCost: 13, .SizeAndLatencyCost: 13}}, // blend+permute |
| 2237 | }; |
| 2238 | |
| 2239 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { |
| 2240 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 0}, // broadcast handled by movddup |
| 2241 | }; |
| 2242 | |
| 2243 | if (ST->hasSSE2()) { |
| 2244 | bool IsLoad = |
| 2245 | llvm::any_of(Range&: Args, P: [](const auto &V) { return isa<LoadInst>(V); }); |
| 2246 | if (ST->hasSSE3() && IsLoad) |
| 2247 | if (const auto *Entry = |
| 2248 | CostTableLookup(Table: SSE3BroadcastLoadTbl, ISD: Kind, Ty: LT.second)) { |
| 2249 | assert(isLegalBroadcastLoad(SrcTy->getElementType(), |
| 2250 | LT.second.getVectorElementCount()) && |
| 2251 | "Table entry missing from isLegalBroadcastLoad()" ); |
| 2252 | return LT.first * Entry->Cost; |
| 2253 | } |
| 2254 | |
| 2255 | if (const auto *Entry = CostTableLookup(Table: SSE2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2256 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2257 | return LT.first * *KindCost; |
| 2258 | } |
| 2259 | |
| 2260 | static const CostKindTblEntry SSE1ShuffleTbl[] = { |
| 2261 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
| 2262 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
| 2263 | { .ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
| 2264 | { .ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
| 2265 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
| 2266 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
| 2267 | }; |
| 2268 | |
| 2269 | if (ST->hasSSE1()) { |
| 2270 | if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) { |
| 2271 | // SHUFPS: both pairs must come from the same source register. |
| 2272 | auto MatchSHUFPS = [](int X, int Y) { |
| 2273 | return X < 0 || Y < 0 || ((X & 4) == (Y & 4)); |
| 2274 | }; |
| 2275 | if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3])) |
| 2276 | return 1; |
| 2277 | } |
| 2278 | if (const auto *Entry = CostTableLookup(Table: SSE1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
| 2279 | if (auto KindCost = Entry->Cost[CostKind]) |
| 2280 | return LT.first * *KindCost; |
| 2281 | } |
| 2282 | |
| 2283 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, |
| 2284 | SubTp); |
| 2285 | } |
| 2286 | |
| 2287 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
| 2288 | Type *Src, |
| 2289 | TTI::CastContextHint CCH, |
| 2290 | TTI::TargetCostKind CostKind, |
| 2291 | const Instruction *I) const { |
| 2292 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 2293 | assert(ISD && "Invalid opcode" ); |
| 2294 | |
| 2295 | // The cost tables include both specific, custom (non-legal) src/dst type |
| 2296 | // conversions and generic, legalized types. We test for customs first, before |
| 2297 | // falling back to legalization. |
| 2298 | // FIXME: Need a better design of the cost table to handle non-simple types of |
| 2299 | // potential massive combinations (elem_num x src_type x dst_type). |
| 2300 | static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{ |
| 2301 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2302 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2303 | |
| 2304 | // Mask sign extend has an instruction. |
| 2305 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2306 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2307 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2308 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2309 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2310 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2311 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2312 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2313 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2314 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2315 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2316 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2317 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2318 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2319 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2320 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2321 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2322 | |
| 2323 | // Mask zero extend is a sext + shift. |
| 2324 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2325 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2326 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2327 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2328 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2329 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2330 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2331 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2332 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2333 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2334 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2335 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2336 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2337 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2338 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2339 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2340 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2341 | |
| 2342 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2343 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2344 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2345 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2346 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2347 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2348 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2349 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2350 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2351 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2352 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2353 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2354 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2355 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2356 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2357 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2358 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2359 | |
| 2360 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2361 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // widen to zmm |
| 2362 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
| 2363 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
| 2364 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
| 2365 | }; |
| 2366 | |
| 2367 | static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = { |
| 2368 | // Mask sign extend has an instruction. |
| 2369 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2370 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2371 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2372 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2373 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2374 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2375 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2376 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2377 | |
| 2378 | // Mask zero extend is a sext + shift. |
| 2379 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2380 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2381 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2382 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2383 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2384 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2385 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2386 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
| 2387 | |
| 2388 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2389 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2390 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2391 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2392 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2393 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2394 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2395 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2396 | |
| 2397 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2398 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2399 | |
| 2400 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2401 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2402 | |
| 2403 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2404 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2405 | |
| 2406 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2407 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2408 | }; |
| 2409 | |
| 2410 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
| 2411 | // 256-bit wide vectors. |
| 2412 | |
| 2413 | static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = { |
| 2414 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2415 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2416 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v16f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // 2*vcvtps2pd+vextractf64x4 |
| 2417 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v16f32, .Src: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps |
| 2418 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
| 2419 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2420 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v16f16, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtps2ph |
| 2421 | |
| 2422 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2423 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2424 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2425 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2426 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2427 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2428 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2429 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2430 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
| 2431 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
| 2432 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
| 2433 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
| 2434 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
| 2435 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
| 2436 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
| 2437 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
| 2438 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
| 2439 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
| 2440 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
| 2441 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
| 2442 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
| 2443 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
| 2444 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2445 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
| 2446 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2447 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2448 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2449 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2450 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
| 2451 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
| 2452 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
| 2453 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
| 2454 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpmovqd |
| 2455 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } },// 2*vpmovqd+concat+vpmovdb |
| 2456 | |
| 2457 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // extend to v16i32 |
| 2458 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2459 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2460 | |
| 2461 | // Sign extend is zmm vpternlogd+vptruncdb. |
| 2462 | // Zero extend is zmm broadcast load+vptruncdw. |
| 2463 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2464 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2465 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2466 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2467 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2468 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2469 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2470 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2471 | |
| 2472 | // Sign extend is zmm vpternlogd+vptruncdw. |
| 2473 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. |
| 2474 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2475 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2476 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2477 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2478 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2479 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2480 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2481 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2482 | |
| 2483 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
| 2484 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
| 2485 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
| 2486 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
| 2487 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
| 2488 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
| 2489 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
| 2490 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
| 2491 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
| 2492 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
| 2493 | |
| 2494 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
| 2495 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
| 2496 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 2497 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
| 2498 | |
| 2499 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2500 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2501 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2502 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2503 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2504 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2505 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2506 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2507 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2508 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2509 | |
| 2510 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
| 2511 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
| 2512 | |
| 2513 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2514 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2515 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2516 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2517 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2518 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2519 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2520 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2521 | |
| 2522 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2523 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2524 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2525 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2526 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2527 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2528 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2529 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2530 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: {.RecipThroughputCost: 26, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2531 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2532 | |
| 2533 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2534 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2535 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2536 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f32, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2537 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f64, .Cost: {.RecipThroughputCost: 31, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2538 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2539 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2540 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2541 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2542 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2543 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2544 | |
| 2545 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2546 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2547 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2548 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2549 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2550 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2551 | }; |
| 2552 | |
| 2553 | static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] { |
| 2554 | // Mask sign extend has an instruction. |
| 2555 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2556 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2557 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2558 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2559 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2560 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2561 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2562 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2563 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2564 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2565 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2566 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2567 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2568 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2569 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2570 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2571 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2572 | |
| 2573 | // Mask zero extend is a sext + shift. |
| 2574 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2575 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2576 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2577 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2578 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2579 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2580 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2581 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2582 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2583 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2584 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2585 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2586 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2587 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2588 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2589 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2590 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2591 | |
| 2592 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2593 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2594 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2595 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2596 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2597 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2598 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2599 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2600 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2601 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2602 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2603 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2604 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2605 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2606 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2607 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2608 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2609 | |
| 2610 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2611 | }; |
| 2612 | |
| 2613 | static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = { |
| 2614 | // Mask sign extend has an instruction. |
| 2615 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2616 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2617 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2618 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2619 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2620 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2621 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2622 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2623 | |
| 2624 | // Mask zero extend is a sext + shift. |
| 2625 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2626 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2627 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2628 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2629 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2630 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2631 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2632 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2633 | |
| 2634 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2635 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2636 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2637 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2638 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2639 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2640 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2641 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2642 | |
| 2643 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2644 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2645 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2646 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2647 | |
| 2648 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2649 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2650 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2651 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2652 | |
| 2653 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2654 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2655 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2656 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2657 | |
| 2658 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2659 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2660 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2661 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2662 | }; |
| 2663 | |
| 2664 | static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = { |
| 2665 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2666 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2667 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
| 2668 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i8 |
| 2669 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2670 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2671 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
| 2672 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i16 |
| 2673 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
| 2674 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
| 2675 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
| 2676 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
| 2677 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
| 2678 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
| 2679 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
| 2680 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
| 2681 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
| 2682 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
| 2683 | |
| 2684 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb |
| 2685 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb |
| 2686 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2687 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2688 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2689 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2690 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2691 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2692 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2693 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2694 | |
| 2695 | // sign extend is vpcmpeq+maskedmove+vpmovdw |
| 2696 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw |
| 2697 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2698 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2699 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2700 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2701 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2702 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2703 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2704 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2705 | |
| 2706 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
| 2707 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
| 2708 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
| 2709 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
| 2710 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
| 2711 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
| 2712 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
| 2713 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
| 2714 | |
| 2715 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 2716 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
| 2717 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
| 2718 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
| 2719 | |
| 2720 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2721 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2722 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2723 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2724 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2725 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2726 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2727 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2728 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2729 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2730 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2731 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2732 | |
| 2733 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2734 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2735 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2736 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2737 | |
| 2738 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2739 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2740 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2741 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2742 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2743 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2744 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2745 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2746 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2747 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2748 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2749 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2750 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2751 | |
| 2752 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2753 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2754 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2755 | |
| 2756 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2757 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2758 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2759 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2760 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2761 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2762 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2763 | }; |
| 2764 | |
| 2765 | static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = { |
| 2766 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2767 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2768 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2769 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2770 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2771 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2772 | |
| 2773 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2774 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2775 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2776 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2777 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2778 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2779 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2780 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2781 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2782 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2783 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2784 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2785 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2786 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2787 | |
| 2788 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2789 | |
| 2790 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2791 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2792 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2793 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2794 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2795 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2796 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2797 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2798 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2799 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2800 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2801 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2802 | |
| 2803 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2804 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2805 | |
| 2806 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2807 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2808 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2809 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2810 | |
| 2811 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2812 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2813 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2814 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2815 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2816 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2817 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2818 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2819 | |
| 2820 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2821 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2822 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2823 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2824 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2825 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2826 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2827 | |
| 2828 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2829 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2830 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2831 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2832 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2833 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2834 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2835 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2836 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2837 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2838 | }; |
| 2839 | |
| 2840 | static const TypeConversionCostKindTblEntry AVXConversionTbl[] = { |
| 2841 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2842 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2843 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2844 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2845 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2846 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2847 | |
| 2848 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2849 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2850 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2851 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2852 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2853 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2854 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2855 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2856 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2857 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2858 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2859 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2860 | |
| 2861 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2862 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2863 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2864 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2865 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2866 | |
| 2867 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2868 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2869 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+packuswb |
| 2870 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2871 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2872 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2873 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+2*packusdw |
| 2874 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2875 | |
| 2876 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2877 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2878 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2879 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2880 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2881 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2882 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2883 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2884 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2885 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2886 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2887 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2888 | |
| 2889 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2890 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2891 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2892 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2893 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2894 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2895 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2896 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2897 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2898 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2899 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2900 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2901 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2902 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2903 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2904 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2905 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2906 | |
| 2907 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2908 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2909 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2910 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2911 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2912 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2913 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2914 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2915 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2916 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2917 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2918 | |
| 2919 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2920 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2921 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2922 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2923 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2924 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2925 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2926 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2927 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2928 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2929 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2930 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2931 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2932 | |
| 2933 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2934 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2935 | }; |
| 2936 | |
| 2937 | static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = { |
| 2938 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2939 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2940 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2941 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2942 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2943 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2944 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2945 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2946 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2947 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2948 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2949 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2950 | |
| 2951 | // These truncates end up widening elements. |
| 2952 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBQ |
| 2953 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZWQ |
| 2954 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBD |
| 2955 | |
| 2956 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2957 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2958 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2959 | |
| 2960 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2961 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2962 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2963 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2964 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2965 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2966 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2967 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2968 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2969 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2970 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2971 | |
| 2972 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2973 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2974 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2975 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2976 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2977 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2978 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2979 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2980 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2981 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2982 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2983 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2984 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 22, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2985 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2986 | |
| 2987 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2988 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2989 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2990 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2991 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2992 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2993 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2994 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2995 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2996 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2997 | |
| 2998 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 2999 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3000 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3001 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3002 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3003 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3004 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3005 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3006 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3007 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3008 | }; |
| 3009 | |
| 3010 | static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = { |
| 3011 | // These are somewhat magic numbers justified by comparing the |
| 3012 | // output of llvm-mca for our various supported scheduler models |
| 3013 | // and basing it off the worst case scenario. |
| 3014 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3015 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3016 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3017 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3018 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3019 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3020 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3021 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3022 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3023 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3024 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3025 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3026 | |
| 3027 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3028 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3029 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3030 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3031 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3032 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3033 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3034 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3035 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3036 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3037 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3038 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3039 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3040 | |
| 3041 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3042 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3043 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3044 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3045 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3046 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3047 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3048 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3049 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3050 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3051 | |
| 3052 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3053 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3054 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3055 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3056 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3057 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3058 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3059 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3060 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3061 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3062 | |
| 3063 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3064 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3065 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3066 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3067 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3068 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3069 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3070 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3071 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3072 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3073 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3074 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3075 | |
| 3076 | // These truncates are really widening elements. |
| 3077 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
| 3078 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD+DQ |
| 3079 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD+PSHUFD |
| 3080 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD |
| 3081 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD |
| 3082 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW |
| 3083 | |
| 3084 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+PACKUSWB |
| 3085 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3086 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+2*PACKUSWB |
| 3087 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3088 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3089 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3090 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3091 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3092 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+3*PACKUSWB |
| 3093 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD+PSHUFLW |
| 3094 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
| 3095 | }; |
| 3096 | |
| 3097 | static const TypeConversionCostKindTblEntry F16ConversionTbl[] = { |
| 3098 | { .ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3099 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3100 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3101 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3102 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
| 3103 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3104 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3105 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
| 3106 | }; |
| 3107 | |
| 3108 | // Attempt to map directly to (simple) MVT types to let us match custom entries. |
| 3109 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
| 3110 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
| 3111 | |
| 3112 | // The function getSimpleVT only handles simple value types. |
| 3113 | if (SrcTy.isSimple() && DstTy.isSimple()) { |
| 3114 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); |
| 3115 | MVT SimpleDstTy = DstTy.getSimpleVT(); |
| 3116 | |
| 3117 | if (ST->useAVX512Regs()) { |
| 3118 | if (ST->hasBWI()) |
| 3119 | if (const auto *Entry = ConvertCostTableLookup( |
| 3120 | Table: AVX512BWConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3121 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3122 | return *KindCost; |
| 3123 | |
| 3124 | if (ST->hasDQI()) |
| 3125 | if (const auto *Entry = ConvertCostTableLookup( |
| 3126 | Table: AVX512DQConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3127 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3128 | return *KindCost; |
| 3129 | |
| 3130 | if (ST->hasAVX512()) |
| 3131 | if (const auto *Entry = ConvertCostTableLookup( |
| 3132 | Table: AVX512FConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3133 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3134 | return *KindCost; |
| 3135 | } |
| 3136 | |
| 3137 | if (ST->hasBWI()) |
| 3138 | if (const auto *Entry = ConvertCostTableLookup( |
| 3139 | Table: AVX512BWVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3140 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3141 | return *KindCost; |
| 3142 | |
| 3143 | if (ST->hasDQI()) |
| 3144 | if (const auto *Entry = ConvertCostTableLookup( |
| 3145 | Table: AVX512DQVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3146 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3147 | return *KindCost; |
| 3148 | |
| 3149 | if (ST->hasAVX512()) |
| 3150 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
| 3151 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3152 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3153 | return *KindCost; |
| 3154 | |
| 3155 | if (ST->hasAVX2()) { |
| 3156 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
| 3157 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3158 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3159 | return *KindCost; |
| 3160 | } |
| 3161 | |
| 3162 | if (ST->hasAVX()) { |
| 3163 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
| 3164 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3165 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3166 | return *KindCost; |
| 3167 | } |
| 3168 | |
| 3169 | if (ST->hasF16C()) { |
| 3170 | if (const auto *Entry = ConvertCostTableLookup(Table: F16ConversionTbl, ISD, |
| 3171 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3172 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3173 | return *KindCost; |
| 3174 | } |
| 3175 | |
| 3176 | if (ST->hasSSE41()) { |
| 3177 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
| 3178 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3179 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3180 | return *KindCost; |
| 3181 | } |
| 3182 | |
| 3183 | if (ST->hasSSE2()) { |
| 3184 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
| 3185 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
| 3186 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3187 | return *KindCost; |
| 3188 | } |
| 3189 | |
| 3190 | if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) || |
| 3191 | (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) { |
| 3192 | // fp16 conversions not covered by any table entries require a libcall. |
| 3193 | // Return a large (arbitrary) number to model this. |
| 3194 | return InstructionCost(64); |
| 3195 | } |
| 3196 | } |
| 3197 | |
| 3198 | // Fall back to legalized types. |
| 3199 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Ty: Src); |
| 3200 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Ty: Dst); |
| 3201 | |
| 3202 | // If we're truncating to the same legalized type - just assume its free. |
| 3203 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) |
| 3204 | return TTI::TCC_Free; |
| 3205 | |
| 3206 | if (ST->useAVX512Regs()) { |
| 3207 | if (ST->hasBWI()) |
| 3208 | if (const auto *Entry = ConvertCostTableLookup( |
| 3209 | Table: AVX512BWConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
| 3210 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3211 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3212 | |
| 3213 | if (ST->hasDQI()) |
| 3214 | if (const auto *Entry = ConvertCostTableLookup( |
| 3215 | Table: AVX512DQConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
| 3216 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3217 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3218 | |
| 3219 | if (ST->hasAVX512()) |
| 3220 | if (const auto *Entry = ConvertCostTableLookup( |
| 3221 | Table: AVX512FConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
| 3222 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3223 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3224 | } |
| 3225 | |
| 3226 | if (ST->hasBWI()) |
| 3227 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512BWVLConversionTbl, ISD, |
| 3228 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3229 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3230 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3231 | |
| 3232 | if (ST->hasDQI()) |
| 3233 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512DQVLConversionTbl, ISD, |
| 3234 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3235 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3236 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3237 | |
| 3238 | if (ST->hasAVX512()) |
| 3239 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
| 3240 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3241 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3242 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3243 | |
| 3244 | if (ST->hasAVX2()) |
| 3245 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
| 3246 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3247 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3248 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3249 | |
| 3250 | if (ST->hasAVX()) |
| 3251 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
| 3252 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3253 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3254 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3255 | |
| 3256 | if (ST->hasF16C()) { |
| 3257 | if (const auto *Entry = ConvertCostTableLookup(Table: F16ConversionTbl, ISD, |
| 3258 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3259 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3260 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3261 | } |
| 3262 | |
| 3263 | if (ST->hasSSE41()) |
| 3264 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
| 3265 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3266 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3267 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3268 | |
| 3269 | if (ST->hasSSE2()) |
| 3270 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
| 3271 | Dst: LTDest.second, Src: LTSrc.second)) |
| 3272 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3273 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
| 3274 | |
| 3275 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for |
| 3276 | // sitofp. |
| 3277 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
| 3278 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { |
| 3279 | Type *ExtSrc = Src->getWithNewBitWidth(NewBitWidth: 32); |
| 3280 | unsigned ExtOpc = |
| 3281 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; |
| 3282 | |
| 3283 | // For scalar loads the extend would be free. |
| 3284 | InstructionCost ExtCost = 0; |
| 3285 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(Val: I->getOperand(i: 0)))) |
| 3286 | ExtCost = getCastInstrCost(Opcode: ExtOpc, Dst: ExtSrc, Src, CCH, CostKind); |
| 3287 | |
| 3288 | return ExtCost + getCastInstrCost(Opcode: Instruction::SIToFP, Dst, Src: ExtSrc, |
| 3289 | CCH: TTI::CastContextHint::None, CostKind); |
| 3290 | } |
| 3291 | |
| 3292 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi |
| 3293 | // i32. |
| 3294 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && |
| 3295 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { |
| 3296 | Type *TruncDst = Dst->getWithNewBitWidth(NewBitWidth: 32); |
| 3297 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: TruncDst, Src, CCH, CostKind) + |
| 3298 | getCastInstrCost(Opcode: Instruction::Trunc, Dst, Src: TruncDst, |
| 3299 | CCH: TTI::CastContextHint::None, CostKind); |
| 3300 | } |
| 3301 | |
| 3302 | // TODO: Allow non-throughput costs that aren't binary. |
| 3303 | auto AdjustCost = [&CostKind](InstructionCost Cost, |
| 3304 | InstructionCost N = 1) -> InstructionCost { |
| 3305 | if (CostKind != TTI::TCK_RecipThroughput) |
| 3306 | return Cost == 0 ? 0 : N; |
| 3307 | return Cost * N; |
| 3308 | }; |
| 3309 | return AdjustCost( |
| 3310 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
| 3311 | } |
| 3312 | |
| 3313 | InstructionCost X86TTIImpl::getCmpSelInstrCost( |
| 3314 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
| 3315 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
| 3316 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
| 3317 | // Early out if this type isn't scalar/vector integer/float. |
| 3318 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) |
| 3319 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
| 3320 | Op1Info, Op2Info, I); |
| 3321 | |
| 3322 | // Legalize the type. |
| 3323 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
| 3324 | |
| 3325 | MVT MTy = LT.second; |
| 3326 | |
| 3327 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 3328 | assert(ISD && "Invalid opcode" ); |
| 3329 | |
| 3330 | InstructionCost = 0; |
| 3331 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { |
| 3332 | // Some vector comparison predicates cost extra instructions. |
| 3333 | // TODO: Adjust ExtraCost based on CostKind? |
| 3334 | // TODO: Should we invert this and assume worst case cmp costs |
| 3335 | // and reduce for particular predicates? |
| 3336 | if (MTy.isVector() && |
| 3337 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || |
| 3338 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || |
| 3339 | ST->hasBWI())) { |
| 3340 | // Fallback to I if a specific predicate wasn't specified. |
| 3341 | CmpInst::Predicate Pred = VecPred; |
| 3342 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || |
| 3343 | Pred == CmpInst::BAD_FCMP_PREDICATE)) |
| 3344 | Pred = cast<CmpInst>(Val: I)->getPredicate(); |
| 3345 | |
| 3346 | bool CmpWithConstant = false; |
| 3347 | if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(Val: I)) |
| 3348 | CmpWithConstant = isa<Constant>(Val: CmpInstr->getOperand(i_nocapture: 1)); |
| 3349 | |
| 3350 | switch (Pred) { |
| 3351 | case CmpInst::Predicate::ICMP_NE: |
| 3352 | // xor(cmpeq(x,y),-1) |
| 3353 | ExtraCost = CmpWithConstant ? 0 : 1; |
| 3354 | break; |
| 3355 | case CmpInst::Predicate::ICMP_SGE: |
| 3356 | case CmpInst::Predicate::ICMP_SLE: |
| 3357 | // xor(cmpgt(x,y),-1) |
| 3358 | ExtraCost = CmpWithConstant ? 0 : 1; |
| 3359 | break; |
| 3360 | case CmpInst::Predicate::ICMP_ULT: |
| 3361 | case CmpInst::Predicate::ICMP_UGT: |
| 3362 | // cmpgt(xor(x,signbit),xor(y,signbit)) |
| 3363 | // xor(cmpeq(pmaxu(x,y),x),-1) |
| 3364 | ExtraCost = CmpWithConstant ? 1 : 2; |
| 3365 | break; |
| 3366 | case CmpInst::Predicate::ICMP_ULE: |
| 3367 | case CmpInst::Predicate::ICMP_UGE: |
| 3368 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || |
| 3369 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { |
| 3370 | // cmpeq(psubus(x,y),0) |
| 3371 | // cmpeq(pminu(x,y),x) |
| 3372 | ExtraCost = 1; |
| 3373 | } else { |
| 3374 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) |
| 3375 | ExtraCost = CmpWithConstant ? 2 : 3; |
| 3376 | } |
| 3377 | break; |
| 3378 | case CmpInst::Predicate::FCMP_ONE: |
| 3379 | case CmpInst::Predicate::FCMP_UEQ: |
| 3380 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. |
| 3381 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. |
| 3382 | if (CondTy && !ST->hasAVX()) |
| 3383 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, |
| 3384 | VecPred: CmpInst::Predicate::FCMP_UNO, CostKind, |
| 3385 | Op1Info, Op2Info) + |
| 3386 | getCmpSelInstrCost(Opcode, ValTy, CondTy, |
| 3387 | VecPred: CmpInst::Predicate::FCMP_OEQ, CostKind, |
| 3388 | Op1Info, Op2Info) + |
| 3389 | getArithmeticInstrCost(Opcode: Instruction::Or, Ty: CondTy, CostKind); |
| 3390 | |
| 3391 | break; |
| 3392 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: |
| 3393 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: |
| 3394 | // Assume worst case scenario and add the maximum extra cost. |
| 3395 | ExtraCost = 3; |
| 3396 | break; |
| 3397 | default: |
| 3398 | break; |
| 3399 | } |
| 3400 | } |
| 3401 | } |
| 3402 | |
| 3403 | static const CostKindTblEntry SLMCostTbl[] = { |
| 3404 | // slm pcmpeq/pcmpgt throughput is 2 |
| 3405 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3406 | // slm pblendvb/blendvpd/blendvps throughput is 4 |
| 3407 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvpd |
| 3408 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvps |
| 3409 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
| 3410 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
| 3411 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
| 3412 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
| 3413 | }; |
| 3414 | |
| 3415 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
| 3416 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3417 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3418 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3419 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3420 | |
| 3421 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3422 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3423 | }; |
| 3424 | |
| 3425 | static const CostKindTblEntry AVX512CostTbl[] = { |
| 3426 | { .ISD: ISD::SETCC, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3427 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3428 | { .ISD: ISD::SETCC, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3429 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3430 | |
| 3431 | { .ISD: ISD::SETCC, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3432 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3433 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3434 | { .ISD: ISD::SETCC, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3435 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3436 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3437 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3438 | |
| 3439 | { .ISD: ISD::SELECT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3440 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3441 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3442 | { .ISD: ISD::SELECT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3443 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3444 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3445 | { .ISD: ISD::SELECT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3446 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3447 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3448 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3449 | { .ISD: ISD::SELECT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3450 | { .ISD: ISD::SELECT, .Type: MVT::v8f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3451 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3452 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3453 | |
| 3454 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3455 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3456 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3457 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3458 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3459 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3460 | }; |
| 3461 | |
| 3462 | static const CostKindTblEntry AVX2CostTbl[] = { |
| 3463 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3464 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3465 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3466 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3467 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3468 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3469 | |
| 3470 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3471 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3472 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3473 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3474 | |
| 3475 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
| 3476 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
| 3477 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3478 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3479 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3480 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3481 | }; |
| 3482 | |
| 3483 | static const CostKindTblEntry XOPCostTbl[] = { |
| 3484 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3485 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3486 | }; |
| 3487 | |
| 3488 | static const CostKindTblEntry AVX1CostTbl[] = { |
| 3489 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3490 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3491 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3492 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3493 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3494 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3495 | |
| 3496 | // AVX1 does not support 8-wide integer compare. |
| 3497 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3498 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3499 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3500 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3501 | |
| 3502 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
| 3503 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
| 3504 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
| 3505 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
| 3506 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
| 3507 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
| 3508 | }; |
| 3509 | |
| 3510 | static const CostKindTblEntry SSE42CostTbl[] = { |
| 3511 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3512 | }; |
| 3513 | |
| 3514 | static const CostKindTblEntry SSE41CostTbl[] = { |
| 3515 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3516 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3517 | |
| 3518 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
| 3519 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
| 3520 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
| 3521 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
| 3522 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3523 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3524 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3525 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
| 3526 | }; |
| 3527 | |
| 3528 | static const CostKindTblEntry SSE2CostTbl[] = { |
| 3529 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3530 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3531 | |
| 3532 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // pcmpeqd/pcmpgtd expansion |
| 3533 | { .ISD: ISD::SETCC, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3534 | { .ISD: ISD::SETCC, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3535 | { .ISD: ISD::SETCC, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3536 | |
| 3537 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
| 3538 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
| 3539 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
| 3540 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
| 3541 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
| 3542 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
| 3543 | }; |
| 3544 | |
| 3545 | static const CostKindTblEntry SSE1CostTbl[] = { |
| 3546 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3547 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3548 | |
| 3549 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
| 3550 | { .ISD: ISD::SELECT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
| 3551 | }; |
| 3552 | |
| 3553 | if (ST->useSLMArithCosts()) |
| 3554 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
| 3555 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3556 | return LT.first * (ExtraCost + *KindCost); |
| 3557 | |
| 3558 | if (ST->hasBWI()) |
| 3559 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
| 3560 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3561 | return LT.first * (ExtraCost + *KindCost); |
| 3562 | |
| 3563 | if (ST->hasAVX512()) |
| 3564 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
| 3565 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3566 | return LT.first * (ExtraCost + *KindCost); |
| 3567 | |
| 3568 | if (ST->hasAVX2()) |
| 3569 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
| 3570 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3571 | return LT.first * (ExtraCost + *KindCost); |
| 3572 | |
| 3573 | if (ST->hasXOP()) |
| 3574 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
| 3575 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3576 | return LT.first * (ExtraCost + *KindCost); |
| 3577 | |
| 3578 | if (ST->hasAVX()) |
| 3579 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 3580 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3581 | return LT.first * (ExtraCost + *KindCost); |
| 3582 | |
| 3583 | if (ST->hasSSE42()) |
| 3584 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
| 3585 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3586 | return LT.first * (ExtraCost + *KindCost); |
| 3587 | |
| 3588 | if (ST->hasSSE41()) |
| 3589 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
| 3590 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3591 | return LT.first * (ExtraCost + *KindCost); |
| 3592 | |
| 3593 | if (ST->hasSSE2()) |
| 3594 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 3595 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3596 | return LT.first * (ExtraCost + *KindCost); |
| 3597 | |
| 3598 | if (ST->hasSSE1()) |
| 3599 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
| 3600 | if (auto KindCost = Entry->Cost[CostKind]) |
| 3601 | return LT.first * (ExtraCost + *KindCost); |
| 3602 | |
| 3603 | // Assume a 3cy latency for fp select ops. |
| 3604 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) |
| 3605 | if (ValTy->getScalarType()->isFloatingPointTy()) |
| 3606 | return 3; |
| 3607 | |
| 3608 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
| 3609 | Op1Info, Op2Info, I); |
| 3610 | } |
| 3611 | |
| 3612 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } |
| 3613 | |
| 3614 | InstructionCost |
| 3615 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
| 3616 | TTI::TargetCostKind CostKind) const { |
| 3617 | // Costs should match the codegen from: |
| 3618 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll |
| 3619 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll |
| 3620 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll |
| 3621 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll |
| 3622 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll |
| 3623 | |
| 3624 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not |
| 3625 | // specialized in these tables yet. |
| 3626 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { |
| 3627 | { .ISD: ISD::FSHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3628 | { .ISD: ISD::FSHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3629 | { .ISD: ISD::FSHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3630 | { .ISD: ISD::FSHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3631 | { .ISD: ISD::FSHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3632 | { .ISD: ISD::FSHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3633 | { .ISD: ISD::FSHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3634 | { .ISD: ISD::FSHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3635 | { .ISD: ISD::FSHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3636 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3637 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3638 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3639 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3640 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3641 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3642 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3643 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3644 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3645 | }; |
| 3646 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { |
| 3647 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3648 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3649 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3650 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3651 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3652 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3653 | }; |
| 3654 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { |
| 3655 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3656 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3657 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3658 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3659 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3660 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3661 | }; |
| 3662 | static const CostKindTblEntry AVX512CDCostTbl[] = { |
| 3663 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3664 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3665 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 27, .CodeSizeCost: 23, .SizeAndLatencyCost: 27 } }, |
| 3666 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 16, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
| 3667 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3668 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3669 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 19, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
| 3670 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
| 3671 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3672 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3673 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 3674 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
| 3675 | |
| 3676 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3677 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3678 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 3679 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 3680 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 3681 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 3682 | }; |
| 3683 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
| 3684 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3685 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3686 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3687 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3688 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
| 3689 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3690 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3691 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
| 3692 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3693 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3694 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
| 3695 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
| 3696 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
| 3697 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 12 } }, |
| 3698 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3699 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3700 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3701 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3702 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3703 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3704 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3705 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3706 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3707 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 22, .CodeSizeCost: 23, .SizeAndLatencyCost: 23 } }, |
| 3708 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 23, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
| 3709 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 15, .SizeAndLatencyCost: 16 } }, |
| 3710 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 9 } }, |
| 3711 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
| 3712 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
| 3713 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 10, .SizeAndLatencyCost: 12 } }, |
| 3714 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3715 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3716 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
| 3717 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3718 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3719 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
| 3720 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 3721 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 3722 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
| 3723 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3724 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3725 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
| 3726 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3727 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3728 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
| 3729 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
| 3730 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3731 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3732 | { .ISD: ISD::ROTL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
| 3733 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 3734 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 3735 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
| 3736 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3737 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3738 | { .ISD: ISD::ROTR, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 12, .SizeAndLatencyCost: 14 } }, |
| 3739 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
| 3740 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
| 3741 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3742 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3743 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3744 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3745 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3746 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3747 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3748 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3749 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3750 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3751 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3752 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3753 | { .ISD: ISD::SMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3754 | { .ISD: ISD::SMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 21, .CodeSizeCost: 17, .SizeAndLatencyCost: 18 } }, |
| 3755 | { .ISD: ISD::UMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3756 | { .ISD: ISD::UMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 15, .CodeSizeCost: 15, .SizeAndLatencyCost: 16 } }, |
| 3757 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3758 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3759 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3760 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3761 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3762 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3763 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3764 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3765 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3766 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3767 | }; |
| 3768 | static const CostKindTblEntry AVX512CostTbl[] = { |
| 3769 | { .ISD: ISD::ABS, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3770 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3771 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3772 | { .ISD: ISD::ABS, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3773 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3774 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3775 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3776 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3777 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3778 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
| 3779 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
| 3780 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
| 3781 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
| 3782 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3783 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3784 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3785 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 28, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
| 3786 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 30, .CodeSizeCost: 38, .SizeAndLatencyCost: 38 } }, |
| 3787 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 15, .CodeSizeCost: 29, .SizeAndLatencyCost: 29 } }, |
| 3788 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
| 3789 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
| 3790 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 19, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
| 3791 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 15, .CodeSizeCost: 22, .SizeAndLatencyCost: 22 } }, |
| 3792 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 11, .CodeSizeCost: 16, .SizeAndLatencyCost: 16 } }, |
| 3793 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3794 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3795 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 17, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
| 3796 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
| 3797 | { .ISD: ISD::ROTL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3798 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3799 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3800 | { .ISD: ISD::ROTL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3801 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3802 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3803 | { .ISD: ISD::ROTR, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3804 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3805 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3806 | { .ISD: ISD::ROTR, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3807 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3808 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3809 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3810 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3811 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3812 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3813 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3814 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3815 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
| 3816 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3817 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3818 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3819 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3820 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 3821 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3822 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3823 | { .ISD: ISD::SMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3824 | { .ISD: ISD::SMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3825 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3826 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3827 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3828 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3829 | { .ISD: ISD::SMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3830 | { .ISD: ISD::SMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3831 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3832 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3833 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3834 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3835 | { .ISD: ISD::SMULO, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 44, .CodeSizeCost: 81, .SizeAndLatencyCost: 93 } }, |
| 3836 | { .ISD: ISD::SMULO, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
| 3837 | { .ISD: ISD::SMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
| 3838 | { .ISD: ISD::SMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 28, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
| 3839 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
| 3840 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
| 3841 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
| 3842 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
| 3843 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
| 3844 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
| 3845 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3846 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3847 | { .ISD: ISD::UMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3848 | { .ISD: ISD::UMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3849 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3850 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3851 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3852 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3853 | { .ISD: ISD::UMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3854 | { .ISD: ISD::UMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3855 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3856 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 3857 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3858 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3859 | { .ISD: ISD::UMULO, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 52, .LatencyCost: 52, .CodeSizeCost: 95, .SizeAndLatencyCost: 104} }, |
| 3860 | { .ISD: ISD::UMULO, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
| 3861 | { .ISD: ISD::UMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 13, .CodeSizeCost: 16, .SizeAndLatencyCost: 16 } }, |
| 3862 | { .ISD: ISD::UMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 30, .SizeAndLatencyCost: 30 } }, |
| 3863 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3864 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3865 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3866 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3867 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3868 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 3869 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3870 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3871 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3872 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3873 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3874 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3875 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3876 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3877 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 3878 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3879 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3880 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3881 | { .ISD: ISD::FMAXNUM, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3882 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3883 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3884 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3885 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3886 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3887 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3888 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3889 | { .ISD: ISD::FSQRT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
| 3890 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3891 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3892 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
| 3893 | { .ISD: ISD::FSQRT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
| 3894 | }; |
| 3895 | static const CostKindTblEntry XOPCostTbl[] = { |
| 3896 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3897 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3898 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3899 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3900 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3901 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3902 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3903 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3904 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3905 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3906 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3907 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
| 3908 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) |
| 3909 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3910 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3911 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3912 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3913 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3914 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3915 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3916 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3917 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
| 3918 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
| 3919 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
| 3920 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
| 3921 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3922 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3923 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3924 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 3925 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3926 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3927 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3928 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 3929 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3930 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3931 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3932 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3933 | }; |
| 3934 | static const CostKindTblEntry AVX2CostTbl[] = { |
| 3935 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
| 3936 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
| 3937 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3938 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3939 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3940 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3941 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 3942 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3943 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3944 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
| 3945 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3946 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
| 3947 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
| 3948 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
| 3949 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
| 3950 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 15 } }, |
| 3951 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3952 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3953 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3954 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3955 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3956 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3957 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
| 3958 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 44 } }, |
| 3959 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
| 3960 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 34 } }, |
| 3961 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, |
| 3962 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
| 3963 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
| 3964 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 14 } }, |
| 3965 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
| 3966 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
| 3967 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3968 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
| 3969 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3970 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
| 3971 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 3972 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
| 3973 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 13 } }, |
| 3974 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 20 } }, |
| 3975 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
| 3976 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 24 } }, |
| 3977 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 3978 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
| 3979 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 3980 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
| 3981 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
| 3982 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
| 3983 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, |
| 3984 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 7, .SizeAndLatencyCost: 13 } }, |
| 3985 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3986 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3987 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 3988 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 3989 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3990 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3991 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3992 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 3993 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 3994 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3995 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3996 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 3997 | { .ISD: ISD::SMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 33, .SizeAndLatencyCost: 37 } }, |
| 3998 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 13, .SizeAndLatencyCost: 15 } }, |
| 3999 | { .ISD: ISD::SMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 20, .CodeSizeCost: 13, .SizeAndLatencyCost: 24 } }, |
| 4000 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
| 4001 | { .ISD: ISD::SMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 8, .SizeAndLatencyCost: 14 } }, |
| 4002 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4003 | { .ISD: ISD::SMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 15, .CodeSizeCost: 18, .SizeAndLatencyCost: 35 } }, |
| 4004 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 22, .CodeSizeCost: 14, .SizeAndLatencyCost: 21 } }, |
| 4005 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
| 4006 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
| 4007 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 14, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
| 4008 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 9, .SizeAndLatencyCost: 16 } }, |
| 4009 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4010 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4011 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4012 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 10 } }, |
| 4013 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 8 } }, |
| 4014 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4015 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4016 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 4017 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
| 4018 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4019 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4020 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4021 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
| 4022 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
| 4023 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4024 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4025 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4026 | { .ISD: ISD::UMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 24, .CodeSizeCost: 39, .SizeAndLatencyCost: 43 } }, |
| 4027 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 10, .CodeSizeCost: 15, .SizeAndLatencyCost: 19 } }, |
| 4028 | { .ISD: ISD::UMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 23 } }, |
| 4029 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
| 4030 | { .ISD: ISD::UMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 8, .SizeAndLatencyCost: 13 } }, |
| 4031 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4032 | { .ISD: ISD::UMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 17, .SizeAndLatencyCost: 33 } }, |
| 4033 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 19, .CodeSizeCost: 13, .SizeAndLatencyCost: 20 } }, |
| 4034 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4035 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 10 } }, |
| 4036 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4037 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4038 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4039 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
| 4040 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
| 4041 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
| 4042 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
| 4043 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
| 4044 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
| 4045 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
| 4046 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
| 4047 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
| 4048 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
| 4049 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
| 4050 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
| 4051 | }; |
| 4052 | static const CostKindTblEntry AVX1CostTbl[] = { |
| 4053 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
| 4054 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
| 4055 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
| 4056 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
| 4057 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
| 4058 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4059 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
| 4060 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4061 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
| 4062 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4063 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 26 } }, // 2 x 128-bit Op + extract/insert |
| 4064 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
| 4065 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
| 4066 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4067 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
| 4068 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4069 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
| 4070 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4071 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 29, .LatencyCost: 33, .CodeSizeCost: 49, .SizeAndLatencyCost: 58 } }, // 2 x 128-bit Op + extract/insert |
| 4072 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 24, .CodeSizeCost: 24, .SizeAndLatencyCost: 28 } }, |
| 4073 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 28, .CodeSizeCost: 39, .SizeAndLatencyCost: 48 } }, // 2 x 128-bit Op + extract/insert |
| 4074 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 19, .SizeAndLatencyCost: 23 } }, |
| 4075 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 29, .SizeAndLatencyCost: 38 } }, // 2 x 128-bit Op + extract/insert |
| 4076 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 16, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
| 4077 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
| 4078 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
| 4079 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
| 4080 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
| 4081 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
| 4082 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 20, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
| 4083 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 21, .CodeSizeCost: 22, .SizeAndLatencyCost: 31 } }, // 2 x 128-bit Op + extract/insert |
| 4084 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 18, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
| 4085 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 16, .SizeAndLatencyCost: 25 } }, // 2 x 128-bit Op + extract/insert |
| 4086 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
| 4087 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 24, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
| 4088 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 19, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
| 4089 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 27, .CodeSizeCost: 32, .SizeAndLatencyCost: 41 } }, // 2 x 128-bit Op + extract/insert |
| 4090 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 24, .CodeSizeCost: 17, .SizeAndLatencyCost: 21 } }, |
| 4091 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
| 4092 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
| 4093 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 18, .CodeSizeCost: 21, .SizeAndLatencyCost: 30 } }, // 2 x 128-bit Op + extract/insert |
| 4094 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 16, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
| 4095 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
| 4096 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 20, .CodeSizeCost: 15, .SizeAndLatencyCost: 25 } }, // 2 x 128-bit Op + extract/insert |
| 4097 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 18, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, // 2 x 128-bit Op + extract/insert |
| 4098 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4099 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4100 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
| 4101 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4102 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4103 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4104 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4105 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
| 4106 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4107 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4108 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4109 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4110 | { .ISD: ISD::SMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 33, .SizeAndLatencyCost: 37 } }, |
| 4111 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
| 4112 | { .ISD: ISD::SMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 20, .CodeSizeCost: 24, .SizeAndLatencyCost: 29 } }, |
| 4113 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
| 4114 | { .ISD: ISD::SMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, |
| 4115 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4116 | { .ISD: ISD::SMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 37, .SizeAndLatencyCost: 39 } }, |
| 4117 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 22, .CodeSizeCost: 18, .SizeAndLatencyCost: 21 } }, |
| 4118 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
| 4119 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 21, .CodeSizeCost: 18, .SizeAndLatencyCost: 29 } }, // 2 x 128-bit Op + extract/insert |
| 4120 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19, .CodeSizeCost: 18, .SizeAndLatencyCost: 29 } }, // 2 x 128-bit Op + extract/insert |
| 4121 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4122 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4123 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4124 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, // 2 x 128-bit Op + extract/insert |
| 4125 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, // 2 x 128-bit Op + extract/insert |
| 4126 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4127 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4128 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
| 4129 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
| 4130 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4131 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4132 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4133 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
| 4134 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
| 4135 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4136 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4137 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4138 | { .ISD: ISD::UMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 26, .CodeSizeCost: 39, .SizeAndLatencyCost: 45 } }, |
| 4139 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 12, .CodeSizeCost: 15, .SizeAndLatencyCost: 20 } }, |
| 4140 | { .ISD: ISD::UMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 15, .CodeSizeCost: 23, .SizeAndLatencyCost: 28 } }, |
| 4141 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
| 4142 | { .ISD: ISD::UMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 14 } }, |
| 4143 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4144 | { .ISD: ISD::UMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 19, .CodeSizeCost: 35, .SizeAndLatencyCost: 37 } }, |
| 4145 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 19, .CodeSizeCost: 17, .SizeAndLatencyCost: 20 } }, |
| 4146 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4147 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, // 2 x 128-bit Op + extract/insert |
| 4148 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2 x 128-bit Op + extract/insert |
| 4149 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4150 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4151 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
| 4152 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
| 4153 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
| 4154 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
| 4155 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
| 4156 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
| 4157 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
| 4158 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
| 4159 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
| 4160 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
| 4161 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
| 4162 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
| 4163 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 54, .LatencyCost: 54, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
| 4164 | }; |
| 4165 | static const CostKindTblEntry GFNICostTbl[] = { |
| 4166 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4167 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
| 4168 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // gf2p8affineqb |
| 4169 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
| 4170 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4171 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4172 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4173 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4174 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4175 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4176 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4177 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4178 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4179 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4180 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4181 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
| 4182 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4183 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4184 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
| 4185 | }; |
| 4186 | static const CostKindTblEntry GLMCostTbl[] = { |
| 4187 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
| 4188 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
| 4189 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
| 4190 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 67, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
| 4191 | }; |
| 4192 | static const CostKindTblEntry SLMCostTbl[] = { |
| 4193 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4194 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4195 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4196 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
| 4197 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 40, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
| 4198 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
| 4199 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 70, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
| 4200 | }; |
| 4201 | static const CostKindTblEntry SSE42CostTbl[] = { |
| 4202 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
| 4203 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
| 4204 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
| 4205 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
| 4206 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 4207 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 4208 | }; |
| 4209 | static const CostKindTblEntry SSE41CostTbl[] = { |
| 4210 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) |
| 4211 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 14, .CodeSizeCost: 17, .SizeAndLatencyCost: 21 } }, |
| 4212 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
| 4213 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 19, .CodeSizeCost: 25, .SizeAndLatencyCost: 29 } }, |
| 4214 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 10, .SizeAndLatencyCost: 12 } }, |
| 4215 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4216 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4217 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4218 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4219 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4220 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4221 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
| 4222 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 24, .CodeSizeCost: 13, .SizeAndLatencyCost: 19 } }, |
| 4223 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 9, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 4224 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 22, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
| 4225 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 4226 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 4227 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 4228 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
| 4229 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 4230 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4231 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4232 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
| 4233 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4234 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4235 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 15, .SizeAndLatencyCost: 20 } }, |
| 4236 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 12, .SizeAndLatencyCost: 18 } }, |
| 4237 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4238 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 18, .SizeAndLatencyCost: 20 } }, |
| 4239 | }; |
| 4240 | static const CostKindTblEntry SSSE3CostTbl[] = { |
| 4241 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4242 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4243 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4244 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
| 4245 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
| 4246 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
| 4247 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4248 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4249 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4250 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
| 4251 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 28, .CodeSizeCost: 28, .SizeAndLatencyCost: 35 } }, |
| 4252 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 20, .CodeSizeCost: 22, .SizeAndLatencyCost: 28 } }, |
| 4253 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 17, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
| 4254 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4255 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 12, .SizeAndLatencyCost: 18 } }, |
| 4256 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
| 4257 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 18, .CodeSizeCost: 14, .SizeAndLatencyCost: 20 } }, |
| 4258 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
| 4259 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 15, .SizeAndLatencyCost: 22 } }, |
| 4260 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 26, .CodeSizeCost: 19, .SizeAndLatencyCost: 25 } }, |
| 4261 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 20, .CodeSizeCost: 17, .SizeAndLatencyCost: 23 } }, |
| 4262 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16, .CodeSizeCost: 13, .SizeAndLatencyCost: 19 } } |
| 4263 | }; |
| 4264 | static const CostKindTblEntry SSE2CostTbl[] = { |
| 4265 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 4266 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
| 4267 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 4268 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 4269 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
| 4270 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 30, .SizeAndLatencyCost: 30 } }, |
| 4271 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
| 4272 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
| 4273 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
| 4274 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
| 4275 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
| 4276 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 36, .SizeAndLatencyCost: 38 } }, |
| 4277 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 38, .SizeAndLatencyCost: 40 } }, |
| 4278 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 38, .CodeSizeCost: 32, .SizeAndLatencyCost: 34 } }, |
| 4279 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 39, .CodeSizeCost: 29, .SizeAndLatencyCost: 32 } }, |
| 4280 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 26, .CodeSizeCost: 16, .SizeAndLatencyCost: 18 } }, |
| 4281 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 29, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
| 4282 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 18, .SizeAndLatencyCost: 20 } }, |
| 4283 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
| 4284 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28, .CodeSizeCost: 19, .SizeAndLatencyCost: 21 } }, |
| 4285 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 31, .CodeSizeCost: 24, .SizeAndLatencyCost: 26 } }, |
| 4286 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
| 4287 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 23, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
| 4288 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 14, .CodeSizeCost: 24, .SizeAndLatencyCost: 24 } }, |
| 4289 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
| 4290 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4291 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4292 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
| 4293 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 4294 | { .ISD: ISD::SMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4295 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 4296 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
| 4297 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 4298 | { .ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4299 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
| 4300 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 30, .LatencyCost: 33, .CodeSizeCost: 13, .SizeAndLatencyCost: 23 } }, |
| 4301 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 24, .CodeSizeCost: 23, .SizeAndLatencyCost: 23 } }, |
| 4302 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 4303 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 23, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
| 4304 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19, .CodeSizeCost: 31, .SizeAndLatencyCost: 31 } }, |
| 4305 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 12, .SizeAndLatencyCost: 13 } }, |
| 4306 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4307 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4308 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 4309 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4310 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4311 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4312 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
| 4313 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 4314 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 4315 | { .ISD: ISD::UMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4316 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
| 4317 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
| 4318 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
| 4319 | { .ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4320 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 30, .LatencyCost: 33, .CodeSizeCost: 15, .SizeAndLatencyCost: 29 } }, |
| 4321 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
| 4322 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4323 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
| 4324 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
| 4325 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4326 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4327 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4328 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4329 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4330 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 4331 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
| 4332 | }; |
| 4333 | static const CostKindTblEntry SSE1CostTbl[] = { |
| 4334 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
| 4335 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4336 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
| 4337 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 56, .LatencyCost: 56, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
| 4338 | }; |
| 4339 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets |
| 4340 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4341 | }; |
| 4342 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets |
| 4343 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4344 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4345 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4346 | }; |
| 4347 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets |
| 4348 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4349 | }; |
| 4350 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets |
| 4351 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4352 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4353 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4354 | }; |
| 4355 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets |
| 4356 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
| 4357 | }; |
| 4358 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets |
| 4359 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
| 4360 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
| 4361 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
| 4362 | }; |
| 4363 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
| 4364 | { .ISD: ISD::ABS, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+CMOV |
| 4365 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 12, .CodeSizeCost: 20, .SizeAndLatencyCost: 22 } }, |
| 4366 | { .ISD: ISD::BSWAP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
| 4367 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
| 4368 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
| 4369 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
| 4370 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
| 4371 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
| 4372 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
| 4373 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
| 4374 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
| 4375 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
| 4376 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
| 4377 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 6, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
| 4378 | { .ISD: ISD::ROTL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4379 | { .ISD: ISD::ROTR, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4380 | { .ISD: X86ISD::VROTLI, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4381 | { .ISD: ISD::FSHL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
| 4382 | { .ISD: ISD::SADDSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 4383 | { .ISD: ISD::SSUBSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
| 4384 | { .ISD: ISD::UADDSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4385 | { .ISD: ISD::USUBSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4386 | { .ISD: ISD::SMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4387 | { .ISD: ISD::SMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4388 | { .ISD: ISD::UMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4389 | { .ISD: ISD::UMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4390 | { .ISD: ISD::SADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4391 | { .ISD: ISD::UADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4392 | { .ISD: ISD::SMULO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4393 | { .ISD: ISD::UMULO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4394 | }; |
| 4395 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
| 4396 | { .ISD: ISD::ABS, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
| 4397 | { .ISD: ISD::ABS, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
| 4398 | { .ISD: ISD::ABS, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA |
| 4399 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
| 4400 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
| 4401 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9, .CodeSizeCost: 13, .SizeAndLatencyCost: 14 } }, |
| 4402 | { .ISD: ISD::BSWAP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4403 | { .ISD: ISD::BSWAP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // ROL |
| 4404 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // BSR+XOR or BSR+XOR+CMOV |
| 4405 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // BSR+XOR or BSR+XOR+CMOV |
| 4406 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // BSR+XOR or BSR+XOR+CMOV |
| 4407 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
| 4408 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
| 4409 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // BSR+XOR |
| 4410 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
| 4411 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
| 4412 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
| 4413 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
| 4414 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
| 4415 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
| 4416 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 7, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
| 4417 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 8, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
| 4418 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
| 4419 | { .ISD: ISD::ROTL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4420 | { .ISD: ISD::ROTL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4421 | { .ISD: ISD::ROTL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4422 | { .ISD: ISD::ROTR, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4423 | { .ISD: ISD::ROTR, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4424 | { .ISD: ISD::ROTR, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
| 4425 | { .ISD: X86ISD::VROTLI, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4426 | { .ISD: X86ISD::VROTLI, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4427 | { .ISD: X86ISD::VROTLI, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
| 4428 | { .ISD: ISD::FSHL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
| 4429 | { .ISD: ISD::FSHL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
| 4430 | { .ISD: ISD::FSHL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
| 4431 | { .ISD: ISD::SADDSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
| 4432 | { .ISD: ISD::SADDSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 4433 | { .ISD: ISD::SADDSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
| 4434 | { .ISD: ISD::SSUBSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 4435 | { .ISD: ISD::SSUBSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
| 4436 | { .ISD: ISD::SSUBSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
| 4437 | { .ISD: ISD::UADDSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4438 | { .ISD: ISD::UADDSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4439 | { .ISD: ISD::UADDSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
| 4440 | { .ISD: ISD::USUBSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4441 | { .ISD: ISD::USUBSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
| 4442 | { .ISD: ISD::USUBSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
| 4443 | { .ISD: ISD::SMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4444 | { .ISD: ISD::SMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4445 | { .ISD: ISD::SMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4446 | { .ISD: ISD::SMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4447 | { .ISD: ISD::SMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4448 | { .ISD: ISD::SMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4449 | { .ISD: ISD::UMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4450 | { .ISD: ISD::UMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4451 | { .ISD: ISD::UMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4452 | { .ISD: ISD::UMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
| 4453 | { .ISD: ISD::UMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4454 | { .ISD: ISD::UMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
| 4455 | { .ISD: ISD::SADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4456 | { .ISD: ISD::SADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4457 | { .ISD: ISD::SADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4458 | { .ISD: ISD::UADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4459 | { .ISD: ISD::UADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4460 | { .ISD: ISD::UADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4461 | { .ISD: ISD::SMULO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4462 | { .ISD: ISD::SMULO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4463 | { .ISD: ISD::SMULO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4464 | { .ISD: ISD::UMULO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 8 } }, |
| 4465 | { .ISD: ISD::UMULO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 9 } }, |
| 4466 | { .ISD: ISD::UMULO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
| 4467 | }; |
| 4468 | |
| 4469 | Type *RetTy = ICA.getReturnType(); |
| 4470 | Type *OpTy = RetTy; |
| 4471 | Intrinsic::ID IID = ICA.getID(); |
| 4472 | unsigned ISD = ISD::DELETED_NODE; |
| 4473 | switch (IID) { |
| 4474 | default: |
| 4475 | break; |
| 4476 | case Intrinsic::abs: |
| 4477 | ISD = ISD::ABS; |
| 4478 | break; |
| 4479 | case Intrinsic::bitreverse: |
| 4480 | ISD = ISD::BITREVERSE; |
| 4481 | break; |
| 4482 | case Intrinsic::bswap: |
| 4483 | ISD = ISD::BSWAP; |
| 4484 | break; |
| 4485 | case Intrinsic::ctlz: |
| 4486 | ISD = ISD::CTLZ; |
| 4487 | break; |
| 4488 | case Intrinsic::ctpop: |
| 4489 | ISD = ISD::CTPOP; |
| 4490 | break; |
| 4491 | case Intrinsic::cttz: |
| 4492 | ISD = ISD::CTTZ; |
| 4493 | break; |
| 4494 | case Intrinsic::fshl: |
| 4495 | ISD = ISD::FSHL; |
| 4496 | if (!ICA.isTypeBasedOnly()) { |
| 4497 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
| 4498 | if (Args[0] == Args[1]) { |
| 4499 | ISD = ISD::ROTL; |
| 4500 | // Handle uniform constant rotation amounts. |
| 4501 | // TODO: Handle funnel-shift cases. |
| 4502 | const APInt *Amt; |
| 4503 | if (Args[2] && |
| 4504 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
| 4505 | ISD = X86ISD::VROTLI; |
| 4506 | } |
| 4507 | } |
| 4508 | break; |
| 4509 | case Intrinsic::fshr: |
| 4510 | // FSHR has same costs so don't duplicate. |
| 4511 | ISD = ISD::FSHL; |
| 4512 | if (!ICA.isTypeBasedOnly()) { |
| 4513 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
| 4514 | if (Args[0] == Args[1]) { |
| 4515 | ISD = ISD::ROTR; |
| 4516 | // Handle uniform constant rotation amount. |
| 4517 | // TODO: Handle funnel-shift cases. |
| 4518 | const APInt *Amt; |
| 4519 | if (Args[2] && |
| 4520 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
| 4521 | ISD = X86ISD::VROTLI; |
| 4522 | } |
| 4523 | } |
| 4524 | break; |
| 4525 | case Intrinsic::lrint: |
| 4526 | case Intrinsic::llrint: { |
| 4527 | // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which |
| 4528 | // have the same costs as the CVTTP2SI (fptosi) instructions |
| 4529 | const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes(); |
| 4530 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: RetTy, Src: ArgTys[0], |
| 4531 | CCH: TTI::CastContextHint::None, CostKind); |
| 4532 | } |
| 4533 | case Intrinsic::maxnum: |
| 4534 | case Intrinsic::minnum: |
| 4535 | // FMINNUM has same costs so don't duplicate. |
| 4536 | ISD = ISD::FMAXNUM; |
| 4537 | break; |
| 4538 | case Intrinsic::sadd_sat: |
| 4539 | ISD = ISD::SADDSAT; |
| 4540 | break; |
| 4541 | case Intrinsic::smax: |
| 4542 | ISD = ISD::SMAX; |
| 4543 | break; |
| 4544 | case Intrinsic::smin: |
| 4545 | ISD = ISD::SMIN; |
| 4546 | break; |
| 4547 | case Intrinsic::ssub_sat: |
| 4548 | ISD = ISD::SSUBSAT; |
| 4549 | break; |
| 4550 | case Intrinsic::uadd_sat: |
| 4551 | ISD = ISD::UADDSAT; |
| 4552 | break; |
| 4553 | case Intrinsic::umax: |
| 4554 | ISD = ISD::UMAX; |
| 4555 | break; |
| 4556 | case Intrinsic::umin: |
| 4557 | ISD = ISD::UMIN; |
| 4558 | break; |
| 4559 | case Intrinsic::usub_sat: |
| 4560 | ISD = ISD::USUBSAT; |
| 4561 | break; |
| 4562 | case Intrinsic::sqrt: |
| 4563 | ISD = ISD::FSQRT; |
| 4564 | break; |
| 4565 | case Intrinsic::sadd_with_overflow: |
| 4566 | case Intrinsic::ssub_with_overflow: |
| 4567 | // SSUBO has same costs so don't duplicate. |
| 4568 | ISD = ISD::SADDO; |
| 4569 | OpTy = RetTy->getContainedType(i: 0); |
| 4570 | break; |
| 4571 | case Intrinsic::uadd_with_overflow: |
| 4572 | case Intrinsic::usub_with_overflow: |
| 4573 | // USUBO has same costs so don't duplicate. |
| 4574 | ISD = ISD::UADDO; |
| 4575 | OpTy = RetTy->getContainedType(i: 0); |
| 4576 | break; |
| 4577 | case Intrinsic::smul_with_overflow: |
| 4578 | ISD = ISD::SMULO; |
| 4579 | OpTy = RetTy->getContainedType(i: 0); |
| 4580 | break; |
| 4581 | case Intrinsic::umul_with_overflow: |
| 4582 | ISD = ISD::UMULO; |
| 4583 | OpTy = RetTy->getContainedType(i: 0); |
| 4584 | break; |
| 4585 | } |
| 4586 | |
| 4587 | if (ISD != ISD::DELETED_NODE) { |
| 4588 | auto adjustTableCost = [&](int ISD, unsigned Cost, |
| 4589 | std::pair<InstructionCost, MVT> LT, |
| 4590 | FastMathFlags FMF) -> InstructionCost { |
| 4591 | InstructionCost LegalizationCost = LT.first; |
| 4592 | MVT MTy = LT.second; |
| 4593 | |
| 4594 | // If there are no NANs to deal with, then these are reduced to a |
| 4595 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we |
| 4596 | // assume is used in the non-fast case. |
| 4597 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { |
| 4598 | if (FMF.noNaNs()) |
| 4599 | return LegalizationCost * 1; |
| 4600 | } |
| 4601 | |
| 4602 | // For cases where some ops can be folded into a load/store, assume free. |
| 4603 | if (MTy.isScalarInteger()) { |
| 4604 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { |
| 4605 | if (const Instruction *II = ICA.getInst()) { |
| 4606 | if (II->hasOneUse() && isa<StoreInst>(Val: II->user_back())) |
| 4607 | return TTI::TCC_Free; |
| 4608 | if (auto *LI = dyn_cast<LoadInst>(Val: II->getOperand(i: 0))) { |
| 4609 | if (LI->hasOneUse()) |
| 4610 | return TTI::TCC_Free; |
| 4611 | } |
| 4612 | } |
| 4613 | } |
| 4614 | } |
| 4615 | |
| 4616 | return LegalizationCost * (int)Cost; |
| 4617 | }; |
| 4618 | |
| 4619 | // Legalize the type. |
| 4620 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: OpTy); |
| 4621 | MVT MTy = LT.second; |
| 4622 | |
| 4623 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. |
| 4624 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || |
| 4625 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && |
| 4626 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { |
| 4627 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
| 4628 | if (auto *Cst = dyn_cast<ConstantInt>(Val: Args[1])) |
| 4629 | if (Cst->isAllOnesValue()) |
| 4630 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; |
| 4631 | } |
| 4632 | |
| 4633 | // FSQRT is a single instruction. |
| 4634 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) |
| 4635 | return LT.first; |
| 4636 | |
| 4637 | if (ST->useGLMDivSqrtCosts()) |
| 4638 | if (const auto *Entry = CostTableLookup(Table: GLMCostTbl, ISD, Ty: MTy)) |
| 4639 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4640 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4641 | |
| 4642 | if (ST->useSLMArithCosts()) |
| 4643 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
| 4644 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4645 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4646 | |
| 4647 | if (ST->hasVBMI2()) |
| 4648 | if (const auto *Entry = CostTableLookup(Table: AVX512VBMI2CostTbl, ISD, Ty: MTy)) |
| 4649 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4650 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4651 | |
| 4652 | if (ST->hasBITALG()) |
| 4653 | if (const auto *Entry = CostTableLookup(Table: AVX512BITALGCostTbl, ISD, Ty: MTy)) |
| 4654 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4655 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4656 | |
| 4657 | if (ST->hasVPOPCNTDQ()) |
| 4658 | if (const auto *Entry = CostTableLookup(Table: AVX512VPOPCNTDQCostTbl, ISD, Ty: MTy)) |
| 4659 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4660 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4661 | |
| 4662 | if (ST->hasGFNI()) |
| 4663 | if (const auto *Entry = CostTableLookup(Table: GFNICostTbl, ISD, Ty: MTy)) |
| 4664 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4665 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4666 | |
| 4667 | if (ST->hasCDI()) |
| 4668 | if (const auto *Entry = CostTableLookup(Table: AVX512CDCostTbl, ISD, Ty: MTy)) |
| 4669 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4670 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4671 | |
| 4672 | if (ST->hasBWI()) |
| 4673 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
| 4674 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4675 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4676 | |
| 4677 | if (ST->hasAVX512()) |
| 4678 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
| 4679 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4680 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4681 | |
| 4682 | if (ST->hasXOP()) |
| 4683 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
| 4684 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4685 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4686 | |
| 4687 | if (ST->hasAVX2()) |
| 4688 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
| 4689 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4690 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4691 | |
| 4692 | if (ST->hasAVX()) |
| 4693 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 4694 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4695 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4696 | |
| 4697 | if (ST->hasSSE42()) |
| 4698 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
| 4699 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4700 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4701 | |
| 4702 | if (ST->hasSSE41()) |
| 4703 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
| 4704 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4705 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4706 | |
| 4707 | if (ST->hasSSSE3()) |
| 4708 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTbl, ISD, Ty: MTy)) |
| 4709 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4710 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4711 | |
| 4712 | if (ST->hasSSE2()) |
| 4713 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 4714 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4715 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4716 | |
| 4717 | if (ST->hasSSE1()) |
| 4718 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
| 4719 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4720 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4721 | |
| 4722 | if (ST->hasBMI()) { |
| 4723 | if (ST->is64Bit()) |
| 4724 | if (const auto *Entry = CostTableLookup(Table: BMI64CostTbl, ISD, Ty: MTy)) |
| 4725 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4726 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4727 | |
| 4728 | if (const auto *Entry = CostTableLookup(Table: BMI32CostTbl, ISD, Ty: MTy)) |
| 4729 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4730 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4731 | } |
| 4732 | |
| 4733 | if (ST->hasLZCNT()) { |
| 4734 | if (ST->is64Bit()) |
| 4735 | if (const auto *Entry = CostTableLookup(Table: LZCNT64CostTbl, ISD, Ty: MTy)) |
| 4736 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4737 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4738 | |
| 4739 | if (const auto *Entry = CostTableLookup(Table: LZCNT32CostTbl, ISD, Ty: MTy)) |
| 4740 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4741 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4742 | } |
| 4743 | |
| 4744 | if (ST->hasPOPCNT()) { |
| 4745 | if (ST->is64Bit()) |
| 4746 | if (const auto *Entry = CostTableLookup(Table: POPCNT64CostTbl, ISD, Ty: MTy)) |
| 4747 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4748 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4749 | |
| 4750 | if (const auto *Entry = CostTableLookup(Table: POPCNT32CostTbl, ISD, Ty: MTy)) |
| 4751 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4752 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4753 | } |
| 4754 | |
| 4755 | if (ST->is64Bit()) |
| 4756 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: MTy)) |
| 4757 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4758 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4759 | |
| 4760 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: MTy)) |
| 4761 | if (auto KindCost = Entry->Cost[CostKind]) |
| 4762 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
| 4763 | |
| 4764 | // Without arg data, we need to compute the expanded costs of custom lowered |
| 4765 | // intrinsics to prevent use of the (very low) default costs. |
| 4766 | if (ICA.isTypeBasedOnly() && |
| 4767 | (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) { |
| 4768 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
| 4769 | InstructionCost Cost = 0; |
| 4770 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Or, Ty: RetTy, CostKind); |
| 4771 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Sub, Ty: RetTy, CostKind); |
| 4772 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Shl, Ty: RetTy, CostKind); |
| 4773 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::LShr, Ty: RetTy, CostKind); |
| 4774 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: RetTy, CostKind); |
| 4775 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, |
| 4776 | VecPred: CmpInst::ICMP_EQ, CostKind); |
| 4777 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, |
| 4778 | VecPred: CmpInst::ICMP_EQ, CostKind); |
| 4779 | return Cost; |
| 4780 | } |
| 4781 | } |
| 4782 | |
| 4783 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
| 4784 | } |
| 4785 | |
| 4786 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
| 4787 | TTI::TargetCostKind CostKind, |
| 4788 | unsigned Index, const Value *Op0, |
| 4789 | const Value *Op1) const { |
| 4790 | static const CostTblEntry SLMCostTbl[] = { |
| 4791 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i8, .Cost: 4 }, |
| 4792 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i16, .Cost: 4 }, |
| 4793 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i32, .Cost: 4 }, |
| 4794 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i64, .Cost: 7 } |
| 4795 | }; |
| 4796 | |
| 4797 | assert(Val->isVectorTy() && "This must be a vector type" ); |
| 4798 | Type *ScalarType = Val->getScalarType(); |
| 4799 | InstructionCost RegisterFileMoveCost = 0; |
| 4800 | |
| 4801 | // Non-immediate extraction/insertion can be handled as a sequence of |
| 4802 | // aliased loads+stores via the stack. |
| 4803 | if (Index == -1U && (Opcode == Instruction::ExtractElement || |
| 4804 | Opcode == Instruction::InsertElement)) { |
| 4805 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: |
| 4806 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. |
| 4807 | |
| 4808 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. |
| 4809 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected" ); |
| 4810 | Align VecAlign = DL.getPrefTypeAlign(Ty: Val); |
| 4811 | Align SclAlign = DL.getPrefTypeAlign(Ty: ScalarType); |
| 4812 | |
| 4813 | // Extract - store vector to stack, load scalar. |
| 4814 | if (Opcode == Instruction::ExtractElement) { |
| 4815 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
| 4816 | getMemoryOpCost(Opcode: Instruction::Load, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
| 4817 | CostKind); |
| 4818 | } |
| 4819 | // Insert - store vector to stack, store scalar, load vector. |
| 4820 | if (Opcode == Instruction::InsertElement) { |
| 4821 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
| 4822 | getMemoryOpCost(Opcode: Instruction::Store, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
| 4823 | CostKind) + |
| 4824 | getMemoryOpCost(Opcode: Instruction::Load, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind); |
| 4825 | } |
| 4826 | } |
| 4827 | |
| 4828 | if (Index != -1U && (Opcode == Instruction::ExtractElement || |
| 4829 | Opcode == Instruction::InsertElement)) { |
| 4830 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. |
| 4831 | if (Opcode == Instruction::ExtractElement && |
| 4832 | ScalarType->getScalarSizeInBits() == 1 && |
| 4833 | cast<FixedVectorType>(Val)->getNumElements() > 1) |
| 4834 | return 1; |
| 4835 | |
| 4836 | // Legalize the type. |
| 4837 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
| 4838 | |
| 4839 | // This type is legalized to a scalar type. |
| 4840 | if (!LT.second.isVector()) |
| 4841 | return TTI::TCC_Free; |
| 4842 | |
| 4843 | // The type may be split. Normalize the index to the new type. |
| 4844 | unsigned SizeInBits = LT.second.getSizeInBits(); |
| 4845 | unsigned NumElts = LT.second.getVectorNumElements(); |
| 4846 | unsigned SubNumElts = NumElts; |
| 4847 | Index = Index % NumElts; |
| 4848 | |
| 4849 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. |
| 4850 | // For inserts, we also need to insert the subvector back. |
| 4851 | if (SizeInBits > 128) { |
| 4852 | assert((SizeInBits % 128) == 0 && "Illegal vector" ); |
| 4853 | unsigned NumSubVecs = SizeInBits / 128; |
| 4854 | SubNumElts = NumElts / NumSubVecs; |
| 4855 | if (SubNumElts <= Index) { |
| 4856 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); |
| 4857 | Index %= SubNumElts; |
| 4858 | } |
| 4859 | } |
| 4860 | |
| 4861 | MVT MScalarTy = LT.second.getScalarType(); |
| 4862 | auto IsCheapPInsrPExtrInsertPS = [&]() { |
| 4863 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. |
| 4864 | // Inserting f32 into index0 is just movss. |
| 4865 | // Also, assume insertps is relatively cheap on all >= SSE41 targets. |
| 4866 | return (MScalarTy == MVT::i16 && ST->hasSSE2()) || |
| 4867 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
| 4868 | (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 && |
| 4869 | Opcode == Instruction::InsertElement) || |
| 4870 | (MScalarTy == MVT::f32 && ST->hasSSE41() && |
| 4871 | Opcode == Instruction::InsertElement); |
| 4872 | }; |
| 4873 | |
| 4874 | if (Index == 0) { |
| 4875 | // Floating point scalars are already located in index #0. |
| 4876 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume |
| 4877 | // true for all. |
| 4878 | if (ScalarType->isFloatingPointTy() && |
| 4879 | (Opcode != Instruction::InsertElement || !Op0 || |
| 4880 | isa<UndefValue>(Val: Op0))) |
| 4881 | return RegisterFileMoveCost; |
| 4882 | |
| 4883 | if (Opcode == Instruction::InsertElement && |
| 4884 | isa_and_nonnull<UndefValue>(Val: Op0)) { |
| 4885 | // Consider the gather cost to be cheap. |
| 4886 | if (isa_and_nonnull<LoadInst>(Val: Op1)) |
| 4887 | return RegisterFileMoveCost; |
| 4888 | if (!IsCheapPInsrPExtrInsertPS()) { |
| 4889 | // mov constant-to-GPR + movd/movq GPR -> XMM. |
| 4890 | if (isa_and_nonnull<Constant>(Val: Op1) && Op1->getType()->isIntegerTy()) |
| 4891 | return 2 + RegisterFileMoveCost; |
| 4892 | // Assume movd/movq GPR -> XMM is relatively cheap on all targets. |
| 4893 | return 1 + RegisterFileMoveCost; |
| 4894 | } |
| 4895 | } |
| 4896 | |
| 4897 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. |
| 4898 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) |
| 4899 | return 1 + RegisterFileMoveCost; |
| 4900 | } |
| 4901 | |
| 4902 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 4903 | assert(ISD && "Unexpected vector opcode" ); |
| 4904 | if (ST->useSLMArithCosts()) |
| 4905 | if (auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MScalarTy)) |
| 4906 | return Entry->Cost + RegisterFileMoveCost; |
| 4907 | |
| 4908 | // Consider cheap cases. |
| 4909 | if (IsCheapPInsrPExtrInsertPS()) |
| 4910 | return 1 + RegisterFileMoveCost; |
| 4911 | |
| 4912 | // For extractions we just need to shuffle the element to index 0, which |
| 4913 | // should be very cheap (assume cost = 1). For insertions we need to shuffle |
| 4914 | // the elements to its destination. In both cases we must handle the |
| 4915 | // subvector move(s). |
| 4916 | // If the vector type is already less than 128-bits then don't reduce it. |
| 4917 | // TODO: Under what circumstances should we shuffle using the full width? |
| 4918 | InstructionCost ShuffleCost = 1; |
| 4919 | if (Opcode == Instruction::InsertElement) { |
| 4920 | auto *SubTy = cast<VectorType>(Val); |
| 4921 | EVT VT = TLI->getValueType(DL, Ty: Val); |
| 4922 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) |
| 4923 | SubTy = FixedVectorType::get(ElementType: ScalarType, NumElts: SubNumElts); |
| 4924 | ShuffleCost = getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SubTy, SrcTy: SubTy, Mask: {}, |
| 4925 | CostKind, Index: 0, SubTp: SubTy); |
| 4926 | } |
| 4927 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; |
| 4928 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; |
| 4929 | } |
| 4930 | |
| 4931 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + |
| 4932 | RegisterFileMoveCost; |
| 4933 | } |
| 4934 | |
| 4935 | InstructionCost X86TTIImpl::getScalarizationOverhead( |
| 4936 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
| 4937 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
| 4938 | ArrayRef<Value *> VL) const { |
| 4939 | assert(DemandedElts.getBitWidth() == |
| 4940 | cast<FixedVectorType>(Ty)->getNumElements() && |
| 4941 | "Vector size mismatch" ); |
| 4942 | |
| 4943 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
| 4944 | MVT MScalarTy = LT.second.getScalarType(); |
| 4945 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); |
| 4946 | InstructionCost Cost = 0; |
| 4947 | |
| 4948 | constexpr unsigned LaneBitWidth = 128; |
| 4949 | assert((LegalVectorBitWidth < LaneBitWidth || |
| 4950 | (LegalVectorBitWidth % LaneBitWidth) == 0) && |
| 4951 | "Illegal vector" ); |
| 4952 | |
| 4953 | const int NumLegalVectors = LT.first.getValue(); |
| 4954 | assert(NumLegalVectors >= 0 && "Negative cost!" ); |
| 4955 | |
| 4956 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much |
| 4957 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has |
| 4958 | // a special heuristic regarding poison input which is passed here in |
| 4959 | // ForPoisonSrc. |
| 4960 | if (Insert && !ForPoisonSrc) { |
| 4961 | // This is nearly identical to BaseT::getScalarizationOverhead(), except |
| 4962 | // it is passing nullptr to getVectorInstrCost() for Op0 (instead of |
| 4963 | // Constant::getNullValue()), which makes the X86TTIImpl |
| 4964 | // getVectorInstrCost() return 0 instead of 1. |
| 4965 | for (unsigned I : seq(Size: DemandedElts.getBitWidth())) { |
| 4966 | if (!DemandedElts[I]) |
| 4967 | continue; |
| 4968 | Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: I, |
| 4969 | Op0: Constant::getNullValue(Ty), |
| 4970 | Op1: VL.empty() ? nullptr : VL[I]); |
| 4971 | } |
| 4972 | return Cost; |
| 4973 | } |
| 4974 | |
| 4975 | if (Insert) { |
| 4976 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || |
| 4977 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
| 4978 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { |
| 4979 | // For types we can insert directly, insertion into 128-bit sub vectors is |
| 4980 | // cheap, followed by a cheap chain of concatenations. |
| 4981 | if (LegalVectorBitWidth <= LaneBitWidth) { |
| 4982 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, |
| 4983 | /*Extract*/ false, CostKind); |
| 4984 | } else { |
| 4985 | // In each 128-lane, if at least one index is demanded but not all |
| 4986 | // indices are demanded and this 128-lane is not the first 128-lane of |
| 4987 | // the legalized-vector, then this 128-lane needs a extracti128; If in |
| 4988 | // each 128-lane, there is at least one demanded index, this 128-lane |
| 4989 | // needs a inserti128. |
| 4990 | |
| 4991 | // The following cases will help you build a better understanding: |
| 4992 | // Assume we insert several elements into a v8i32 vector in avx2, |
| 4993 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. |
| 4994 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + |
| 4995 | // inserti128. |
| 4996 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. |
| 4997 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector" ); |
| 4998 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
| 4999 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
| 5000 | unsigned NumLegalElts = |
| 5001 | LT.second.getVectorNumElements() * NumLegalVectors; |
| 5002 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
| 5003 | "Vector has been legalized to smaller element count" ); |
| 5004 | assert((NumLegalElts % NumLanesTotal) == 0 && |
| 5005 | "Unexpected elts per lane" ); |
| 5006 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
| 5007 | |
| 5008 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
| 5009 | auto *LaneTy = |
| 5010 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
| 5011 | |
| 5012 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
| 5013 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
| 5014 | numBits: NumEltsPerLane, bitPosition: NumEltsPerLane * I); |
| 5015 | if (LaneEltMask.isZero()) |
| 5016 | continue; |
| 5017 | // FIXME: we don't need to extract if all non-demanded elements |
| 5018 | // are legalization-inserted padding. |
| 5019 | if (!LaneEltMask.isAllOnes()) |
| 5020 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
| 5021 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
| 5022 | Cost += BaseT::getScalarizationOverhead(InTy: LaneTy, DemandedElts: LaneEltMask, Insert, |
| 5023 | /*Extract*/ false, CostKind); |
| 5024 | } |
| 5025 | |
| 5026 | APInt AffectedLanes = |
| 5027 | APIntOps::ScaleBitMask(A: WidenedDemandedElts, NewBitWidth: NumLanesTotal); |
| 5028 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( |
| 5029 | A: AffectedLanes, NewBitWidth: NumLegalVectors, /*MatchAllBits=*/true); |
| 5030 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { |
| 5031 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { |
| 5032 | unsigned I = NumLegalLanes * LegalVec + Lane; |
| 5033 | // No need to insert unaffected lane; or lane 0 of each legal vector |
| 5034 | // iff ALL lanes of that vector were affected and will be inserted. |
| 5035 | if (!AffectedLanes[I] || |
| 5036 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) |
| 5037 | continue; |
| 5038 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
| 5039 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
| 5040 | } |
| 5041 | } |
| 5042 | } |
| 5043 | } else if (LT.second.isVector()) { |
| 5044 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded |
| 5045 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a |
| 5046 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be |
| 5047 | // considered cheap. |
| 5048 | if (Ty->isIntOrIntVectorTy()) |
| 5049 | Cost += DemandedElts.popcount(); |
| 5050 | |
| 5051 | // Get the smaller of the legalized or original pow2-extended number of |
| 5052 | // vector elements, which represents the number of unpacks we'll end up |
| 5053 | // performing. |
| 5054 | unsigned NumElts = LT.second.getVectorNumElements(); |
| 5055 | unsigned Pow2Elts = |
| 5056 | PowerOf2Ceil(A: cast<FixedVectorType>(Val: Ty)->getNumElements()); |
| 5057 | Cost += (std::min<unsigned>(a: NumElts, b: Pow2Elts) - 1) * LT.first; |
| 5058 | } |
| 5059 | } |
| 5060 | |
| 5061 | if (Extract) { |
| 5062 | // vXi1 can be efficiently extracted with MOVMSK. |
| 5063 | // TODO: AVX512 predicate mask handling. |
| 5064 | // NOTE: This doesn't work well for roundtrip scalarization. |
| 5065 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { |
| 5066 | unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
| 5067 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; |
| 5068 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; |
| 5069 | return MOVMSKCost; |
| 5070 | } |
| 5071 | |
| 5072 | if (LT.second.isVector()) { |
| 5073 | unsigned NumLegalElts = |
| 5074 | LT.second.getVectorNumElements() * NumLegalVectors; |
| 5075 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
| 5076 | "Vector has been legalized to smaller element count" ); |
| 5077 | |
| 5078 | // If we're extracting elements from a 128-bit subvector lane, |
| 5079 | // we only need to extract each lane once, not for every element. |
| 5080 | if (LegalVectorBitWidth > LaneBitWidth) { |
| 5081 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
| 5082 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
| 5083 | assert((NumLegalElts % NumLanesTotal) == 0 && |
| 5084 | "Unexpected elts per lane" ); |
| 5085 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
| 5086 | |
| 5087 | // Add cost for each demanded 128-bit subvector extraction. |
| 5088 | // Luckily this is a lot easier than for insertion. |
| 5089 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
| 5090 | auto *LaneTy = |
| 5091 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
| 5092 | |
| 5093 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
| 5094 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
| 5095 | numBits: NumEltsPerLane, bitPosition: I * NumEltsPerLane); |
| 5096 | if (LaneEltMask.isZero()) |
| 5097 | continue; |
| 5098 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, CostKind, |
| 5099 | Index: I * NumEltsPerLane, SubTp: LaneTy); |
| 5100 | Cost += BaseT::getScalarizationOverhead( |
| 5101 | InTy: LaneTy, DemandedElts: LaneEltMask, /*Insert*/ false, Extract, CostKind); |
| 5102 | } |
| 5103 | |
| 5104 | return Cost; |
| 5105 | } |
| 5106 | } |
| 5107 | |
| 5108 | // Fallback to default extraction. |
| 5109 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, /*Insert*/ false, |
| 5110 | Extract, CostKind); |
| 5111 | } |
| 5112 | |
| 5113 | return Cost; |
| 5114 | } |
| 5115 | |
| 5116 | InstructionCost |
| 5117 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
| 5118 | int VF, const APInt &DemandedDstElts, |
| 5119 | TTI::TargetCostKind CostKind) const { |
| 5120 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
| 5121 | // We don't differentiate element types here, only element bit width. |
| 5122 | EltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: EltTyBits); |
| 5123 | |
| 5124 | auto bailout = [&]() { |
| 5125 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, |
| 5126 | DemandedDstElts, CostKind); |
| 5127 | }; |
| 5128 | |
| 5129 | // For now, only deal with AVX512 cases. |
| 5130 | if (!ST->hasAVX512()) |
| 5131 | return bailout(); |
| 5132 | |
| 5133 | // Do we have a native shuffle for this element type, or should we promote? |
| 5134 | unsigned PromEltTyBits = EltTyBits; |
| 5135 | switch (EltTyBits) { |
| 5136 | case 32: |
| 5137 | case 64: |
| 5138 | break; // AVX512F. |
| 5139 | case 16: |
| 5140 | if (!ST->hasBWI()) |
| 5141 | PromEltTyBits = 32; // promote to i32, AVX512F. |
| 5142 | break; // AVX512BW |
| 5143 | case 8: |
| 5144 | if (!ST->hasVBMI()) |
| 5145 | PromEltTyBits = 32; // promote to i32, AVX512F. |
| 5146 | break; // AVX512VBMI |
| 5147 | case 1: |
| 5148 | // There is no support for shuffling i1 elements. We *must* promote. |
| 5149 | if (ST->hasBWI()) { |
| 5150 | if (ST->hasVBMI()) |
| 5151 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. |
| 5152 | else |
| 5153 | PromEltTyBits = 16; // promote to i16, AVX512BW. |
| 5154 | break; |
| 5155 | } |
| 5156 | PromEltTyBits = 32; // promote to i32, AVX512F. |
| 5157 | break; |
| 5158 | default: |
| 5159 | return bailout(); |
| 5160 | } |
| 5161 | auto *PromEltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: PromEltTyBits); |
| 5162 | |
| 5163 | auto *SrcVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: VF); |
| 5164 | auto *PromSrcVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: VF); |
| 5165 | |
| 5166 | int NumDstElements = VF * ReplicationFactor; |
| 5167 | auto *PromDstVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: NumDstElements); |
| 5168 | auto *DstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumDstElements); |
| 5169 | |
| 5170 | // Legalize the types. |
| 5171 | MVT LegalSrcVecTy = getTypeLegalizationCost(Ty: SrcVecTy).second; |
| 5172 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(Ty: PromSrcVecTy).second; |
| 5173 | MVT LegalPromDstVecTy = getTypeLegalizationCost(Ty: PromDstVecTy).second; |
| 5174 | MVT LegalDstVecTy = getTypeLegalizationCost(Ty: DstVecTy).second; |
| 5175 | // They should have legalized into vector types. |
| 5176 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || |
| 5177 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) |
| 5178 | return bailout(); |
| 5179 | |
| 5180 | if (PromEltTyBits != EltTyBits) { |
| 5181 | // If we have to perform the shuffle with wider elt type than our data type, |
| 5182 | // then we will first need to anyext (we don't care about the new bits) |
| 5183 | // the source elements, and then truncate Dst elements. |
| 5184 | InstructionCost PromotionCost; |
| 5185 | PromotionCost += getCastInstrCost( |
| 5186 | Opcode: Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, |
| 5187 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
| 5188 | PromotionCost += |
| 5189 | getCastInstrCost(Opcode: Instruction::Trunc, /*Dst=*/DstVecTy, |
| 5190 | /*Src=*/PromDstVecTy, |
| 5191 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
| 5192 | return PromotionCost + getReplicationShuffleCost(EltTy: PromEltTy, |
| 5193 | ReplicationFactor, VF, |
| 5194 | DemandedDstElts, CostKind); |
| 5195 | } |
| 5196 | |
| 5197 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && |
| 5198 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && |
| 5199 | "We expect that the legalization doesn't affect the element width, " |
| 5200 | "doesn't coalesce/split elements." ); |
| 5201 | |
| 5202 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); |
| 5203 | unsigned NumDstVectors = |
| 5204 | divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: NumEltsPerDstVec); |
| 5205 | |
| 5206 | auto *SingleDstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltsPerDstVec); |
| 5207 | |
| 5208 | // Not all the produced Dst elements may be demanded. In our case, |
| 5209 | // given that a single Dst vector is formed by a single shuffle, |
| 5210 | // if all elements that will form a single Dst vector aren't demanded, |
| 5211 | // then we won't need to do that shuffle, so adjust the cost accordingly. |
| 5212 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( |
| 5213 | A: DemandedDstElts.zext(width: NumDstVectors * NumEltsPerDstVec), NewBitWidth: NumDstVectors); |
| 5214 | unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); |
| 5215 | |
| 5216 | InstructionCost SingleShuffleCost = |
| 5217 | getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: SingleDstVecTy, SrcTy: SingleDstVecTy, |
| 5218 | /*Mask=*/{}, CostKind, |
| 5219 | /*Index=*/0, /*SubTp=*/nullptr); |
| 5220 | return NumDstVectorsDemanded * SingleShuffleCost; |
| 5221 | } |
| 5222 | |
| 5223 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
| 5224 | Align Alignment, |
| 5225 | unsigned AddressSpace, |
| 5226 | TTI::TargetCostKind CostKind, |
| 5227 | TTI::OperandValueInfo OpInfo, |
| 5228 | const Instruction *I) const { |
| 5229 | // TODO: Handle other cost kinds. |
| 5230 | if (CostKind != TTI::TCK_RecipThroughput) { |
| 5231 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: I)) { |
| 5232 | // Store instruction with index and scale costs 2 Uops. |
| 5233 | // Check the preceding GEP to identify non-const indices. |
| 5234 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: SI->getPointerOperand())) { |
| 5235 | if (!all_of(Range: GEP->indices(), P: [](Value *V) { return isa<Constant>(Val: V); })) |
| 5236 | return TTI::TCC_Basic * 2; |
| 5237 | } |
| 5238 | } |
| 5239 | return TTI::TCC_Basic; |
| 5240 | } |
| 5241 | |
| 5242 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
| 5243 | "Invalid Opcode" ); |
| 5244 | // Type legalization can't handle structs |
| 5245 | if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other) |
| 5246 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
| 5247 | CostKind, OpInfo, I); |
| 5248 | |
| 5249 | // Legalize the type. |
| 5250 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
| 5251 | |
| 5252 | auto *VTy = dyn_cast<FixedVectorType>(Val: Src); |
| 5253 | |
| 5254 | InstructionCost Cost = 0; |
| 5255 | |
| 5256 | // Add a cost for constant load to vector. |
| 5257 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
| 5258 | Cost += getMemoryOpCost(Opcode: Instruction::Load, Src, Alignment: DL.getABITypeAlign(Ty: Src), |
| 5259 | /*AddressSpace=*/0, CostKind, OpInfo); |
| 5260 | |
| 5261 | // Handle the simple case of non-vectors. |
| 5262 | // NOTE: this assumes that legalization never creates vector from scalars! |
| 5263 | if (!VTy || !LT.second.isVector()) { |
| 5264 | // Each load/store unit costs 1. |
| 5265 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; |
| 5266 | } |
| 5267 | |
| 5268 | bool IsLoad = Opcode == Instruction::Load; |
| 5269 | |
| 5270 | Type *EltTy = VTy->getElementType(); |
| 5271 | |
| 5272 | const int EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
| 5273 | |
| 5274 | // Source of truth: how many elements were there in the original IR vector? |
| 5275 | const unsigned SrcNumElt = VTy->getNumElements(); |
| 5276 | |
| 5277 | // How far have we gotten? |
| 5278 | int NumEltRemaining = SrcNumElt; |
| 5279 | // Note that we intentionally capture by-reference, NumEltRemaining changes. |
| 5280 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; |
| 5281 | |
| 5282 | const int MaxLegalOpSizeBytes = divideCeil(Numerator: LT.second.getSizeInBits(), Denominator: 8); |
| 5283 | |
| 5284 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. |
| 5285 | const unsigned XMMBits = 128; |
| 5286 | if (XMMBits % EltTyBits != 0) |
| 5287 | // Vector size must be a multiple of the element size. I.e. no padding. |
| 5288 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
| 5289 | CostKind, OpInfo, I); |
| 5290 | const int NumEltPerXMM = XMMBits / EltTyBits; |
| 5291 | |
| 5292 | auto *XMMVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltPerXMM); |
| 5293 | |
| 5294 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; |
| 5295 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { |
| 5296 | // How many elements would a single op deal with at once? |
| 5297 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) |
| 5298 | // Vector size must be a multiple of the element size. I.e. no padding. |
| 5299 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
| 5300 | CostKind, OpInfo, I); |
| 5301 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; |
| 5302 | |
| 5303 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?" ); |
| 5304 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || |
| 5305 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && |
| 5306 | "Unless we haven't halved the op size yet, " |
| 5307 | "we have less than two op's sized units of work left." ); |
| 5308 | |
| 5309 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM |
| 5310 | ? FixedVectorType::get(ElementType: EltTy, NumElts: CurrNumEltPerOp) |
| 5311 | : XMMVecTy; |
| 5312 | |
| 5313 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && |
| 5314 | "After halving sizes, the vector elt count is no longer a multiple " |
| 5315 | "of number of elements per operation?" ); |
| 5316 | auto *CoalescedVecTy = |
| 5317 | CurrNumEltPerOp == 1 |
| 5318 | ? CurrVecTy |
| 5319 | : FixedVectorType::get( |
| 5320 | ElementType: IntegerType::get(C&: Src->getContext(), |
| 5321 | NumBits: EltTyBits * CurrNumEltPerOp), |
| 5322 | NumElts: CurrVecTy->getNumElements() / CurrNumEltPerOp); |
| 5323 | assert(DL.getTypeSizeInBits(CoalescedVecTy) == |
| 5324 | DL.getTypeSizeInBits(CurrVecTy) && |
| 5325 | "coalesciing elements doesn't change vector width." ); |
| 5326 | |
| 5327 | while (NumEltRemaining > 0) { |
| 5328 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ); |
| 5329 | |
| 5330 | // Can we use this vector size, as per the remaining element count? |
| 5331 | // Iff the vector is naturally aligned, we can do a wide load regardless. |
| 5332 | if (NumEltRemaining < CurrNumEltPerOp && |
| 5333 | (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1) |
| 5334 | break; // Try smalled vector size. |
| 5335 | |
| 5336 | // This isn't exactly right. We're using slow unaligned 32-byte accesses |
| 5337 | // as a proxy for a double-pumped AVX memory interface such as on |
| 5338 | // Sandybridge. |
| 5339 | // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or |
| 5340 | // will be scalarized. |
| 5341 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) |
| 5342 | Cost += 2; |
| 5343 | else if (CurrOpSizeBytes < 4) |
| 5344 | Cost += 2; |
| 5345 | else |
| 5346 | Cost += 1; |
| 5347 | |
| 5348 | // If we're loading a uniform value, then we don't need to split the load, |
| 5349 | // loading just a single (widest) vector can be reused by all splits. |
| 5350 | if (IsLoad && OpInfo.isUniform()) |
| 5351 | return Cost; |
| 5352 | |
| 5353 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; |
| 5354 | |
| 5355 | // If we have fully processed the previous reg, we need to replenish it. |
| 5356 | if (SubVecEltsLeft == 0) { |
| 5357 | SubVecEltsLeft += CurrVecTy->getNumElements(); |
| 5358 | // And that's free only for the 0'th subvector of a legalized vector. |
| 5359 | if (!Is0thSubVec) |
| 5360 | Cost += |
| 5361 | getShuffleCost(Kind: IsLoad ? TTI::ShuffleKind::SK_InsertSubvector |
| 5362 | : TTI::ShuffleKind::SK_ExtractSubvector, |
| 5363 | DstTy: VTy, SrcTy: VTy, Mask: {}, CostKind, Index: NumEltDone(), SubTp: CurrVecTy); |
| 5364 | } |
| 5365 | |
| 5366 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, |
| 5367 | // for smaller widths (32/16/8) we have to insert/extract them separately. |
| 5368 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, |
| 5369 | // but let's pretend that it is also true for 16/8 bit wide ops...) |
| 5370 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { |
| 5371 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; |
| 5372 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "" ); |
| 5373 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; |
| 5374 | APInt DemandedElts = |
| 5375 | APInt::getBitsSet(numBits: CoalescedVecTy->getNumElements(), |
| 5376 | loBit: CoalescedVecEltIdx, hiBit: CoalescedVecEltIdx + 1); |
| 5377 | assert(DemandedElts.popcount() == 1 && "Inserting single value" ); |
| 5378 | Cost += getScalarizationOverhead(Ty: CoalescedVecTy, DemandedElts, Insert: IsLoad, |
| 5379 | Extract: !IsLoad, CostKind); |
| 5380 | } |
| 5381 | |
| 5382 | SubVecEltsLeft -= CurrNumEltPerOp; |
| 5383 | NumEltRemaining -= CurrNumEltPerOp; |
| 5384 | Alignment = commonAlignment(A: Alignment, Offset: CurrOpSizeBytes); |
| 5385 | } |
| 5386 | } |
| 5387 | |
| 5388 | assert(NumEltRemaining <= 0 && "Should have processed all the elements." ); |
| 5389 | |
| 5390 | return Cost; |
| 5391 | } |
| 5392 | |
| 5393 | InstructionCost |
| 5394 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, |
| 5395 | unsigned AddressSpace, |
| 5396 | TTI::TargetCostKind CostKind) const { |
| 5397 | bool IsLoad = (Instruction::Load == Opcode); |
| 5398 | bool IsStore = (Instruction::Store == Opcode); |
| 5399 | |
| 5400 | auto *SrcVTy = dyn_cast<FixedVectorType>(Val: SrcTy); |
| 5401 | if (!SrcVTy) |
| 5402 | // To calculate scalar take the regular cost, without mask |
| 5403 | return getMemoryOpCost(Opcode, Src: SrcTy, Alignment, AddressSpace, CostKind); |
| 5404 | |
| 5405 | unsigned NumElem = SrcVTy->getNumElements(); |
| 5406 | auto *MaskTy = |
| 5407 | FixedVectorType::get(ElementType: Type::getInt8Ty(C&: SrcVTy->getContext()), NumElts: NumElem); |
| 5408 | if ((IsLoad && !isLegalMaskedLoad(DataType: SrcVTy, Alignment, AddressSpace)) || |
| 5409 | (IsStore && !isLegalMaskedStore(DataType: SrcVTy, Alignment, AddressSpace))) { |
| 5410 | // Scalarization |
| 5411 | APInt DemandedElts = APInt::getAllOnes(numBits: NumElem); |
| 5412 | InstructionCost MaskSplitCost = getScalarizationOverhead( |
| 5413 | Ty: MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); |
| 5414 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( |
| 5415 | Opcode: Instruction::ICmp, ValTy: Type::getInt8Ty(C&: SrcVTy->getContext()), CondTy: nullptr, |
| 5416 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
| 5417 | InstructionCost BranchCost = getCFInstrCost(Opcode: Instruction::Br, CostKind); |
| 5418 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); |
| 5419 | InstructionCost ValueSplitCost = getScalarizationOverhead( |
| 5420 | Ty: SrcVTy, DemandedElts, Insert: IsLoad, Extract: IsStore, CostKind); |
| 5421 | InstructionCost MemopCost = |
| 5422 | NumElem * BaseT::getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
| 5423 | Alignment, AddressSpace, CostKind); |
| 5424 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; |
| 5425 | } |
| 5426 | |
| 5427 | // Legalize the type. |
| 5428 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcVTy); |
| 5429 | auto VT = TLI->getValueType(DL, Ty: SrcVTy); |
| 5430 | InstructionCost Cost = 0; |
| 5431 | MVT Ty = LT.second; |
| 5432 | if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) |
| 5433 | // APX masked load/store for scalar is cheap. |
| 5434 | return Cost + LT.first; |
| 5435 | |
| 5436 | if (VT.isSimple() && Ty != VT.getSimpleVT() && |
| 5437 | LT.second.getVectorNumElements() == NumElem) |
| 5438 | // Promotion requires extend/truncate for data and a shuffle for mask. |
| 5439 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SrcVTy, SrcTy: SrcVTy, Mask: {}, CostKind, |
| 5440 | Index: 0, SubTp: nullptr) + |
| 5441 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: MaskTy, SrcTy: MaskTy, Mask: {}, CostKind, |
| 5442 | Index: 0, SubTp: nullptr); |
| 5443 | |
| 5444 | else if (LT.first * Ty.getVectorNumElements() > NumElem) { |
| 5445 | auto *NewMaskTy = FixedVectorType::get(ElementType: MaskTy->getElementType(), |
| 5446 | NumElts: (unsigned)LT.first.getValue() * |
| 5447 | Ty.getVectorNumElements()); |
| 5448 | // Expanding requires fill mask with zeroes |
| 5449 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy: NewMaskTy, SrcTy: NewMaskTy, Mask: {}, |
| 5450 | CostKind, Index: 0, SubTp: MaskTy); |
| 5451 | } |
| 5452 | |
| 5453 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. |
| 5454 | if (!ST->hasAVX512()) |
| 5455 | return Cost + LT.first * (IsLoad ? 2 : 8); |
| 5456 | |
| 5457 | // AVX-512 masked load/store is cheaper |
| 5458 | return Cost + LT.first; |
| 5459 | } |
| 5460 | |
| 5461 | InstructionCost X86TTIImpl::getPointersChainCost( |
| 5462 | ArrayRef<const Value *> Ptrs, const Value *Base, |
| 5463 | const TTI::PointersChainInfo &Info, Type *AccessTy, |
| 5464 | TTI::TargetCostKind CostKind) const { |
| 5465 | if (Info.isSameBase() && Info.isKnownStride()) { |
| 5466 | // If all the pointers have known stride all the differences are translated |
| 5467 | // into constants. X86 memory addressing allows encoding it into |
| 5468 | // displacement. So we just need to take the base GEP cost. |
| 5469 | if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Val: Base)) { |
| 5470 | SmallVector<const Value *> Indices(BaseGEP->indices()); |
| 5471 | return getGEPCost(PointeeType: BaseGEP->getSourceElementType(), |
| 5472 | Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: nullptr, |
| 5473 | CostKind); |
| 5474 | } |
| 5475 | return TTI::TCC_Free; |
| 5476 | } |
| 5477 | return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); |
| 5478 | } |
| 5479 | |
| 5480 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, |
| 5481 | ScalarEvolution *SE, |
| 5482 | const SCEV *Ptr) const { |
| 5483 | // Address computations in vectorized code with non-consecutive addresses will |
| 5484 | // likely result in more instructions compared to scalar code where the |
| 5485 | // computation can more often be merged into the index mode. The resulting |
| 5486 | // extra micro-ops can significantly decrease throughput. |
| 5487 | const unsigned NumVectorInstToHideOverhead = 10; |
| 5488 | |
| 5489 | // Cost modeling of Strided Access Computation is hidden by the indexing |
| 5490 | // modes of X86 regardless of the stride value. We dont believe that there |
| 5491 | // is a difference between constant strided access in gerenal and constant |
| 5492 | // strided value which is less than or equal to 64. |
| 5493 | // Even in the case of (loop invariant) stride whose value is not known at |
| 5494 | // compile time, the address computation will not incur more than one extra |
| 5495 | // ADD instruction. |
| 5496 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { |
| 5497 | // TODO: AVX2 is the current cut-off because we don't have correct |
| 5498 | // interleaving costs for prior ISA's. |
| 5499 | if (!BaseT::isStridedAccess(Ptr)) |
| 5500 | return NumVectorInstToHideOverhead; |
| 5501 | if (!BaseT::getConstantStrideStep(SE, Ptr)) |
| 5502 | return 1; |
| 5503 | } |
| 5504 | |
| 5505 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); |
| 5506 | } |
| 5507 | |
| 5508 | InstructionCost |
| 5509 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
| 5510 | std::optional<FastMathFlags> FMF, |
| 5511 | TTI::TargetCostKind CostKind) const { |
| 5512 | if (TTI::requiresOrderedReduction(FMF)) |
| 5513 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
| 5514 | |
| 5515 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
| 5516 | // and make it as the cost. |
| 5517 | |
| 5518 | static const CostTblEntry SLMCostTbl[] = { |
| 5519 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 3 }, |
| 5520 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 5 }, |
| 5521 | }; |
| 5522 | |
| 5523 | static const CostTblEntry SSE2CostTbl[] = { |
| 5524 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 2 }, |
| 5525 | { .ISD: ISD::FADD, .Type: MVT::v2f32, .Cost: 2 }, |
| 5526 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 4 }, |
| 5527 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2 }, // The data reported by the IACA tool is "1.6". |
| 5528 | { .ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2 }, // FIXME: chosen to be less than v4i32 |
| 5529 | { .ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 3 }, // The data reported by the IACA tool is "3.3". |
| 5530 | { .ISD: ISD::ADD, .Type: MVT::v2i16, .Cost: 2 }, // The data reported by the IACA tool is "4.3". |
| 5531 | { .ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 3 }, // The data reported by the IACA tool is "4.3". |
| 5532 | { .ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 4 }, // The data reported by the IACA tool is "4.3". |
| 5533 | { .ISD: ISD::ADD, .Type: MVT::v2i8, .Cost: 2 }, |
| 5534 | { .ISD: ISD::ADD, .Type: MVT::v4i8, .Cost: 2 }, |
| 5535 | { .ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2 }, |
| 5536 | { .ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 3 }, |
| 5537 | }; |
| 5538 | |
| 5539 | static const CostTblEntry AVX1CostTbl[] = { |
| 5540 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: 3 }, |
| 5541 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 3 }, |
| 5542 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: 4 }, |
| 5543 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 1 }, // The data reported by the IACA tool is "1.5". |
| 5544 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: 3 }, |
| 5545 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: 5 }, |
| 5546 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: 5 }, |
| 5547 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: 4 }, |
| 5548 | }; |
| 5549 | |
| 5550 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| 5551 | assert(ISD && "Invalid opcode" ); |
| 5552 | |
| 5553 | // Before legalizing the type, give a chance to look up illegal narrow types |
| 5554 | // in the table. |
| 5555 | // FIXME: Is there a better way to do this? |
| 5556 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
| 5557 | if (VT.isSimple()) { |
| 5558 | MVT MTy = VT.getSimpleVT(); |
| 5559 | if (ST->useSLMArithCosts()) |
| 5560 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
| 5561 | return Entry->Cost; |
| 5562 | |
| 5563 | if (ST->hasAVX()) |
| 5564 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 5565 | return Entry->Cost; |
| 5566 | |
| 5567 | if (ST->hasSSE2()) |
| 5568 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 5569 | return Entry->Cost; |
| 5570 | } |
| 5571 | |
| 5572 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
| 5573 | |
| 5574 | MVT MTy = LT.second; |
| 5575 | |
| 5576 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
| 5577 | |
| 5578 | // Special case: vXi8 mul reductions are performed as vXi16. |
| 5579 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { |
| 5580 | auto *WideSclTy = IntegerType::get(C&: ValVTy->getContext(), NumBits: 16); |
| 5581 | auto *WideVecTy = FixedVectorType::get(ElementType: WideSclTy, NumElts: ValVTy->getNumElements()); |
| 5582 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: ValTy, |
| 5583 | CCH: TargetTransformInfo::CastContextHint::None, |
| 5584 | CostKind) + |
| 5585 | getArithmeticReductionCost(Opcode, ValTy: WideVecTy, FMF, CostKind); |
| 5586 | } |
| 5587 | |
| 5588 | InstructionCost ArithmeticCost = 0; |
| 5589 | if (LT.first != 1 && MTy.isVector() && |
| 5590 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
| 5591 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
| 5592 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
| 5593 | NumElts: MTy.getVectorNumElements()); |
| 5594 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
| 5595 | ArithmeticCost *= LT.first - 1; |
| 5596 | } |
| 5597 | |
| 5598 | if (ST->useSLMArithCosts()) |
| 5599 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
| 5600 | return ArithmeticCost + Entry->Cost; |
| 5601 | |
| 5602 | if (ST->hasAVX()) |
| 5603 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 5604 | return ArithmeticCost + Entry->Cost; |
| 5605 | |
| 5606 | if (ST->hasSSE2()) |
| 5607 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 5608 | return ArithmeticCost + Entry->Cost; |
| 5609 | |
| 5610 | // FIXME: These assume a naive kshift+binop lowering, which is probably |
| 5611 | // conservative in most cases. |
| 5612 | static const CostTblEntry AVX512BoolReduction[] = { |
| 5613 | { .ISD: ISD::AND, .Type: MVT::v2i1, .Cost: 3 }, |
| 5614 | { .ISD: ISD::AND, .Type: MVT::v4i1, .Cost: 5 }, |
| 5615 | { .ISD: ISD::AND, .Type: MVT::v8i1, .Cost: 7 }, |
| 5616 | { .ISD: ISD::AND, .Type: MVT::v16i1, .Cost: 9 }, |
| 5617 | { .ISD: ISD::AND, .Type: MVT::v32i1, .Cost: 11 }, |
| 5618 | { .ISD: ISD::AND, .Type: MVT::v64i1, .Cost: 13 }, |
| 5619 | { .ISD: ISD::OR, .Type: MVT::v2i1, .Cost: 3 }, |
| 5620 | { .ISD: ISD::OR, .Type: MVT::v4i1, .Cost: 5 }, |
| 5621 | { .ISD: ISD::OR, .Type: MVT::v8i1, .Cost: 7 }, |
| 5622 | { .ISD: ISD::OR, .Type: MVT::v16i1, .Cost: 9 }, |
| 5623 | { .ISD: ISD::OR, .Type: MVT::v32i1, .Cost: 11 }, |
| 5624 | { .ISD: ISD::OR, .Type: MVT::v64i1, .Cost: 13 }, |
| 5625 | }; |
| 5626 | |
| 5627 | static const CostTblEntry AVX2BoolReduction[] = { |
| 5628 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
| 5629 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
| 5630 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
| 5631 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
| 5632 | }; |
| 5633 | |
| 5634 | static const CostTblEntry AVX1BoolReduction[] = { |
| 5635 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
| 5636 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
| 5637 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
| 5638 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
| 5639 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
| 5640 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
| 5641 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
| 5642 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
| 5643 | }; |
| 5644 | |
| 5645 | static const CostTblEntry SSE2BoolReduction[] = { |
| 5646 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
| 5647 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
| 5648 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
| 5649 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
| 5650 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
| 5651 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
| 5652 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
| 5653 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
| 5654 | }; |
| 5655 | |
| 5656 | // Handle bool allof/anyof patterns. |
| 5657 | if (ValVTy->getElementType()->isIntegerTy(Bitwidth: 1)) { |
| 5658 | InstructionCost ArithmeticCost = 0; |
| 5659 | if (LT.first != 1 && MTy.isVector() && |
| 5660 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
| 5661 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
| 5662 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
| 5663 | NumElts: MTy.getVectorNumElements()); |
| 5664 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
| 5665 | ArithmeticCost *= LT.first - 1; |
| 5666 | } |
| 5667 | |
| 5668 | if (ST->hasAVX512()) |
| 5669 | if (const auto *Entry = CostTableLookup(Table: AVX512BoolReduction, ISD, Ty: MTy)) |
| 5670 | return ArithmeticCost + Entry->Cost; |
| 5671 | if (ST->hasAVX2()) |
| 5672 | if (const auto *Entry = CostTableLookup(Table: AVX2BoolReduction, ISD, Ty: MTy)) |
| 5673 | return ArithmeticCost + Entry->Cost; |
| 5674 | if (ST->hasAVX()) |
| 5675 | if (const auto *Entry = CostTableLookup(Table: AVX1BoolReduction, ISD, Ty: MTy)) |
| 5676 | return ArithmeticCost + Entry->Cost; |
| 5677 | if (ST->hasSSE2()) |
| 5678 | if (const auto *Entry = CostTableLookup(Table: SSE2BoolReduction, ISD, Ty: MTy)) |
| 5679 | return ArithmeticCost + Entry->Cost; |
| 5680 | |
| 5681 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
| 5682 | } |
| 5683 | |
| 5684 | unsigned NumVecElts = ValVTy->getNumElements(); |
| 5685 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); |
| 5686 | |
| 5687 | // Special case power of 2 reductions where the scalar type isn't changed |
| 5688 | // by type legalization. |
| 5689 | if (!isPowerOf2_32(Value: NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) |
| 5690 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
| 5691 | |
| 5692 | InstructionCost ReductionCost = 0; |
| 5693 | |
| 5694 | auto *Ty = ValVTy; |
| 5695 | if (LT.first != 1 && MTy.isVector() && |
| 5696 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
| 5697 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
| 5698 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
| 5699 | NumElts: MTy.getVectorNumElements()); |
| 5700 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
| 5701 | ReductionCost *= LT.first - 1; |
| 5702 | NumVecElts = MTy.getVectorNumElements(); |
| 5703 | } |
| 5704 | |
| 5705 | // Now handle reduction with the legal type, taking into account size changes |
| 5706 | // at each level. |
| 5707 | while (NumVecElts > 1) { |
| 5708 | // Determine the size of the remaining vector we need to reduce. |
| 5709 | unsigned Size = NumVecElts * ScalarSize; |
| 5710 | NumVecElts /= 2; |
| 5711 | // If we're reducing from 256/512 bits, use an extract_subvector. |
| 5712 | if (Size > 128) { |
| 5713 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
| 5714 | ReductionCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
| 5715 | CostKind, Index: NumVecElts, SubTp: SubTy); |
| 5716 | Ty = SubTy; |
| 5717 | } else if (Size == 128) { |
| 5718 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
| 5719 | FixedVectorType *ShufTy; |
| 5720 | if (ValVTy->isFloatingPointTy()) |
| 5721 | ShufTy = |
| 5722 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValVTy->getContext()), NumElts: 2); |
| 5723 | else |
| 5724 | ShufTy = |
| 5725 | FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValVTy->getContext()), NumElts: 2); |
| 5726 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, |
| 5727 | Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
| 5728 | } else if (Size == 64) { |
| 5729 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
| 5730 | FixedVectorType *ShufTy; |
| 5731 | if (ValVTy->isFloatingPointTy()) |
| 5732 | ShufTy = |
| 5733 | FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValVTy->getContext()), NumElts: 4); |
| 5734 | else |
| 5735 | ShufTy = |
| 5736 | FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValVTy->getContext()), NumElts: 4); |
| 5737 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, |
| 5738 | Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
| 5739 | } else { |
| 5740 | // Reducing from smaller size is a shift by immediate. |
| 5741 | auto *ShiftTy = FixedVectorType::get( |
| 5742 | ElementType: Type::getIntNTy(C&: ValVTy->getContext(), N: Size), NumElts: 128 / Size); |
| 5743 | ReductionCost += getArithmeticInstrCost( |
| 5744 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind, |
| 5745 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 5746 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
| 5747 | } |
| 5748 | |
| 5749 | // Add the arithmetic op for this level. |
| 5750 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); |
| 5751 | } |
| 5752 | |
| 5753 | // Add the final extract element to the cost. |
| 5754 | return ReductionCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
| 5755 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
| 5756 | } |
| 5757 | |
| 5758 | InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, |
| 5759 | TTI::TargetCostKind CostKind, |
| 5760 | FastMathFlags FMF) const { |
| 5761 | IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); |
| 5762 | return getIntrinsicInstrCost(ICA, CostKind); |
| 5763 | } |
| 5764 | |
| 5765 | InstructionCost |
| 5766 | X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, |
| 5767 | FastMathFlags FMF, |
| 5768 | TTI::TargetCostKind CostKind) const { |
| 5769 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
| 5770 | |
| 5771 | MVT MTy = LT.second; |
| 5772 | |
| 5773 | int ISD; |
| 5774 | if (ValTy->isIntOrIntVectorTy()) { |
| 5775 | ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN |
| 5776 | : ISD::SMIN; |
| 5777 | } else { |
| 5778 | assert(ValTy->isFPOrFPVectorTy() && |
| 5779 | "Expected float point or integer vector type." ); |
| 5780 | ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) |
| 5781 | ? ISD::FMINNUM |
| 5782 | : ISD::FMINIMUM; |
| 5783 | } |
| 5784 | |
| 5785 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
| 5786 | // and make it as the cost. |
| 5787 | |
| 5788 | static const CostTblEntry SSE2CostTbl[] = { |
| 5789 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // need pxors to use pminsw/pmaxsw |
| 5790 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // need pxors to use pminsw/pmaxsw |
| 5791 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 9}, // need pxors to use pminsw/pmaxsw |
| 5792 | }; |
| 5793 | |
| 5794 | static const CostTblEntry SSE41CostTbl[] = { |
| 5795 | {.ISD: ISD::SMIN, .Type: MVT::v2i16, .Cost: 3}, // same as sse2 |
| 5796 | {.ISD: ISD::SMIN, .Type: MVT::v4i16, .Cost: 5}, // same as sse2 |
| 5797 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // same as sse2 |
| 5798 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // same as sse2 |
| 5799 | {.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: 4}, // phminposuw+xor |
| 5800 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 4}, // FIXME: umin is cheaper than umax |
| 5801 | {.ISD: ISD::SMIN, .Type: MVT::v2i8, .Cost: 3}, // pminsb |
| 5802 | {.ISD: ISD::SMIN, .Type: MVT::v4i8, .Cost: 5}, // pminsb |
| 5803 | {.ISD: ISD::SMIN, .Type: MVT::v8i8, .Cost: 7}, // pminsb |
| 5804 | {.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: 6}, |
| 5805 | {.ISD: ISD::UMIN, .Type: MVT::v2i8, .Cost: 3}, // same as sse2 |
| 5806 | {.ISD: ISD::UMIN, .Type: MVT::v4i8, .Cost: 5}, // same as sse2 |
| 5807 | {.ISD: ISD::UMIN, .Type: MVT::v8i8, .Cost: 7}, // same as sse2 |
| 5808 | {.ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: 6}, // FIXME: umin is cheaper than umax |
| 5809 | }; |
| 5810 | |
| 5811 | static const CostTblEntry AVX1CostTbl[] = { |
| 5812 | {.ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: 6}, |
| 5813 | {.ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: 6}, // FIXME: umin is cheaper than umax |
| 5814 | {.ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: 8}, |
| 5815 | {.ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: 8}, |
| 5816 | }; |
| 5817 | |
| 5818 | static const CostTblEntry AVX512BWCostTbl[] = { |
| 5819 | {.ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: 8}, |
| 5820 | {.ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: 8}, // FIXME: umin is cheaper than umax |
| 5821 | {.ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: 10}, |
| 5822 | {.ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: 10}, |
| 5823 | }; |
| 5824 | |
| 5825 | // Before legalizing the type, give a chance to look up illegal narrow types |
| 5826 | // in the table. |
| 5827 | // FIXME: Is there a better way to do this? |
| 5828 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
| 5829 | if (VT.isSimple()) { |
| 5830 | MVT MTy = VT.getSimpleVT(); |
| 5831 | if (ST->hasBWI()) |
| 5832 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
| 5833 | return Entry->Cost; |
| 5834 | |
| 5835 | if (ST->hasAVX()) |
| 5836 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 5837 | return Entry->Cost; |
| 5838 | |
| 5839 | if (ST->hasSSE41()) |
| 5840 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
| 5841 | return Entry->Cost; |
| 5842 | |
| 5843 | if (ST->hasSSE2()) |
| 5844 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 5845 | return Entry->Cost; |
| 5846 | } |
| 5847 | |
| 5848 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
| 5849 | unsigned NumVecElts = ValVTy->getNumElements(); |
| 5850 | |
| 5851 | auto *Ty = ValVTy; |
| 5852 | InstructionCost MinMaxCost = 0; |
| 5853 | if (LT.first != 1 && MTy.isVector() && |
| 5854 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
| 5855 | // Type needs to be split. We need LT.first - 1 operations ops. |
| 5856 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
| 5857 | NumElts: MTy.getVectorNumElements()); |
| 5858 | MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); |
| 5859 | MinMaxCost *= LT.first - 1; |
| 5860 | NumVecElts = MTy.getVectorNumElements(); |
| 5861 | } |
| 5862 | |
| 5863 | if (ST->hasBWI()) |
| 5864 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
| 5865 | return MinMaxCost + Entry->Cost; |
| 5866 | |
| 5867 | if (ST->hasAVX()) |
| 5868 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
| 5869 | return MinMaxCost + Entry->Cost; |
| 5870 | |
| 5871 | if (ST->hasSSE41()) |
| 5872 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
| 5873 | return MinMaxCost + Entry->Cost; |
| 5874 | |
| 5875 | if (ST->hasSSE2()) |
| 5876 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
| 5877 | return MinMaxCost + Entry->Cost; |
| 5878 | |
| 5879 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); |
| 5880 | |
| 5881 | // Special case power of 2 reductions where the scalar type isn't changed |
| 5882 | // by type legalization. |
| 5883 | if (!isPowerOf2_32(Value: ValVTy->getNumElements()) || |
| 5884 | ScalarSize != MTy.getScalarSizeInBits()) |
| 5885 | return BaseT::getMinMaxReductionCost(IID, Ty: ValTy, FMF, CostKind); |
| 5886 | |
| 5887 | // Now handle reduction with the legal type, taking into account size changes |
| 5888 | // at each level. |
| 5889 | while (NumVecElts > 1) { |
| 5890 | // Determine the size of the remaining vector we need to reduce. |
| 5891 | unsigned Size = NumVecElts * ScalarSize; |
| 5892 | NumVecElts /= 2; |
| 5893 | // If we're reducing from 256/512 bits, use an extract_subvector. |
| 5894 | if (Size > 128) { |
| 5895 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
| 5896 | MinMaxCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
| 5897 | CostKind, Index: NumVecElts, SubTp: SubTy); |
| 5898 | Ty = SubTy; |
| 5899 | } else if (Size == 128) { |
| 5900 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
| 5901 | VectorType *ShufTy; |
| 5902 | if (ValTy->isFloatingPointTy()) |
| 5903 | ShufTy = |
| 5904 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValTy->getContext()), NumElts: 2); |
| 5905 | else |
| 5906 | ShufTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValTy->getContext()), NumElts: 2); |
| 5907 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, Mask: {}, |
| 5908 | CostKind, Index: 0, SubTp: nullptr); |
| 5909 | } else if (Size == 64) { |
| 5910 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
| 5911 | FixedVectorType *ShufTy; |
| 5912 | if (ValTy->isFloatingPointTy()) |
| 5913 | ShufTy = FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), NumElts: 4); |
| 5914 | else |
| 5915 | ShufTy = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValTy->getContext()), NumElts: 4); |
| 5916 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, Mask: {}, |
| 5917 | CostKind, Index: 0, SubTp: nullptr); |
| 5918 | } else { |
| 5919 | // Reducing from smaller size is a shift by immediate. |
| 5920 | auto *ShiftTy = FixedVectorType::get( |
| 5921 | ElementType: Type::getIntNTy(C&: ValTy->getContext(), N: Size), NumElts: 128 / Size); |
| 5922 | MinMaxCost += getArithmeticInstrCost( |
| 5923 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind: TTI::TCK_RecipThroughput, |
| 5924 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
| 5925 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
| 5926 | } |
| 5927 | |
| 5928 | // Add the arithmetic op for this level. |
| 5929 | MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); |
| 5930 | } |
| 5931 | |
| 5932 | // Add the final extract element to the cost. |
| 5933 | return MinMaxCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
| 5934 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
| 5935 | } |
| 5936 | |
| 5937 | /// Calculate the cost of materializing a 64-bit value. This helper |
| 5938 | /// method might only calculate a fraction of a larger immediate. Therefore it |
| 5939 | /// is valid to return a cost of ZERO. |
| 5940 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) const { |
| 5941 | if (Val == 0) |
| 5942 | return TTI::TCC_Free; |
| 5943 | |
| 5944 | if (isInt<32>(x: Val)) |
| 5945 | return TTI::TCC_Basic; |
| 5946 | |
| 5947 | return 2 * TTI::TCC_Basic; |
| 5948 | } |
| 5949 | |
| 5950 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
| 5951 | TTI::TargetCostKind CostKind) const { |
| 5952 | assert(Ty->isIntegerTy()); |
| 5953 | |
| 5954 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 5955 | if (BitSize == 0) |
| 5956 | return ~0U; |
| 5957 | |
| 5958 | // Never hoist constants larger than 128bit, because this might lead to |
| 5959 | // incorrect code generation or assertions in codegen. |
| 5960 | // Fixme: Create a cost model for types larger than i128 once the codegen |
| 5961 | // issues have been fixed. |
| 5962 | if (BitSize > 128) |
| 5963 | return TTI::TCC_Free; |
| 5964 | |
| 5965 | if (Imm == 0) |
| 5966 | return TTI::TCC_Free; |
| 5967 | |
| 5968 | // Sign-extend all constants to a multiple of 64-bit. |
| 5969 | APInt ImmVal = Imm; |
| 5970 | if (BitSize % 64 != 0) |
| 5971 | ImmVal = Imm.sext(width: alignTo(Value: BitSize, Align: 64)); |
| 5972 | |
| 5973 | // Split the constant into 64-bit chunks and calculate the cost for each |
| 5974 | // chunk. |
| 5975 | InstructionCost Cost = 0; |
| 5976 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
| 5977 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
| 5978 | int64_t Val = Tmp.getSExtValue(); |
| 5979 | Cost += getIntImmCost(Val); |
| 5980 | } |
| 5981 | // We need at least one instruction to materialize the constant. |
| 5982 | return std::max<InstructionCost>(a: 1, b: Cost); |
| 5983 | } |
| 5984 | |
| 5985 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
| 5986 | const APInt &Imm, Type *Ty, |
| 5987 | TTI::TargetCostKind CostKind, |
| 5988 | Instruction *Inst) const { |
| 5989 | assert(Ty->isIntegerTy()); |
| 5990 | |
| 5991 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 5992 | unsigned ImmBitWidth = Imm.getBitWidth(); |
| 5993 | |
| 5994 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 5995 | // here, so that constant hoisting will ignore this constant. |
| 5996 | if (BitSize == 0) |
| 5997 | return TTI::TCC_Free; |
| 5998 | |
| 5999 | unsigned ImmIdx = ~0U; |
| 6000 | switch (Opcode) { |
| 6001 | default: |
| 6002 | return TTI::TCC_Free; |
| 6003 | case Instruction::GetElementPtr: |
| 6004 | // Always hoist the base address of a GetElementPtr. This prevents the |
| 6005 | // creation of new constants for every base constant that gets constant |
| 6006 | // folded with the offset. |
| 6007 | if (Idx == 0) |
| 6008 | return 2 * TTI::TCC_Basic; |
| 6009 | return TTI::TCC_Free; |
| 6010 | case Instruction::Store: |
| 6011 | ImmIdx = 0; |
| 6012 | break; |
| 6013 | case Instruction::ICmp: |
| 6014 | // This is an imperfect hack to prevent constant hoisting of |
| 6015 | // compares that might be trying to check if a 64-bit value fits in |
| 6016 | // 32-bits. The backend can optimize these cases using a right shift by 32. |
| 6017 | // There are other predicates and immediates the backend can use shifts for. |
| 6018 | if (Idx == 1 && ImmBitWidth == 64) { |
| 6019 | uint64_t ImmVal = Imm.getZExtValue(); |
| 6020 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) |
| 6021 | return TTI::TCC_Free; |
| 6022 | |
| 6023 | if (auto *Cmp = dyn_cast_or_null<CmpInst>(Val: Inst)) { |
| 6024 | if (Cmp->isEquality()) { |
| 6025 | KnownBits Known = computeKnownBits(V: Cmp->getOperand(i_nocapture: 0), DL); |
| 6026 | if (Known.countMinTrailingZeros() >= 32) |
| 6027 | return TTI::TCC_Free; |
| 6028 | } |
| 6029 | } |
| 6030 | } |
| 6031 | ImmIdx = 1; |
| 6032 | break; |
| 6033 | case Instruction::And: |
| 6034 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes |
| 6035 | // by using a 32-bit operation with implicit zero extension. Detect such |
| 6036 | // immediates here as the normal path expects bit 31 to be sign extended. |
| 6037 | if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(N: 32)) |
| 6038 | return TTI::TCC_Free; |
| 6039 | // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits. |
| 6040 | if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() && |
| 6041 | Imm.isMask()) |
| 6042 | return X86TTIImpl::getIntImmCost(Val: ST->hasBMI2() ? 255 : 65535); |
| 6043 | ImmIdx = 1; |
| 6044 | break; |
| 6045 | case Instruction::Add: |
| 6046 | case Instruction::Sub: |
| 6047 | // For add/sub, we can use the opposite instruction for INT32_MIN. |
| 6048 | if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000) |
| 6049 | return TTI::TCC_Free; |
| 6050 | ImmIdx = 1; |
| 6051 | break; |
| 6052 | case Instruction::UDiv: |
| 6053 | case Instruction::SDiv: |
| 6054 | case Instruction::URem: |
| 6055 | case Instruction::SRem: |
| 6056 | // Division by constant is typically expanded later into a different |
| 6057 | // instruction sequence. This completely changes the constants. |
| 6058 | // Report them as "free" to stop ConstantHoist from marking them as opaque. |
| 6059 | return TTI::TCC_Free; |
| 6060 | case Instruction::Mul: |
| 6061 | case Instruction::Or: |
| 6062 | case Instruction::Xor: |
| 6063 | ImmIdx = 1; |
| 6064 | break; |
| 6065 | // Always return TCC_Free for the shift value of a shift instruction. |
| 6066 | case Instruction::Shl: |
| 6067 | case Instruction::LShr: |
| 6068 | case Instruction::AShr: |
| 6069 | if (Idx == 1) |
| 6070 | return TTI::TCC_Free; |
| 6071 | break; |
| 6072 | case Instruction::Trunc: |
| 6073 | case Instruction::ZExt: |
| 6074 | case Instruction::SExt: |
| 6075 | case Instruction::IntToPtr: |
| 6076 | case Instruction::PtrToInt: |
| 6077 | case Instruction::BitCast: |
| 6078 | case Instruction::PHI: |
| 6079 | case Instruction::Call: |
| 6080 | case Instruction::Select: |
| 6081 | case Instruction::Ret: |
| 6082 | case Instruction::Load: |
| 6083 | break; |
| 6084 | } |
| 6085 | |
| 6086 | if (Idx == ImmIdx) { |
| 6087 | uint64_t NumConstants = divideCeil(Numerator: BitSize, Denominator: 64); |
| 6088 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 6089 | return (Cost <= NumConstants * TTI::TCC_Basic) |
| 6090 | ? static_cast<int>(TTI::TCC_Free) |
| 6091 | : Cost; |
| 6092 | } |
| 6093 | |
| 6094 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 6095 | } |
| 6096 | |
| 6097 | InstructionCost |
| 6098 | X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
| 6099 | const APInt &Imm, Type *Ty, |
| 6100 | TTI::TargetCostKind CostKind) const { |
| 6101 | assert(Ty->isIntegerTy()); |
| 6102 | |
| 6103 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
| 6104 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
| 6105 | // here, so that constant hoisting will ignore this constant. |
| 6106 | if (BitSize == 0) |
| 6107 | return TTI::TCC_Free; |
| 6108 | |
| 6109 | switch (IID) { |
| 6110 | default: |
| 6111 | return TTI::TCC_Free; |
| 6112 | case Intrinsic::sadd_with_overflow: |
| 6113 | case Intrinsic::uadd_with_overflow: |
| 6114 | case Intrinsic::ssub_with_overflow: |
| 6115 | case Intrinsic::usub_with_overflow: |
| 6116 | case Intrinsic::smul_with_overflow: |
| 6117 | case Intrinsic::umul_with_overflow: |
| 6118 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 32)) |
| 6119 | return TTI::TCC_Free; |
| 6120 | break; |
| 6121 | case Intrinsic::experimental_stackmap: |
| 6122 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
| 6123 | return TTI::TCC_Free; |
| 6124 | break; |
| 6125 | case Intrinsic::experimental_patchpoint_void: |
| 6126 | case Intrinsic::experimental_patchpoint: |
| 6127 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
| 6128 | return TTI::TCC_Free; |
| 6129 | break; |
| 6130 | } |
| 6131 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
| 6132 | } |
| 6133 | |
| 6134 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, |
| 6135 | TTI::TargetCostKind CostKind, |
| 6136 | const Instruction *I) const { |
| 6137 | if (CostKind != TTI::TCK_RecipThroughput) |
| 6138 | return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic; |
| 6139 | // Branches are assumed to be predicted. |
| 6140 | return TTI::TCC_Free; |
| 6141 | } |
| 6142 | |
| 6143 | int X86TTIImpl::getGatherOverhead() const { |
| 6144 | // Some CPUs have more overhead for gather. The specified overhead is relative |
| 6145 | // to the Load operation. "2" is the number provided by Intel architects. This |
| 6146 | // parameter is used for cost estimation of Gather Op and comparison with |
| 6147 | // other alternatives. |
| 6148 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only |
| 6149 | // enable gather with a -march. |
| 6150 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) |
| 6151 | return 2; |
| 6152 | |
| 6153 | return 1024; |
| 6154 | } |
| 6155 | |
| 6156 | int X86TTIImpl::getScatterOverhead() const { |
| 6157 | if (ST->hasAVX512()) |
| 6158 | return 2; |
| 6159 | |
| 6160 | return 1024; |
| 6161 | } |
| 6162 | |
| 6163 | // Return an average cost of Gather / Scatter instruction, maybe improved later. |
| 6164 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, |
| 6165 | TTI::TargetCostKind CostKind, |
| 6166 | Type *SrcVTy, const Value *Ptr, |
| 6167 | Align Alignment, |
| 6168 | unsigned AddressSpace) const { |
| 6169 | |
| 6170 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ); |
| 6171 | unsigned VF = cast<FixedVectorType>(Val: SrcVTy)->getNumElements(); |
| 6172 | |
| 6173 | // Try to reduce index size from 64 bit (default for GEP) |
| 6174 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the |
| 6175 | // operation will use 16 x 64 indices which do not fit in a zmm and needs |
| 6176 | // to split. Also check that the base pointer is the same for all lanes, |
| 6177 | // and that there's at most one variable index. |
| 6178 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { |
| 6179 | unsigned IndexSize = DL.getPointerSizeInBits(); |
| 6180 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
| 6181 | if (IndexSize < 64 || !GEP) |
| 6182 | return IndexSize; |
| 6183 | |
| 6184 | unsigned NumOfVarIndices = 0; |
| 6185 | const Value *Ptrs = GEP->getPointerOperand(); |
| 6186 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(V: Ptrs)) |
| 6187 | return IndexSize; |
| 6188 | for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { |
| 6189 | if (isa<Constant>(Val: GEP->getOperand(i_nocapture: I))) |
| 6190 | continue; |
| 6191 | Type *IndxTy = GEP->getOperand(i_nocapture: I)->getType(); |
| 6192 | if (auto *IndexVTy = dyn_cast<VectorType>(Val: IndxTy)) |
| 6193 | IndxTy = IndexVTy->getElementType(); |
| 6194 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && |
| 6195 | !isa<SExtInst>(Val: GEP->getOperand(i_nocapture: I))) || |
| 6196 | ++NumOfVarIndices > 1) |
| 6197 | return IndexSize; // 64 |
| 6198 | } |
| 6199 | return (unsigned)32; |
| 6200 | }; |
| 6201 | |
| 6202 | // Trying to reduce IndexSize to 32 bits for vector 16. |
| 6203 | // By default the IndexSize is equal to pointer size. |
| 6204 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) |
| 6205 | ? getIndexSizeInBits(Ptr, DL) |
| 6206 | : DL.getPointerSizeInBits(); |
| 6207 | |
| 6208 | auto *IndexVTy = FixedVectorType::get( |
| 6209 | ElementType: IntegerType::get(C&: SrcVTy->getContext(), NumBits: IndexSize), NumElts: VF); |
| 6210 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(Ty: IndexVTy); |
| 6211 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcVTy); |
| 6212 | InstructionCost::CostType SplitFactor = |
| 6213 | std::max(a: IdxsLT.first, b: SrcLT.first).getValue(); |
| 6214 | if (SplitFactor > 1) { |
| 6215 | // Handle splitting of vector of pointers |
| 6216 | auto *SplitSrcTy = |
| 6217 | FixedVectorType::get(ElementType: SrcVTy->getScalarType(), NumElts: VF / SplitFactor); |
| 6218 | return SplitFactor * getGSVectorCost(Opcode, CostKind, SrcVTy: SplitSrcTy, Ptr, |
| 6219 | Alignment, AddressSpace); |
| 6220 | } |
| 6221 | |
| 6222 | // If we didn't split, this will be a single gather/scatter instruction. |
| 6223 | if (CostKind == TTI::TCK_CodeSize) |
| 6224 | return 1; |
| 6225 | |
| 6226 | // The gather / scatter cost is given by Intel architects. It is a rough |
| 6227 | // number since we are looking at one instruction in a time. |
| 6228 | const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() |
| 6229 | : getScatterOverhead(); |
| 6230 | return GSOverhead + VF * getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
| 6231 | Alignment, AddressSpace, CostKind); |
| 6232 | } |
| 6233 | |
| 6234 | /// Calculate the cost of Gather / Scatter operation |
| 6235 | InstructionCost X86TTIImpl::getGatherScatterOpCost( |
| 6236 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, |
| 6237 | Align Alignment, TTI::TargetCostKind CostKind, |
| 6238 | const Instruction *I = nullptr) const { |
| 6239 | if ((Opcode == Instruction::Load && |
| 6240 | (!isLegalMaskedGather(DataType: SrcVTy, Alignment: Align(Alignment)) || |
| 6241 | forceScalarizeMaskedGather(VTy: cast<VectorType>(Val: SrcVTy), |
| 6242 | Alignment: Align(Alignment)))) || |
| 6243 | (Opcode == Instruction::Store && |
| 6244 | (!isLegalMaskedScatter(DataType: SrcVTy, Alignment: Align(Alignment)) || |
| 6245 | forceScalarizeMaskedScatter(VTy: cast<VectorType>(Val: SrcVTy), |
| 6246 | Alignment: Align(Alignment))))) |
| 6247 | return BaseT::getGatherScatterOpCost(Opcode, DataTy: SrcVTy, Ptr, VariableMask, |
| 6248 | Alignment, CostKind, I); |
| 6249 | |
| 6250 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ); |
| 6251 | PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
| 6252 | if (!PtrTy && Ptr->getType()->isVectorTy()) |
| 6253 | PtrTy = dyn_cast<PointerType>( |
| 6254 | Val: cast<VectorType>(Val: Ptr->getType())->getElementType()); |
| 6255 | assert(PtrTy && "Unexpected type for Ptr argument" ); |
| 6256 | unsigned AddressSpace = PtrTy->getAddressSpace(); |
| 6257 | return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, |
| 6258 | AddressSpace); |
| 6259 | } |
| 6260 | |
| 6261 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
| 6262 | const TargetTransformInfo::LSRCost &C2) const { |
| 6263 | // X86 specific here are "instruction number 1st priority". |
| 6264 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, args: C1.NumIVMuls, |
| 6265 | args: C1.NumBaseAdds, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
| 6266 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, args: C2.NumIVMuls, |
| 6267 | args: C2.NumBaseAdds, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
| 6268 | } |
| 6269 | |
| 6270 | bool X86TTIImpl::canMacroFuseCmp() const { |
| 6271 | return ST->hasMacroFusion() || ST->hasBranchFusion(); |
| 6272 | } |
| 6273 | |
| 6274 | static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) { |
| 6275 | if (!ST->hasAVX()) |
| 6276 | return false; |
| 6277 | |
| 6278 | if (ScalarTy->isPointerTy()) |
| 6279 | return true; |
| 6280 | |
| 6281 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
| 6282 | return true; |
| 6283 | |
| 6284 | if (ScalarTy->isHalfTy() && ST->hasBWI()) |
| 6285 | return true; |
| 6286 | |
| 6287 | if (ScalarTy->isBFloatTy() && ST->hasBF16()) |
| 6288 | return true; |
| 6289 | |
| 6290 | if (!ScalarTy->isIntegerTy()) |
| 6291 | return false; |
| 6292 | |
| 6293 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
| 6294 | return IntWidth == 32 || IntWidth == 64 || |
| 6295 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); |
| 6296 | } |
| 6297 | |
| 6298 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, |
| 6299 | unsigned AddressSpace) const { |
| 6300 | Type *ScalarTy = DataTy->getScalarType(); |
| 6301 | |
| 6302 | // The backend can't handle a single element vector w/o CFCMOV. |
| 6303 | if (isa<VectorType>(Val: DataTy) && |
| 6304 | cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
| 6305 | return ST->hasCF() && |
| 6306 | hasConditionalLoadStoreForType(Ty: ScalarTy, /*IsStore=*/false); |
| 6307 | |
| 6308 | return isLegalMaskedLoadStore(ScalarTy, ST); |
| 6309 | } |
| 6310 | |
| 6311 | bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment, |
| 6312 | unsigned AddressSpace) const { |
| 6313 | Type *ScalarTy = DataTy->getScalarType(); |
| 6314 | |
| 6315 | // The backend can't handle a single element vector w/o CFCMOV. |
| 6316 | if (isa<VectorType>(Val: DataTy) && |
| 6317 | cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
| 6318 | return ST->hasCF() && |
| 6319 | hasConditionalLoadStoreForType(Ty: ScalarTy, /*IsStore=*/true); |
| 6320 | |
| 6321 | return isLegalMaskedLoadStore(ScalarTy, ST); |
| 6322 | } |
| 6323 | |
| 6324 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const { |
| 6325 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
| 6326 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 |
| 6327 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 |
| 6328 | // (the equivalent stores only require AVX). |
| 6329 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) |
| 6330 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); |
| 6331 | |
| 6332 | return false; |
| 6333 | } |
| 6334 | |
| 6335 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const { |
| 6336 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
| 6337 | |
| 6338 | // SSE4A supports nontemporal stores of float and double at arbitrary |
| 6339 | // alignment. |
| 6340 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) |
| 6341 | return true; |
| 6342 | |
| 6343 | // Besides the SSE4A subtarget exception above, only aligned stores are |
| 6344 | // available nontemporaly on any other subtarget. And only stores with a size |
| 6345 | // of 4..32 bytes (powers of 2, only) are permitted. |
| 6346 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || |
| 6347 | !isPowerOf2_32(Value: DataSize)) |
| 6348 | return false; |
| 6349 | |
| 6350 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent |
| 6351 | // loads require AVX2). |
| 6352 | if (DataSize == 32) |
| 6353 | return ST->hasAVX(); |
| 6354 | if (DataSize == 16) |
| 6355 | return ST->hasSSE1(); |
| 6356 | return true; |
| 6357 | } |
| 6358 | |
| 6359 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, |
| 6360 | ElementCount NumElements) const { |
| 6361 | // movddup |
| 6362 | return ST->hasSSE3() && !NumElements.isScalable() && |
| 6363 | NumElements.getFixedValue() == 2 && |
| 6364 | ElementTy == Type::getDoubleTy(C&: ElementTy->getContext()); |
| 6365 | } |
| 6366 | |
| 6367 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const { |
| 6368 | if (!isa<VectorType>(Val: DataTy)) |
| 6369 | return false; |
| 6370 | |
| 6371 | if (!ST->hasAVX512()) |
| 6372 | return false; |
| 6373 | |
| 6374 | // The backend can't handle a single element vector. |
| 6375 | if (cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
| 6376 | return false; |
| 6377 | |
| 6378 | Type *ScalarTy = cast<VectorType>(Val: DataTy)->getElementType(); |
| 6379 | |
| 6380 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
| 6381 | return true; |
| 6382 | |
| 6383 | if (!ScalarTy->isIntegerTy()) |
| 6384 | return false; |
| 6385 | |
| 6386 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
| 6387 | return IntWidth == 32 || IntWidth == 64 || |
| 6388 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); |
| 6389 | } |
| 6390 | |
| 6391 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, |
| 6392 | Align Alignment) const { |
| 6393 | return isLegalMaskedExpandLoad(DataTy, Alignment); |
| 6394 | } |
| 6395 | |
| 6396 | bool X86TTIImpl::supportsGather() const { |
| 6397 | // Some CPUs have better gather performance than others. |
| 6398 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only |
| 6399 | // enable gather with a -march. |
| 6400 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); |
| 6401 | } |
| 6402 | |
| 6403 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, |
| 6404 | Align Alignment) const { |
| 6405 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX |
| 6406 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend |
| 6407 | // it to 8 elements, but zeroing upper bits of the mask vector will add more |
| 6408 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: |
| 6409 | // Check, maybe the gather/scatter instruction is better in the VariableMask |
| 6410 | // case. |
| 6411 | unsigned NumElts = cast<FixedVectorType>(Val: VTy)->getNumElements(); |
| 6412 | return NumElts == 1 || |
| 6413 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); |
| 6414 | } |
| 6415 | |
| 6416 | bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, |
| 6417 | Align Alignment) const { |
| 6418 | Type *ScalarTy = DataTy->getScalarType(); |
| 6419 | if (ScalarTy->isPointerTy()) |
| 6420 | return true; |
| 6421 | |
| 6422 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
| 6423 | return true; |
| 6424 | |
| 6425 | if (!ScalarTy->isIntegerTy()) |
| 6426 | return false; |
| 6427 | |
| 6428 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
| 6429 | return IntWidth == 32 || IntWidth == 64; |
| 6430 | } |
| 6431 | |
| 6432 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const { |
| 6433 | if (!supportsGather() || !ST->preferGather()) |
| 6434 | return false; |
| 6435 | return isLegalMaskedGatherScatter(DataTy, Alignment); |
| 6436 | } |
| 6437 | |
| 6438 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, |
| 6439 | unsigned Opcode1, |
| 6440 | const SmallBitVector &OpcodeMask) const { |
| 6441 | // ADDSUBPS 4xf32 SSE3 |
| 6442 | // VADDSUBPS 4xf32 AVX |
| 6443 | // VADDSUBPS 8xf32 AVX2 |
| 6444 | // ADDSUBPD 2xf64 SSE3 |
| 6445 | // VADDSUBPD 2xf64 AVX |
| 6446 | // VADDSUBPD 4xf64 AVX2 |
| 6447 | |
| 6448 | unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
| 6449 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible" ); |
| 6450 | if (!isPowerOf2_32(Value: NumElements)) |
| 6451 | return false; |
| 6452 | // Check the opcode pattern. We apply the mask on the opcode arguments and |
| 6453 | // then check if it is what we expect. |
| 6454 | for (int Lane : seq<int>(Begin: 0, End: NumElements)) { |
| 6455 | unsigned Opc = OpcodeMask.test(Idx: Lane) ? Opcode1 : Opcode0; |
| 6456 | // We expect FSub for even lanes and FAdd for odd lanes. |
| 6457 | if (Lane % 2 == 0 && Opc != Instruction::FSub) |
| 6458 | return false; |
| 6459 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) |
| 6460 | return false; |
| 6461 | } |
| 6462 | // Now check that the pattern is supported by the target ISA. |
| 6463 | Type *ElemTy = cast<VectorType>(Val: VecTy)->getElementType(); |
| 6464 | if (ElemTy->isFloatTy()) |
| 6465 | return ST->hasSSE3() && NumElements % 4 == 0; |
| 6466 | if (ElemTy->isDoubleTy()) |
| 6467 | return ST->hasSSE3() && NumElements % 2 == 0; |
| 6468 | return false; |
| 6469 | } |
| 6470 | |
| 6471 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const { |
| 6472 | // AVX2 doesn't support scatter |
| 6473 | if (!ST->hasAVX512() || !ST->preferScatter()) |
| 6474 | return false; |
| 6475 | return isLegalMaskedGatherScatter(DataTy: DataType, Alignment); |
| 6476 | } |
| 6477 | |
| 6478 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const { |
| 6479 | EVT VT = TLI->getValueType(DL, Ty: DataType); |
| 6480 | return TLI->isOperationLegal(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); |
| 6481 | } |
| 6482 | |
| 6483 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction *I) const { |
| 6484 | // FDIV is always expensive, even if it has a very low uop count. |
| 6485 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? |
| 6486 | if (I->getOpcode() == Instruction::FDiv) |
| 6487 | return true; |
| 6488 | |
| 6489 | return BaseT::isExpensiveToSpeculativelyExecute(I); |
| 6490 | } |
| 6491 | |
| 6492 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; } |
| 6493 | |
| 6494 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, |
| 6495 | const Function *Callee) const { |
| 6496 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
| 6497 | |
| 6498 | // Work this as a subsetting of subtarget features. |
| 6499 | const FeatureBitset &CallerBits = |
| 6500 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
| 6501 | const FeatureBitset &CalleeBits = |
| 6502 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
| 6503 | |
| 6504 | // Check whether features are the same (apart from the ignore list). |
| 6505 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
| 6506 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
| 6507 | if (RealCallerBits == RealCalleeBits) |
| 6508 | return true; |
| 6509 | |
| 6510 | // If the features are a subset, we need to additionally check for calls |
| 6511 | // that may become ABI-incompatible as a result of inlining. |
| 6512 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
| 6513 | return false; |
| 6514 | |
| 6515 | for (const Instruction &I : instructions(F: Callee)) { |
| 6516 | if (const auto *CB = dyn_cast<CallBase>(Val: &I)) { |
| 6517 | // Having more target features is fine for inline ASM. |
| 6518 | if (CB->isInlineAsm()) |
| 6519 | continue; |
| 6520 | |
| 6521 | SmallVector<Type *, 8> Types; |
| 6522 | for (Value *Arg : CB->args()) |
| 6523 | Types.push_back(Elt: Arg->getType()); |
| 6524 | if (!CB->getType()->isVoidTy()) |
| 6525 | Types.push_back(Elt: CB->getType()); |
| 6526 | |
| 6527 | // Simple types are always ABI compatible. |
| 6528 | auto IsSimpleTy = [](Type *Ty) { |
| 6529 | return !Ty->isVectorTy() && !Ty->isAggregateType(); |
| 6530 | }; |
| 6531 | if (all_of(Range&: Types, P: IsSimpleTy)) |
| 6532 | continue; |
| 6533 | |
| 6534 | if (Function *NestedCallee = CB->getCalledFunction()) { |
| 6535 | // Assume that intrinsics are always ABI compatible. |
| 6536 | if (NestedCallee->isIntrinsic()) |
| 6537 | continue; |
| 6538 | |
| 6539 | // Do a precise compatibility check. |
| 6540 | if (!areTypesABICompatible(Caller, Callee: NestedCallee, Type: Types)) |
| 6541 | return false; |
| 6542 | } else { |
| 6543 | // We don't know the target features of the callee, |
| 6544 | // assume it is incompatible. |
| 6545 | return false; |
| 6546 | } |
| 6547 | } |
| 6548 | } |
| 6549 | return true; |
| 6550 | } |
| 6551 | |
| 6552 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, |
| 6553 | const Function *Callee, |
| 6554 | const ArrayRef<Type *> &Types) const { |
| 6555 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
| 6556 | return false; |
| 6557 | |
| 6558 | // If we get here, we know the target features match. If one function |
| 6559 | // considers 512-bit vectors legal and the other does not, consider them |
| 6560 | // incompatible. |
| 6561 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
| 6562 | |
| 6563 | if (TM.getSubtarget<X86Subtarget>(F: *Caller).useAVX512Regs() == |
| 6564 | TM.getSubtarget<X86Subtarget>(F: *Callee).useAVX512Regs()) |
| 6565 | return true; |
| 6566 | |
| 6567 | // Consider the arguments compatible if they aren't vectors or aggregates. |
| 6568 | // FIXME: Look at the size of vectors. |
| 6569 | // FIXME: Look at the element types of aggregates to see if there are vectors. |
| 6570 | return llvm::none_of(Range: Types, |
| 6571 | P: [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); |
| 6572 | } |
| 6573 | |
| 6574 | X86TTIImpl::TTI::MemCmpExpansionOptions |
| 6575 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
| 6576 | TTI::MemCmpExpansionOptions Options; |
| 6577 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
| 6578 | Options.NumLoadsPerBlock = 2; |
| 6579 | // All GPR and vector loads can be unaligned. |
| 6580 | Options.AllowOverlappingLoads = true; |
| 6581 | if (IsZeroCmp) { |
| 6582 | // Only enable vector loads for equality comparison. Right now the vector |
| 6583 | // version is not as fast for three way compare (see #33329). |
| 6584 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); |
| 6585 | if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) |
| 6586 | Options.LoadSizes.push_back(Elt: 64); |
| 6587 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(Elt: 32); |
| 6588 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(Elt: 16); |
| 6589 | } |
| 6590 | if (ST->is64Bit()) { |
| 6591 | Options.LoadSizes.push_back(Elt: 8); |
| 6592 | } |
| 6593 | Options.LoadSizes.push_back(Elt: 4); |
| 6594 | Options.LoadSizes.push_back(Elt: 2); |
| 6595 | Options.LoadSizes.push_back(Elt: 1); |
| 6596 | return Options; |
| 6597 | } |
| 6598 | |
| 6599 | bool X86TTIImpl::prefersVectorizedAddressing() const { |
| 6600 | return supportsGather(); |
| 6601 | } |
| 6602 | |
| 6603 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { |
| 6604 | return false; |
| 6605 | } |
| 6606 | |
| 6607 | bool X86TTIImpl::enableInterleavedAccessVectorization() const { |
| 6608 | // TODO: We expect this to be beneficial regardless of arch, |
| 6609 | // but there are currently some unexplained performance artifacts on Atom. |
| 6610 | // As a temporary solution, disable on Atom. |
| 6611 | return !(ST->isAtom()); |
| 6612 | } |
| 6613 | |
| 6614 | // Get estimation for interleaved load/store operations and strided load. |
| 6615 | // \p Indices contains indices for strided load. |
| 6616 | // \p Factor - the factor of interleaving. |
| 6617 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. |
| 6618 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( |
| 6619 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, |
| 6620 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, |
| 6621 | TTI::TargetCostKind CostKind, bool UseMaskForCond, |
| 6622 | bool UseMaskForGaps) const { |
| 6623 | // VecTy for interleave memop is <VF*Factor x Elt>. |
| 6624 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
| 6625 | // VecTy = <12 x i32>. |
| 6626 | |
| 6627 | // Calculate the number of memory operations (NumOfMemOps), required |
| 6628 | // for load/store the VecTy. |
| 6629 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
| 6630 | unsigned VecTySize = DL.getTypeStoreSize(Ty: VecTy); |
| 6631 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
| 6632 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
| 6633 | |
| 6634 | // Get the cost of one memory operation. |
| 6635 | auto *SingleMemOpTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
| 6636 | NumElts: LegalVT.getVectorNumElements()); |
| 6637 | InstructionCost MemOpCost; |
| 6638 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; |
| 6639 | if (UseMaskedMemOp) |
| 6640 | MemOpCost = getMaskedMemoryOpCost(Opcode, SrcTy: SingleMemOpTy, Alignment, |
| 6641 | AddressSpace, CostKind); |
| 6642 | else |
| 6643 | MemOpCost = getMemoryOpCost(Opcode, Src: SingleMemOpTy, Alignment, AddressSpace, |
| 6644 | CostKind); |
| 6645 | |
| 6646 | unsigned VF = VecTy->getNumElements() / Factor; |
| 6647 | MVT VT = |
| 6648 | MVT::getVectorVT(VT: TLI->getSimpleValueType(DL, Ty: VecTy->getScalarType()), NumElements: VF); |
| 6649 | |
| 6650 | InstructionCost MaskCost; |
| 6651 | if (UseMaskedMemOp) { |
| 6652 | APInt DemandedLoadStoreElts = APInt::getZero(numBits: VecTy->getNumElements()); |
| 6653 | for (unsigned Index : Indices) { |
| 6654 | assert(Index < Factor && "Invalid index for interleaved memory op" ); |
| 6655 | for (unsigned Elm = 0; Elm < VF; Elm++) |
| 6656 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
| 6657 | } |
| 6658 | |
| 6659 | Type *I1Type = Type::getInt1Ty(C&: VecTy->getContext()); |
| 6660 | |
| 6661 | MaskCost = getReplicationShuffleCost( |
| 6662 | EltTy: I1Type, ReplicationFactor: Factor, VF, |
| 6663 | DemandedDstElts: UseMaskForGaps ? DemandedLoadStoreElts |
| 6664 | : APInt::getAllOnes(numBits: VecTy->getNumElements()), |
| 6665 | CostKind); |
| 6666 | |
| 6667 | // The Gaps mask is invariant and created outside the loop, therefore the |
| 6668 | // cost of creating it is not accounted for here. However if we have both |
| 6669 | // a MaskForGaps and some other mask that guards the execution of the |
| 6670 | // memory access, we need to account for the cost of And-ing the two masks |
| 6671 | // inside the loop. |
| 6672 | if (UseMaskForGaps) { |
| 6673 | auto *MaskVT = FixedVectorType::get(ElementType: I1Type, NumElts: VecTy->getNumElements()); |
| 6674 | MaskCost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: MaskVT, CostKind); |
| 6675 | } |
| 6676 | } |
| 6677 | |
| 6678 | if (Opcode == Instruction::Load) { |
| 6679 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) |
| 6680 | // contain the cost of the optimized shuffle sequence that the |
| 6681 | // X86InterleavedAccess pass will generate. |
| 6682 | // The cost of loads and stores are computed separately from the table. |
| 6683 | |
| 6684 | // X86InterleavedAccess support only the following interleaved-access group. |
| 6685 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { |
| 6686 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, //(load 48i8 and) deinterleave into 3 x 16i8 |
| 6687 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, //(load 96i8 and) deinterleave into 3 x 32i8 |
| 6688 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 22}, //(load 96i8 and) deinterleave into 3 x 32i8 |
| 6689 | }; |
| 6690 | |
| 6691 | if (const auto *Entry = |
| 6692 | CostTableLookup(Table: AVX512InterleavedLoadTbl, ISD: Factor, Ty: VT)) |
| 6693 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
| 6694 | //If an entry does not exist, fallback to the default implementation. |
| 6695 | |
| 6696 | // Kind of shuffle depends on number of loaded values. |
| 6697 | // If we load the entire data in one register, we can use a 1-src shuffle. |
| 6698 | // Otherwise, we'll merge 2 sources in each operation. |
| 6699 | TTI::ShuffleKind ShuffleKind = |
| 6700 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; |
| 6701 | |
| 6702 | InstructionCost ShuffleCost = getShuffleCost( |
| 6703 | Kind: ShuffleKind, DstTy: SingleMemOpTy, SrcTy: SingleMemOpTy, Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
| 6704 | |
| 6705 | unsigned NumOfLoadsInInterleaveGrp = |
| 6706 | Indices.size() ? Indices.size() : Factor; |
| 6707 | auto *ResultTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
| 6708 | NumElts: VecTy->getNumElements() / Factor); |
| 6709 | InstructionCost NumOfResults = |
| 6710 | getTypeLegalizationCost(Ty: ResultTy).first * NumOfLoadsInInterleaveGrp; |
| 6711 | |
| 6712 | // About a half of the loads may be folded in shuffles when we have only |
| 6713 | // one result. If we have more than one result, or the loads are masked, |
| 6714 | // we do not fold loads at all. |
| 6715 | unsigned NumOfUnfoldedLoads = |
| 6716 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; |
| 6717 | |
| 6718 | // Get a number of shuffle operations per result. |
| 6719 | unsigned NumOfShufflesPerResult = |
| 6720 | std::max(a: (unsigned)1, b: (unsigned)(NumOfMemOps - 1)); |
| 6721 | |
| 6722 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
| 6723 | // When we have more than one destination, we need additional instructions |
| 6724 | // to keep sources. |
| 6725 | InstructionCost NumOfMoves = 0; |
| 6726 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) |
| 6727 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; |
| 6728 | |
| 6729 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + |
| 6730 | MaskCost + NumOfUnfoldedLoads * MemOpCost + |
| 6731 | NumOfMoves; |
| 6732 | |
| 6733 | return Cost; |
| 6734 | } |
| 6735 | |
| 6736 | // Store. |
| 6737 | assert(Opcode == Instruction::Store && |
| 6738 | "Expected Store Instruction at this point" ); |
| 6739 | // X86InterleavedAccess support only the following interleaved-access group. |
| 6740 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { |
| 6741 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, // interleave 3 x 16i8 into 48i8 (and store) |
| 6742 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // interleave 3 x 32i8 into 96i8 (and store) |
| 6743 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 26}, // interleave 3 x 64i8 into 96i8 (and store) |
| 6744 | |
| 6745 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 10}, // interleave 4 x 8i8 into 32i8 (and store) |
| 6746 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 11}, // interleave 4 x 16i8 into 64i8 (and store) |
| 6747 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 14}, // interleave 4 x 32i8 into 128i8 (and store) |
| 6748 | {.ISD: 4, .Type: MVT::v64i8, .Cost: 24} // interleave 4 x 32i8 into 256i8 (and store) |
| 6749 | }; |
| 6750 | |
| 6751 | if (const auto *Entry = |
| 6752 | CostTableLookup(Table: AVX512InterleavedStoreTbl, ISD: Factor, Ty: VT)) |
| 6753 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
| 6754 | //If an entry does not exist, fallback to the default implementation. |
| 6755 | |
| 6756 | // There is no strided stores meanwhile. And store can't be folded in |
| 6757 | // shuffle. |
| 6758 | unsigned NumOfSources = Factor; // The number of values to be merged. |
| 6759 | InstructionCost ShuffleCost = |
| 6760 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleMemOpTy, SrcTy: SingleMemOpTy, Mask: {}, |
| 6761 | CostKind, Index: 0, SubTp: nullptr); |
| 6762 | unsigned NumOfShufflesPerStore = NumOfSources - 1; |
| 6763 | |
| 6764 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
| 6765 | // We need additional instructions to keep sources. |
| 6766 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; |
| 6767 | InstructionCost Cost = |
| 6768 | MaskCost + |
| 6769 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + |
| 6770 | NumOfMoves; |
| 6771 | return Cost; |
| 6772 | } |
| 6773 | |
| 6774 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( |
| 6775 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, |
| 6776 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
| 6777 | bool UseMaskForCond, bool UseMaskForGaps) const { |
| 6778 | auto *VecTy = cast<FixedVectorType>(Val: BaseTy); |
| 6779 | |
| 6780 | auto isSupportedOnAVX512 = [&](Type *VecTy) { |
| 6781 | Type *EltTy = cast<VectorType>(Val: VecTy)->getElementType(); |
| 6782 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(Bitwidth: 64) || |
| 6783 | EltTy->isIntegerTy(Bitwidth: 32) || EltTy->isPointerTy()) |
| 6784 | return true; |
| 6785 | if (EltTy->isIntegerTy(Bitwidth: 16) || EltTy->isIntegerTy(Bitwidth: 8) || EltTy->isHalfTy()) |
| 6786 | return ST->hasBWI(); |
| 6787 | if (EltTy->isBFloatTy()) |
| 6788 | return ST->hasBF16(); |
| 6789 | return false; |
| 6790 | }; |
| 6791 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) |
| 6792 | return getInterleavedMemoryOpCostAVX512( |
| 6793 | Opcode, VecTy, Factor, Indices, Alignment, |
| 6794 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); |
| 6795 | |
| 6796 | if (UseMaskForCond || UseMaskForGaps) |
| 6797 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
| 6798 | Alignment, AddressSpace, CostKind, |
| 6799 | UseMaskForCond, UseMaskForGaps); |
| 6800 | |
| 6801 | // Get estimation for interleaved load/store operations for SSE-AVX2. |
| 6802 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow |
| 6803 | // computing the cost using a generic formula as a function of generic |
| 6804 | // shuffles. We therefore use a lookup table instead, filled according to |
| 6805 | // the instruction sequences that codegen currently generates. |
| 6806 | |
| 6807 | // VecTy for interleave memop is <VF*Factor x Elt>. |
| 6808 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
| 6809 | // VecTy = <12 x i32>. |
| 6810 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
| 6811 | |
| 6812 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case |
| 6813 | // the VF=2, while v2i128 is an unsupported MVT vector type |
| 6814 | // (see MachineValueType.h::getVectorVT()). |
| 6815 | if (!LegalVT.isVector()) |
| 6816 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
| 6817 | Alignment, AddressSpace, CostKind); |
| 6818 | |
| 6819 | unsigned VF = VecTy->getNumElements() / Factor; |
| 6820 | Type *ScalarTy = VecTy->getElementType(); |
| 6821 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. |
| 6822 | if (!ScalarTy->isIntegerTy()) |
| 6823 | ScalarTy = |
| 6824 | Type::getIntNTy(C&: ScalarTy->getContext(), N: DL.getTypeSizeInBits(Ty: ScalarTy)); |
| 6825 | |
| 6826 | // Get the cost of all the memory operations. |
| 6827 | // FIXME: discount dead loads. |
| 6828 | InstructionCost MemOpCosts = |
| 6829 | getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind); |
| 6830 | |
| 6831 | auto *VT = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF); |
| 6832 | EVT ETy = TLI->getValueType(DL, Ty: VT); |
| 6833 | if (!ETy.isSimple()) |
| 6834 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
| 6835 | Alignment, AddressSpace, CostKind); |
| 6836 | |
| 6837 | // TODO: Complete for other data-types and strides. |
| 6838 | // Each combination of Stride, element bit width and VF results in a different |
| 6839 | // sequence; The cost tables are therefore accessed with: |
| 6840 | // Factor (stride) and VectorType=VFxiN. |
| 6841 | // The Cost accounts only for the shuffle sequence; |
| 6842 | // The cost of the loads/stores is accounted for separately. |
| 6843 | // |
| 6844 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { |
| 6845 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 2}, // (load 4i8 and) deinterleave into 2 x 2i8 |
| 6846 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 2}, // (load 8i8 and) deinterleave into 2 x 4i8 |
| 6847 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 2}, // (load 16i8 and) deinterleave into 2 x 8i8 |
| 6848 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 4}, // (load 32i8 and) deinterleave into 2 x 16i8 |
| 6849 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 6}, // (load 64i8 and) deinterleave into 2 x 32i8 |
| 6850 | |
| 6851 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 6}, // (load 16i16 and) deinterleave into 2 x 8i16 |
| 6852 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 9}, // (load 32i16 and) deinterleave into 2 x 16i16 |
| 6853 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 18}, // (load 64i16 and) deinterleave into 2 x 32i16 |
| 6854 | |
| 6855 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // (load 16i32 and) deinterleave into 2 x 8i32 |
| 6856 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // (load 32i32 and) deinterleave into 2 x 16i32 |
| 6857 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // (load 64i32 and) deinterleave into 2 x 32i32 |
| 6858 | |
| 6859 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // (load 8i64 and) deinterleave into 2 x 4i64 |
| 6860 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // (load 16i64 and) deinterleave into 2 x 8i64 |
| 6861 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // (load 32i64 and) deinterleave into 2 x 16i64 |
| 6862 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // (load 64i64 and) deinterleave into 2 x 32i64 |
| 6863 | |
| 6864 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 3}, // (load 6i8 and) deinterleave into 3 x 2i8 |
| 6865 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 3}, // (load 12i8 and) deinterleave into 3 x 4i8 |
| 6866 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // (load 24i8 and) deinterleave into 3 x 8i8 |
| 6867 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // (load 48i8 and) deinterleave into 3 x 16i8 |
| 6868 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // (load 96i8 and) deinterleave into 3 x 32i8 |
| 6869 | |
| 6870 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 5}, // (load 6i16 and) deinterleave into 3 x 2i16 |
| 6871 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 7}, // (load 12i16 and) deinterleave into 3 x 4i16 |
| 6872 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 9}, // (load 24i16 and) deinterleave into 3 x 8i16 |
| 6873 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 28}, // (load 48i16 and) deinterleave into 3 x 16i16 |
| 6874 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 56}, // (load 96i16 and) deinterleave into 3 x 32i16 |
| 6875 | |
| 6876 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 3}, // (load 6i32 and) deinterleave into 3 x 2i32 |
| 6877 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 3}, // (load 12i32 and) deinterleave into 3 x 4i32 |
| 6878 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 7}, // (load 24i32 and) deinterleave into 3 x 8i32 |
| 6879 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 14}, // (load 48i32 and) deinterleave into 3 x 16i32 |
| 6880 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 32}, // (load 96i32 and) deinterleave into 3 x 32i32 |
| 6881 | |
| 6882 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 1}, // (load 6i64 and) deinterleave into 3 x 2i64 |
| 6883 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 5}, // (load 12i64 and) deinterleave into 3 x 4i64 |
| 6884 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 10}, // (load 24i64 and) deinterleave into 3 x 8i64 |
| 6885 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 20}, // (load 48i64 and) deinterleave into 3 x 16i64 |
| 6886 | |
| 6887 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // (load 8i8 and) deinterleave into 4 x 2i8 |
| 6888 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // (load 16i8 and) deinterleave into 4 x 4i8 |
| 6889 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 12}, // (load 32i8 and) deinterleave into 4 x 8i8 |
| 6890 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 24}, // (load 64i8 and) deinterleave into 4 x 16i8 |
| 6891 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 56}, // (load 128i8 and) deinterleave into 4 x 32i8 |
| 6892 | |
| 6893 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 6}, // (load 8i16 and) deinterleave into 4 x 2i16 |
| 6894 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 17}, // (load 16i16 and) deinterleave into 4 x 4i16 |
| 6895 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 33}, // (load 32i16 and) deinterleave into 4 x 8i16 |
| 6896 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 75}, // (load 64i16 and) deinterleave into 4 x 16i16 |
| 6897 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 150}, // (load 128i16 and) deinterleave into 4 x 32i16 |
| 6898 | |
| 6899 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // (load 8i32 and) deinterleave into 4 x 2i32 |
| 6900 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 8}, // (load 16i32 and) deinterleave into 4 x 4i32 |
| 6901 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // (load 32i32 and) deinterleave into 4 x 8i32 |
| 6902 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // (load 64i32 and) deinterleave into 4 x 16i32 |
| 6903 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 68}, // (load 128i32 and) deinterleave into 4 x 32i32 |
| 6904 | |
| 6905 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // (load 8i64 and) deinterleave into 4 x 2i64 |
| 6906 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // (load 16i64 and) deinterleave into 4 x 4i64 |
| 6907 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // (load 32i64 and) deinterleave into 4 x 8i64 |
| 6908 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // (load 64i64 and) deinterleave into 4 x 16i64 |
| 6909 | |
| 6910 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 6}, // (load 12i8 and) deinterleave into 6 x 2i8 |
| 6911 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 14}, // (load 24i8 and) deinterleave into 6 x 4i8 |
| 6912 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 18}, // (load 48i8 and) deinterleave into 6 x 8i8 |
| 6913 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 43}, // (load 96i8 and) deinterleave into 6 x 16i8 |
| 6914 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 82}, // (load 192i8 and) deinterleave into 6 x 32i8 |
| 6915 | |
| 6916 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 13}, // (load 12i16 and) deinterleave into 6 x 2i16 |
| 6917 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 9}, // (load 24i16 and) deinterleave into 6 x 4i16 |
| 6918 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 39}, // (load 48i16 and) deinterleave into 6 x 8i16 |
| 6919 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 106}, // (load 96i16 and) deinterleave into 6 x 16i16 |
| 6920 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 212}, // (load 192i16 and) deinterleave into 6 x 32i16 |
| 6921 | |
| 6922 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 6}, // (load 12i32 and) deinterleave into 6 x 2i32 |
| 6923 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 15}, // (load 24i32 and) deinterleave into 6 x 4i32 |
| 6924 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 31}, // (load 48i32 and) deinterleave into 6 x 8i32 |
| 6925 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 64}, // (load 96i32 and) deinterleave into 6 x 16i32 |
| 6926 | |
| 6927 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 6}, // (load 12i64 and) deinterleave into 6 x 2i64 |
| 6928 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 18}, // (load 24i64 and) deinterleave into 6 x 4i64 |
| 6929 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 36}, // (load 48i64 and) deinterleave into 6 x 8i64 |
| 6930 | |
| 6931 | {.ISD: 8, .Type: MVT::v8i32, .Cost: 40} // (load 64i32 and) deinterleave into 8 x 8i32 |
| 6932 | }; |
| 6933 | |
| 6934 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { |
| 6935 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 2}, // (load 8i16 and) deinterleave into 2 x 4i16 |
| 6936 | }; |
| 6937 | |
| 6938 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { |
| 6939 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 2}, // (load 4i16 and) deinterleave into 2 x 2i16 |
| 6940 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 7}, // (load 8i16 and) deinterleave into 2 x 4i16 |
| 6941 | |
| 6942 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 2}, // (load 4i32 and) deinterleave into 2 x 2i32 |
| 6943 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // (load 8i32 and) deinterleave into 2 x 4i32 |
| 6944 | |
| 6945 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // (load 4i64 and) deinterleave into 2 x 2i64 |
| 6946 | }; |
| 6947 | |
| 6948 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { |
| 6949 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 3}, // interleave 2 x 16i8 into 32i8 (and store) |
| 6950 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 4}, // interleave 2 x 32i8 into 64i8 (and store) |
| 6951 | |
| 6952 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 3}, // interleave 2 x 8i16 into 16i16 (and store) |
| 6953 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 4}, // interleave 2 x 16i16 into 32i16 (and store) |
| 6954 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 8}, // interleave 2 x 32i16 into 64i16 (and store) |
| 6955 | |
| 6956 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32 (and store) |
| 6957 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // interleave 2 x 8i32 into 16i32 (and store) |
| 6958 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // interleave 2 x 16i32 into 32i32 (and store) |
| 6959 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // interleave 2 x 32i32 into 64i32 (and store) |
| 6960 | |
| 6961 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // interleave 2 x 2i64 into 4i64 (and store) |
| 6962 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // interleave 2 x 4i64 into 8i64 (and store) |
| 6963 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // interleave 2 x 8i64 into 16i64 (and store) |
| 6964 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // interleave 2 x 16i64 into 32i64 (and store) |
| 6965 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // interleave 2 x 32i64 into 64i64 (and store) |
| 6966 | |
| 6967 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 4}, // interleave 3 x 2i8 into 6i8 (and store) |
| 6968 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 4}, // interleave 3 x 4i8 into 12i8 (and store) |
| 6969 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // interleave 3 x 8i8 into 24i8 (and store) |
| 6970 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // interleave 3 x 16i8 into 48i8 (and store) |
| 6971 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 13}, // interleave 3 x 32i8 into 96i8 (and store) |
| 6972 | |
| 6973 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 4}, // interleave 3 x 2i16 into 6i16 (and store) |
| 6974 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 6}, // interleave 3 x 4i16 into 12i16 (and store) |
| 6975 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 12}, // interleave 3 x 8i16 into 24i16 (and store) |
| 6976 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 27}, // interleave 3 x 16i16 into 48i16 (and store) |
| 6977 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 54}, // interleave 3 x 32i16 into 96i16 (and store) |
| 6978 | |
| 6979 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 4}, // interleave 3 x 2i32 into 6i32 (and store) |
| 6980 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 5}, // interleave 3 x 4i32 into 12i32 (and store) |
| 6981 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 11}, // interleave 3 x 8i32 into 24i32 (and store) |
| 6982 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 22}, // interleave 3 x 16i32 into 48i32 (and store) |
| 6983 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 48}, // interleave 3 x 32i32 into 96i32 (and store) |
| 6984 | |
| 6985 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 4}, // interleave 3 x 2i64 into 6i64 (and store) |
| 6986 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 6}, // interleave 3 x 4i64 into 12i64 (and store) |
| 6987 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 12}, // interleave 3 x 8i64 into 24i64 (and store) |
| 6988 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 24}, // interleave 3 x 16i64 into 48i64 (and store) |
| 6989 | |
| 6990 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8 (and store) |
| 6991 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8 (and store) |
| 6992 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 4}, // interleave 4 x 8i8 into 32i8 (and store) |
| 6993 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 8}, // interleave 4 x 16i8 into 64i8 (and store) |
| 6994 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 12}, // interleave 4 x 32i8 into 128i8 (and store) |
| 6995 | |
| 6996 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 2}, // interleave 4 x 2i16 into 8i16 (and store) |
| 6997 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 6}, // interleave 4 x 4i16 into 16i16 (and store) |
| 6998 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 10}, // interleave 4 x 8i16 into 32i16 (and store) |
| 6999 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 32}, // interleave 4 x 16i16 into 64i16 (and store) |
| 7000 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 64}, // interleave 4 x 32i16 into 128i16 (and store) |
| 7001 | |
| 7002 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 5}, // interleave 4 x 2i32 into 8i32 (and store) |
| 7003 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 6}, // interleave 4 x 4i32 into 16i32 (and store) |
| 7004 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // interleave 4 x 8i32 into 32i32 (and store) |
| 7005 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // interleave 4 x 16i32 into 64i32 (and store) |
| 7006 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 64}, // interleave 4 x 32i32 into 128i32 (and store) |
| 7007 | |
| 7008 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // interleave 4 x 2i64 into 8i64 (and store) |
| 7009 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // interleave 4 x 4i64 into 16i64 (and store) |
| 7010 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // interleave 4 x 8i64 into 32i64 (and store) |
| 7011 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // interleave 4 x 16i64 into 64i64 (and store) |
| 7012 | |
| 7013 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 7}, // interleave 6 x 2i8 into 12i8 (and store) |
| 7014 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 9}, // interleave 6 x 4i8 into 24i8 (and store) |
| 7015 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 16}, // interleave 6 x 8i8 into 48i8 (and store) |
| 7016 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 27}, // interleave 6 x 16i8 into 96i8 (and store) |
| 7017 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 90}, // interleave 6 x 32i8 into 192i8 (and store) |
| 7018 | |
| 7019 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 10}, // interleave 6 x 2i16 into 12i16 (and store) |
| 7020 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 15}, // interleave 6 x 4i16 into 24i16 (and store) |
| 7021 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 21}, // interleave 6 x 8i16 into 48i16 (and store) |
| 7022 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 58}, // interleave 6 x 16i16 into 96i16 (and store) |
| 7023 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 90}, // interleave 6 x 32i16 into 192i16 (and store) |
| 7024 | |
| 7025 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 9}, // interleave 6 x 2i32 into 12i32 (and store) |
| 7026 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 12}, // interleave 6 x 4i32 into 24i32 (and store) |
| 7027 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 33}, // interleave 6 x 8i32 into 48i32 (and store) |
| 7028 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 66}, // interleave 6 x 16i32 into 96i32 (and store) |
| 7029 | |
| 7030 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 8}, // interleave 6 x 2i64 into 12i64 (and store) |
| 7031 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 15}, // interleave 6 x 4i64 into 24i64 (and store) |
| 7032 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 30}, // interleave 6 x 8i64 into 48i64 (and store) |
| 7033 | }; |
| 7034 | |
| 7035 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { |
| 7036 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8 (and store) |
| 7037 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8 (and store) |
| 7038 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8 (and store) |
| 7039 | |
| 7040 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16 (and store) |
| 7041 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16 (and store) |
| 7042 | |
| 7043 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32 (and store) |
| 7044 | }; |
| 7045 | |
| 7046 | if (Opcode == Instruction::Load) { |
| 7047 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), |
| 7048 | MemOpCosts](const CostTblEntry *Entry) { |
| 7049 | // NOTE: this is just an approximation! |
| 7050 | // It can over/under -estimate the cost! |
| 7051 | return MemOpCosts + divideCeil(Numerator: NumMembers * Entry->Cost, Denominator: Factor); |
| 7052 | }; |
| 7053 | |
| 7054 | if (ST->hasAVX2()) |
| 7055 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedLoadTbl, ISD: Factor, |
| 7056 | Ty: ETy.getSimpleVT())) |
| 7057 | return GetDiscountedCost(Entry); |
| 7058 | |
| 7059 | if (ST->hasSSSE3()) |
| 7060 | if (const auto *Entry = CostTableLookup(Table: SSSE3InterleavedLoadTbl, ISD: Factor, |
| 7061 | Ty: ETy.getSimpleVT())) |
| 7062 | return GetDiscountedCost(Entry); |
| 7063 | |
| 7064 | if (ST->hasSSE2()) |
| 7065 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedLoadTbl, ISD: Factor, |
| 7066 | Ty: ETy.getSimpleVT())) |
| 7067 | return GetDiscountedCost(Entry); |
| 7068 | } else { |
| 7069 | assert(Opcode == Instruction::Store && |
| 7070 | "Expected Store Instruction at this point" ); |
| 7071 | assert((!Indices.size() || Indices.size() == Factor) && |
| 7072 | "Interleaved store only supports fully-interleaved groups." ); |
| 7073 | if (ST->hasAVX2()) |
| 7074 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedStoreTbl, ISD: Factor, |
| 7075 | Ty: ETy.getSimpleVT())) |
| 7076 | return MemOpCosts + Entry->Cost; |
| 7077 | |
| 7078 | if (ST->hasSSE2()) |
| 7079 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedStoreTbl, ISD: Factor, |
| 7080 | Ty: ETy.getSimpleVT())) |
| 7081 | return MemOpCosts + Entry->Cost; |
| 7082 | } |
| 7083 | |
| 7084 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
| 7085 | Alignment, AddressSpace, CostKind, |
| 7086 | UseMaskForCond, UseMaskForGaps); |
| 7087 | } |
| 7088 | |
| 7089 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
| 7090 | StackOffset BaseOffset, |
| 7091 | bool HasBaseReg, int64_t Scale, |
| 7092 | unsigned AddrSpace) const { |
| 7093 | // Scaling factors are not free at all. |
| 7094 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), |
| 7095 | // will take 2 allocations in the out of order engine instead of 1 |
| 7096 | // for plain addressing mode, i.e. inst (reg1). |
| 7097 | // E.g., |
| 7098 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 |
| 7099 | // Requires two allocations (one for the load, one for the computation) |
| 7100 | // whereas: |
| 7101 | // vaddps (%rsi), %ymm0, %ymm1 |
| 7102 | // Requires just 1 allocation, i.e., freeing allocations for other operations |
| 7103 | // and having less micro operations to execute. |
| 7104 | // |
| 7105 | // For some X86 architectures, this is even worse because for instance for |
| 7106 | // stores, the complex addressing mode forces the instruction to use the |
| 7107 | // "load" ports instead of the dedicated "store" port. |
| 7108 | // E.g., on Haswell: |
| 7109 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. |
| 7110 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. |
| 7111 | TargetLoweringBase::AddrMode AM; |
| 7112 | AM.BaseGV = BaseGV; |
| 7113 | AM.BaseOffs = BaseOffset.getFixed(); |
| 7114 | AM.HasBaseReg = HasBaseReg; |
| 7115 | AM.Scale = Scale; |
| 7116 | AM.ScalableOffset = BaseOffset.getScalable(); |
| 7117 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
| 7118 | // Scale represents reg2 * scale, thus account for 1 |
| 7119 | // as soon as we use a second register. |
| 7120 | return AM.Scale != 0; |
| 7121 | return InstructionCost::getInvalid(); |
| 7122 | } |
| 7123 | |
| 7124 | InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { |
| 7125 | // TODO: Hook MispredictPenalty of SchedMachineModel into this. |
| 7126 | return 14; |
| 7127 | } |
| 7128 | |
| 7129 | bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { |
| 7130 | unsigned Bits = Ty->getScalarSizeInBits(); |
| 7131 | |
| 7132 | // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. |
| 7133 | // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. |
| 7134 | if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) |
| 7135 | return false; |
| 7136 | |
| 7137 | // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable |
| 7138 | // shifts just as cheap as scalar ones. |
| 7139 | if (ST->hasAVX2() && (Bits == 32 || Bits == 64)) |
| 7140 | return false; |
| 7141 | |
| 7142 | // AVX512BW has shifts such as vpsllvw. |
| 7143 | if (ST->hasBWI() && Bits == 16) |
| 7144 | return false; |
| 7145 | |
| 7146 | // Otherwise, it's significantly cheaper to shift by a scalar amount than by a |
| 7147 | // fully general vector. |
| 7148 | return true; |
| 7149 | } |
| 7150 | |
| 7151 | unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, |
| 7152 | Type *ScalarValTy) const { |
| 7153 | if (ST->hasF16C() && ScalarMemTy->isHalfTy()) { |
| 7154 | return 4; |
| 7155 | } |
| 7156 | return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); |
| 7157 | } |
| 7158 | |
| 7159 | bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, |
| 7160 | SmallVectorImpl<Use *> &Ops) const { |
| 7161 | using namespace llvm::PatternMatch; |
| 7162 | |
| 7163 | FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: I->getType()); |
| 7164 | if (!VTy) |
| 7165 | return false; |
| 7166 | |
| 7167 | if (I->getOpcode() == Instruction::Mul && |
| 7168 | VTy->getElementType()->isIntegerTy(Bitwidth: 64)) { |
| 7169 | for (auto &Op : I->operands()) { |
| 7170 | // Make sure we are not already sinking this operand |
| 7171 | if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; })) |
| 7172 | continue; |
| 7173 | |
| 7174 | // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or |
| 7175 | // the PMULUDQ pattern where the input is a zext_inreg from vXi32. |
| 7176 | if (ST->hasSSE41() && |
| 7177 | match(V: Op.get(), P: m_AShr(L: m_Shl(L: m_Value(), R: m_SpecificInt(V: 32)), |
| 7178 | R: m_SpecificInt(V: 32)))) { |
| 7179 | Ops.push_back(Elt: &cast<Instruction>(Val&: Op)->getOperandUse(i: 0)); |
| 7180 | Ops.push_back(Elt: &Op); |
| 7181 | } else if (ST->hasSSE2() && |
| 7182 | match(V: Op.get(), |
| 7183 | P: m_And(L: m_Value(), R: m_SpecificInt(UINT64_C(0xffffffff))))) { |
| 7184 | Ops.push_back(Elt: &Op); |
| 7185 | } |
| 7186 | } |
| 7187 | |
| 7188 | return !Ops.empty(); |
| 7189 | } |
| 7190 | |
| 7191 | // A uniform shift amount in a vector shift or funnel shift may be much |
| 7192 | // cheaper than a generic variable vector shift, so make that pattern visible |
| 7193 | // to SDAG by sinking the shuffle instruction next to the shift. |
| 7194 | int ShiftAmountOpNum = -1; |
| 7195 | if (I->isShift()) |
| 7196 | ShiftAmountOpNum = 1; |
| 7197 | else if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) { |
| 7198 | if (II->getIntrinsicID() == Intrinsic::fshl || |
| 7199 | II->getIntrinsicID() == Intrinsic::fshr) |
| 7200 | ShiftAmountOpNum = 2; |
| 7201 | } |
| 7202 | |
| 7203 | if (ShiftAmountOpNum == -1) |
| 7204 | return false; |
| 7205 | |
| 7206 | auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: I->getOperand(i: ShiftAmountOpNum)); |
| 7207 | if (Shuf && getSplatIndex(Mask: Shuf->getShuffleMask()) >= 0 && |
| 7208 | isVectorShiftByScalarCheap(Ty: I->getType())) { |
| 7209 | Ops.push_back(Elt: &I->getOperandUse(i: ShiftAmountOpNum)); |
| 7210 | return true; |
| 7211 | } |
| 7212 | |
| 7213 | return false; |
| 7214 | } |
| 7215 | |