1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements a TargetTransformInfo analysis pass specific to the |
10 | /// X86 target machine. It uses the target's detailed information to provide |
11 | /// more precise answers to certain TTI queries, while letting the target |
12 | /// independent and default TTI implementations handle the rest. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | /// About Cost Model numbers used below it's necessary to say the following: |
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a |
17 | /// specific CPU model. Usually the numbers correspond to the CPU where the |
18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in |
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU |
20 | /// to support that feature level and thus has most likely the worst case cost, |
21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). |
22 | /// |
23 | /// Some examples of other technologies/CPUs: |
24 | /// SSE 3 - Pentium4 / Athlon64 |
25 | /// SSE 4.1 - Penryn |
26 | /// SSE 4.2 - Nehalem / Silvermont |
27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer |
28 | /// AVX2 - Haswell / Ryzen |
29 | /// AVX-512 - Xeon Phi / Skylake |
30 | /// |
31 | /// And some examples of instruction target dependent costs (latency) |
32 | /// divss sqrtss rsqrtss |
33 | /// AMD K7 11-16 19 3 |
34 | /// Piledriver 9-24 13-15 5 |
35 | /// Jaguar 14 16 2 |
36 | /// Pentium II,III 18 30 2 |
37 | /// Nehalem 7-14 7-18 3 |
38 | /// Haswell 10-13 11 5 |
39 | /// |
40 | /// Interpreting the 4 TargetCostKind types: |
41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case |
42 | /// values reported by the CPU scheduler models (and llvm-mca). |
43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the |
44 | /// actual encoding size of the instruction. |
45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by |
46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are |
47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are |
48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. |
49 | //===----------------------------------------------------------------------===// |
50 | |
51 | #include "X86TargetTransformInfo.h" |
52 | #include "llvm/ADT/SmallBitVector.h" |
53 | #include "llvm/Analysis/TargetTransformInfo.h" |
54 | #include "llvm/CodeGen/BasicTTIImpl.h" |
55 | #include "llvm/CodeGen/CostTable.h" |
56 | #include "llvm/CodeGen/TargetLowering.h" |
57 | #include "llvm/IR/InstIterator.h" |
58 | #include "llvm/IR/IntrinsicInst.h" |
59 | #include <optional> |
60 | |
61 | using namespace llvm; |
62 | |
63 | #define DEBUG_TYPE "x86tti" |
64 | |
65 | //===----------------------------------------------------------------------===// |
66 | // |
67 | // X86 cost model. |
68 | // |
69 | //===----------------------------------------------------------------------===// |
70 | |
71 | // Helper struct to store/access costs for each cost kind. |
72 | // TODO: Move this to allow other targets to use it? |
73 | struct CostKindCosts { |
74 | unsigned RecipThroughputCost = ~0U; |
75 | unsigned LatencyCost = ~0U; |
76 | unsigned CodeSizeCost = ~0U; |
77 | unsigned SizeAndLatencyCost = ~0U; |
78 | |
79 | std::optional<unsigned> |
80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { |
81 | unsigned Cost = ~0U; |
82 | switch (Kind) { |
83 | case TargetTransformInfo::TCK_RecipThroughput: |
84 | Cost = RecipThroughputCost; |
85 | break; |
86 | case TargetTransformInfo::TCK_Latency: |
87 | Cost = LatencyCost; |
88 | break; |
89 | case TargetTransformInfo::TCK_CodeSize: |
90 | Cost = CodeSizeCost; |
91 | break; |
92 | case TargetTransformInfo::TCK_SizeAndLatency: |
93 | Cost = SizeAndLatencyCost; |
94 | break; |
95 | } |
96 | if (Cost == ~0U) |
97 | return std::nullopt; |
98 | return Cost; |
99 | } |
100 | }; |
101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; |
102 | using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>; |
103 | |
104 | TargetTransformInfo::PopcntSupportKind |
105 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) const { |
106 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
107 | // TODO: Currently the __builtin_popcount() implementation using SSE3 |
108 | // instructions is inefficient. Once the problem is fixed, we should |
109 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
110 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; |
111 | } |
112 | |
113 | std::optional<unsigned> X86TTIImpl::getCacheSize( |
114 | TargetTransformInfo::CacheLevel Level) const { |
115 | switch (Level) { |
116 | case TargetTransformInfo::CacheLevel::L1D: |
117 | // - Penryn |
118 | // - Nehalem |
119 | // - Westmere |
120 | // - Sandy Bridge |
121 | // - Ivy Bridge |
122 | // - Haswell |
123 | // - Broadwell |
124 | // - Skylake |
125 | // - Kabylake |
126 | return 32 * 1024; // 32 KiB |
127 | case TargetTransformInfo::CacheLevel::L2D: |
128 | // - Penryn |
129 | // - Nehalem |
130 | // - Westmere |
131 | // - Sandy Bridge |
132 | // - Ivy Bridge |
133 | // - Haswell |
134 | // - Broadwell |
135 | // - Skylake |
136 | // - Kabylake |
137 | return 256 * 1024; // 256 KiB |
138 | } |
139 | |
140 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
141 | } |
142 | |
143 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( |
144 | TargetTransformInfo::CacheLevel Level) const { |
145 | // - Penryn |
146 | // - Nehalem |
147 | // - Westmere |
148 | // - Sandy Bridge |
149 | // - Ivy Bridge |
150 | // - Haswell |
151 | // - Broadwell |
152 | // - Skylake |
153 | // - Kabylake |
154 | switch (Level) { |
155 | case TargetTransformInfo::CacheLevel::L1D: |
156 | [[fallthrough]]; |
157 | case TargetTransformInfo::CacheLevel::L2D: |
158 | return 8; |
159 | } |
160 | |
161 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
162 | } |
163 | |
164 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
165 | bool Vector = (ClassID == 1); |
166 | if (Vector && !ST->hasSSE1()) |
167 | return 0; |
168 | |
169 | if (ST->is64Bit()) { |
170 | if (Vector && ST->hasAVX512()) |
171 | return 32; |
172 | if (!Vector && ST->hasEGPR()) |
173 | return 32; |
174 | return 16; |
175 | } |
176 | return 8; |
177 | } |
178 | |
179 | bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty, bool IsStore) const { |
180 | if (!ST->hasCF()) |
181 | return false; |
182 | if (!Ty) |
183 | return true; |
184 | // Conditional faulting is supported by CFCMOV, which only accepts |
185 | // 16/32/64-bit operands. |
186 | // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's |
187 | // profitable. |
188 | auto *VTy = dyn_cast<FixedVectorType>(Val: Ty); |
189 | if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1)) |
190 | return false; |
191 | auto *ScalarTy = Ty->getScalarType(); |
192 | switch (cast<IntegerType>(Val: ScalarTy)->getBitWidth()) { |
193 | default: |
194 | return false; |
195 | case 16: |
196 | case 32: |
197 | case 64: |
198 | return true; |
199 | } |
200 | } |
201 | |
202 | TypeSize |
203 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
204 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); |
205 | switch (K) { |
206 | case TargetTransformInfo::RGK_Scalar: |
207 | return TypeSize::getFixed(ExactSize: ST->is64Bit() ? 64 : 32); |
208 | case TargetTransformInfo::RGK_FixedWidthVector: |
209 | if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) |
210 | return TypeSize::getFixed(ExactSize: 512); |
211 | if (ST->hasAVX() && PreferVectorWidth >= 256) |
212 | return TypeSize::getFixed(ExactSize: 256); |
213 | if (ST->hasSSE1() && PreferVectorWidth >= 128) |
214 | return TypeSize::getFixed(ExactSize: 128); |
215 | return TypeSize::getFixed(ExactSize: 0); |
216 | case TargetTransformInfo::RGK_ScalableVector: |
217 | return TypeSize::getScalable(MinimumSize: 0); |
218 | } |
219 | |
220 | llvm_unreachable("Unsupported register kind" ); |
221 | } |
222 | |
223 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
224 | return getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
225 | .getFixedValue(); |
226 | } |
227 | |
228 | unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) const { |
229 | // If the loop will not be vectorized, don't interleave the loop. |
230 | // Let regular unroll to unroll the loop, which saves the overflow |
231 | // check and memory check cost. |
232 | if (VF.isScalar()) |
233 | return 1; |
234 | |
235 | if (ST->isAtom()) |
236 | return 1; |
237 | |
238 | // Sandybridge and Haswell have multiple execution ports and pipelined |
239 | // vector units. |
240 | if (ST->hasAVX()) |
241 | return 4; |
242 | |
243 | return 2; |
244 | } |
245 | |
246 | InstructionCost X86TTIImpl::getArithmeticInstrCost( |
247 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
248 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
249 | ArrayRef<const Value *> Args, const Instruction *CxtI) const { |
250 | |
251 | // vXi8 multiplications are always promoted to vXi16. |
252 | // Sub-128-bit types can be extended/packed more efficiently. |
253 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && |
254 | Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { |
255 | Type *WideVecTy = |
256 | VectorType::getExtendedElementVectorType(VTy: cast<VectorType>(Val: Ty)); |
257 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: Ty, |
258 | CCH: TargetTransformInfo::CastContextHint::None, |
259 | CostKind) + |
260 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: Ty, Src: WideVecTy, |
261 | CCH: TargetTransformInfo::CastContextHint::None, |
262 | CostKind) + |
263 | getArithmeticInstrCost(Opcode, Ty: WideVecTy, CostKind, Op1Info, Op2Info); |
264 | } |
265 | |
266 | // Legalize the type. |
267 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
268 | |
269 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
270 | assert(ISD && "Invalid opcode" ); |
271 | |
272 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && |
273 | (LT.second.getScalarType() == MVT::i32 || |
274 | LT.second.getScalarType() == MVT::i64)) { |
275 | // Check if the operands can be represented as a smaller datatype. |
276 | bool Op1Signed = false, Op2Signed = false; |
277 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Val: Args[0], isSigned&: Op1Signed); |
278 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Val: Args[1], isSigned&: Op2Signed); |
279 | unsigned OpMinSize = std::max(a: Op1MinSize, b: Op2MinSize); |
280 | bool SignedMode = Op1Signed || Op2Signed; |
281 | |
282 | // If both vXi32 are representable as i15 and at least one is constant, |
283 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we |
284 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. |
285 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && |
286 | LT.second.getScalarType() == MVT::i32) { |
287 | bool Op1Constant = |
288 | isa<ConstantDataVector>(Val: Args[0]) || isa<ConstantVector>(Val: Args[0]); |
289 | bool Op2Constant = |
290 | isa<ConstantDataVector>(Val: Args[1]) || isa<ConstantVector>(Val: Args[1]); |
291 | bool Op1Sext = isa<SExtInst>(Val: Args[0]) && |
292 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); |
293 | bool Op2Sext = isa<SExtInst>(Val: Args[1]) && |
294 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); |
295 | |
296 | bool IsZeroExtended = !Op1Signed || !Op2Signed; |
297 | bool IsConstant = Op1Constant || Op2Constant; |
298 | bool IsSext = Op1Sext || Op2Sext; |
299 | if (IsConstant || IsZeroExtended || IsSext) |
300 | LT.second = |
301 | MVT::getVectorVT(VT: MVT::i16, NumElements: 2 * LT.second.getVectorNumElements()); |
302 | } |
303 | |
304 | // Check if the vXi32 operands can be shrunk into a smaller datatype. |
305 | // This should match the codegen from reduceVMULWidth. |
306 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). |
307 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { |
308 | if (OpMinSize <= 7) |
309 | return LT.first * 3; // pmullw/sext |
310 | if (!SignedMode && OpMinSize <= 8) |
311 | return LT.first * 3; // pmullw/zext |
312 | if (OpMinSize <= 15) |
313 | return LT.first * 5; // pmullw/pmulhw/pshuf |
314 | if (!SignedMode && OpMinSize <= 16) |
315 | return LT.first * 5; // pmullw/pmulhw/pshuf |
316 | } |
317 | |
318 | // If both vXi64 are representable as (unsigned) i32, then we can perform |
319 | // the multiple with a single PMULUDQ instruction. |
320 | // TODO: Add (SSE41+) PMULDQ handling for signed extensions. |
321 | if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) |
322 | ISD = X86ISD::PMULUDQ; |
323 | } |
324 | |
325 | // Vector multiply by pow2 will be simplified to shifts. |
326 | // Vector multiply by -pow2 will be simplified to shifts/negates. |
327 | if (ISD == ISD::MUL && Op2Info.isConstant() && |
328 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { |
329 | InstructionCost Cost = |
330 | getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind, |
331 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
332 | if (Op2Info.isNegatedPowerOf2()) |
333 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind); |
334 | return Cost; |
335 | } |
336 | |
337 | // On X86, vector signed division by constants power-of-two are |
338 | // normally expanded to the sequence SRA + SRL + ADD + SRA. |
339 | // The OperandValue properties may not be the same as that of the previous |
340 | // operation; conservatively assume OP_None. |
341 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && |
342 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
343 | InstructionCost Cost = |
344 | 2 * getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
345 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
346 | Cost += getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
347 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
348 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
349 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
350 | |
351 | if (ISD == ISD::SREM) { |
352 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) |
353 | Cost += getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
354 | Op2Info: Op2Info.getNoProps()); |
355 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
356 | Op2Info: Op2Info.getNoProps()); |
357 | } |
358 | |
359 | return Cost; |
360 | } |
361 | |
362 | // Vector unsigned division/remainder will be simplified to shifts/masks. |
363 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && |
364 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
365 | if (ISD == ISD::UDIV) |
366 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
367 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
368 | // UREM |
369 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
370 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
371 | } |
372 | |
373 | static const CostKindTblEntry GFNIUniformConstCostTable[] = { |
374 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
375 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
376 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
377 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
378 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
379 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
380 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
381 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
382 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
383 | }; |
384 | |
385 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI()) |
386 | if (const auto *Entry = |
387 | CostTableLookup(Table: GFNIUniformConstCostTable, ISD, Ty: LT.second)) |
388 | if (auto KindCost = Entry->Cost[CostKind]) |
389 | return LT.first * *KindCost; |
390 | |
391 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { |
392 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
393 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
394 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
395 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
396 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
397 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
398 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
399 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
400 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
401 | |
402 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
403 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
404 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
405 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
406 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
407 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
408 | }; |
409 | |
410 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) |
411 | if (const auto *Entry = |
412 | CostTableLookup(Table: AVX512BWUniformConstCostTable, ISD, Ty: LT.second)) |
413 | if (auto KindCost = Entry->Cost[CostKind]) |
414 | return LT.first * *KindCost; |
415 | |
416 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { |
417 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psllw + pand. |
418 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw + pand. |
419 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 12, .SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
420 | |
421 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psllw + split. |
422 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psrlw + split. |
423 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psraw + split. |
424 | |
425 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
426 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
427 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
428 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
429 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
430 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
431 | |
432 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
433 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
434 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
435 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
436 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
437 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
438 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
439 | |
440 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
441 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
442 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
443 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
444 | }; |
445 | |
446 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) |
447 | if (const auto *Entry = |
448 | CostTableLookup(Table: AVX512UniformConstCostTable, ISD, Ty: LT.second)) |
449 | if (auto KindCost = Entry->Cost[CostKind]) |
450 | return LT.first * *KindCost; |
451 | |
452 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { |
453 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
454 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
455 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
456 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psllw + pand. |
457 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psrlw + pand. |
458 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw, pand, pxor, psubb. |
459 | |
460 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
461 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
462 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw |
463 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw |
464 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw |
465 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw |
466 | |
467 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
468 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
469 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
470 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
471 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
472 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
473 | |
474 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
475 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
476 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
477 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
478 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
479 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // psrad + shuffle + split. |
480 | |
481 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
482 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
483 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
484 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
485 | }; |
486 | |
487 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) |
488 | if (const auto *Entry = |
489 | CostTableLookup(Table: AVX2UniformConstCostTable, ISD, Ty: LT.second)) |
490 | if (auto KindCost = Entry->Cost[CostKind]) |
491 | return LT.first * *KindCost; |
492 | |
493 | static const CostKindTblEntry AVXUniformConstCostTable[] = { |
494 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
495 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
496 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
497 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psllw + pand) + split. |
498 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psrlw + pand) + split. |
499 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 12, .SizeAndLatencyCost: 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. |
500 | |
501 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
502 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
503 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
504 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psllw + split. |
505 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw + split. |
506 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psraw + split. |
507 | |
508 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld. |
509 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
510 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
511 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // pslld + split. |
512 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrld + split. |
513 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrad + split. |
514 | |
515 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
516 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
517 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
518 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
519 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
520 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle + split. |
521 | |
522 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmuludq sequence + split. |
523 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmuludq+mul+sub sequence + split. |
524 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 12 } }, // 2*pmuludq sequence + split. |
525 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 16 } }, // 2*pmuludq+mul+sub sequence + split. |
526 | }; |
527 | |
528 | // XOP has faster vXi8 shifts. |
529 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && |
530 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
531 | if (const auto *Entry = |
532 | CostTableLookup(Table: AVXUniformConstCostTable, ISD, Ty: LT.second)) |
533 | if (auto KindCost = Entry->Cost[CostKind]) |
534 | return LT.first * *KindCost; |
535 | |
536 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { |
537 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
538 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
539 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
540 | |
541 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
542 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
543 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
544 | |
545 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
546 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
547 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
548 | |
549 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
550 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
551 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, // 2 x psrad + shuffle. |
552 | |
553 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
554 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
555 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
556 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
557 | }; |
558 | |
559 | // XOP has faster vXi8 shifts. |
560 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && |
561 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
562 | if (const auto *Entry = |
563 | CostTableLookup(Table: SSE2UniformConstCostTable, ISD, Ty: LT.second)) |
564 | if (auto KindCost = Entry->Cost[CostKind]) |
565 | return LT.first * *KindCost; |
566 | |
567 | static const CostKindTblEntry AVX512BWConstCostTable[] = { |
568 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
569 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
570 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
571 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
572 | |
573 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
574 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
575 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
576 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
577 | }; |
578 | |
579 | if (Op2Info.isConstant() && ST->hasBWI()) |
580 | if (const auto *Entry = |
581 | CostTableLookup(Table: AVX512BWConstCostTable, ISD, Ty: LT.second)) |
582 | if (auto KindCost = Entry->Cost[CostKind]) |
583 | return LT.first * *KindCost; |
584 | |
585 | static const CostKindTblEntry AVX512ConstCostTable[] = { |
586 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
587 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
588 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
589 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
590 | |
591 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhw sequence |
592 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhw+mul+sub sequence |
593 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhuw sequence |
594 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhuw+mul+sub sequence |
595 | |
596 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
597 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuldq+mul+sub sequence |
598 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
599 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuludq+mul+sub sequence |
600 | }; |
601 | |
602 | if (Op2Info.isConstant() && ST->hasAVX512()) |
603 | if (const auto *Entry = |
604 | CostTableLookup(Table: AVX512ConstCostTable, ISD, Ty: LT.second)) |
605 | if (auto KindCost = Entry->Cost[CostKind]) |
606 | return LT.first * *KindCost; |
607 | |
608 | static const CostKindTblEntry AVX2ConstCostTable[] = { |
609 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
610 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
611 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
612 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
613 | |
614 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
615 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
616 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
617 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
618 | |
619 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
620 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuldq+mul+sub sequence |
621 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
622 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuludq+mul+sub sequence |
623 | }; |
624 | |
625 | if (Op2Info.isConstant() && ST->hasAVX2()) |
626 | if (const auto *Entry = CostTableLookup(Table: AVX2ConstCostTable, ISD, Ty: LT.second)) |
627 | if (auto KindCost = Entry->Cost[CostKind]) |
628 | return LT.first * *KindCost; |
629 | |
630 | static const CostKindTblEntry AVXConstCostTable[] = { |
631 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
632 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
633 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
634 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
635 | |
636 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhw sequence + split. |
637 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhw+mul+sub sequence + split. |
638 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhuw sequence + split. |
639 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhuw+mul+sub sequence + split. |
640 | |
641 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // vpmuludq sequence |
642 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 38 } }, // vpmuludq+mul+sub sequence |
643 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // 2*pmuludq sequence + split. |
644 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 42 } }, // 2*pmuludq+mul+sub sequence + split. |
645 | }; |
646 | |
647 | if (Op2Info.isConstant() && ST->hasAVX()) |
648 | if (const auto *Entry = CostTableLookup(Table: AVXConstCostTable, ISD, Ty: LT.second)) |
649 | if (auto KindCost = Entry->Cost[CostKind]) |
650 | return LT.first * *KindCost; |
651 | |
652 | static const CostKindTblEntry SSE41ConstCostTable[] = { |
653 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
654 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // vpmuludq+mul+sub sequence |
655 | }; |
656 | |
657 | if (Op2Info.isConstant() && ST->hasSSE41()) |
658 | if (const auto *Entry = |
659 | CostTableLookup(Table: SSE41ConstCostTable, ISD, Ty: LT.second)) |
660 | if (auto KindCost = Entry->Cost[CostKind]) |
661 | return LT.first * *KindCost; |
662 | |
663 | static const CostKindTblEntry SSE2ConstCostTable[] = { |
664 | { .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
665 | { .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
666 | { .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
667 | { .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
668 | |
669 | { .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhw sequence |
670 | { .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhw+mul+sub sequence |
671 | { .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhuw sequence |
672 | { .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhuw+mul+sub sequence |
673 | |
674 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19 } }, // pmuludq sequence |
675 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 24 } }, // pmuludq+mul+sub sequence |
676 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // pmuludq sequence |
677 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // pmuludq+mul+sub sequence |
678 | }; |
679 | |
680 | if (Op2Info.isConstant() && ST->hasSSE2()) |
681 | if (const auto *Entry = CostTableLookup(Table: SSE2ConstCostTable, ISD, Ty: LT.second)) |
682 | if (auto KindCost = Entry->Cost[CostKind]) |
683 | return LT.first * *KindCost; |
684 | |
685 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { |
686 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
687 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
688 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4,.LatencyCost: 12, .CodeSizeCost: 8,.SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
689 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
690 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
691 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
692 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
693 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // psrlw + pand. |
694 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 15 } }, // psrlw, pand, pxor, psubb. |
695 | |
696 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw |
697 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw |
698 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrqw |
699 | }; |
700 | |
701 | if (ST->hasBWI() && Op2Info.isUniform()) |
702 | if (const auto *Entry = |
703 | CostTableLookup(Table: AVX512BWUniformCostTable, ISD, Ty: LT.second)) |
704 | if (auto KindCost = Entry->Cost[CostKind]) |
705 | return LT.first * *KindCost; |
706 | |
707 | static const CostKindTblEntry AVX512UniformCostTable[] = { |
708 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
709 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
710 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
711 | |
712 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
713 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
714 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
715 | |
716 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
717 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
718 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
719 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
720 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
721 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
722 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
723 | }; |
724 | |
725 | if (ST->hasAVX512() && Op2Info.isUniform()) |
726 | if (const auto *Entry = |
727 | CostTableLookup(Table: AVX512UniformCostTable, ISD, Ty: LT.second)) |
728 | if (auto KindCost = Entry->Cost[CostKind]) |
729 | return LT.first * *KindCost; |
730 | |
731 | static const CostKindTblEntry AVX2UniformCostTable[] = { |
732 | // Uniform splats are cheaper for the following instructions. |
733 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
734 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
735 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
736 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
737 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
738 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // psrlw, pand, pxor, psubb. |
739 | |
740 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
741 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
742 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
743 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw. |
744 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw. |
745 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psraw. |
746 | |
747 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
748 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
749 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
750 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
751 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
752 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
753 | |
754 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
755 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
756 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
757 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
758 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
759 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle. |
760 | }; |
761 | |
762 | if (ST->hasAVX2() && Op2Info.isUniform()) |
763 | if (const auto *Entry = |
764 | CostTableLookup(Table: AVX2UniformCostTable, ISD, Ty: LT.second)) |
765 | if (auto KindCost = Entry->Cost[CostKind]) |
766 | return LT.first * *KindCost; |
767 | |
768 | static const CostKindTblEntry AVXUniformCostTable[] = { |
769 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
770 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
771 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
772 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 8,.CodeSizeCost: 11,.SizeAndLatencyCost: 14 } }, // psllw + pand + split. |
773 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // psrlw + pand + split. |
774 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10,.LatencyCost: 11,.CodeSizeCost: 16,.SizeAndLatencyCost: 21 } }, // psrlw, pand, pxor, psubb + split. |
775 | |
776 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
777 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
778 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
779 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
780 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
781 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
782 | |
783 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld. |
784 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
785 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
786 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // pslld + split. |
787 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrld + split. |
788 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrad + split. |
789 | |
790 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
791 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
792 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
793 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psllq + split. |
794 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlq + split. |
795 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // 2 x (2 x psrad + shuffle) + split. |
796 | }; |
797 | |
798 | // XOP has faster vXi8 shifts. |
799 | if (ST->hasAVX() && Op2Info.isUniform() && |
800 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
801 | if (const auto *Entry = |
802 | CostTableLookup(Table: AVXUniformCostTable, ISD, Ty: LT.second)) |
803 | if (auto KindCost = Entry->Cost[CostKind]) |
804 | return LT.first * *KindCost; |
805 | |
806 | static const CostKindTblEntry SSE2UniformCostTable[] = { |
807 | // Uniform splats are cheaper for the following instructions. |
808 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, // psllw + pand. |
809 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
810 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // pcmpgtb sequence. |
811 | |
812 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
813 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
814 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
815 | |
816 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
817 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
818 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
819 | |
820 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
821 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
822 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2*psrlq + xor + sub. |
823 | }; |
824 | |
825 | if (ST->hasSSE2() && Op2Info.isUniform() && |
826 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
827 | if (const auto *Entry = |
828 | CostTableLookup(Table: SSE2UniformCostTable, ISD, Ty: LT.second)) |
829 | if (auto KindCost = Entry->Cost[CostKind]) |
830 | return LT.first * *KindCost; |
831 | |
832 | static const CostKindTblEntry AVX512DQCostTable[] = { |
833 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
834 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
835 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } } // pmullq |
836 | }; |
837 | |
838 | // Look for AVX512DQ lowering tricks for custom cases. |
839 | if (ST->hasDQI()) |
840 | if (const auto *Entry = CostTableLookup(Table: AVX512DQCostTable, ISD, Ty: LT.second)) |
841 | if (auto KindCost = Entry->Cost[CostKind]) |
842 | return LT.first * *KindCost; |
843 | |
844 | static const CostKindTblEntry AVX512BWCostTable[] = { |
845 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsllvw/pack sequence. |
846 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsrlvw/pack sequence. |
847 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsravw/pack sequence. |
848 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // extend/vpsllvw/pack sequence. |
849 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
850 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
851 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 19,.CodeSizeCost: 13,.SizeAndLatencyCost: 15 } }, // extend/vpsllvw/pack sequence. |
852 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 27,.CodeSizeCost: 15,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
853 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 15,.CodeSizeCost: 30,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
854 | |
855 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
856 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
857 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
858 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
859 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
860 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
861 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
862 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
863 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
864 | |
865 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
866 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
867 | |
868 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
869 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
870 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddd |
871 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
872 | |
873 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
874 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
875 | |
876 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/pmullw/trunc |
877 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
878 | { .ISD: ISD::MUL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
879 | { .ISD: ISD::MUL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
880 | |
881 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
882 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
883 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubd |
884 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
885 | }; |
886 | |
887 | // Look for AVX512BW lowering tricks for custom cases. |
888 | if (ST->hasBWI()) |
889 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTable, ISD, Ty: LT.second)) |
890 | if (auto KindCost = Entry->Cost[CostKind]) |
891 | return LT.first * *KindCost; |
892 | |
893 | static const CostKindTblEntry AVX512CostTable[] = { |
894 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 27,.SizeAndLatencyCost: 33 } }, // vpblendv+split sequence. |
895 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // vpblendv+split sequence. |
896 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 37,.CodeSizeCost: 51,.SizeAndLatencyCost: 63 } }, // vpblendv+split sequence. |
897 | |
898 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
899 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
900 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsravd/pack sequence. |
901 | |
902 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
903 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
904 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
905 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
906 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
907 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
908 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
909 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
910 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
911 | |
912 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
913 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
914 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
915 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
916 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
917 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
918 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
919 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
920 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
921 | |
922 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddb + split |
923 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddw + split |
924 | |
925 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubb + split |
926 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubw + split |
927 | |
928 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
929 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
930 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
931 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
932 | |
933 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
934 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
935 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
936 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
937 | |
938 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
939 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
940 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
941 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
942 | |
943 | { .ISD: ISD::MUL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
944 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
945 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
946 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
947 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Skylake from http://www.agner.org/ |
948 | |
949 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
950 | |
951 | { .ISD: ISD::FNEG, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
952 | { .ISD: ISD::FADD, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
953 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
954 | { .ISD: ISD::FSUB, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
955 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
956 | { .ISD: ISD::FMUL, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
957 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
958 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
959 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
960 | |
961 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
962 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
963 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
964 | { .ISD: ISD::FDIV, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
965 | |
966 | { .ISD: ISD::FNEG, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
967 | { .ISD: ISD::FADD, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
968 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
969 | { .ISD: ISD::FSUB, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
970 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
971 | { .ISD: ISD::FMUL, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
972 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
973 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
974 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
975 | |
976 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
977 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
978 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
979 | { .ISD: ISD::FDIV, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
980 | }; |
981 | |
982 | if (ST->hasAVX512()) |
983 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTable, ISD, Ty: LT.second)) |
984 | if (auto KindCost = Entry->Cost[CostKind]) |
985 | return LT.first * *KindCost; |
986 | |
987 | static const CostKindTblEntry AVX2ShiftCostTable[] = { |
988 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to |
989 | // customize them to detect the cases where shift amount is a scalar one. |
990 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
991 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
992 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
993 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
994 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
995 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
996 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvq (Haswell from agner.org) |
997 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvq (Haswell from agner.org) |
998 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsllvq (Haswell from agner.org) |
999 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsrlvq (Haswell from agner.org) |
1000 | }; |
1001 | |
1002 | if (ST->hasAVX512()) { |
1003 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) |
1004 | // On AVX512, a packed v32i16 shift left by a constant build_vector |
1005 | // is lowered into a vector multiply (vpmullw). |
1006 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
1007 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1008 | } |
1009 | |
1010 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). |
1011 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { |
1012 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && |
1013 | Op2Info.isConstant()) |
1014 | // On AVX2, a packed v16i16 shift left by a constant build_vector |
1015 | // is lowered into a vector multiply (vpmullw). |
1016 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
1017 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1018 | |
1019 | if (const auto *Entry = CostTableLookup(Table: AVX2ShiftCostTable, ISD, Ty: LT.second)) |
1020 | if (auto KindCost = Entry->Cost[CostKind]) |
1021 | return LT.first * *KindCost; |
1022 | } |
1023 | |
1024 | static const CostKindTblEntry XOPShiftCostTable[] = { |
1025 | // 128bit shifts take 1cy, but right shifts require negation beforehand. |
1026 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1027 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1028 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1029 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1030 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1031 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1032 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1033 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1034 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1035 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1036 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1037 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1038 | // 256bit shifts require splitting if AVX2 didn't catch them above. |
1039 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1040 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1041 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1042 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1043 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1044 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1045 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1046 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1047 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1048 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1049 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1050 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1051 | }; |
1052 | |
1053 | // Look for XOP lowering tricks. |
1054 | if (ST->hasXOP()) { |
1055 | // If the right shift is constant then we'll fold the negation so |
1056 | // it's as cheap as a left shift. |
1057 | int ShiftISD = ISD; |
1058 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) |
1059 | ShiftISD = ISD::SHL; |
1060 | if (const auto *Entry = |
1061 | CostTableLookup(Table: XOPShiftCostTable, ISD: ShiftISD, Ty: LT.second)) |
1062 | if (auto KindCost = Entry->Cost[CostKind]) |
1063 | return LT.first * *KindCost; |
1064 | } |
1065 | |
1066 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { |
1067 | MVT VT = LT.second; |
1068 | // Vector shift left by non uniform constant can be lowered |
1069 | // into vector multiply. |
1070 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || |
1071 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) |
1072 | ISD = ISD::MUL; |
1073 | } |
1074 | |
1075 | static const CostKindTblEntry GLMCostTable[] = { |
1076 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
1077 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 36, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divps |
1078 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 33, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
1079 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 65, .LatencyCost: 66, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divpd |
1080 | }; |
1081 | |
1082 | if (ST->useGLMDivSqrtCosts()) |
1083 | if (const auto *Entry = CostTableLookup(Table: GLMCostTable, ISD, Ty: LT.second)) |
1084 | if (auto KindCost = Entry->Cost[CostKind]) |
1085 | return LT.first * *KindCost; |
1086 | |
1087 | static const CostKindTblEntry SLMCostTable[] = { |
1088 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 7 } }, // pmulld |
1089 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
1090 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulsd |
1091 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulss |
1092 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulpd |
1093 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulps |
1094 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
1095 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divps |
1096 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
1097 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divpd |
1098 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // addpd |
1099 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // subpd |
1100 | // v2i64/v4i64 mul is custom lowered as a series of long: |
1101 | // multiplies(3), shifts(3) and adds(2) |
1102 | // slm muldq version throughput is 2 and addq throughput 4 |
1103 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + |
1104 | // 3X4 (addq throughput) = 17 |
1105 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
1106 | // slm addq\subq throughput is 4 |
1107 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1108 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1109 | }; |
1110 | |
1111 | if (ST->useSLMArithCosts()) |
1112 | if (const auto *Entry = CostTableLookup(Table: SLMCostTable, ISD, Ty: LT.second)) |
1113 | if (auto KindCost = Entry->Cost[CostKind]) |
1114 | return LT.first * *KindCost; |
1115 | |
1116 | static const CostKindTblEntry AVX2CostTable[] = { |
1117 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // vpblendvb sequence. |
1118 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 22 } }, // vpblendvb sequence. |
1119 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
1120 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
1121 | |
1122 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // vpblendvb sequence. |
1123 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 24 } }, // vpblendvb sequence. |
1124 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
1125 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
1126 | |
1127 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 17,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // vpblendvb sequence. |
1128 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 20,.CodeSizeCost: 24,.SizeAndLatencyCost: 43 } }, // vpblendvb sequence. |
1129 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsravd/pack sequence. |
1130 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsravd/pack sequence. |
1131 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // srl/xor/sub sequence. |
1132 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // srl/xor/sub sequence. |
1133 | |
1134 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubb |
1135 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddb |
1136 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubw |
1137 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddw |
1138 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubd |
1139 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddd |
1140 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
1141 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
1142 | |
1143 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 6,.SizeAndLatencyCost: 12 } }, // extend/pmullw/pack |
1144 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 8,.SizeAndLatencyCost: 16 } }, // pmaddubsw |
1145 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmullw |
1146 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
1147 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
1148 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8,.SizeAndLatencyCost: 13 } }, // 3*pmuludq/3*shift/2*add |
1149 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
1150 | |
1151 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1152 | |
1153 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorpd |
1154 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1155 | |
1156 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddsd |
1157 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddss |
1158 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddpd |
1159 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddps |
1160 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddpd |
1161 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddps |
1162 | |
1163 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubsd |
1164 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubss |
1165 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubpd |
1166 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubps |
1167 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubpd |
1168 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubps |
1169 | |
1170 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulsd |
1171 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulss |
1172 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulpd |
1173 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulps |
1174 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulpd |
1175 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulps |
1176 | |
1177 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivss |
1178 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivps |
1179 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivps |
1180 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivsd |
1181 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivpd |
1182 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivpd |
1183 | }; |
1184 | |
1185 | // Look for AVX2 lowering tricks for custom cases. |
1186 | if (ST->hasAVX2()) |
1187 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTable, ISD, Ty: LT.second)) |
1188 | if (auto KindCost = Entry->Cost[CostKind]) |
1189 | return LT.first * *KindCost; |
1190 | |
1191 | static const CostKindTblEntry AVX1CostTable[] = { |
1192 | // We don't have to scalarize unsupported ops. We can issue two half-sized |
1193 | // operations and we only need to extract the upper YMM half. |
1194 | // Two ops + 1 extract + 1 insert = 4. |
1195 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 11, .CodeSizeCost: 18, .SizeAndLatencyCost: 19 } }, // pmaddubsw + split |
1196 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
1197 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // pmullw + split |
1198 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, // pmulld + split |
1199 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmulld |
1200 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
1201 | |
1202 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1203 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1204 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1205 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1206 | |
1207 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1208 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1209 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1210 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1211 | |
1212 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1213 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1214 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1215 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1216 | |
1217 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubb + split |
1218 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddb + split |
1219 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubw + split |
1220 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddw + split |
1221 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubd + split |
1222 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddd + split |
1223 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubq + split |
1224 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddq + split |
1225 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
1226 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
1227 | |
1228 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 17 } }, // pblendvb sequence. |
1229 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22,.CodeSizeCost: 27,.SizeAndLatencyCost: 40 } }, // pblendvb sequence + split. |
1230 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
1231 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 24,.SizeAndLatencyCost: 25 } }, // pblendvb sequence + split. |
1232 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // pslld/paddd/cvttps2dq/pmulld |
1233 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 17 } }, // pslld/paddd/cvttps2dq/pmulld + split |
1234 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
1235 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
1236 | |
1237 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // pblendvb sequence. |
1238 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23,.CodeSizeCost: 30,.SizeAndLatencyCost: 43 } }, // pblendvb sequence + split. |
1239 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1240 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
1241 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
1242 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
1243 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
1244 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
1245 | |
1246 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 22,.CodeSizeCost: 24,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
1247 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45,.CodeSizeCost: 51,.SizeAndLatencyCost: 76 } }, // pblendvb sequence + split. |
1248 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1249 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
1250 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
1251 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
1252 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // Shift each lane + blend. |
1253 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 22,.SizeAndLatencyCost: 30 } }, // Shift each lane + blend + split. |
1254 | |
1255 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1256 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1257 | |
1258 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1259 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1260 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1261 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1262 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1263 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1264 | |
1265 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1266 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1267 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1268 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1269 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1270 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1271 | |
1272 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1273 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1274 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1275 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1276 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1277 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1278 | |
1279 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1280 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1281 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 29, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
1282 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1283 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1284 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
1285 | }; |
1286 | |
1287 | if (ST->hasAVX()) |
1288 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTable, ISD, Ty: LT.second)) |
1289 | if (auto KindCost = Entry->Cost[CostKind]) |
1290 | return LT.first * *KindCost; |
1291 | |
1292 | static const CostKindTblEntry SSE42CostTable[] = { |
1293 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1294 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1295 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1296 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1297 | |
1298 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1299 | { .ISD: ISD::FSUB, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1300 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1301 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1302 | |
1303 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1304 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1305 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1306 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1307 | |
1308 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1309 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1310 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1311 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1312 | |
1313 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } } // 3*pmuludq/3*shift/2*add |
1314 | }; |
1315 | |
1316 | if (ST->hasSSE42()) |
1317 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTable, ISD, Ty: LT.second)) |
1318 | if (auto KindCost = Entry->Cost[CostKind]) |
1319 | return LT.first * *KindCost; |
1320 | |
1321 | static const CostKindTblEntry SSE41CostTable[] = { |
1322 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 24,.CodeSizeCost: 17,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1323 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 14,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
1324 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 4,.SizeAndLatencyCost: 10 } }, // pslld/paddd/cvttps2dq/pmulld |
1325 | |
1326 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27,.CodeSizeCost: 18,.SizeAndLatencyCost: 24 } }, // pblendvb sequence. |
1327 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
1328 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1329 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1330 | |
1331 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 41,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
1332 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
1333 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1334 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 17, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1335 | |
1336 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } } // pmulld (Nehalem from agner.org) |
1337 | }; |
1338 | |
1339 | if (ST->hasSSE41()) |
1340 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTable, ISD, Ty: LT.second)) |
1341 | if (auto KindCost = Entry->Cost[CostKind]) |
1342 | return LT.first * *KindCost; |
1343 | |
1344 | static const CostKindTblEntry SSSE3CostTable[] = { |
1345 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
1346 | }; |
1347 | |
1348 | if (ST->hasSSSE3()) |
1349 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTable, ISD, Ty: LT.second)) |
1350 | if (auto KindCost = Entry->Cost[CostKind]) |
1351 | return LT.first * *KindCost; |
1352 | |
1353 | static const CostKindTblEntry SSE2CostTable[] = { |
1354 | // We don't correctly identify costs of casts because they are marked as |
1355 | // custom. |
1356 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 21,.CodeSizeCost: 26,.SizeAndLatencyCost: 28 } }, // cmpgtb sequence. |
1357 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 27,.CodeSizeCost: 16,.SizeAndLatencyCost: 20 } }, // cmpgtw sequence. |
1358 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // pslld/paddd/cvttps2dq/pmuludq. |
1359 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1360 | |
1361 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28,.CodeSizeCost: 27,.SizeAndLatencyCost: 30 } }, // cmpgtb sequence. |
1362 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
1363 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1364 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1365 | |
1366 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 30,.CodeSizeCost: 54,.SizeAndLatencyCost: 54 } }, // unpacked cmpgtb sequence. |
1367 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
1368 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1369 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // srl/xor/sub splat+shuffle sequence. |
1370 | |
1371 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1372 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1373 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1374 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1375 | |
1376 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1377 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1378 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1379 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1380 | |
1381 | { .ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1382 | { .ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1383 | { .ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1384 | { .ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1385 | |
1386 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
1387 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
1388 | |
1389 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18,.CodeSizeCost: 12,.SizeAndLatencyCost: 12 } }, // 2*unpack/2*pmullw/2*and/pack |
1390 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
1391 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // 3*pmuludq/4*shuffle |
1392 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } }, // 3*pmuludq/3*shift/2*add |
1393 | |
1394 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1395 | |
1396 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1397 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1398 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1399 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1400 | |
1401 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1402 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1403 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1404 | { .ISD: ISD::FNEG, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1405 | |
1406 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1407 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1408 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1409 | |
1410 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1411 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1412 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1413 | |
1414 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1415 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1416 | }; |
1417 | |
1418 | if (ST->hasSSE2()) |
1419 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTable, ISD, Ty: LT.second)) |
1420 | if (auto KindCost = Entry->Cost[CostKind]) |
1421 | return LT.first * *KindCost; |
1422 | |
1423 | static const CostKindTblEntry SSE1CostTable[] = { |
1424 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1425 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 48, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1426 | |
1427 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
1428 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
1429 | |
1430 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1431 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1432 | |
1433 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1434 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1435 | |
1436 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1437 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1438 | }; |
1439 | |
1440 | if (ST->hasSSE1()) |
1441 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTable, ISD, Ty: LT.second)) |
1442 | if (auto KindCost = Entry->Cost[CostKind]) |
1443 | return LT.first * *KindCost; |
1444 | |
1445 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
1446 | { .ISD: ISD::ADD, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
1447 | { .ISD: ISD::SUB, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
1448 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1449 | }; |
1450 | |
1451 | if (ST->is64Bit()) |
1452 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: LT.second)) |
1453 | if (auto KindCost = Entry->Cost[CostKind]) |
1454 | return LT.first * *KindCost; |
1455 | |
1456 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
1457 | { .ISD: ISD::ADD, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1458 | { .ISD: ISD::ADD, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1459 | { .ISD: ISD::ADD, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1460 | |
1461 | { .ISD: ISD::SUB, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1462 | { .ISD: ISD::SUB, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1463 | { .ISD: ISD::SUB, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1464 | |
1465 | { .ISD: ISD::MUL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1466 | { .ISD: ISD::MUL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1467 | { .ISD: ISD::MUL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1468 | |
1469 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // (x87) |
1470 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1471 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1472 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1473 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1474 | }; |
1475 | |
1476 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: LT.second)) |
1477 | if (auto KindCost = Entry->Cost[CostKind]) |
1478 | return LT.first * *KindCost; |
1479 | |
1480 | // It is not a good idea to vectorize division. We have to scalarize it and |
1481 | // in the process we will often end up having to spilling regular |
1482 | // registers. The overhead of division is going to dominate most kernels |
1483 | // anyways so try hard to prevent vectorization of division - it is |
1484 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
1485 | // to hide "20 cycles" for each lane. |
1486 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && |
1487 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || |
1488 | ISD == ISD::UREM)) { |
1489 | InstructionCost ScalarCost = |
1490 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
1491 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1492 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; |
1493 | } |
1494 | |
1495 | // Handle some basic single instruction code size cases. |
1496 | if (CostKind == TTI::TCK_CodeSize) { |
1497 | switch (ISD) { |
1498 | case ISD::FADD: |
1499 | case ISD::FSUB: |
1500 | case ISD::FMUL: |
1501 | case ISD::FDIV: |
1502 | case ISD::FNEG: |
1503 | case ISD::AND: |
1504 | case ISD::OR: |
1505 | case ISD::XOR: |
1506 | return LT.first; |
1507 | break; |
1508 | } |
1509 | } |
1510 | |
1511 | // Fallback to the default implementation. |
1512 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1513 | Args, CxtI); |
1514 | } |
1515 | |
1516 | InstructionCost |
1517 | X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, |
1518 | unsigned Opcode1, const SmallBitVector &OpcodeMask, |
1519 | TTI::TargetCostKind CostKind) const { |
1520 | if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) |
1521 | return TTI::TCC_Basic; |
1522 | return InstructionCost::getInvalid(); |
1523 | } |
1524 | |
1525 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
1526 | VectorType *DstTy, VectorType *SrcTy, |
1527 | ArrayRef<int> Mask, |
1528 | TTI::TargetCostKind CostKind, |
1529 | int Index, VectorType *SubTp, |
1530 | ArrayRef<const Value *> Args, |
1531 | const Instruction *CxtI) const { |
1532 | assert((Mask.empty() || DstTy->isScalableTy() || |
1533 | Mask.size() == DstTy->getElementCount().getKnownMinValue()) && |
1534 | "Expected the Mask to match the return size if given" ); |
1535 | assert(SrcTy->getScalarType() == DstTy->getScalarType() && |
1536 | "Expected the same scalar types" ); |
1537 | |
1538 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
1539 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. |
1540 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy); |
1541 | |
1542 | Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp); |
1543 | |
1544 | // If all args are constant than this will be constant folded away. |
1545 | if (!Args.empty() && |
1546 | all_of(Range&: Args, P: [](const Value *Arg) { return isa<Constant>(Val: Arg); })) |
1547 | return TTI::TCC_Free; |
1548 | |
1549 | // Recognize a basic concat_vector shuffle. |
1550 | if (Kind == TTI::SK_PermuteTwoSrc && |
1551 | Mask.size() == (2 * SrcTy->getElementCount().getKnownMinValue()) && |
1552 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
1553 | return getShuffleCost(Kind: TTI::SK_InsertSubvector, |
1554 | DstTy: VectorType::getDoubleElementsVectorType(VTy: SrcTy), |
1555 | SrcTy: VectorType::getDoubleElementsVectorType(VTy: SrcTy), Mask, |
1556 | CostKind, Index: Mask.size() / 2, SubTp: SrcTy); |
1557 | |
1558 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. |
1559 | if (Kind == TTI::SK_Transpose) |
1560 | Kind = TTI::SK_PermuteTwoSrc; |
1561 | |
1562 | if (Kind == TTI::SK_Broadcast) { |
1563 | // For Broadcasts we are splatting the first element from the first input |
1564 | // register, so only need to reference that input and all the output |
1565 | // registers are the same. |
1566 | LT.first = 1; |
1567 | |
1568 | // If we're broadcasting a load then AVX/AVX2 can do this for free. |
1569 | using namespace PatternMatch; |
1570 | if (!Args.empty() && match(V: Args[0], P: m_OneUse(SubPattern: m_Load(Op: m_Value()))) && |
1571 | (ST->hasAVX2() || |
1572 | (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) |
1573 | return TTI::TCC_Free; |
1574 | } |
1575 | |
1576 | // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector |
1577 | // permutation. |
1578 | // Attempt to detect a shuffle mask with a single defined element. |
1579 | bool IsInLaneShuffle = false; |
1580 | bool IsSingleElementMask = false; |
1581 | if (SrcTy->getPrimitiveSizeInBits() > 0 && |
1582 | (SrcTy->getPrimitiveSizeInBits() % 128) == 0 && |
1583 | SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
1584 | Mask.size() == SrcTy->getElementCount().getKnownMinValue()) { |
1585 | unsigned NumLanes = SrcTy->getPrimitiveSizeInBits() / 128; |
1586 | unsigned NumEltsPerLane = Mask.size() / NumLanes; |
1587 | if ((Mask.size() % NumLanes) == 0) { |
1588 | IsInLaneShuffle = all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) { |
1589 | return P.value() == PoisonMaskElem || |
1590 | ((P.value() % Mask.size()) / NumEltsPerLane) == |
1591 | (P.index() / NumEltsPerLane); |
1592 | }); |
1593 | IsSingleElementMask = |
1594 | (Mask.size() - 1) == static_cast<unsigned>(count_if(Range&: Mask, P: [](int M) { |
1595 | return M == PoisonMaskElem; |
1596 | })); |
1597 | } |
1598 | } |
1599 | |
1600 | // Treat <X x bfloat> shuffles as <X x half>. |
1601 | if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) |
1602 | LT.second = LT.second.changeVectorElementType(EltVT: MVT::f16); |
1603 | |
1604 | // Subvector extractions are free if they start at the beginning of a |
1605 | // vector and cheap if the subvectors are aligned. |
1606 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { |
1607 | int NumElts = LT.second.getVectorNumElements(); |
1608 | if ((Index % NumElts) == 0) |
1609 | return TTI::TCC_Free; |
1610 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1611 | if (SubLT.second.isVector()) { |
1612 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1613 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1614 | return SubLT.first; |
1615 | // Handle some cases for widening legalization. For now we only handle |
1616 | // cases where the original subvector was naturally aligned and evenly |
1617 | // fit in its legalized subvector type. |
1618 | // FIXME: Remove some of the alignment restrictions. |
1619 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit |
1620 | // vectors. |
1621 | int OrigSubElts = cast<FixedVectorType>(Val: SubTp)->getNumElements(); |
1622 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && |
1623 | (NumSubElts % OrigSubElts) == 0 && |
1624 | LT.second.getVectorElementType() == |
1625 | SubLT.second.getVectorElementType() && |
1626 | LT.second.getVectorElementType().getSizeInBits() == |
1627 | SrcTy->getElementType()->getPrimitiveSizeInBits()) { |
1628 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts && |
1629 | "Unexpected number of elements!" ); |
1630 | auto *VecTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
1631 | NumElts: LT.second.getVectorNumElements()); |
1632 | auto *SubTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
1633 | NumElts: SubLT.second.getVectorNumElements()); |
1634 | int = alignDown(Value: (Index % NumElts), Align: NumSubElts); |
1635 | InstructionCost = |
1636 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind, |
1637 | Index: ExtractIndex, SubTp: SubTy); |
1638 | |
1639 | // If the original size is 32-bits or more, we can use pshufd. Otherwise |
1640 | // if we have SSSE3 we can use pshufb. |
1641 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) |
1642 | return ExtractCost + 1; // pshufd or pshufb |
1643 | |
1644 | assert(SubTp->getPrimitiveSizeInBits() == 16 && |
1645 | "Unexpected vector size" ); |
1646 | |
1647 | return ExtractCost + 2; // worst case pshufhw + pshufd |
1648 | } |
1649 | } |
1650 | // If the extract subvector is not optimal, treat it as single op shuffle. |
1651 | Kind = TTI::SK_PermuteSingleSrc; |
1652 | } |
1653 | |
1654 | // Subvector insertions are cheap if the subvectors are aligned. |
1655 | // Note that in general, the insertion starting at the beginning of a vector |
1656 | // isn't free, because we need to preserve the rest of the wide vector, |
1657 | // but if the destination vector legalizes to the same width as the subvector |
1658 | // then the insertion will simplify to a (free) register copy. |
1659 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { |
1660 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy); |
1661 | int NumElts = DstLT.second.getVectorNumElements(); |
1662 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1663 | if (SubLT.second.isVector()) { |
1664 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1665 | bool MatchingTypes = |
1666 | NumElts == NumSubElts && |
1667 | (SubTp->getElementCount().getKnownMinValue() % NumSubElts) == 0; |
1668 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1669 | return MatchingTypes ? TTI::TCC_Free : SubLT.first; |
1670 | } |
1671 | |
1672 | // Attempt to match MOVSS (Idx == 0) or INSERTPS pattern. This will have |
1673 | // been matched by improveShuffleKindFromMask as a SK_InsertSubvector of |
1674 | // v1f32 (legalised to f32) into a v4f32. |
1675 | if (LT.first == 1 && LT.second == MVT::v4f32 && SubLT.first == 1 && |
1676 | SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41())) |
1677 | return 1; |
1678 | |
1679 | // If the insertion is the lowest subvector then it will be blended |
1680 | // otherwise treat it like a 2-op shuffle. |
1681 | Kind = |
1682 | (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc; |
1683 | } |
1684 | |
1685 | // Handle some common (illegal) sub-vector types as they are often very cheap |
1686 | // to shuffle even on targets without PSHUFB. |
1687 | EVT VT = TLI->getValueType(DL, Ty: SrcTy); |
1688 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && |
1689 | !ST->hasSSSE3()) { |
1690 | static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = { |
1691 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1692 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1693 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
1694 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
1695 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
1696 | |
1697 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1698 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1699 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // punpck/pshuflw/packus |
1700 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
1701 | |
1702 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
1703 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
1704 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
1705 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck+psrldq |
1706 | |
1707 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
1708 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck/pshuflw |
1709 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 7,.LatencyCost: 7,.CodeSizeCost: 7,.SizeAndLatencyCost: 7}}, // punpck/pshuflw |
1710 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // punpck/pshuflw |
1711 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // punpck |
1712 | |
1713 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1714 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pshuflw |
1715 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // punpck/pshuflw |
1716 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // punpck/pshuflw |
1717 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // punpck |
1718 | }; |
1719 | |
1720 | if (ST->hasSSE2()) |
1721 | if (const auto *Entry = |
1722 | CostTableLookup(Table: SSE2SubVectorShuffleTbl, ISD: Kind, Ty: VT.getSimpleVT())) |
1723 | if (auto KindCost = Entry->Cost[CostKind]) |
1724 | return LT.first * *KindCost; |
1725 | } |
1726 | |
1727 | // We are going to permute multiple sources and the result will be in multiple |
1728 | // destinations. Providing an accurate cost only for splits where the element |
1729 | // type remains the same. |
1730 | if (LT.first != 1) { |
1731 | MVT LegalVT = LT.second; |
1732 | if (LegalVT.isVector() && |
1733 | LegalVT.getVectorElementType().getSizeInBits() == |
1734 | SrcTy->getElementType()->getPrimitiveSizeInBits() && |
1735 | LegalVT.getVectorNumElements() < |
1736 | cast<FixedVectorType>(Val: SrcTy)->getNumElements()) { |
1737 | unsigned VecTySize = DL.getTypeStoreSize(Ty: SrcTy); |
1738 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
1739 | // Number of source vectors after legalization: |
1740 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
1741 | // Number of destination vectors after legalization: |
1742 | InstructionCost NumOfDests = LT.first; |
1743 | |
1744 | auto *SingleOpTy = FixedVectorType::get(ElementType: SrcTy->getElementType(), |
1745 | NumElts: LegalVT.getVectorNumElements()); |
1746 | |
1747 | if (!Mask.empty() && NumOfDests.isValid()) { |
1748 | // Try to perform better estimation of the permutation. |
1749 | // 1. Split the source/destination vectors into real registers. |
1750 | // 2. Do the mask analysis to identify which real registers are |
1751 | // permuted. If more than 1 source registers are used for the |
1752 | // destination register building, the cost for this destination register |
1753 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one |
1754 | // source register is used, build mask and calculate the cost as a cost |
1755 | // of PermuteSingleSrc. |
1756 | // Also, for the single register permute we try to identify if the |
1757 | // destination register is just a copy of the source register or the |
1758 | // copy of the previous destination register (the cost is |
1759 | // TTI::TCC_Basic). If the source register is just reused, the cost for |
1760 | // this operation is TTI::TCC_Free. |
1761 | NumOfDests = |
1762 | getTypeLegalizationCost( |
1763 | Ty: FixedVectorType::get(ElementType: SrcTy->getElementType(), NumElts: Mask.size())) |
1764 | .first; |
1765 | unsigned E = NumOfDests.getValue(); |
1766 | unsigned NormalizedVF = |
1767 | LegalVT.getVectorNumElements() * std::max(a: NumOfSrcs, b: E); |
1768 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1769 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1770 | SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); |
1771 | copy(Range&: Mask, Out: NormalizedMask.begin()); |
1772 | unsigned PrevSrcReg = 0; |
1773 | ArrayRef<int> PrevRegMask; |
1774 | InstructionCost Cost = 0; |
1775 | processShuffleMasks( |
1776 | Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {}, |
1777 | SingleInputAction: [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, |
1778 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { |
1779 | if (!ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size())) { |
1780 | // Check if the previous register can be just copied to the next |
1781 | // one. |
1782 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || |
1783 | PrevRegMask != RegMask) |
1784 | Cost += |
1785 | getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: SingleOpTy, |
1786 | SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
1787 | else |
1788 | // Just a copy of previous destination register. |
1789 | Cost += TTI::TCC_Basic; |
1790 | return; |
1791 | } |
1792 | if (SrcReg != DestReg && |
1793 | any_of(Range&: RegMask, P: [](int I) { return I != PoisonMaskElem; })) { |
1794 | // Just a copy of the source register. |
1795 | Cost += TTI::TCC_Free; |
1796 | } |
1797 | PrevSrcReg = SrcReg; |
1798 | PrevRegMask = RegMask; |
1799 | }, |
1800 | ManyInputsAction: [this, SingleOpTy, CostKind, |
1801 | &Cost](ArrayRef<int> RegMask, unsigned /*Unused*/, |
1802 | unsigned /*Unused*/, bool /*Unused*/) { |
1803 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleOpTy, |
1804 | SrcTy: SingleOpTy, Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
1805 | }); |
1806 | return Cost; |
1807 | } |
1808 | |
1809 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
1810 | return NumOfShuffles * getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleOpTy, |
1811 | SrcTy: SingleOpTy, Mask: {}, CostKind, Index: 0, |
1812 | SubTp: nullptr); |
1813 | } |
1814 | |
1815 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, |
1816 | SubTp); |
1817 | } |
1818 | |
1819 | // If we're just moving a single element around (probably as an alternative to |
1820 | // extracting it), we can assume this is cheap. |
1821 | if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask) |
1822 | return TTI::TCC_Basic; |
1823 | |
1824 | static const CostKindTblEntry AVX512VBMIShuffleTbl[] = { |
1825 | { .ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
1826 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
1827 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
1828 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermb |
1829 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2b |
1830 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2b |
1831 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } } // vpermt2b |
1832 | }; |
1833 | |
1834 | if (ST->hasVBMI()) |
1835 | if (const auto *Entry = |
1836 | CostTableLookup(Table: AVX512VBMIShuffleTbl, ISD: Kind, Ty: LT.second)) |
1837 | if (auto KindCost = Entry->Cost[CostKind]) |
1838 | return LT.first * *KindCost; |
1839 | |
1840 | static const CostKindTblEntry AVX512BWShuffleTbl[] = { |
1841 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1842 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1843 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
1844 | |
1845 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1846 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1847 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1848 | { .ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // pshufb + vshufi64x2 |
1849 | |
1850 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1851 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1852 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1853 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermw |
1854 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // extend to v32i16 |
1855 | |
1856 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
1857 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
1858 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
1859 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vpermt2w |
1860 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 19, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, // 6 * v32i8 + 1 |
1861 | |
1862 | { .ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmw |
1863 | { .ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmb |
1864 | |
1865 | { .ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
1866 | { .ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
1867 | { .ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vshufi64x2 + palignr |
1868 | }; |
1869 | |
1870 | if (ST->hasBWI()) |
1871 | if (const auto *Entry = |
1872 | CostTableLookup(Table: AVX512BWShuffleTbl, ISD: Kind, Ty: LT.second)) |
1873 | if (auto KindCost = Entry->Cost[CostKind]) |
1874 | return LT.first * *KindCost; |
1875 | |
1876 | static const CostKindTblEntry AVX512ShuffleTbl[] = { |
1877 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastsd |
1878 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastss |
1879 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastq |
1880 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastd |
1881 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1882 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1883 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
1884 | |
1885 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1886 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1887 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1888 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1889 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1890 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1891 | {.ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1892 | |
1893 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1894 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1895 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1896 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1897 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1898 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1899 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1900 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1901 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1902 | {.ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1903 | {.ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1904 | |
1905 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1906 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1907 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1908 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1909 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1910 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1911 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1912 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1913 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1914 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1915 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1916 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1917 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pshufb |
1918 | |
1919 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1920 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1921 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1922 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1923 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1924 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1925 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1926 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1927 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1928 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1929 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1930 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1931 | |
1932 | // FIXME: This just applies the type legalization cost rules above |
1933 | // assuming these completely split. |
1934 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1935 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1936 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1937 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1938 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1939 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1940 | |
1941 | {.ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1942 | {.ISD: TTI::SK_Select, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1943 | {.ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1944 | {.ISD: TTI::SK_Select, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmpd |
1945 | {.ISD: TTI::SK_Select, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmps |
1946 | {.ISD: TTI::SK_Select, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmq |
1947 | {.ISD: TTI::SK_Select, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmd |
1948 | }; |
1949 | |
1950 | if (ST->hasAVX512()) |
1951 | if (const auto *Entry = CostTableLookup(Table: AVX512ShuffleTbl, ISD: Kind, Ty: LT.second)) |
1952 | if (auto KindCost = Entry->Cost[CostKind]) |
1953 | return LT.first * *KindCost; |
1954 | |
1955 | static const CostKindTblEntry AVX2InLaneShuffleTbl[] = { |
1956 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
1957 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
1958 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
1959 | |
1960 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufpd + vblendpd |
1961 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufps + vblendps |
1962 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufd + vpblendd |
1963 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufd + vpblendd |
1964 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
1965 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
1966 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpshufb + vpor |
1967 | }; |
1968 | |
1969 | if (IsInLaneShuffle && ST->hasAVX2()) |
1970 | if (const auto *Entry = |
1971 | CostTableLookup(Table: AVX2InLaneShuffleTbl, ISD: Kind, Ty: LT.second)) |
1972 | if (auto KindCost = Entry->Cost[CostKind]) |
1973 | return LT.first * *KindCost; |
1974 | |
1975 | static const CostKindTblEntry AVX2ShuffleTbl[] = { |
1976 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastpd |
1977 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastps |
1978 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastq |
1979 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastd |
1980 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1981 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1982 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
1983 | |
1984 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1985 | { .ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1986 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1987 | { .ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1988 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
1989 | { .ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
1990 | { .ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + pshufb |
1991 | |
1992 | { .ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
1993 | { .ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
1994 | { .ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpblendvb |
1995 | |
1996 | { .ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
1997 | { .ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
1998 | { .ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
1999 | { .ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
2000 | { .ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2i128 + vpalignr |
2001 | |
2002 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
2003 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
2004 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
2005 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
2006 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
2007 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
2008 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
2009 | |
2010 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermpd + vblendpd |
2011 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermps + vblendps |
2012 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermq + vpblendd |
2013 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // 2*vpermd + vpblendd |
2014 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
2015 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
2016 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
2017 | }; |
2018 | |
2019 | if (ST->hasAVX2()) |
2020 | if (const auto *Entry = CostTableLookup(Table: AVX2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2021 | if (auto KindCost = Entry->Cost[CostKind]) |
2022 | return LT.first * *KindCost; |
2023 | |
2024 | static const CostKindTblEntry XOPShuffleTbl[] = { |
2025 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2pd |
2026 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2ps |
2027 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2pd |
2028 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // vperm2f128 + vpermil2ps |
2029 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*vpperm |
2030 | // + vinsertf128 |
2031 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*vpperm |
2032 | // + vinsertf128 |
2033 | |
2034 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 6*vpperm |
2035 | // + vinsertf128 |
2036 | |
2037 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpperm |
2038 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 6*vpperm |
2039 | // + vinsertf128 |
2040 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpperm |
2041 | }; |
2042 | |
2043 | if (ST->hasXOP()) |
2044 | if (const auto *Entry = CostTableLookup(Table: XOPShuffleTbl, ISD: Kind, Ty: LT.second)) |
2045 | if (auto KindCost = Entry->Cost[CostKind]) |
2046 | return LT.first * *KindCost; |
2047 | |
2048 | static const CostKindTblEntry AVX1InLaneShuffleTbl[] = { |
2049 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilpd |
2050 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilpd |
2051 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilps |
2052 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermilps |
2053 | |
2054 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
2055 | // + vpor + vinsertf128 |
2056 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
2057 | // + vpor + vinsertf128 |
2058 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // vextractf128 + 2*pshufb |
2059 | // + vpor + vinsertf128 |
2060 | |
2061 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufpd + vblendpd |
2062 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vshufps + vblendps |
2063 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpermilpd + vblendpd |
2064 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // 2*vpermilps + vblendps |
2065 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
2066 | // + 2*vpor + vinsertf128 |
2067 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
2068 | // + 2*vpor + vinsertf128 |
2069 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, // 2*vextractf128 + 4*pshufb |
2070 | // + 2*vpor + vinsertf128 |
2071 | }; |
2072 | |
2073 | if (IsInLaneShuffle && ST->hasAVX()) |
2074 | if (const auto *Entry = |
2075 | CostTableLookup(Table: AVX1InLaneShuffleTbl, ISD: Kind, Ty: LT.second)) |
2076 | if (auto KindCost = Entry->Cost[CostKind]) |
2077 | return LT.first * *KindCost; |
2078 | |
2079 | static const CostKindTblEntry AVX1ShuffleTbl[] = { |
2080 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
2081 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
2082 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
2083 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
2084 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpshuflw + vpshufd + vinsertf128 |
2085 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpshuflw + vpshufd + vinsertf128 |
2086 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vpshufb + vinsertf128 |
2087 | |
2088 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
2089 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
2090 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilpd |
2091 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vpermilps |
2092 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
2093 | // + vinsertf128 |
2094 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
2095 | // + vinsertf128 |
2096 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // vextractf128 + 2*pshufb |
2097 | // + vinsertf128 |
2098 | |
2099 | {.ISD: TTI::SK_Select, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendpd |
2100 | {.ISD: TTI::SK_Select, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendpd |
2101 | {.ISD: TTI::SK_Select, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendps |
2102 | {.ISD: TTI::SK_Select, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // vblendps |
2103 | {.ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
2104 | {.ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
2105 | {.ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // vpand + vpandn + vpor |
2106 | |
2107 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + shufpd |
2108 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + shufpd |
2109 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2110 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2111 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2112 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2113 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 5,.LatencyCost: 5,.CodeSizeCost: 5,.SizeAndLatencyCost: 5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2114 | |
2115 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vshufpd |
2116 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2}}, // vperm2f128 + vshufpd |
2117 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2118 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2119 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16,.Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
2120 | // + 2*por + vinsertf128 |
2121 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16,.Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
2122 | // + 2*por + vinsertf128 |
2123 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 8,.LatencyCost: 8,.CodeSizeCost: 8,.SizeAndLatencyCost: 8}}, // vextractf128 + 4*pshufb |
2124 | // + 2*por + vinsertf128 |
2125 | |
2126 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // 2*vperm2f128 + vshufpd |
2127 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: {.RecipThroughputCost: 3,.LatencyCost: 3,.CodeSizeCost: 3,.SizeAndLatencyCost: 3}}, // 2*vperm2f128 + vshufpd |
2128 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2129 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: {.RecipThroughputCost: 4,.LatencyCost: 4,.CodeSizeCost: 4,.SizeAndLatencyCost: 4}}, // 2*vperm2f128 + 2*vshufps |
2130 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16,.Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
2131 | // + 4*por + vinsertf128 |
2132 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16,.Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
2133 | // + 4*por + vinsertf128 |
2134 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: {.RecipThroughputCost: 15,.LatencyCost: 15,.CodeSizeCost: 15,.SizeAndLatencyCost: 15}}, // 2*vextractf128 + 8*pshufb |
2135 | // + 4*por + vinsertf128 |
2136 | }; |
2137 | |
2138 | if (ST->hasAVX()) |
2139 | if (const auto *Entry = CostTableLookup(Table: AVX1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2140 | if (auto KindCost = Entry->Cost[CostKind]) |
2141 | return LT.first * *KindCost; |
2142 | |
2143 | static const CostKindTblEntry SSE41ShuffleTbl[] = { |
2144 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
2145 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // movsd |
2146 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
2147 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // blendps |
2148 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
2149 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}}, // pblendw |
2150 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1}} // pblendvb |
2151 | }; |
2152 | |
2153 | if (ST->hasSSE41()) |
2154 | if (const auto *Entry = CostTableLookup(Table: SSE41ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2155 | if (auto KindCost = Entry->Cost[CostKind]) |
2156 | return LT.first * *KindCost; |
2157 | |
2158 | static const CostKindTblEntry SSSE3ShuffleTbl[] = { |
2159 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2160 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2161 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2162 | |
2163 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2164 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2165 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2166 | |
2167 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2168 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2169 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2170 | |
2171 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
2172 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
2173 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
2174 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
2175 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // palignr |
2176 | |
2177 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2178 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2179 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufb |
2180 | |
2181 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2182 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2183 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // 2*pshufb + por |
2184 | }; |
2185 | |
2186 | if (ST->hasSSSE3()) |
2187 | if (const auto *Entry = CostTableLookup(Table: SSSE3ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2188 | if (auto KindCost = Entry->Cost[CostKind]) |
2189 | return LT.first * *KindCost; |
2190 | |
2191 | static const CostKindTblEntry SSE2ShuffleTbl[] = { |
2192 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2193 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2194 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2195 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // pshuflw + pshufd |
2196 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // pshuflw + pshufd |
2197 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // unpck + pshuflw + pshufd |
2198 | |
2199 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2200 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2201 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2202 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pshuflw + pshufhw + pshufd |
2203 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pshuflw + pshufhw + pshufd |
2204 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 9, .SizeAndLatencyCost: 9}}, // 2*pshuflw + 2*pshufhw |
2205 | // + 2*pshufd + 2*unpck + packus |
2206 | |
2207 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // movsd |
2208 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // movsd |
2209 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*shufps |
2210 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
2211 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
2212 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // pand + pandn + por |
2213 | |
2214 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2215 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2216 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*{unpck,movsd,pshufd} |
2217 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
2218 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
2219 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3}}, // psrldq + psrlldq + por |
2220 | |
2221 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2222 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2223 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // pshufd |
2224 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5}}, // 2*pshuflw + 2*pshufhw |
2225 | // + pshufd/unpck |
2226 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5}}, // 2*pshuflw + 2*pshufhw |
2227 | // + pshufd/unpck |
2228 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 8, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 10}}, // 2*pshuflw + 2*pshufhw |
2229 | // + 2*pshufd + 2*unpck + 2*packus |
2230 | |
2231 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2232 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: {.RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1}}, // shufpd |
2233 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: {.RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2}}, // 2*{unpck,movsd,pshufd} |
2234 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: {.RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8}}, // blend+permute |
2235 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: {.RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 8, .SizeAndLatencyCost: 8}}, // blend+permute |
2236 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 13, .CodeSizeCost: 13, .SizeAndLatencyCost: 13}}, // blend+permute |
2237 | }; |
2238 | |
2239 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { |
2240 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 0}, // broadcast handled by movddup |
2241 | }; |
2242 | |
2243 | if (ST->hasSSE2()) { |
2244 | bool IsLoad = |
2245 | llvm::any_of(Range&: Args, P: [](const auto &V) { return isa<LoadInst>(V); }); |
2246 | if (ST->hasSSE3() && IsLoad) |
2247 | if (const auto *Entry = |
2248 | CostTableLookup(Table: SSE3BroadcastLoadTbl, ISD: Kind, Ty: LT.second)) { |
2249 | assert(isLegalBroadcastLoad(SrcTy->getElementType(), |
2250 | LT.second.getVectorElementCount()) && |
2251 | "Table entry missing from isLegalBroadcastLoad()" ); |
2252 | return LT.first * Entry->Cost; |
2253 | } |
2254 | |
2255 | if (const auto *Entry = CostTableLookup(Table: SSE2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2256 | if (auto KindCost = Entry->Cost[CostKind]) |
2257 | return LT.first * *KindCost; |
2258 | } |
2259 | |
2260 | static const CostKindTblEntry SSE1ShuffleTbl[] = { |
2261 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
2262 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
2263 | { .ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
2264 | { .ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
2265 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 1,.LatencyCost: 1,.CodeSizeCost: 1,.SizeAndLatencyCost: 1} }, // shufps |
2266 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: {.RecipThroughputCost: 2,.LatencyCost: 2,.CodeSizeCost: 2,.SizeAndLatencyCost: 2} }, // 2*shufps |
2267 | }; |
2268 | |
2269 | if (ST->hasSSE1()) { |
2270 | if (LT.first == 1 && LT.second == MVT::v4f32 && Mask.size() == 4) { |
2271 | // SHUFPS: both pairs must come from the same source register. |
2272 | auto MatchSHUFPS = [](int X, int Y) { |
2273 | return X < 0 || Y < 0 || ((X & 4) == (Y & 4)); |
2274 | }; |
2275 | if (MatchSHUFPS(Mask[0], Mask[1]) && MatchSHUFPS(Mask[2], Mask[3])) |
2276 | return 1; |
2277 | } |
2278 | if (const auto *Entry = CostTableLookup(Table: SSE1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2279 | if (auto KindCost = Entry->Cost[CostKind]) |
2280 | return LT.first * *KindCost; |
2281 | } |
2282 | |
2283 | return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, |
2284 | SubTp); |
2285 | } |
2286 | |
2287 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
2288 | Type *Src, |
2289 | TTI::CastContextHint CCH, |
2290 | TTI::TargetCostKind CostKind, |
2291 | const Instruction *I) const { |
2292 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2293 | assert(ISD && "Invalid opcode" ); |
2294 | |
2295 | // The cost tables include both specific, custom (non-legal) src/dst type |
2296 | // conversions and generic, legalized types. We test for customs first, before |
2297 | // falling back to legalization. |
2298 | // FIXME: Need a better design of the cost table to handle non-simple types of |
2299 | // potential massive combinations (elem_num x src_type x dst_type). |
2300 | static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{ |
2301 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2302 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2303 | |
2304 | // Mask sign extend has an instruction. |
2305 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2306 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2307 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2308 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2309 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2310 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2311 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2312 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2313 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2314 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2315 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2316 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2317 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2318 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2319 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2320 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2321 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2322 | |
2323 | // Mask zero extend is a sext + shift. |
2324 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2325 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2326 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2327 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2328 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2329 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2330 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2331 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2332 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2333 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2334 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2335 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2336 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2337 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2338 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2339 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2340 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2341 | |
2342 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2343 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2344 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2345 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2346 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2347 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2348 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2349 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2350 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2351 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2352 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2353 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2354 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2355 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2356 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2357 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2358 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2359 | |
2360 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2361 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // widen to zmm |
2362 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2363 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2364 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2365 | }; |
2366 | |
2367 | static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = { |
2368 | // Mask sign extend has an instruction. |
2369 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2370 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2371 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2372 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2373 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2374 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2375 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2376 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2377 | |
2378 | // Mask zero extend is a sext + shift. |
2379 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2380 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2381 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2382 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2383 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2384 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2385 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2386 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2387 | |
2388 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2389 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2390 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2391 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2392 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2393 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2394 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2395 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2396 | |
2397 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2398 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2399 | |
2400 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2401 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2402 | |
2403 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2404 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2405 | |
2406 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2407 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2408 | }; |
2409 | |
2410 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
2411 | // 256-bit wide vectors. |
2412 | |
2413 | static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = { |
2414 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2415 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2416 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v16f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // 2*vcvtps2pd+vextractf64x4 |
2417 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v16f32, .Src: MVT::v16f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps |
2418 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
2419 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2420 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v16f16, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtps2ph |
2421 | |
2422 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2423 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2424 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2425 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2426 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2427 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2428 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2429 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2430 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2431 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2432 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2433 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2434 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
2435 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
2436 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2437 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2438 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2439 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2440 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2441 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2442 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
2443 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
2444 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2445 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
2446 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2447 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2448 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2449 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2450 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2451 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2452 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2453 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
2454 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpmovqd |
2455 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } },// 2*vpmovqd+concat+vpmovdb |
2456 | |
2457 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // extend to v16i32 |
2458 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2459 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2460 | |
2461 | // Sign extend is zmm vpternlogd+vptruncdb. |
2462 | // Zero extend is zmm broadcast load+vptruncdw. |
2463 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2464 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2465 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2466 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2467 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2468 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2469 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2470 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2471 | |
2472 | // Sign extend is zmm vpternlogd+vptruncdw. |
2473 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. |
2474 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2475 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2476 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2477 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2478 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2479 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2480 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2481 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2482 | |
2483 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2484 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2485 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2486 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2487 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2488 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2489 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
2490 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
2491 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
2492 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
2493 | |
2494 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2495 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2496 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2497 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2498 | |
2499 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2500 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2501 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2502 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2503 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2504 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2505 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2506 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2507 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2508 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2509 | |
2510 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
2511 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
2512 | |
2513 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2514 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2515 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2516 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2517 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2518 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2519 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2520 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2521 | |
2522 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2523 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2524 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2525 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2526 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2527 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2528 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2529 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2530 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: {.RecipThroughputCost: 26, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2531 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2532 | |
2533 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2534 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2535 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2536 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f32, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2537 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f64, .Cost: {.RecipThroughputCost: 31, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2538 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2539 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2540 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2541 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2542 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2543 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2544 | |
2545 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2546 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2547 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2548 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2549 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2550 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2551 | }; |
2552 | |
2553 | static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] { |
2554 | // Mask sign extend has an instruction. |
2555 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2556 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2557 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2558 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2559 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2560 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2561 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2562 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2563 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2564 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2565 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2566 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2567 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2568 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2569 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2570 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2571 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2572 | |
2573 | // Mask zero extend is a sext + shift. |
2574 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2575 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2576 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2577 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2578 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2579 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2580 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2581 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2582 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2583 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2584 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2585 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2586 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2587 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2588 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2589 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2590 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2591 | |
2592 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2593 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2594 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2595 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2596 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2597 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2598 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2599 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2600 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2601 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2602 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2603 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2604 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2605 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2606 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2607 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2608 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2609 | |
2610 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2611 | }; |
2612 | |
2613 | static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = { |
2614 | // Mask sign extend has an instruction. |
2615 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2616 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2617 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2618 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2619 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2620 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2621 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2622 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2623 | |
2624 | // Mask zero extend is a sext + shift. |
2625 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2626 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2627 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2628 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2629 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2630 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2631 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2632 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2633 | |
2634 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2635 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2636 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2637 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2638 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2639 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2640 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2641 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2642 | |
2643 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2644 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2645 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2646 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2647 | |
2648 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2649 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2650 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2651 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2652 | |
2653 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2654 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2655 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2656 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2657 | |
2658 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2659 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2660 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2661 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2662 | }; |
2663 | |
2664 | static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = { |
2665 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2666 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2667 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2668 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i8 |
2669 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2670 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2671 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2672 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i16 |
2673 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2674 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2675 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2676 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2677 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2678 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2679 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
2680 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2681 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2682 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2683 | |
2684 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb |
2685 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb |
2686 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2687 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2688 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2689 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2690 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2691 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2692 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2693 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2694 | |
2695 | // sign extend is vpcmpeq+maskedmove+vpmovdw |
2696 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw |
2697 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2698 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2699 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2700 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2701 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2702 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2703 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2704 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2705 | |
2706 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2707 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2708 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2709 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2710 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2711 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2712 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2713 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2714 | |
2715 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2716 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2717 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2718 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2719 | |
2720 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2721 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2722 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2723 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2724 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2725 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2726 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2727 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2728 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2729 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2730 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2731 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2732 | |
2733 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2734 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2735 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2736 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2737 | |
2738 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2739 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2740 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2741 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2742 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2743 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2744 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2745 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2746 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2747 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2748 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2749 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2750 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2751 | |
2752 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2753 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2754 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2755 | |
2756 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2757 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2758 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2759 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2760 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2761 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2762 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2763 | }; |
2764 | |
2765 | static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = { |
2766 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2767 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2768 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2769 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2770 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2771 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2772 | |
2773 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2774 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2775 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2776 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2777 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2778 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2779 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2780 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2781 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2782 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2783 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2784 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2785 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2786 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2787 | |
2788 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2789 | |
2790 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2791 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2792 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2793 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2794 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2795 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2796 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2797 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2798 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2799 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2800 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2801 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2802 | |
2803 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2804 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2805 | |
2806 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2807 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2808 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2809 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2810 | |
2811 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2812 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2813 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2814 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2815 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2816 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2817 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2818 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2819 | |
2820 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2821 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2822 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2823 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2824 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2825 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2826 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2827 | |
2828 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2829 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2830 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2831 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2832 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2833 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2834 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2835 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2836 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2837 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2838 | }; |
2839 | |
2840 | static const TypeConversionCostKindTblEntry AVXConversionTbl[] = { |
2841 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2842 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2843 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2844 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2845 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2846 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2847 | |
2848 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2849 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2850 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2851 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2852 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2853 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2854 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2855 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2856 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2857 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2858 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2859 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2860 | |
2861 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2862 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2863 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2864 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2865 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2866 | |
2867 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2868 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2869 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+packuswb |
2870 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2871 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2872 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2873 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+2*packusdw |
2874 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2875 | |
2876 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2877 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2878 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2879 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2880 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2881 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2882 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2883 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2884 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2885 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2886 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2887 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2888 | |
2889 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2890 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2891 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2892 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2893 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2894 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2895 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2896 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2897 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2898 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2899 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2900 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2901 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2902 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2903 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2904 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2905 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2906 | |
2907 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2908 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2909 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2910 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2911 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2912 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2913 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2914 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2915 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2916 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2917 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2918 | |
2919 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2920 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2921 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2922 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2923 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2924 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2925 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2926 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2927 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2928 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2929 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2930 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2931 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2932 | |
2933 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2934 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2935 | }; |
2936 | |
2937 | static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = { |
2938 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2939 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2940 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2941 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2942 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2943 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2944 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2945 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2946 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2947 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2948 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2949 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2950 | |
2951 | // These truncates end up widening elements. |
2952 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBQ |
2953 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZWQ |
2954 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBD |
2955 | |
2956 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2957 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2958 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2959 | |
2960 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2961 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2962 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2963 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2964 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2965 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2966 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2967 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2968 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2969 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2970 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2971 | |
2972 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2973 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2974 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2975 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2976 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2977 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2978 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2979 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2980 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2981 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2982 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2983 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2984 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 22, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2985 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2986 | |
2987 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2988 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2989 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2990 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2991 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2992 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2993 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2994 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2995 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2996 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2997 | |
2998 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2999 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3000 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3001 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3002 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3003 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3004 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3005 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3006 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3007 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3008 | }; |
3009 | |
3010 | static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = { |
3011 | // These are somewhat magic numbers justified by comparing the |
3012 | // output of llvm-mca for our various supported scheduler models |
3013 | // and basing it off the worst case scenario. |
3014 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3015 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3016 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3017 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3018 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3019 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3020 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3021 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3022 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3023 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3024 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3025 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3026 | |
3027 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3028 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3029 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3030 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3031 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3032 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3033 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3034 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3035 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3036 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3037 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3038 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3039 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3040 | |
3041 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3042 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3043 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3044 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3045 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3046 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3047 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3048 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3049 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3050 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3051 | |
3052 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3053 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3054 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3055 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3056 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3057 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3058 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3059 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3060 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3061 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3062 | |
3063 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3064 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3065 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3066 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3067 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3068 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3069 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3070 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3071 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3072 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3073 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3074 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3075 | |
3076 | // These truncates are really widening elements. |
3077 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
3078 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD+DQ |
3079 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD+PSHUFD |
3080 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD |
3081 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD |
3082 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW |
3083 | |
3084 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+PACKUSWB |
3085 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3086 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+2*PACKUSWB |
3087 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3088 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3089 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3090 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3091 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3092 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+3*PACKUSWB |
3093 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD+PSHUFLW |
3094 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
3095 | }; |
3096 | |
3097 | static const TypeConversionCostKindTblEntry F16ConversionTbl[] = { |
3098 | { .ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3099 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3100 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3101 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3102 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
3103 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3104 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3105 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vcvtph2ps+vcvtps2pd |
3106 | }; |
3107 | |
3108 | // Attempt to map directly to (simple) MVT types to let us match custom entries. |
3109 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
3110 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
3111 | |
3112 | // The function getSimpleVT only handles simple value types. |
3113 | if (SrcTy.isSimple() && DstTy.isSimple()) { |
3114 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); |
3115 | MVT SimpleDstTy = DstTy.getSimpleVT(); |
3116 | |
3117 | if (ST->useAVX512Regs()) { |
3118 | if (ST->hasBWI()) |
3119 | if (const auto *Entry = ConvertCostTableLookup( |
3120 | Table: AVX512BWConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3121 | if (auto KindCost = Entry->Cost[CostKind]) |
3122 | return *KindCost; |
3123 | |
3124 | if (ST->hasDQI()) |
3125 | if (const auto *Entry = ConvertCostTableLookup( |
3126 | Table: AVX512DQConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3127 | if (auto KindCost = Entry->Cost[CostKind]) |
3128 | return *KindCost; |
3129 | |
3130 | if (ST->hasAVX512()) |
3131 | if (const auto *Entry = ConvertCostTableLookup( |
3132 | Table: AVX512FConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3133 | if (auto KindCost = Entry->Cost[CostKind]) |
3134 | return *KindCost; |
3135 | } |
3136 | |
3137 | if (ST->hasBWI()) |
3138 | if (const auto *Entry = ConvertCostTableLookup( |
3139 | Table: AVX512BWVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3140 | if (auto KindCost = Entry->Cost[CostKind]) |
3141 | return *KindCost; |
3142 | |
3143 | if (ST->hasDQI()) |
3144 | if (const auto *Entry = ConvertCostTableLookup( |
3145 | Table: AVX512DQVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3146 | if (auto KindCost = Entry->Cost[CostKind]) |
3147 | return *KindCost; |
3148 | |
3149 | if (ST->hasAVX512()) |
3150 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
3151 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3152 | if (auto KindCost = Entry->Cost[CostKind]) |
3153 | return *KindCost; |
3154 | |
3155 | if (ST->hasAVX2()) { |
3156 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
3157 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3158 | if (auto KindCost = Entry->Cost[CostKind]) |
3159 | return *KindCost; |
3160 | } |
3161 | |
3162 | if (ST->hasAVX()) { |
3163 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
3164 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3165 | if (auto KindCost = Entry->Cost[CostKind]) |
3166 | return *KindCost; |
3167 | } |
3168 | |
3169 | if (ST->hasF16C()) { |
3170 | if (const auto *Entry = ConvertCostTableLookup(Table: F16ConversionTbl, ISD, |
3171 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3172 | if (auto KindCost = Entry->Cost[CostKind]) |
3173 | return *KindCost; |
3174 | } |
3175 | |
3176 | if (ST->hasSSE41()) { |
3177 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
3178 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3179 | if (auto KindCost = Entry->Cost[CostKind]) |
3180 | return *KindCost; |
3181 | } |
3182 | |
3183 | if (ST->hasSSE2()) { |
3184 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
3185 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3186 | if (auto KindCost = Entry->Cost[CostKind]) |
3187 | return *KindCost; |
3188 | } |
3189 | |
3190 | if ((ISD == ISD::FP_ROUND && SimpleDstTy == MVT::f16) || |
3191 | (ISD == ISD::FP_EXTEND && SimpleSrcTy == MVT::f16)) { |
3192 | // fp16 conversions not covered by any table entries require a libcall. |
3193 | // Return a large (arbitrary) number to model this. |
3194 | return InstructionCost(64); |
3195 | } |
3196 | } |
3197 | |
3198 | // Fall back to legalized types. |
3199 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Ty: Src); |
3200 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Ty: Dst); |
3201 | |
3202 | // If we're truncating to the same legalized type - just assume its free. |
3203 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) |
3204 | return TTI::TCC_Free; |
3205 | |
3206 | if (ST->useAVX512Regs()) { |
3207 | if (ST->hasBWI()) |
3208 | if (const auto *Entry = ConvertCostTableLookup( |
3209 | Table: AVX512BWConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3210 | if (auto KindCost = Entry->Cost[CostKind]) |
3211 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3212 | |
3213 | if (ST->hasDQI()) |
3214 | if (const auto *Entry = ConvertCostTableLookup( |
3215 | Table: AVX512DQConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3216 | if (auto KindCost = Entry->Cost[CostKind]) |
3217 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3218 | |
3219 | if (ST->hasAVX512()) |
3220 | if (const auto *Entry = ConvertCostTableLookup( |
3221 | Table: AVX512FConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3222 | if (auto KindCost = Entry->Cost[CostKind]) |
3223 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3224 | } |
3225 | |
3226 | if (ST->hasBWI()) |
3227 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512BWVLConversionTbl, ISD, |
3228 | Dst: LTDest.second, Src: LTSrc.second)) |
3229 | if (auto KindCost = Entry->Cost[CostKind]) |
3230 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3231 | |
3232 | if (ST->hasDQI()) |
3233 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512DQVLConversionTbl, ISD, |
3234 | Dst: LTDest.second, Src: LTSrc.second)) |
3235 | if (auto KindCost = Entry->Cost[CostKind]) |
3236 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3237 | |
3238 | if (ST->hasAVX512()) |
3239 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
3240 | Dst: LTDest.second, Src: LTSrc.second)) |
3241 | if (auto KindCost = Entry->Cost[CostKind]) |
3242 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3243 | |
3244 | if (ST->hasAVX2()) |
3245 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
3246 | Dst: LTDest.second, Src: LTSrc.second)) |
3247 | if (auto KindCost = Entry->Cost[CostKind]) |
3248 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3249 | |
3250 | if (ST->hasAVX()) |
3251 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
3252 | Dst: LTDest.second, Src: LTSrc.second)) |
3253 | if (auto KindCost = Entry->Cost[CostKind]) |
3254 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3255 | |
3256 | if (ST->hasF16C()) { |
3257 | if (const auto *Entry = ConvertCostTableLookup(Table: F16ConversionTbl, ISD, |
3258 | Dst: LTDest.second, Src: LTSrc.second)) |
3259 | if (auto KindCost = Entry->Cost[CostKind]) |
3260 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3261 | } |
3262 | |
3263 | if (ST->hasSSE41()) |
3264 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
3265 | Dst: LTDest.second, Src: LTSrc.second)) |
3266 | if (auto KindCost = Entry->Cost[CostKind]) |
3267 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3268 | |
3269 | if (ST->hasSSE2()) |
3270 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
3271 | Dst: LTDest.second, Src: LTSrc.second)) |
3272 | if (auto KindCost = Entry->Cost[CostKind]) |
3273 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3274 | |
3275 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for |
3276 | // sitofp. |
3277 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
3278 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { |
3279 | Type *ExtSrc = Src->getWithNewBitWidth(NewBitWidth: 32); |
3280 | unsigned ExtOpc = |
3281 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; |
3282 | |
3283 | // For scalar loads the extend would be free. |
3284 | InstructionCost ExtCost = 0; |
3285 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(Val: I->getOperand(i: 0)))) |
3286 | ExtCost = getCastInstrCost(Opcode: ExtOpc, Dst: ExtSrc, Src, CCH, CostKind); |
3287 | |
3288 | return ExtCost + getCastInstrCost(Opcode: Instruction::SIToFP, Dst, Src: ExtSrc, |
3289 | CCH: TTI::CastContextHint::None, CostKind); |
3290 | } |
3291 | |
3292 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi |
3293 | // i32. |
3294 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && |
3295 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { |
3296 | Type *TruncDst = Dst->getWithNewBitWidth(NewBitWidth: 32); |
3297 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: TruncDst, Src, CCH, CostKind) + |
3298 | getCastInstrCost(Opcode: Instruction::Trunc, Dst, Src: TruncDst, |
3299 | CCH: TTI::CastContextHint::None, CostKind); |
3300 | } |
3301 | |
3302 | // TODO: Allow non-throughput costs that aren't binary. |
3303 | auto AdjustCost = [&CostKind](InstructionCost Cost, |
3304 | InstructionCost N = 1) -> InstructionCost { |
3305 | if (CostKind != TTI::TCK_RecipThroughput) |
3306 | return Cost == 0 ? 0 : N; |
3307 | return Cost * N; |
3308 | }; |
3309 | return AdjustCost( |
3310 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
3311 | } |
3312 | |
3313 | InstructionCost X86TTIImpl::getCmpSelInstrCost( |
3314 | unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, |
3315 | TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, |
3316 | TTI::OperandValueInfo Op2Info, const Instruction *I) const { |
3317 | // Early out if this type isn't scalar/vector integer/float. |
3318 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) |
3319 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3320 | Op1Info, Op2Info, I); |
3321 | |
3322 | // Legalize the type. |
3323 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3324 | |
3325 | MVT MTy = LT.second; |
3326 | |
3327 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3328 | assert(ISD && "Invalid opcode" ); |
3329 | |
3330 | InstructionCost = 0; |
3331 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { |
3332 | // Some vector comparison predicates cost extra instructions. |
3333 | // TODO: Adjust ExtraCost based on CostKind? |
3334 | // TODO: Should we invert this and assume worst case cmp costs |
3335 | // and reduce for particular predicates? |
3336 | if (MTy.isVector() && |
3337 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || |
3338 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || |
3339 | ST->hasBWI())) { |
3340 | // Fallback to I if a specific predicate wasn't specified. |
3341 | CmpInst::Predicate Pred = VecPred; |
3342 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || |
3343 | Pred == CmpInst::BAD_FCMP_PREDICATE)) |
3344 | Pred = cast<CmpInst>(Val: I)->getPredicate(); |
3345 | |
3346 | bool CmpWithConstant = false; |
3347 | if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(Val: I)) |
3348 | CmpWithConstant = isa<Constant>(Val: CmpInstr->getOperand(i_nocapture: 1)); |
3349 | |
3350 | switch (Pred) { |
3351 | case CmpInst::Predicate::ICMP_NE: |
3352 | // xor(cmpeq(x,y),-1) |
3353 | ExtraCost = CmpWithConstant ? 0 : 1; |
3354 | break; |
3355 | case CmpInst::Predicate::ICMP_SGE: |
3356 | case CmpInst::Predicate::ICMP_SLE: |
3357 | // xor(cmpgt(x,y),-1) |
3358 | ExtraCost = CmpWithConstant ? 0 : 1; |
3359 | break; |
3360 | case CmpInst::Predicate::ICMP_ULT: |
3361 | case CmpInst::Predicate::ICMP_UGT: |
3362 | // cmpgt(xor(x,signbit),xor(y,signbit)) |
3363 | // xor(cmpeq(pmaxu(x,y),x),-1) |
3364 | ExtraCost = CmpWithConstant ? 1 : 2; |
3365 | break; |
3366 | case CmpInst::Predicate::ICMP_ULE: |
3367 | case CmpInst::Predicate::ICMP_UGE: |
3368 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || |
3369 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { |
3370 | // cmpeq(psubus(x,y),0) |
3371 | // cmpeq(pminu(x,y),x) |
3372 | ExtraCost = 1; |
3373 | } else { |
3374 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) |
3375 | ExtraCost = CmpWithConstant ? 2 : 3; |
3376 | } |
3377 | break; |
3378 | case CmpInst::Predicate::FCMP_ONE: |
3379 | case CmpInst::Predicate::FCMP_UEQ: |
3380 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. |
3381 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. |
3382 | if (CondTy && !ST->hasAVX()) |
3383 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3384 | VecPred: CmpInst::Predicate::FCMP_UNO, CostKind, |
3385 | Op1Info, Op2Info) + |
3386 | getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3387 | VecPred: CmpInst::Predicate::FCMP_OEQ, CostKind, |
3388 | Op1Info, Op2Info) + |
3389 | getArithmeticInstrCost(Opcode: Instruction::Or, Ty: CondTy, CostKind); |
3390 | |
3391 | break; |
3392 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: |
3393 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: |
3394 | // Assume worst case scenario and add the maximum extra cost. |
3395 | ExtraCost = 3; |
3396 | break; |
3397 | default: |
3398 | break; |
3399 | } |
3400 | } |
3401 | } |
3402 | |
3403 | static const CostKindTblEntry SLMCostTbl[] = { |
3404 | // slm pcmpeq/pcmpgt throughput is 2 |
3405 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3406 | // slm pblendvb/blendvpd/blendvps throughput is 4 |
3407 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvpd |
3408 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvps |
3409 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3410 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3411 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3412 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3413 | }; |
3414 | |
3415 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3416 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3417 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3418 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3419 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3420 | |
3421 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3422 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3423 | }; |
3424 | |
3425 | static const CostKindTblEntry AVX512CostTbl[] = { |
3426 | { .ISD: ISD::SETCC, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3427 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3428 | { .ISD: ISD::SETCC, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3429 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3430 | |
3431 | { .ISD: ISD::SETCC, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3432 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3433 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3434 | { .ISD: ISD::SETCC, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3435 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3436 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3437 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3438 | |
3439 | { .ISD: ISD::SELECT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3440 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3441 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3442 | { .ISD: ISD::SELECT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3443 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3444 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3445 | { .ISD: ISD::SELECT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3446 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3447 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3448 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3449 | { .ISD: ISD::SELECT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3450 | { .ISD: ISD::SELECT, .Type: MVT::v8f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3451 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3452 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3453 | |
3454 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3455 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3456 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3457 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3458 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3459 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3460 | }; |
3461 | |
3462 | static const CostKindTblEntry AVX2CostTbl[] = { |
3463 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3464 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3465 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3466 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3467 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3468 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3469 | |
3470 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3471 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3472 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3473 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3474 | |
3475 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3476 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3477 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3478 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3479 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3480 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3481 | }; |
3482 | |
3483 | static const CostKindTblEntry XOPCostTbl[] = { |
3484 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3485 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3486 | }; |
3487 | |
3488 | static const CostKindTblEntry AVX1CostTbl[] = { |
3489 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3490 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3491 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3492 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3493 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3494 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3495 | |
3496 | // AVX1 does not support 8-wide integer compare. |
3497 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3498 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3499 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3500 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3501 | |
3502 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3503 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3504 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3505 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3506 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
3507 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
3508 | }; |
3509 | |
3510 | static const CostKindTblEntry SSE42CostTbl[] = { |
3511 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3512 | }; |
3513 | |
3514 | static const CostKindTblEntry SSE41CostTbl[] = { |
3515 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3516 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3517 | |
3518 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
3519 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
3520 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
3521 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
3522 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3523 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3524 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3525 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3526 | }; |
3527 | |
3528 | static const CostKindTblEntry SSE2CostTbl[] = { |
3529 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3530 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3531 | |
3532 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // pcmpeqd/pcmpgtd expansion |
3533 | { .ISD: ISD::SETCC, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3534 | { .ISD: ISD::SETCC, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3535 | { .ISD: ISD::SETCC, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3536 | |
3537 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
3538 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
3539 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3540 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3541 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3542 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3543 | }; |
3544 | |
3545 | static const CostKindTblEntry SSE1CostTbl[] = { |
3546 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3547 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3548 | |
3549 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
3550 | { .ISD: ISD::SELECT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
3551 | }; |
3552 | |
3553 | if (ST->useSLMArithCosts()) |
3554 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
3555 | if (auto KindCost = Entry->Cost[CostKind]) |
3556 | return LT.first * (ExtraCost + *KindCost); |
3557 | |
3558 | if (ST->hasBWI()) |
3559 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
3560 | if (auto KindCost = Entry->Cost[CostKind]) |
3561 | return LT.first * (ExtraCost + *KindCost); |
3562 | |
3563 | if (ST->hasAVX512()) |
3564 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
3565 | if (auto KindCost = Entry->Cost[CostKind]) |
3566 | return LT.first * (ExtraCost + *KindCost); |
3567 | |
3568 | if (ST->hasAVX2()) |
3569 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
3570 | if (auto KindCost = Entry->Cost[CostKind]) |
3571 | return LT.first * (ExtraCost + *KindCost); |
3572 | |
3573 | if (ST->hasXOP()) |
3574 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
3575 | if (auto KindCost = Entry->Cost[CostKind]) |
3576 | return LT.first * (ExtraCost + *KindCost); |
3577 | |
3578 | if (ST->hasAVX()) |
3579 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
3580 | if (auto KindCost = Entry->Cost[CostKind]) |
3581 | return LT.first * (ExtraCost + *KindCost); |
3582 | |
3583 | if (ST->hasSSE42()) |
3584 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
3585 | if (auto KindCost = Entry->Cost[CostKind]) |
3586 | return LT.first * (ExtraCost + *KindCost); |
3587 | |
3588 | if (ST->hasSSE41()) |
3589 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
3590 | if (auto KindCost = Entry->Cost[CostKind]) |
3591 | return LT.first * (ExtraCost + *KindCost); |
3592 | |
3593 | if (ST->hasSSE2()) |
3594 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
3595 | if (auto KindCost = Entry->Cost[CostKind]) |
3596 | return LT.first * (ExtraCost + *KindCost); |
3597 | |
3598 | if (ST->hasSSE1()) |
3599 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
3600 | if (auto KindCost = Entry->Cost[CostKind]) |
3601 | return LT.first * (ExtraCost + *KindCost); |
3602 | |
3603 | // Assume a 3cy latency for fp select ops. |
3604 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) |
3605 | if (ValTy->getScalarType()->isFloatingPointTy()) |
3606 | return 3; |
3607 | |
3608 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3609 | Op1Info, Op2Info, I); |
3610 | } |
3611 | |
3612 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } |
3613 | |
3614 | InstructionCost |
3615 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
3616 | TTI::TargetCostKind CostKind) const { |
3617 | // Costs should match the codegen from: |
3618 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll |
3619 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll |
3620 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll |
3621 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll |
3622 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll |
3623 | |
3624 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not |
3625 | // specialized in these tables yet. |
3626 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { |
3627 | { .ISD: ISD::FSHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3628 | { .ISD: ISD::FSHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3629 | { .ISD: ISD::FSHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3630 | { .ISD: ISD::FSHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3631 | { .ISD: ISD::FSHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3632 | { .ISD: ISD::FSHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3633 | { .ISD: ISD::FSHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3634 | { .ISD: ISD::FSHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3635 | { .ISD: ISD::FSHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3636 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3637 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3638 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3639 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3640 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3641 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3642 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3643 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3644 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3645 | }; |
3646 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { |
3647 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3648 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3649 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3650 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3651 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3652 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3653 | }; |
3654 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { |
3655 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3656 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3657 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3658 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3659 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3660 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3661 | }; |
3662 | static const CostKindTblEntry AVX512CDCostTbl[] = { |
3663 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3664 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3665 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 27, .CodeSizeCost: 23, .SizeAndLatencyCost: 27 } }, |
3666 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 16, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
3667 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3668 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3669 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 19, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3670 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3671 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3672 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3673 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
3674 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3675 | |
3676 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3677 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3678 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3679 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3680 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3681 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3682 | }; |
3683 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3684 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3685 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3686 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3687 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3688 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3689 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3690 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3691 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3692 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3693 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3694 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3695 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3696 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3697 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 12 } }, |
3698 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3699 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3700 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3701 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3702 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3703 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3704 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3705 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3706 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3707 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 22, .CodeSizeCost: 23, .SizeAndLatencyCost: 23 } }, |
3708 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 23, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
3709 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 15, .SizeAndLatencyCost: 16 } }, |
3710 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 9 } }, |
3711 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3712 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3713 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 10, .SizeAndLatencyCost: 12 } }, |
3714 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3715 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3716 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
3717 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3718 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3719 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3720 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3721 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3722 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
3723 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3724 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3725 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
3726 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3727 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3728 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3729 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
3730 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3731 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3732 | { .ISD: ISD::ROTL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
3733 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
3734 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
3735 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
3736 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3737 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3738 | { .ISD: ISD::ROTR, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 12, .SizeAndLatencyCost: 14 } }, |
3739 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
3740 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
3741 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3742 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3743 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3744 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3745 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3746 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3747 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3748 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3749 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3750 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3751 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3752 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3753 | { .ISD: ISD::SMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3754 | { .ISD: ISD::SMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 21, .CodeSizeCost: 17, .SizeAndLatencyCost: 18 } }, |
3755 | { .ISD: ISD::UMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3756 | { .ISD: ISD::UMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 15, .CodeSizeCost: 15, .SizeAndLatencyCost: 16 } }, |
3757 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3758 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3759 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3760 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3761 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3762 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3763 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3764 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3765 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3766 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3767 | }; |
3768 | static const CostKindTblEntry AVX512CostTbl[] = { |
3769 | { .ISD: ISD::ABS, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3770 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3771 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3772 | { .ISD: ISD::ABS, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3773 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3774 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3775 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3776 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3777 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3778 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3779 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3780 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3781 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
3782 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3783 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3784 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3785 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 28, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
3786 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 30, .CodeSizeCost: 38, .SizeAndLatencyCost: 38 } }, |
3787 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 15, .CodeSizeCost: 29, .SizeAndLatencyCost: 29 } }, |
3788 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
3789 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
3790 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 19, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
3791 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 15, .CodeSizeCost: 22, .SizeAndLatencyCost: 22 } }, |
3792 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 11, .CodeSizeCost: 16, .SizeAndLatencyCost: 16 } }, |
3793 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3794 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3795 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 17, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
3796 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
3797 | { .ISD: ISD::ROTL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3798 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3799 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3800 | { .ISD: ISD::ROTL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3801 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3802 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3803 | { .ISD: ISD::ROTR, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3804 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3805 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3806 | { .ISD: ISD::ROTR, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3807 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3808 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3809 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3810 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3811 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3812 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3813 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3814 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3815 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3816 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3817 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3818 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3819 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3820 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3821 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3822 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3823 | { .ISD: ISD::SMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3824 | { .ISD: ISD::SMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3825 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3826 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3827 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3828 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3829 | { .ISD: ISD::SMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3830 | { .ISD: ISD::SMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3831 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3832 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3833 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3834 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3835 | { .ISD: ISD::SMULO, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 44, .CodeSizeCost: 81, .SizeAndLatencyCost: 93 } }, |
3836 | { .ISD: ISD::SMULO, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
3837 | { .ISD: ISD::SMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
3838 | { .ISD: ISD::SMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 28, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
3839 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3840 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
3841 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
3842 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
3843 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
3844 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 14, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, |
3845 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3846 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3847 | { .ISD: ISD::UMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3848 | { .ISD: ISD::UMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3849 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3850 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3851 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3852 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3853 | { .ISD: ISD::UMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3854 | { .ISD: ISD::UMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3855 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3856 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3857 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3858 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3859 | { .ISD: ISD::UMULO, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 52, .LatencyCost: 52, .CodeSizeCost: 95, .SizeAndLatencyCost: 104} }, |
3860 | { .ISD: ISD::UMULO, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
3861 | { .ISD: ISD::UMULO, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 13, .CodeSizeCost: 16, .SizeAndLatencyCost: 16 } }, |
3862 | { .ISD: ISD::UMULO, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 30, .SizeAndLatencyCost: 30 } }, |
3863 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3864 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3865 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3866 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3867 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3868 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3869 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3870 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3871 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3872 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3873 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3874 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3875 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3876 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3877 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
3878 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3879 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3880 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3881 | { .ISD: ISD::FMAXNUM, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3882 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3883 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3884 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3885 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3886 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3887 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3888 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3889 | { .ISD: ISD::FSQRT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
3890 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3891 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3892 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3893 | { .ISD: ISD::FSQRT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
3894 | }; |
3895 | static const CostKindTblEntry XOPCostTbl[] = { |
3896 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3897 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3898 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3899 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3900 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3901 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3902 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3903 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3904 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3905 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3906 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3907 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3908 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) |
3909 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3910 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3911 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3912 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3913 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3914 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3915 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3916 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3917 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3918 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3919 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3920 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3921 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3922 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3923 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3924 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3925 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3926 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3927 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3928 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3929 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3930 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3931 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3932 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3933 | }; |
3934 | static const CostKindTblEntry AVX2CostTbl[] = { |
3935 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3936 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3937 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3938 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3939 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3940 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3941 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3942 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3943 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3944 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3945 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3946 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3947 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3948 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3949 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3950 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 15 } }, |
3951 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3952 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3953 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3954 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3955 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3956 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3957 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
3958 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 44 } }, |
3959 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
3960 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 34 } }, |
3961 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, |
3962 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
3963 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3964 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 14 } }, |
3965 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3966 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3967 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3968 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
3969 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3970 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
3971 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3972 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
3973 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 13 } }, |
3974 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 20 } }, |
3975 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
3976 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 24 } }, |
3977 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3978 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
3979 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3980 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
3981 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
3982 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
3983 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, |
3984 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 7, .SizeAndLatencyCost: 13 } }, |
3985 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3986 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3987 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3988 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3989 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3990 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3991 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3992 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3993 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3994 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3995 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3996 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3997 | { .ISD: ISD::SMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 33, .SizeAndLatencyCost: 37 } }, |
3998 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 13, .SizeAndLatencyCost: 15 } }, |
3999 | { .ISD: ISD::SMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 20, .CodeSizeCost: 13, .SizeAndLatencyCost: 24 } }, |
4000 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
4001 | { .ISD: ISD::SMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 8, .SizeAndLatencyCost: 14 } }, |
4002 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4003 | { .ISD: ISD::SMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 15, .CodeSizeCost: 18, .SizeAndLatencyCost: 35 } }, |
4004 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 22, .CodeSizeCost: 14, .SizeAndLatencyCost: 21 } }, |
4005 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
4006 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
4007 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 14, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
4008 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 9, .SizeAndLatencyCost: 16 } }, |
4009 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4010 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4011 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4012 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 10 } }, |
4013 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 8 } }, |
4014 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4015 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4016 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
4017 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
4018 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4019 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4020 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4021 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
4022 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
4023 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4024 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4025 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4026 | { .ISD: ISD::UMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 24, .CodeSizeCost: 39, .SizeAndLatencyCost: 43 } }, |
4027 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 10, .CodeSizeCost: 15, .SizeAndLatencyCost: 19 } }, |
4028 | { .ISD: ISD::UMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 23 } }, |
4029 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 12, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
4030 | { .ISD: ISD::UMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 8, .SizeAndLatencyCost: 13 } }, |
4031 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4032 | { .ISD: ISD::UMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 17, .SizeAndLatencyCost: 33 } }, |
4033 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 19, .CodeSizeCost: 13, .SizeAndLatencyCost: 20 } }, |
4034 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4035 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 10 } }, |
4036 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4037 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4038 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4039 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
4040 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
4041 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
4042 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
4043 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
4044 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
4045 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
4046 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
4047 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
4048 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
4049 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
4050 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
4051 | }; |
4052 | static const CostKindTblEntry AVX1CostTbl[] = { |
4053 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
4054 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
4055 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
4056 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
4057 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
4058 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4059 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
4060 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4061 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
4062 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4063 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 26 } }, // 2 x 128-bit Op + extract/insert |
4064 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
4065 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
4066 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4067 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
4068 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4069 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
4070 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4071 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 29, .LatencyCost: 33, .CodeSizeCost: 49, .SizeAndLatencyCost: 58 } }, // 2 x 128-bit Op + extract/insert |
4072 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 24, .CodeSizeCost: 24, .SizeAndLatencyCost: 28 } }, |
4073 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 28, .CodeSizeCost: 39, .SizeAndLatencyCost: 48 } }, // 2 x 128-bit Op + extract/insert |
4074 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 19, .SizeAndLatencyCost: 23 } }, |
4075 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 29, .SizeAndLatencyCost: 38 } }, // 2 x 128-bit Op + extract/insert |
4076 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 16, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
4077 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
4078 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
4079 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
4080 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
4081 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
4082 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 20, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
4083 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 21, .CodeSizeCost: 22, .SizeAndLatencyCost: 31 } }, // 2 x 128-bit Op + extract/insert |
4084 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 18, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
4085 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 16, .SizeAndLatencyCost: 25 } }, // 2 x 128-bit Op + extract/insert |
4086 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
4087 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 24, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
4088 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 19, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
4089 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 27, .CodeSizeCost: 32, .SizeAndLatencyCost: 41 } }, // 2 x 128-bit Op + extract/insert |
4090 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 24, .CodeSizeCost: 17, .SizeAndLatencyCost: 21 } }, |
4091 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
4092 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
4093 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 18, .CodeSizeCost: 21, .SizeAndLatencyCost: 30 } }, // 2 x 128-bit Op + extract/insert |
4094 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 16, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
4095 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
4096 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 20, .CodeSizeCost: 15, .SizeAndLatencyCost: 25 } }, // 2 x 128-bit Op + extract/insert |
4097 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 18, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, // 2 x 128-bit Op + extract/insert |
4098 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4099 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4100 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
4101 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4102 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4103 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4104 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4105 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
4106 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4107 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4108 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4109 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4110 | { .ISD: ISD::SMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 33, .SizeAndLatencyCost: 37 } }, |
4111 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 9, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
4112 | { .ISD: ISD::SMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 20, .CodeSizeCost: 24, .SizeAndLatencyCost: 29 } }, |
4113 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
4114 | { .ISD: ISD::SMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, |
4115 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4116 | { .ISD: ISD::SMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 37, .SizeAndLatencyCost: 39 } }, |
4117 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 22, .CodeSizeCost: 18, .SizeAndLatencyCost: 21 } }, |
4118 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
4119 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 21, .CodeSizeCost: 18, .SizeAndLatencyCost: 29 } }, // 2 x 128-bit Op + extract/insert |
4120 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19, .CodeSizeCost: 18, .SizeAndLatencyCost: 29 } }, // 2 x 128-bit Op + extract/insert |
4121 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4122 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4123 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4124 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, // 2 x 128-bit Op + extract/insert |
4125 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, // 2 x 128-bit Op + extract/insert |
4126 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4127 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4128 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
4129 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
4130 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4131 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4132 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4133 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
4134 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
4135 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4136 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4137 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4138 | { .ISD: ISD::UMULO, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 26, .CodeSizeCost: 39, .SizeAndLatencyCost: 45 } }, |
4139 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 12, .CodeSizeCost: 15, .SizeAndLatencyCost: 20 } }, |
4140 | { .ISD: ISD::UMULO, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 15, .CodeSizeCost: 23, .SizeAndLatencyCost: 28 } }, |
4141 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
4142 | { .ISD: ISD::UMULO, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 14 } }, |
4143 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4144 | { .ISD: ISD::UMULO, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 19, .CodeSizeCost: 35, .SizeAndLatencyCost: 37 } }, |
4145 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 19, .CodeSizeCost: 17, .SizeAndLatencyCost: 20 } }, |
4146 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4147 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, // 2 x 128-bit Op + extract/insert |
4148 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2 x 128-bit Op + extract/insert |
4149 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4150 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4151 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
4152 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
4153 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
4154 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
4155 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
4156 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
4157 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
4158 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
4159 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
4160 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
4161 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
4162 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
4163 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 54, .LatencyCost: 54, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
4164 | }; |
4165 | static const CostKindTblEntry GFNICostTbl[] = { |
4166 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4167 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
4168 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // gf2p8affineqb |
4169 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
4170 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4171 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4172 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4173 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4174 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4175 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4176 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4177 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4178 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4179 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4180 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4181 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
4182 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4183 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4184 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
4185 | }; |
4186 | static const CostKindTblEntry GLMCostTbl[] = { |
4187 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
4188 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
4189 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
4190 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 67, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
4191 | }; |
4192 | static const CostKindTblEntry SLMCostTbl[] = { |
4193 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4194 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4195 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4196 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
4197 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 40, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
4198 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
4199 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 70, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
4200 | }; |
4201 | static const CostKindTblEntry SSE42CostTbl[] = { |
4202 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
4203 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
4204 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
4205 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
4206 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4207 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4208 | }; |
4209 | static const CostKindTblEntry SSE41CostTbl[] = { |
4210 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) |
4211 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 14, .CodeSizeCost: 17, .SizeAndLatencyCost: 21 } }, |
4212 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
4213 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 19, .CodeSizeCost: 25, .SizeAndLatencyCost: 29 } }, |
4214 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 10, .SizeAndLatencyCost: 12 } }, |
4215 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4216 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4217 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4218 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4219 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4220 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4221 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
4222 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 24, .CodeSizeCost: 13, .SizeAndLatencyCost: 19 } }, |
4223 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 9, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4224 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 22, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
4225 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
4226 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
4227 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
4228 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, |
4229 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
4230 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4231 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4232 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
4233 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4234 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4235 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 15, .SizeAndLatencyCost: 20 } }, |
4236 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 12, .SizeAndLatencyCost: 18 } }, |
4237 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4238 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 18, .SizeAndLatencyCost: 20 } }, |
4239 | }; |
4240 | static const CostKindTblEntry SSSE3CostTbl[] = { |
4241 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4242 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4243 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4244 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
4245 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
4246 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
4247 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4248 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4249 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4250 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4251 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 28, .CodeSizeCost: 28, .SizeAndLatencyCost: 35 } }, |
4252 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 20, .CodeSizeCost: 22, .SizeAndLatencyCost: 28 } }, |
4253 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 17, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
4254 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4255 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 12, .SizeAndLatencyCost: 18 } }, |
4256 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
4257 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 18, .CodeSizeCost: 14, .SizeAndLatencyCost: 20 } }, |
4258 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4259 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 15, .SizeAndLatencyCost: 22 } }, |
4260 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 26, .CodeSizeCost: 19, .SizeAndLatencyCost: 25 } }, |
4261 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 20, .CodeSizeCost: 17, .SizeAndLatencyCost: 23 } }, |
4262 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16, .CodeSizeCost: 13, .SizeAndLatencyCost: 19 } } |
4263 | }; |
4264 | static const CostKindTblEntry SSE2CostTbl[] = { |
4265 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4266 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
4267 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4268 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4269 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
4270 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 30, .SizeAndLatencyCost: 30 } }, |
4271 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
4272 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
4273 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
4274 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
4275 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
4276 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 36, .SizeAndLatencyCost: 38 } }, |
4277 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 38, .SizeAndLatencyCost: 40 } }, |
4278 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 38, .CodeSizeCost: 32, .SizeAndLatencyCost: 34 } }, |
4279 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 39, .CodeSizeCost: 29, .SizeAndLatencyCost: 32 } }, |
4280 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 26, .CodeSizeCost: 16, .SizeAndLatencyCost: 18 } }, |
4281 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 29, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
4282 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 18, .SizeAndLatencyCost: 20 } }, |
4283 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
4284 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28, .CodeSizeCost: 19, .SizeAndLatencyCost: 21 } }, |
4285 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 31, .CodeSizeCost: 24, .SizeAndLatencyCost: 26 } }, |
4286 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
4287 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 23, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4288 | { .ISD: ISD::SADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 14, .CodeSizeCost: 24, .SizeAndLatencyCost: 24 } }, |
4289 | { .ISD: ISD::SADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
4290 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4291 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4292 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4293 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4294 | { .ISD: ISD::SMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4295 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4296 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4297 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4298 | { .ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4299 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4300 | { .ISD: ISD::SMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 30, .LatencyCost: 33, .CodeSizeCost: 13, .SizeAndLatencyCost: 23 } }, |
4301 | { .ISD: ISD::SMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 24, .CodeSizeCost: 23, .SizeAndLatencyCost: 23 } }, |
4302 | { .ISD: ISD::SMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4303 | { .ISD: ISD::SMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 23, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
4304 | { .ISD: ISD::SSUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19, .CodeSizeCost: 31, .SizeAndLatencyCost: 31 } }, |
4305 | { .ISD: ISD::SSUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 12, .SizeAndLatencyCost: 13 } }, |
4306 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4307 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4308 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
4309 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4310 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4311 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4312 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4313 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4314 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4315 | { .ISD: ISD::UMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4316 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4317 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4318 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4319 | { .ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4320 | { .ISD: ISD::UMULO, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 30, .LatencyCost: 33, .CodeSizeCost: 15, .SizeAndLatencyCost: 29 } }, |
4321 | { .ISD: ISD::UMULO, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
4322 | { .ISD: ISD::UMULO, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4323 | { .ISD: ISD::UMULO, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
4324 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
4325 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4326 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4327 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4328 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4329 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4330 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4331 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4332 | }; |
4333 | static const CostKindTblEntry SSE1CostTbl[] = { |
4334 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4335 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4336 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
4337 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 56, .LatencyCost: 56, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
4338 | }; |
4339 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets |
4340 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4341 | }; |
4342 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets |
4343 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4344 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4345 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4346 | }; |
4347 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets |
4348 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4349 | }; |
4350 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets |
4351 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4352 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4353 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4354 | }; |
4355 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets |
4356 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
4357 | }; |
4358 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets |
4359 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
4360 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
4361 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
4362 | }; |
4363 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
4364 | { .ISD: ISD::ABS, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+CMOV |
4365 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 12, .CodeSizeCost: 20, .SizeAndLatencyCost: 22 } }, |
4366 | { .ISD: ISD::BSWAP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4367 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
4368 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
4369 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
4370 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 3 } }, // MOV+BSR+XOR |
4371 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
4372 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
4373 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
4374 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
4375 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // MOV+BSF |
4376 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
4377 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 6, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
4378 | { .ISD: ISD::ROTL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4379 | { .ISD: ISD::ROTR, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4380 | { .ISD: X86ISD::VROTLI, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4381 | { .ISD: ISD::FSHL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
4382 | { .ISD: ISD::SADDSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
4383 | { .ISD: ISD::SSUBSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
4384 | { .ISD: ISD::UADDSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4385 | { .ISD: ISD::USUBSAT, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4386 | { .ISD: ISD::SMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4387 | { .ISD: ISD::SMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4388 | { .ISD: ISD::UMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4389 | { .ISD: ISD::UMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4390 | { .ISD: ISD::SADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4391 | { .ISD: ISD::UADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4392 | { .ISD: ISD::SMULO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4393 | { .ISD: ISD::UMULO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4394 | }; |
4395 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
4396 | { .ISD: ISD::ABS, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
4397 | { .ISD: ISD::ABS, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
4398 | { .ISD: ISD::ABS, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA |
4399 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4400 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4401 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9, .CodeSizeCost: 13, .SizeAndLatencyCost: 14 } }, |
4402 | { .ISD: ISD::BSWAP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4403 | { .ISD: ISD::BSWAP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // ROL |
4404 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // BSR+XOR or BSR+XOR+CMOV |
4405 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // BSR+XOR or BSR+XOR+CMOV |
4406 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // BSR+XOR or BSR+XOR+CMOV |
4407 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
4408 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // BSR+XOR |
4409 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // BSR+XOR |
4410 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4411 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4412 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4413 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
4414 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
4415 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BSF |
4416 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 7, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4417 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 8, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
4418 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4419 | { .ISD: ISD::ROTL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4420 | { .ISD: ISD::ROTL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4421 | { .ISD: ISD::ROTL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4422 | { .ISD: ISD::ROTR, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4423 | { .ISD: ISD::ROTR, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4424 | { .ISD: ISD::ROTR, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4425 | { .ISD: X86ISD::VROTLI, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4426 | { .ISD: X86ISD::VROTLI, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4427 | { .ISD: X86ISD::VROTLI, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4428 | { .ISD: ISD::FSHL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
4429 | { .ISD: ISD::FSHL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
4430 | { .ISD: ISD::FSHL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
4431 | { .ISD: ISD::SADDSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
4432 | { .ISD: ISD::SADDSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
4433 | { .ISD: ISD::SADDSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
4434 | { .ISD: ISD::SSUBSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
4435 | { .ISD: ISD::SSUBSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
4436 | { .ISD: ISD::SSUBSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 11 } }, |
4437 | { .ISD: ISD::UADDSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4438 | { .ISD: ISD::UADDSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4439 | { .ISD: ISD::UADDSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
4440 | { .ISD: ISD::USUBSAT, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4441 | { .ISD: ISD::USUBSAT, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 7 } }, |
4442 | { .ISD: ISD::USUBSAT, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
4443 | { .ISD: ISD::SMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4444 | { .ISD: ISD::SMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4445 | { .ISD: ISD::SMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4446 | { .ISD: ISD::SMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4447 | { .ISD: ISD::SMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4448 | { .ISD: ISD::SMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4449 | { .ISD: ISD::UMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4450 | { .ISD: ISD::UMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4451 | { .ISD: ISD::UMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4452 | { .ISD: ISD::UMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4453 | { .ISD: ISD::UMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4454 | { .ISD: ISD::UMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4455 | { .ISD: ISD::SADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4456 | { .ISD: ISD::SADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4457 | { .ISD: ISD::SADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4458 | { .ISD: ISD::UADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4459 | { .ISD: ISD::UADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4460 | { .ISD: ISD::UADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4461 | { .ISD: ISD::SMULO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4462 | { .ISD: ISD::SMULO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4463 | { .ISD: ISD::SMULO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4464 | { .ISD: ISD::UMULO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 8 } }, |
4465 | { .ISD: ISD::UMULO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 9 } }, |
4466 | { .ISD: ISD::UMULO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
4467 | }; |
4468 | |
4469 | Type *RetTy = ICA.getReturnType(); |
4470 | Type *OpTy = RetTy; |
4471 | Intrinsic::ID IID = ICA.getID(); |
4472 | unsigned ISD = ISD::DELETED_NODE; |
4473 | switch (IID) { |
4474 | default: |
4475 | break; |
4476 | case Intrinsic::abs: |
4477 | ISD = ISD::ABS; |
4478 | break; |
4479 | case Intrinsic::bitreverse: |
4480 | ISD = ISD::BITREVERSE; |
4481 | break; |
4482 | case Intrinsic::bswap: |
4483 | ISD = ISD::BSWAP; |
4484 | break; |
4485 | case Intrinsic::ctlz: |
4486 | ISD = ISD::CTLZ; |
4487 | break; |
4488 | case Intrinsic::ctpop: |
4489 | ISD = ISD::CTPOP; |
4490 | break; |
4491 | case Intrinsic::cttz: |
4492 | ISD = ISD::CTTZ; |
4493 | break; |
4494 | case Intrinsic::fshl: |
4495 | ISD = ISD::FSHL; |
4496 | if (!ICA.isTypeBasedOnly()) { |
4497 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4498 | if (Args[0] == Args[1]) { |
4499 | ISD = ISD::ROTL; |
4500 | // Handle uniform constant rotation amounts. |
4501 | // TODO: Handle funnel-shift cases. |
4502 | const APInt *Amt; |
4503 | if (Args[2] && |
4504 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4505 | ISD = X86ISD::VROTLI; |
4506 | } |
4507 | } |
4508 | break; |
4509 | case Intrinsic::fshr: |
4510 | // FSHR has same costs so don't duplicate. |
4511 | ISD = ISD::FSHL; |
4512 | if (!ICA.isTypeBasedOnly()) { |
4513 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4514 | if (Args[0] == Args[1]) { |
4515 | ISD = ISD::ROTR; |
4516 | // Handle uniform constant rotation amount. |
4517 | // TODO: Handle funnel-shift cases. |
4518 | const APInt *Amt; |
4519 | if (Args[2] && |
4520 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4521 | ISD = X86ISD::VROTLI; |
4522 | } |
4523 | } |
4524 | break; |
4525 | case Intrinsic::lrint: |
4526 | case Intrinsic::llrint: { |
4527 | // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which |
4528 | // have the same costs as the CVTTP2SI (fptosi) instructions |
4529 | const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes(); |
4530 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: RetTy, Src: ArgTys[0], |
4531 | CCH: TTI::CastContextHint::None, CostKind); |
4532 | } |
4533 | case Intrinsic::maxnum: |
4534 | case Intrinsic::minnum: |
4535 | // FMINNUM has same costs so don't duplicate. |
4536 | ISD = ISD::FMAXNUM; |
4537 | break; |
4538 | case Intrinsic::sadd_sat: |
4539 | ISD = ISD::SADDSAT; |
4540 | break; |
4541 | case Intrinsic::smax: |
4542 | ISD = ISD::SMAX; |
4543 | break; |
4544 | case Intrinsic::smin: |
4545 | ISD = ISD::SMIN; |
4546 | break; |
4547 | case Intrinsic::ssub_sat: |
4548 | ISD = ISD::SSUBSAT; |
4549 | break; |
4550 | case Intrinsic::uadd_sat: |
4551 | ISD = ISD::UADDSAT; |
4552 | break; |
4553 | case Intrinsic::umax: |
4554 | ISD = ISD::UMAX; |
4555 | break; |
4556 | case Intrinsic::umin: |
4557 | ISD = ISD::UMIN; |
4558 | break; |
4559 | case Intrinsic::usub_sat: |
4560 | ISD = ISD::USUBSAT; |
4561 | break; |
4562 | case Intrinsic::sqrt: |
4563 | ISD = ISD::FSQRT; |
4564 | break; |
4565 | case Intrinsic::sadd_with_overflow: |
4566 | case Intrinsic::ssub_with_overflow: |
4567 | // SSUBO has same costs so don't duplicate. |
4568 | ISD = ISD::SADDO; |
4569 | OpTy = RetTy->getContainedType(i: 0); |
4570 | break; |
4571 | case Intrinsic::uadd_with_overflow: |
4572 | case Intrinsic::usub_with_overflow: |
4573 | // USUBO has same costs so don't duplicate. |
4574 | ISD = ISD::UADDO; |
4575 | OpTy = RetTy->getContainedType(i: 0); |
4576 | break; |
4577 | case Intrinsic::smul_with_overflow: |
4578 | ISD = ISD::SMULO; |
4579 | OpTy = RetTy->getContainedType(i: 0); |
4580 | break; |
4581 | case Intrinsic::umul_with_overflow: |
4582 | ISD = ISD::UMULO; |
4583 | OpTy = RetTy->getContainedType(i: 0); |
4584 | break; |
4585 | } |
4586 | |
4587 | if (ISD != ISD::DELETED_NODE) { |
4588 | auto adjustTableCost = [&](int ISD, unsigned Cost, |
4589 | std::pair<InstructionCost, MVT> LT, |
4590 | FastMathFlags FMF) -> InstructionCost { |
4591 | InstructionCost LegalizationCost = LT.first; |
4592 | MVT MTy = LT.second; |
4593 | |
4594 | // If there are no NANs to deal with, then these are reduced to a |
4595 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we |
4596 | // assume is used in the non-fast case. |
4597 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { |
4598 | if (FMF.noNaNs()) |
4599 | return LegalizationCost * 1; |
4600 | } |
4601 | |
4602 | // For cases where some ops can be folded into a load/store, assume free. |
4603 | if (MTy.isScalarInteger()) { |
4604 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { |
4605 | if (const Instruction *II = ICA.getInst()) { |
4606 | if (II->hasOneUse() && isa<StoreInst>(Val: II->user_back())) |
4607 | return TTI::TCC_Free; |
4608 | if (auto *LI = dyn_cast<LoadInst>(Val: II->getOperand(i: 0))) { |
4609 | if (LI->hasOneUse()) |
4610 | return TTI::TCC_Free; |
4611 | } |
4612 | } |
4613 | } |
4614 | } |
4615 | |
4616 | return LegalizationCost * (int)Cost; |
4617 | }; |
4618 | |
4619 | // Legalize the type. |
4620 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: OpTy); |
4621 | MVT MTy = LT.second; |
4622 | |
4623 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. |
4624 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || |
4625 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && |
4626 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { |
4627 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4628 | if (auto *Cst = dyn_cast<ConstantInt>(Val: Args[1])) |
4629 | if (Cst->isAllOnesValue()) |
4630 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; |
4631 | } |
4632 | |
4633 | // FSQRT is a single instruction. |
4634 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) |
4635 | return LT.first; |
4636 | |
4637 | if (ST->useGLMDivSqrtCosts()) |
4638 | if (const auto *Entry = CostTableLookup(Table: GLMCostTbl, ISD, Ty: MTy)) |
4639 | if (auto KindCost = Entry->Cost[CostKind]) |
4640 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4641 | |
4642 | if (ST->useSLMArithCosts()) |
4643 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
4644 | if (auto KindCost = Entry->Cost[CostKind]) |
4645 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4646 | |
4647 | if (ST->hasVBMI2()) |
4648 | if (const auto *Entry = CostTableLookup(Table: AVX512VBMI2CostTbl, ISD, Ty: MTy)) |
4649 | if (auto KindCost = Entry->Cost[CostKind]) |
4650 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4651 | |
4652 | if (ST->hasBITALG()) |
4653 | if (const auto *Entry = CostTableLookup(Table: AVX512BITALGCostTbl, ISD, Ty: MTy)) |
4654 | if (auto KindCost = Entry->Cost[CostKind]) |
4655 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4656 | |
4657 | if (ST->hasVPOPCNTDQ()) |
4658 | if (const auto *Entry = CostTableLookup(Table: AVX512VPOPCNTDQCostTbl, ISD, Ty: MTy)) |
4659 | if (auto KindCost = Entry->Cost[CostKind]) |
4660 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4661 | |
4662 | if (ST->hasGFNI()) |
4663 | if (const auto *Entry = CostTableLookup(Table: GFNICostTbl, ISD, Ty: MTy)) |
4664 | if (auto KindCost = Entry->Cost[CostKind]) |
4665 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4666 | |
4667 | if (ST->hasCDI()) |
4668 | if (const auto *Entry = CostTableLookup(Table: AVX512CDCostTbl, ISD, Ty: MTy)) |
4669 | if (auto KindCost = Entry->Cost[CostKind]) |
4670 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4671 | |
4672 | if (ST->hasBWI()) |
4673 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
4674 | if (auto KindCost = Entry->Cost[CostKind]) |
4675 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4676 | |
4677 | if (ST->hasAVX512()) |
4678 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
4679 | if (auto KindCost = Entry->Cost[CostKind]) |
4680 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4681 | |
4682 | if (ST->hasXOP()) |
4683 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
4684 | if (auto KindCost = Entry->Cost[CostKind]) |
4685 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4686 | |
4687 | if (ST->hasAVX2()) |
4688 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
4689 | if (auto KindCost = Entry->Cost[CostKind]) |
4690 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4691 | |
4692 | if (ST->hasAVX()) |
4693 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
4694 | if (auto KindCost = Entry->Cost[CostKind]) |
4695 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4696 | |
4697 | if (ST->hasSSE42()) |
4698 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
4699 | if (auto KindCost = Entry->Cost[CostKind]) |
4700 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4701 | |
4702 | if (ST->hasSSE41()) |
4703 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
4704 | if (auto KindCost = Entry->Cost[CostKind]) |
4705 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4706 | |
4707 | if (ST->hasSSSE3()) |
4708 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTbl, ISD, Ty: MTy)) |
4709 | if (auto KindCost = Entry->Cost[CostKind]) |
4710 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4711 | |
4712 | if (ST->hasSSE2()) |
4713 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
4714 | if (auto KindCost = Entry->Cost[CostKind]) |
4715 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4716 | |
4717 | if (ST->hasSSE1()) |
4718 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
4719 | if (auto KindCost = Entry->Cost[CostKind]) |
4720 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4721 | |
4722 | if (ST->hasBMI()) { |
4723 | if (ST->is64Bit()) |
4724 | if (const auto *Entry = CostTableLookup(Table: BMI64CostTbl, ISD, Ty: MTy)) |
4725 | if (auto KindCost = Entry->Cost[CostKind]) |
4726 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4727 | |
4728 | if (const auto *Entry = CostTableLookup(Table: BMI32CostTbl, ISD, Ty: MTy)) |
4729 | if (auto KindCost = Entry->Cost[CostKind]) |
4730 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4731 | } |
4732 | |
4733 | if (ST->hasLZCNT()) { |
4734 | if (ST->is64Bit()) |
4735 | if (const auto *Entry = CostTableLookup(Table: LZCNT64CostTbl, ISD, Ty: MTy)) |
4736 | if (auto KindCost = Entry->Cost[CostKind]) |
4737 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4738 | |
4739 | if (const auto *Entry = CostTableLookup(Table: LZCNT32CostTbl, ISD, Ty: MTy)) |
4740 | if (auto KindCost = Entry->Cost[CostKind]) |
4741 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4742 | } |
4743 | |
4744 | if (ST->hasPOPCNT()) { |
4745 | if (ST->is64Bit()) |
4746 | if (const auto *Entry = CostTableLookup(Table: POPCNT64CostTbl, ISD, Ty: MTy)) |
4747 | if (auto KindCost = Entry->Cost[CostKind]) |
4748 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4749 | |
4750 | if (const auto *Entry = CostTableLookup(Table: POPCNT32CostTbl, ISD, Ty: MTy)) |
4751 | if (auto KindCost = Entry->Cost[CostKind]) |
4752 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4753 | } |
4754 | |
4755 | if (ST->is64Bit()) |
4756 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: MTy)) |
4757 | if (auto KindCost = Entry->Cost[CostKind]) |
4758 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4759 | |
4760 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: MTy)) |
4761 | if (auto KindCost = Entry->Cost[CostKind]) |
4762 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4763 | |
4764 | // Without arg data, we need to compute the expanded costs of custom lowered |
4765 | // intrinsics to prevent use of the (very low) default costs. |
4766 | if (ICA.isTypeBasedOnly() && |
4767 | (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) { |
4768 | Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: 1); |
4769 | InstructionCost Cost = 0; |
4770 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Or, Ty: RetTy, CostKind); |
4771 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Sub, Ty: RetTy, CostKind); |
4772 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::Shl, Ty: RetTy, CostKind); |
4773 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::LShr, Ty: RetTy, CostKind); |
4774 | Cost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: RetTy, CostKind); |
4775 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::ICmp, ValTy: RetTy, CondTy, |
4776 | VecPred: CmpInst::ICMP_EQ, CostKind); |
4777 | Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy, |
4778 | VecPred: CmpInst::ICMP_EQ, CostKind); |
4779 | return Cost; |
4780 | } |
4781 | } |
4782 | |
4783 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
4784 | } |
4785 | |
4786 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
4787 | TTI::TargetCostKind CostKind, |
4788 | unsigned Index, const Value *Op0, |
4789 | const Value *Op1) const { |
4790 | static const CostTblEntry SLMCostTbl[] = { |
4791 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i8, .Cost: 4 }, |
4792 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i16, .Cost: 4 }, |
4793 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i32, .Cost: 4 }, |
4794 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i64, .Cost: 7 } |
4795 | }; |
4796 | |
4797 | assert(Val->isVectorTy() && "This must be a vector type" ); |
4798 | Type *ScalarType = Val->getScalarType(); |
4799 | InstructionCost RegisterFileMoveCost = 0; |
4800 | |
4801 | // Non-immediate extraction/insertion can be handled as a sequence of |
4802 | // aliased loads+stores via the stack. |
4803 | if (Index == -1U && (Opcode == Instruction::ExtractElement || |
4804 | Opcode == Instruction::InsertElement)) { |
4805 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: |
4806 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. |
4807 | |
4808 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. |
4809 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected" ); |
4810 | Align VecAlign = DL.getPrefTypeAlign(Ty: Val); |
4811 | Align SclAlign = DL.getPrefTypeAlign(Ty: ScalarType); |
4812 | |
4813 | // Extract - store vector to stack, load scalar. |
4814 | if (Opcode == Instruction::ExtractElement) { |
4815 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4816 | getMemoryOpCost(Opcode: Instruction::Load, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4817 | CostKind); |
4818 | } |
4819 | // Insert - store vector to stack, store scalar, load vector. |
4820 | if (Opcode == Instruction::InsertElement) { |
4821 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4822 | getMemoryOpCost(Opcode: Instruction::Store, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4823 | CostKind) + |
4824 | getMemoryOpCost(Opcode: Instruction::Load, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind); |
4825 | } |
4826 | } |
4827 | |
4828 | if (Index != -1U && (Opcode == Instruction::ExtractElement || |
4829 | Opcode == Instruction::InsertElement)) { |
4830 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. |
4831 | if (Opcode == Instruction::ExtractElement && |
4832 | ScalarType->getScalarSizeInBits() == 1 && |
4833 | cast<FixedVectorType>(Val)->getNumElements() > 1) |
4834 | return 1; |
4835 | |
4836 | // Legalize the type. |
4837 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
4838 | |
4839 | // This type is legalized to a scalar type. |
4840 | if (!LT.second.isVector()) |
4841 | return TTI::TCC_Free; |
4842 | |
4843 | // The type may be split. Normalize the index to the new type. |
4844 | unsigned SizeInBits = LT.second.getSizeInBits(); |
4845 | unsigned NumElts = LT.second.getVectorNumElements(); |
4846 | unsigned SubNumElts = NumElts; |
4847 | Index = Index % NumElts; |
4848 | |
4849 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. |
4850 | // For inserts, we also need to insert the subvector back. |
4851 | if (SizeInBits > 128) { |
4852 | assert((SizeInBits % 128) == 0 && "Illegal vector" ); |
4853 | unsigned NumSubVecs = SizeInBits / 128; |
4854 | SubNumElts = NumElts / NumSubVecs; |
4855 | if (SubNumElts <= Index) { |
4856 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); |
4857 | Index %= SubNumElts; |
4858 | } |
4859 | } |
4860 | |
4861 | MVT MScalarTy = LT.second.getScalarType(); |
4862 | auto IsCheapPInsrPExtrInsertPS = [&]() { |
4863 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. |
4864 | // Inserting f32 into index0 is just movss. |
4865 | // Also, assume insertps is relatively cheap on all >= SSE41 targets. |
4866 | return (MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4867 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4868 | (MScalarTy == MVT::f32 && ST->hasSSE1() && Index == 0 && |
4869 | Opcode == Instruction::InsertElement) || |
4870 | (MScalarTy == MVT::f32 && ST->hasSSE41() && |
4871 | Opcode == Instruction::InsertElement); |
4872 | }; |
4873 | |
4874 | if (Index == 0) { |
4875 | // Floating point scalars are already located in index #0. |
4876 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume |
4877 | // true for all. |
4878 | if (ScalarType->isFloatingPointTy() && |
4879 | (Opcode != Instruction::InsertElement || !Op0 || |
4880 | isa<UndefValue>(Val: Op0))) |
4881 | return RegisterFileMoveCost; |
4882 | |
4883 | if (Opcode == Instruction::InsertElement && |
4884 | isa_and_nonnull<UndefValue>(Val: Op0)) { |
4885 | // Consider the gather cost to be cheap. |
4886 | if (isa_and_nonnull<LoadInst>(Val: Op1)) |
4887 | return RegisterFileMoveCost; |
4888 | if (!IsCheapPInsrPExtrInsertPS()) { |
4889 | // mov constant-to-GPR + movd/movq GPR -> XMM. |
4890 | if (isa_and_nonnull<Constant>(Val: Op1) && Op1->getType()->isIntegerTy()) |
4891 | return 2 + RegisterFileMoveCost; |
4892 | // Assume movd/movq GPR -> XMM is relatively cheap on all targets. |
4893 | return 1 + RegisterFileMoveCost; |
4894 | } |
4895 | } |
4896 | |
4897 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. |
4898 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) |
4899 | return 1 + RegisterFileMoveCost; |
4900 | } |
4901 | |
4902 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
4903 | assert(ISD && "Unexpected vector opcode" ); |
4904 | if (ST->useSLMArithCosts()) |
4905 | if (auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MScalarTy)) |
4906 | return Entry->Cost + RegisterFileMoveCost; |
4907 | |
4908 | // Consider cheap cases. |
4909 | if (IsCheapPInsrPExtrInsertPS()) |
4910 | return 1 + RegisterFileMoveCost; |
4911 | |
4912 | // For extractions we just need to shuffle the element to index 0, which |
4913 | // should be very cheap (assume cost = 1). For insertions we need to shuffle |
4914 | // the elements to its destination. In both cases we must handle the |
4915 | // subvector move(s). |
4916 | // If the vector type is already less than 128-bits then don't reduce it. |
4917 | // TODO: Under what circumstances should we shuffle using the full width? |
4918 | InstructionCost ShuffleCost = 1; |
4919 | if (Opcode == Instruction::InsertElement) { |
4920 | auto *SubTy = cast<VectorType>(Val); |
4921 | EVT VT = TLI->getValueType(DL, Ty: Val); |
4922 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) |
4923 | SubTy = FixedVectorType::get(ElementType: ScalarType, NumElts: SubNumElts); |
4924 | ShuffleCost = getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SubTy, SrcTy: SubTy, Mask: {}, |
4925 | CostKind, Index: 0, SubTp: SubTy); |
4926 | } |
4927 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; |
4928 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; |
4929 | } |
4930 | |
4931 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + |
4932 | RegisterFileMoveCost; |
4933 | } |
4934 | |
4935 | InstructionCost X86TTIImpl::getScalarizationOverhead( |
4936 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
4937 | TTI::TargetCostKind CostKind, bool ForPoisonSrc, |
4938 | ArrayRef<Value *> VL) const { |
4939 | assert(DemandedElts.getBitWidth() == |
4940 | cast<FixedVectorType>(Ty)->getNumElements() && |
4941 | "Vector size mismatch" ); |
4942 | |
4943 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
4944 | MVT MScalarTy = LT.second.getScalarType(); |
4945 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); |
4946 | InstructionCost Cost = 0; |
4947 | |
4948 | constexpr unsigned LaneBitWidth = 128; |
4949 | assert((LegalVectorBitWidth < LaneBitWidth || |
4950 | (LegalVectorBitWidth % LaneBitWidth) == 0) && |
4951 | "Illegal vector" ); |
4952 | |
4953 | const int NumLegalVectors = LT.first.getValue(); |
4954 | assert(NumLegalVectors >= 0 && "Negative cost!" ); |
4955 | |
4956 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much |
4957 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. SLPVectorizer has |
4958 | // a special heuristic regarding poison input which is passed here in |
4959 | // ForPoisonSrc. |
4960 | if (Insert && !ForPoisonSrc) { |
4961 | // This is nearly identical to BaseT::getScalarizationOverhead(), except |
4962 | // it is passing nullptr to getVectorInstrCost() for Op0 (instead of |
4963 | // Constant::getNullValue()), which makes the X86TTIImpl |
4964 | // getVectorInstrCost() return 0 instead of 1. |
4965 | for (unsigned I : seq(Size: DemandedElts.getBitWidth())) { |
4966 | if (!DemandedElts[I]) |
4967 | continue; |
4968 | Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: I, |
4969 | Op0: Constant::getNullValue(Ty), |
4970 | Op1: VL.empty() ? nullptr : VL[I]); |
4971 | } |
4972 | return Cost; |
4973 | } |
4974 | |
4975 | if (Insert) { |
4976 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4977 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4978 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { |
4979 | // For types we can insert directly, insertion into 128-bit sub vectors is |
4980 | // cheap, followed by a cheap chain of concatenations. |
4981 | if (LegalVectorBitWidth <= LaneBitWidth) { |
4982 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, |
4983 | /*Extract*/ false, CostKind); |
4984 | } else { |
4985 | // In each 128-lane, if at least one index is demanded but not all |
4986 | // indices are demanded and this 128-lane is not the first 128-lane of |
4987 | // the legalized-vector, then this 128-lane needs a extracti128; If in |
4988 | // each 128-lane, there is at least one demanded index, this 128-lane |
4989 | // needs a inserti128. |
4990 | |
4991 | // The following cases will help you build a better understanding: |
4992 | // Assume we insert several elements into a v8i32 vector in avx2, |
4993 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. |
4994 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + |
4995 | // inserti128. |
4996 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. |
4997 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector" ); |
4998 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
4999 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
5000 | unsigned NumLegalElts = |
5001 | LT.second.getVectorNumElements() * NumLegalVectors; |
5002 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
5003 | "Vector has been legalized to smaller element count" ); |
5004 | assert((NumLegalElts % NumLanesTotal) == 0 && |
5005 | "Unexpected elts per lane" ); |
5006 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
5007 | |
5008 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
5009 | auto *LaneTy = |
5010 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
5011 | |
5012 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
5013 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
5014 | numBits: NumEltsPerLane, bitPosition: NumEltsPerLane * I); |
5015 | if (LaneEltMask.isZero()) |
5016 | continue; |
5017 | // FIXME: we don't need to extract if all non-demanded elements |
5018 | // are legalization-inserted padding. |
5019 | if (!LaneEltMask.isAllOnes()) |
5020 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
5021 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
5022 | Cost += BaseT::getScalarizationOverhead(InTy: LaneTy, DemandedElts: LaneEltMask, Insert, |
5023 | /*Extract*/ false, CostKind); |
5024 | } |
5025 | |
5026 | APInt AffectedLanes = |
5027 | APIntOps::ScaleBitMask(A: WidenedDemandedElts, NewBitWidth: NumLanesTotal); |
5028 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( |
5029 | A: AffectedLanes, NewBitWidth: NumLegalVectors, /*MatchAllBits=*/true); |
5030 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { |
5031 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { |
5032 | unsigned I = NumLegalLanes * LegalVec + Lane; |
5033 | // No need to insert unaffected lane; or lane 0 of each legal vector |
5034 | // iff ALL lanes of that vector were affected and will be inserted. |
5035 | if (!AffectedLanes[I] || |
5036 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) |
5037 | continue; |
5038 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
5039 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
5040 | } |
5041 | } |
5042 | } |
5043 | } else if (LT.second.isVector()) { |
5044 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded |
5045 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a |
5046 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be |
5047 | // considered cheap. |
5048 | if (Ty->isIntOrIntVectorTy()) |
5049 | Cost += DemandedElts.popcount(); |
5050 | |
5051 | // Get the smaller of the legalized or original pow2-extended number of |
5052 | // vector elements, which represents the number of unpacks we'll end up |
5053 | // performing. |
5054 | unsigned NumElts = LT.second.getVectorNumElements(); |
5055 | unsigned Pow2Elts = |
5056 | PowerOf2Ceil(A: cast<FixedVectorType>(Val: Ty)->getNumElements()); |
5057 | Cost += (std::min<unsigned>(a: NumElts, b: Pow2Elts) - 1) * LT.first; |
5058 | } |
5059 | } |
5060 | |
5061 | if (Extract) { |
5062 | // vXi1 can be efficiently extracted with MOVMSK. |
5063 | // TODO: AVX512 predicate mask handling. |
5064 | // NOTE: This doesn't work well for roundtrip scalarization. |
5065 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { |
5066 | unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
5067 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; |
5068 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; |
5069 | return MOVMSKCost; |
5070 | } |
5071 | |
5072 | if (LT.second.isVector()) { |
5073 | unsigned NumLegalElts = |
5074 | LT.second.getVectorNumElements() * NumLegalVectors; |
5075 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
5076 | "Vector has been legalized to smaller element count" ); |
5077 | |
5078 | // If we're extracting elements from a 128-bit subvector lane, |
5079 | // we only need to extract each lane once, not for every element. |
5080 | if (LegalVectorBitWidth > LaneBitWidth) { |
5081 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
5082 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
5083 | assert((NumLegalElts % NumLanesTotal) == 0 && |
5084 | "Unexpected elts per lane" ); |
5085 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
5086 | |
5087 | // Add cost for each demanded 128-bit subvector extraction. |
5088 | // Luckily this is a lot easier than for insertion. |
5089 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
5090 | auto *LaneTy = |
5091 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
5092 | |
5093 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
5094 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
5095 | numBits: NumEltsPerLane, bitPosition: I * NumEltsPerLane); |
5096 | if (LaneEltMask.isZero()) |
5097 | continue; |
5098 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, CostKind, |
5099 | Index: I * NumEltsPerLane, SubTp: LaneTy); |
5100 | Cost += BaseT::getScalarizationOverhead( |
5101 | InTy: LaneTy, DemandedElts: LaneEltMask, /*Insert*/ false, Extract, CostKind); |
5102 | } |
5103 | |
5104 | return Cost; |
5105 | } |
5106 | } |
5107 | |
5108 | // Fallback to default extraction. |
5109 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, /*Insert*/ false, |
5110 | Extract, CostKind); |
5111 | } |
5112 | |
5113 | return Cost; |
5114 | } |
5115 | |
5116 | InstructionCost |
5117 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
5118 | int VF, const APInt &DemandedDstElts, |
5119 | TTI::TargetCostKind CostKind) const { |
5120 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
5121 | // We don't differentiate element types here, only element bit width. |
5122 | EltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: EltTyBits); |
5123 | |
5124 | auto bailout = [&]() { |
5125 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, |
5126 | DemandedDstElts, CostKind); |
5127 | }; |
5128 | |
5129 | // For now, only deal with AVX512 cases. |
5130 | if (!ST->hasAVX512()) |
5131 | return bailout(); |
5132 | |
5133 | // Do we have a native shuffle for this element type, or should we promote? |
5134 | unsigned PromEltTyBits = EltTyBits; |
5135 | switch (EltTyBits) { |
5136 | case 32: |
5137 | case 64: |
5138 | break; // AVX512F. |
5139 | case 16: |
5140 | if (!ST->hasBWI()) |
5141 | PromEltTyBits = 32; // promote to i32, AVX512F. |
5142 | break; // AVX512BW |
5143 | case 8: |
5144 | if (!ST->hasVBMI()) |
5145 | PromEltTyBits = 32; // promote to i32, AVX512F. |
5146 | break; // AVX512VBMI |
5147 | case 1: |
5148 | // There is no support for shuffling i1 elements. We *must* promote. |
5149 | if (ST->hasBWI()) { |
5150 | if (ST->hasVBMI()) |
5151 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. |
5152 | else |
5153 | PromEltTyBits = 16; // promote to i16, AVX512BW. |
5154 | break; |
5155 | } |
5156 | PromEltTyBits = 32; // promote to i32, AVX512F. |
5157 | break; |
5158 | default: |
5159 | return bailout(); |
5160 | } |
5161 | auto *PromEltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: PromEltTyBits); |
5162 | |
5163 | auto *SrcVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: VF); |
5164 | auto *PromSrcVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: VF); |
5165 | |
5166 | int NumDstElements = VF * ReplicationFactor; |
5167 | auto *PromDstVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: NumDstElements); |
5168 | auto *DstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumDstElements); |
5169 | |
5170 | // Legalize the types. |
5171 | MVT LegalSrcVecTy = getTypeLegalizationCost(Ty: SrcVecTy).second; |
5172 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(Ty: PromSrcVecTy).second; |
5173 | MVT LegalPromDstVecTy = getTypeLegalizationCost(Ty: PromDstVecTy).second; |
5174 | MVT LegalDstVecTy = getTypeLegalizationCost(Ty: DstVecTy).second; |
5175 | // They should have legalized into vector types. |
5176 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || |
5177 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) |
5178 | return bailout(); |
5179 | |
5180 | if (PromEltTyBits != EltTyBits) { |
5181 | // If we have to perform the shuffle with wider elt type than our data type, |
5182 | // then we will first need to anyext (we don't care about the new bits) |
5183 | // the source elements, and then truncate Dst elements. |
5184 | InstructionCost PromotionCost; |
5185 | PromotionCost += getCastInstrCost( |
5186 | Opcode: Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, |
5187 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
5188 | PromotionCost += |
5189 | getCastInstrCost(Opcode: Instruction::Trunc, /*Dst=*/DstVecTy, |
5190 | /*Src=*/PromDstVecTy, |
5191 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
5192 | return PromotionCost + getReplicationShuffleCost(EltTy: PromEltTy, |
5193 | ReplicationFactor, VF, |
5194 | DemandedDstElts, CostKind); |
5195 | } |
5196 | |
5197 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && |
5198 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && |
5199 | "We expect that the legalization doesn't affect the element width, " |
5200 | "doesn't coalesce/split elements." ); |
5201 | |
5202 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); |
5203 | unsigned NumDstVectors = |
5204 | divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: NumEltsPerDstVec); |
5205 | |
5206 | auto *SingleDstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltsPerDstVec); |
5207 | |
5208 | // Not all the produced Dst elements may be demanded. In our case, |
5209 | // given that a single Dst vector is formed by a single shuffle, |
5210 | // if all elements that will form a single Dst vector aren't demanded, |
5211 | // then we won't need to do that shuffle, so adjust the cost accordingly. |
5212 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( |
5213 | A: DemandedDstElts.zext(width: NumDstVectors * NumEltsPerDstVec), NewBitWidth: NumDstVectors); |
5214 | unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); |
5215 | |
5216 | InstructionCost SingleShuffleCost = |
5217 | getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: SingleDstVecTy, SrcTy: SingleDstVecTy, |
5218 | /*Mask=*/{}, CostKind, |
5219 | /*Index=*/0, /*SubTp=*/nullptr); |
5220 | return NumDstVectorsDemanded * SingleShuffleCost; |
5221 | } |
5222 | |
5223 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
5224 | Align Alignment, |
5225 | unsigned AddressSpace, |
5226 | TTI::TargetCostKind CostKind, |
5227 | TTI::OperandValueInfo OpInfo, |
5228 | const Instruction *I) const { |
5229 | // TODO: Handle other cost kinds. |
5230 | if (CostKind != TTI::TCK_RecipThroughput) { |
5231 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: I)) { |
5232 | // Store instruction with index and scale costs 2 Uops. |
5233 | // Check the preceding GEP to identify non-const indices. |
5234 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: SI->getPointerOperand())) { |
5235 | if (!all_of(Range: GEP->indices(), P: [](Value *V) { return isa<Constant>(Val: V); })) |
5236 | return TTI::TCC_Basic * 2; |
5237 | } |
5238 | } |
5239 | return TTI::TCC_Basic; |
5240 | } |
5241 | |
5242 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
5243 | "Invalid Opcode" ); |
5244 | // Type legalization can't handle structs |
5245 | if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other) |
5246 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
5247 | CostKind, OpInfo, I); |
5248 | |
5249 | // Legalize the type. |
5250 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
5251 | |
5252 | auto *VTy = dyn_cast<FixedVectorType>(Val: Src); |
5253 | |
5254 | InstructionCost Cost = 0; |
5255 | |
5256 | // Add a cost for constant load to vector. |
5257 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
5258 | Cost += getMemoryOpCost(Opcode: Instruction::Load, Src, Alignment: DL.getABITypeAlign(Ty: Src), |
5259 | /*AddressSpace=*/0, CostKind, OpInfo); |
5260 | |
5261 | // Handle the simple case of non-vectors. |
5262 | // NOTE: this assumes that legalization never creates vector from scalars! |
5263 | if (!VTy || !LT.second.isVector()) { |
5264 | // Each load/store unit costs 1. |
5265 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; |
5266 | } |
5267 | |
5268 | bool IsLoad = Opcode == Instruction::Load; |
5269 | |
5270 | Type *EltTy = VTy->getElementType(); |
5271 | |
5272 | const int EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
5273 | |
5274 | // Source of truth: how many elements were there in the original IR vector? |
5275 | const unsigned SrcNumElt = VTy->getNumElements(); |
5276 | |
5277 | // How far have we gotten? |
5278 | int NumEltRemaining = SrcNumElt; |
5279 | // Note that we intentionally capture by-reference, NumEltRemaining changes. |
5280 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; |
5281 | |
5282 | const int MaxLegalOpSizeBytes = divideCeil(Numerator: LT.second.getSizeInBits(), Denominator: 8); |
5283 | |
5284 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. |
5285 | const unsigned XMMBits = 128; |
5286 | if (XMMBits % EltTyBits != 0) |
5287 | // Vector size must be a multiple of the element size. I.e. no padding. |
5288 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
5289 | CostKind, OpInfo, I); |
5290 | const int NumEltPerXMM = XMMBits / EltTyBits; |
5291 | |
5292 | auto *XMMVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltPerXMM); |
5293 | |
5294 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; |
5295 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { |
5296 | // How many elements would a single op deal with at once? |
5297 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) |
5298 | // Vector size must be a multiple of the element size. I.e. no padding. |
5299 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
5300 | CostKind, OpInfo, I); |
5301 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; |
5302 | |
5303 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?" ); |
5304 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || |
5305 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && |
5306 | "Unless we haven't halved the op size yet, " |
5307 | "we have less than two op's sized units of work left." ); |
5308 | |
5309 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM |
5310 | ? FixedVectorType::get(ElementType: EltTy, NumElts: CurrNumEltPerOp) |
5311 | : XMMVecTy; |
5312 | |
5313 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && |
5314 | "After halving sizes, the vector elt count is no longer a multiple " |
5315 | "of number of elements per operation?" ); |
5316 | auto *CoalescedVecTy = |
5317 | CurrNumEltPerOp == 1 |
5318 | ? CurrVecTy |
5319 | : FixedVectorType::get( |
5320 | ElementType: IntegerType::get(C&: Src->getContext(), |
5321 | NumBits: EltTyBits * CurrNumEltPerOp), |
5322 | NumElts: CurrVecTy->getNumElements() / CurrNumEltPerOp); |
5323 | assert(DL.getTypeSizeInBits(CoalescedVecTy) == |
5324 | DL.getTypeSizeInBits(CurrVecTy) && |
5325 | "coalesciing elements doesn't change vector width." ); |
5326 | |
5327 | while (NumEltRemaining > 0) { |
5328 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ); |
5329 | |
5330 | // Can we use this vector size, as per the remaining element count? |
5331 | // Iff the vector is naturally aligned, we can do a wide load regardless. |
5332 | if (NumEltRemaining < CurrNumEltPerOp && |
5333 | (!IsLoad || Alignment < CurrOpSizeBytes) && CurrOpSizeBytes != 1) |
5334 | break; // Try smalled vector size. |
5335 | |
5336 | // This isn't exactly right. We're using slow unaligned 32-byte accesses |
5337 | // as a proxy for a double-pumped AVX memory interface such as on |
5338 | // Sandybridge. |
5339 | // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or |
5340 | // will be scalarized. |
5341 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) |
5342 | Cost += 2; |
5343 | else if (CurrOpSizeBytes < 4) |
5344 | Cost += 2; |
5345 | else |
5346 | Cost += 1; |
5347 | |
5348 | // If we're loading a uniform value, then we don't need to split the load, |
5349 | // loading just a single (widest) vector can be reused by all splits. |
5350 | if (IsLoad && OpInfo.isUniform()) |
5351 | return Cost; |
5352 | |
5353 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; |
5354 | |
5355 | // If we have fully processed the previous reg, we need to replenish it. |
5356 | if (SubVecEltsLeft == 0) { |
5357 | SubVecEltsLeft += CurrVecTy->getNumElements(); |
5358 | // And that's free only for the 0'th subvector of a legalized vector. |
5359 | if (!Is0thSubVec) |
5360 | Cost += |
5361 | getShuffleCost(Kind: IsLoad ? TTI::ShuffleKind::SK_InsertSubvector |
5362 | : TTI::ShuffleKind::SK_ExtractSubvector, |
5363 | DstTy: VTy, SrcTy: VTy, Mask: {}, CostKind, Index: NumEltDone(), SubTp: CurrVecTy); |
5364 | } |
5365 | |
5366 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, |
5367 | // for smaller widths (32/16/8) we have to insert/extract them separately. |
5368 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, |
5369 | // but let's pretend that it is also true for 16/8 bit wide ops...) |
5370 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { |
5371 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; |
5372 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "" ); |
5373 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; |
5374 | APInt DemandedElts = |
5375 | APInt::getBitsSet(numBits: CoalescedVecTy->getNumElements(), |
5376 | loBit: CoalescedVecEltIdx, hiBit: CoalescedVecEltIdx + 1); |
5377 | assert(DemandedElts.popcount() == 1 && "Inserting single value" ); |
5378 | Cost += getScalarizationOverhead(Ty: CoalescedVecTy, DemandedElts, Insert: IsLoad, |
5379 | Extract: !IsLoad, CostKind); |
5380 | } |
5381 | |
5382 | SubVecEltsLeft -= CurrNumEltPerOp; |
5383 | NumEltRemaining -= CurrNumEltPerOp; |
5384 | Alignment = commonAlignment(A: Alignment, Offset: CurrOpSizeBytes); |
5385 | } |
5386 | } |
5387 | |
5388 | assert(NumEltRemaining <= 0 && "Should have processed all the elements." ); |
5389 | |
5390 | return Cost; |
5391 | } |
5392 | |
5393 | InstructionCost |
5394 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, |
5395 | unsigned AddressSpace, |
5396 | TTI::TargetCostKind CostKind) const { |
5397 | bool IsLoad = (Instruction::Load == Opcode); |
5398 | bool IsStore = (Instruction::Store == Opcode); |
5399 | |
5400 | auto *SrcVTy = dyn_cast<FixedVectorType>(Val: SrcTy); |
5401 | if (!SrcVTy) |
5402 | // To calculate scalar take the regular cost, without mask |
5403 | return getMemoryOpCost(Opcode, Src: SrcTy, Alignment, AddressSpace, CostKind); |
5404 | |
5405 | unsigned NumElem = SrcVTy->getNumElements(); |
5406 | auto *MaskTy = |
5407 | FixedVectorType::get(ElementType: Type::getInt8Ty(C&: SrcVTy->getContext()), NumElts: NumElem); |
5408 | if ((IsLoad && !isLegalMaskedLoad(DataType: SrcVTy, Alignment, AddressSpace)) || |
5409 | (IsStore && !isLegalMaskedStore(DataType: SrcVTy, Alignment, AddressSpace))) { |
5410 | // Scalarization |
5411 | APInt DemandedElts = APInt::getAllOnes(numBits: NumElem); |
5412 | InstructionCost MaskSplitCost = getScalarizationOverhead( |
5413 | Ty: MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); |
5414 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( |
5415 | Opcode: Instruction::ICmp, ValTy: Type::getInt8Ty(C&: SrcVTy->getContext()), CondTy: nullptr, |
5416 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
5417 | InstructionCost BranchCost = getCFInstrCost(Opcode: Instruction::Br, CostKind); |
5418 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); |
5419 | InstructionCost ValueSplitCost = getScalarizationOverhead( |
5420 | Ty: SrcVTy, DemandedElts, Insert: IsLoad, Extract: IsStore, CostKind); |
5421 | InstructionCost MemopCost = |
5422 | NumElem * BaseT::getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
5423 | Alignment, AddressSpace, CostKind); |
5424 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; |
5425 | } |
5426 | |
5427 | // Legalize the type. |
5428 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcVTy); |
5429 | auto VT = TLI->getValueType(DL, Ty: SrcVTy); |
5430 | InstructionCost Cost = 0; |
5431 | MVT Ty = LT.second; |
5432 | if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) |
5433 | // APX masked load/store for scalar is cheap. |
5434 | return Cost + LT.first; |
5435 | |
5436 | if (VT.isSimple() && Ty != VT.getSimpleVT() && |
5437 | LT.second.getVectorNumElements() == NumElem) |
5438 | // Promotion requires extend/truncate for data and a shuffle for mask. |
5439 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SrcVTy, SrcTy: SrcVTy, Mask: {}, CostKind, |
5440 | Index: 0, SubTp: nullptr) + |
5441 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: MaskTy, SrcTy: MaskTy, Mask: {}, CostKind, |
5442 | Index: 0, SubTp: nullptr); |
5443 | |
5444 | else if (LT.first * Ty.getVectorNumElements() > NumElem) { |
5445 | auto *NewMaskTy = FixedVectorType::get(ElementType: MaskTy->getElementType(), |
5446 | NumElts: (unsigned)LT.first.getValue() * |
5447 | Ty.getVectorNumElements()); |
5448 | // Expanding requires fill mask with zeroes |
5449 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy: NewMaskTy, SrcTy: NewMaskTy, Mask: {}, |
5450 | CostKind, Index: 0, SubTp: MaskTy); |
5451 | } |
5452 | |
5453 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. |
5454 | if (!ST->hasAVX512()) |
5455 | return Cost + LT.first * (IsLoad ? 2 : 8); |
5456 | |
5457 | // AVX-512 masked load/store is cheaper |
5458 | return Cost + LT.first; |
5459 | } |
5460 | |
5461 | InstructionCost X86TTIImpl::getPointersChainCost( |
5462 | ArrayRef<const Value *> Ptrs, const Value *Base, |
5463 | const TTI::PointersChainInfo &Info, Type *AccessTy, |
5464 | TTI::TargetCostKind CostKind) const { |
5465 | if (Info.isSameBase() && Info.isKnownStride()) { |
5466 | // If all the pointers have known stride all the differences are translated |
5467 | // into constants. X86 memory addressing allows encoding it into |
5468 | // displacement. So we just need to take the base GEP cost. |
5469 | if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Val: Base)) { |
5470 | SmallVector<const Value *> Indices(BaseGEP->indices()); |
5471 | return getGEPCost(PointeeType: BaseGEP->getSourceElementType(), |
5472 | Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: nullptr, |
5473 | CostKind); |
5474 | } |
5475 | return TTI::TCC_Free; |
5476 | } |
5477 | return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); |
5478 | } |
5479 | |
5480 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, |
5481 | ScalarEvolution *SE, |
5482 | const SCEV *Ptr) const { |
5483 | // Address computations in vectorized code with non-consecutive addresses will |
5484 | // likely result in more instructions compared to scalar code where the |
5485 | // computation can more often be merged into the index mode. The resulting |
5486 | // extra micro-ops can significantly decrease throughput. |
5487 | const unsigned NumVectorInstToHideOverhead = 10; |
5488 | |
5489 | // Cost modeling of Strided Access Computation is hidden by the indexing |
5490 | // modes of X86 regardless of the stride value. We dont believe that there |
5491 | // is a difference between constant strided access in gerenal and constant |
5492 | // strided value which is less than or equal to 64. |
5493 | // Even in the case of (loop invariant) stride whose value is not known at |
5494 | // compile time, the address computation will not incur more than one extra |
5495 | // ADD instruction. |
5496 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { |
5497 | // TODO: AVX2 is the current cut-off because we don't have correct |
5498 | // interleaving costs for prior ISA's. |
5499 | if (!BaseT::isStridedAccess(Ptr)) |
5500 | return NumVectorInstToHideOverhead; |
5501 | if (!BaseT::getConstantStrideStep(SE, Ptr)) |
5502 | return 1; |
5503 | } |
5504 | |
5505 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); |
5506 | } |
5507 | |
5508 | InstructionCost |
5509 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
5510 | std::optional<FastMathFlags> FMF, |
5511 | TTI::TargetCostKind CostKind) const { |
5512 | if (TTI::requiresOrderedReduction(FMF)) |
5513 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
5514 | |
5515 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5516 | // and make it as the cost. |
5517 | |
5518 | static const CostTblEntry SLMCostTbl[] = { |
5519 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 3 }, |
5520 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 5 }, |
5521 | }; |
5522 | |
5523 | static const CostTblEntry SSE2CostTbl[] = { |
5524 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 2 }, |
5525 | { .ISD: ISD::FADD, .Type: MVT::v2f32, .Cost: 2 }, |
5526 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 4 }, |
5527 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2 }, // The data reported by the IACA tool is "1.6". |
5528 | { .ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2 }, // FIXME: chosen to be less than v4i32 |
5529 | { .ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 3 }, // The data reported by the IACA tool is "3.3". |
5530 | { .ISD: ISD::ADD, .Type: MVT::v2i16, .Cost: 2 }, // The data reported by the IACA tool is "4.3". |
5531 | { .ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 3 }, // The data reported by the IACA tool is "4.3". |
5532 | { .ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 4 }, // The data reported by the IACA tool is "4.3". |
5533 | { .ISD: ISD::ADD, .Type: MVT::v2i8, .Cost: 2 }, |
5534 | { .ISD: ISD::ADD, .Type: MVT::v4i8, .Cost: 2 }, |
5535 | { .ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2 }, |
5536 | { .ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 3 }, |
5537 | }; |
5538 | |
5539 | static const CostTblEntry AVX1CostTbl[] = { |
5540 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: 3 }, |
5541 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 3 }, |
5542 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: 4 }, |
5543 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 1 }, // The data reported by the IACA tool is "1.5". |
5544 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: 3 }, |
5545 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: 5 }, |
5546 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: 5 }, |
5547 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: 4 }, |
5548 | }; |
5549 | |
5550 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
5551 | assert(ISD && "Invalid opcode" ); |
5552 | |
5553 | // Before legalizing the type, give a chance to look up illegal narrow types |
5554 | // in the table. |
5555 | // FIXME: Is there a better way to do this? |
5556 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5557 | if (VT.isSimple()) { |
5558 | MVT MTy = VT.getSimpleVT(); |
5559 | if (ST->useSLMArithCosts()) |
5560 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
5561 | return Entry->Cost; |
5562 | |
5563 | if (ST->hasAVX()) |
5564 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5565 | return Entry->Cost; |
5566 | |
5567 | if (ST->hasSSE2()) |
5568 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5569 | return Entry->Cost; |
5570 | } |
5571 | |
5572 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5573 | |
5574 | MVT MTy = LT.second; |
5575 | |
5576 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5577 | |
5578 | // Special case: vXi8 mul reductions are performed as vXi16. |
5579 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { |
5580 | auto *WideSclTy = IntegerType::get(C&: ValVTy->getContext(), NumBits: 16); |
5581 | auto *WideVecTy = FixedVectorType::get(ElementType: WideSclTy, NumElts: ValVTy->getNumElements()); |
5582 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: ValTy, |
5583 | CCH: TargetTransformInfo::CastContextHint::None, |
5584 | CostKind) + |
5585 | getArithmeticReductionCost(Opcode, ValTy: WideVecTy, FMF, CostKind); |
5586 | } |
5587 | |
5588 | InstructionCost ArithmeticCost = 0; |
5589 | if (LT.first != 1 && MTy.isVector() && |
5590 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5591 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5592 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5593 | NumElts: MTy.getVectorNumElements()); |
5594 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5595 | ArithmeticCost *= LT.first - 1; |
5596 | } |
5597 | |
5598 | if (ST->useSLMArithCosts()) |
5599 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
5600 | return ArithmeticCost + Entry->Cost; |
5601 | |
5602 | if (ST->hasAVX()) |
5603 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5604 | return ArithmeticCost + Entry->Cost; |
5605 | |
5606 | if (ST->hasSSE2()) |
5607 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5608 | return ArithmeticCost + Entry->Cost; |
5609 | |
5610 | // FIXME: These assume a naive kshift+binop lowering, which is probably |
5611 | // conservative in most cases. |
5612 | static const CostTblEntry AVX512BoolReduction[] = { |
5613 | { .ISD: ISD::AND, .Type: MVT::v2i1, .Cost: 3 }, |
5614 | { .ISD: ISD::AND, .Type: MVT::v4i1, .Cost: 5 }, |
5615 | { .ISD: ISD::AND, .Type: MVT::v8i1, .Cost: 7 }, |
5616 | { .ISD: ISD::AND, .Type: MVT::v16i1, .Cost: 9 }, |
5617 | { .ISD: ISD::AND, .Type: MVT::v32i1, .Cost: 11 }, |
5618 | { .ISD: ISD::AND, .Type: MVT::v64i1, .Cost: 13 }, |
5619 | { .ISD: ISD::OR, .Type: MVT::v2i1, .Cost: 3 }, |
5620 | { .ISD: ISD::OR, .Type: MVT::v4i1, .Cost: 5 }, |
5621 | { .ISD: ISD::OR, .Type: MVT::v8i1, .Cost: 7 }, |
5622 | { .ISD: ISD::OR, .Type: MVT::v16i1, .Cost: 9 }, |
5623 | { .ISD: ISD::OR, .Type: MVT::v32i1, .Cost: 11 }, |
5624 | { .ISD: ISD::OR, .Type: MVT::v64i1, .Cost: 13 }, |
5625 | }; |
5626 | |
5627 | static const CostTblEntry AVX2BoolReduction[] = { |
5628 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
5629 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
5630 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
5631 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
5632 | }; |
5633 | |
5634 | static const CostTblEntry AVX1BoolReduction[] = { |
5635 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
5636 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
5637 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5638 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5639 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
5640 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
5641 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5642 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5643 | }; |
5644 | |
5645 | static const CostTblEntry SSE2BoolReduction[] = { |
5646 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
5647 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
5648 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
5649 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
5650 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
5651 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
5652 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
5653 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
5654 | }; |
5655 | |
5656 | // Handle bool allof/anyof patterns. |
5657 | if (ValVTy->getElementType()->isIntegerTy(Bitwidth: 1)) { |
5658 | InstructionCost ArithmeticCost = 0; |
5659 | if (LT.first != 1 && MTy.isVector() && |
5660 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5661 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5662 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5663 | NumElts: MTy.getVectorNumElements()); |
5664 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5665 | ArithmeticCost *= LT.first - 1; |
5666 | } |
5667 | |
5668 | if (ST->hasAVX512()) |
5669 | if (const auto *Entry = CostTableLookup(Table: AVX512BoolReduction, ISD, Ty: MTy)) |
5670 | return ArithmeticCost + Entry->Cost; |
5671 | if (ST->hasAVX2()) |
5672 | if (const auto *Entry = CostTableLookup(Table: AVX2BoolReduction, ISD, Ty: MTy)) |
5673 | return ArithmeticCost + Entry->Cost; |
5674 | if (ST->hasAVX()) |
5675 | if (const auto *Entry = CostTableLookup(Table: AVX1BoolReduction, ISD, Ty: MTy)) |
5676 | return ArithmeticCost + Entry->Cost; |
5677 | if (ST->hasSSE2()) |
5678 | if (const auto *Entry = CostTableLookup(Table: SSE2BoolReduction, ISD, Ty: MTy)) |
5679 | return ArithmeticCost + Entry->Cost; |
5680 | |
5681 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5682 | } |
5683 | |
5684 | unsigned NumVecElts = ValVTy->getNumElements(); |
5685 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); |
5686 | |
5687 | // Special case power of 2 reductions where the scalar type isn't changed |
5688 | // by type legalization. |
5689 | if (!isPowerOf2_32(Value: NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) |
5690 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5691 | |
5692 | InstructionCost ReductionCost = 0; |
5693 | |
5694 | auto *Ty = ValVTy; |
5695 | if (LT.first != 1 && MTy.isVector() && |
5696 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5697 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5698 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5699 | NumElts: MTy.getVectorNumElements()); |
5700 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
5701 | ReductionCost *= LT.first - 1; |
5702 | NumVecElts = MTy.getVectorNumElements(); |
5703 | } |
5704 | |
5705 | // Now handle reduction with the legal type, taking into account size changes |
5706 | // at each level. |
5707 | while (NumVecElts > 1) { |
5708 | // Determine the size of the remaining vector we need to reduce. |
5709 | unsigned Size = NumVecElts * ScalarSize; |
5710 | NumVecElts /= 2; |
5711 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5712 | if (Size > 128) { |
5713 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5714 | ReductionCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
5715 | CostKind, Index: NumVecElts, SubTp: SubTy); |
5716 | Ty = SubTy; |
5717 | } else if (Size == 128) { |
5718 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5719 | FixedVectorType *ShufTy; |
5720 | if (ValVTy->isFloatingPointTy()) |
5721 | ShufTy = |
5722 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValVTy->getContext()), NumElts: 2); |
5723 | else |
5724 | ShufTy = |
5725 | FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValVTy->getContext()), NumElts: 2); |
5726 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, |
5727 | Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
5728 | } else if (Size == 64) { |
5729 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5730 | FixedVectorType *ShufTy; |
5731 | if (ValVTy->isFloatingPointTy()) |
5732 | ShufTy = |
5733 | FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValVTy->getContext()), NumElts: 4); |
5734 | else |
5735 | ShufTy = |
5736 | FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValVTy->getContext()), NumElts: 4); |
5737 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, |
5738 | Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
5739 | } else { |
5740 | // Reducing from smaller size is a shift by immediate. |
5741 | auto *ShiftTy = FixedVectorType::get( |
5742 | ElementType: Type::getIntNTy(C&: ValVTy->getContext(), N: Size), NumElts: 128 / Size); |
5743 | ReductionCost += getArithmeticInstrCost( |
5744 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind, |
5745 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5746 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5747 | } |
5748 | |
5749 | // Add the arithmetic op for this level. |
5750 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); |
5751 | } |
5752 | |
5753 | // Add the final extract element to the cost. |
5754 | return ReductionCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5755 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5756 | } |
5757 | |
5758 | InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, |
5759 | TTI::TargetCostKind CostKind, |
5760 | FastMathFlags FMF) const { |
5761 | IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); |
5762 | return getIntrinsicInstrCost(ICA, CostKind); |
5763 | } |
5764 | |
5765 | InstructionCost |
5766 | X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, |
5767 | FastMathFlags FMF, |
5768 | TTI::TargetCostKind CostKind) const { |
5769 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5770 | |
5771 | MVT MTy = LT.second; |
5772 | |
5773 | int ISD; |
5774 | if (ValTy->isIntOrIntVectorTy()) { |
5775 | ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN |
5776 | : ISD::SMIN; |
5777 | } else { |
5778 | assert(ValTy->isFPOrFPVectorTy() && |
5779 | "Expected float point or integer vector type." ); |
5780 | ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) |
5781 | ? ISD::FMINNUM |
5782 | : ISD::FMINIMUM; |
5783 | } |
5784 | |
5785 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5786 | // and make it as the cost. |
5787 | |
5788 | static const CostTblEntry SSE2CostTbl[] = { |
5789 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // need pxors to use pminsw/pmaxsw |
5790 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // need pxors to use pminsw/pmaxsw |
5791 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 9}, // need pxors to use pminsw/pmaxsw |
5792 | }; |
5793 | |
5794 | static const CostTblEntry SSE41CostTbl[] = { |
5795 | {.ISD: ISD::SMIN, .Type: MVT::v2i16, .Cost: 3}, // same as sse2 |
5796 | {.ISD: ISD::SMIN, .Type: MVT::v4i16, .Cost: 5}, // same as sse2 |
5797 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // same as sse2 |
5798 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // same as sse2 |
5799 | {.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: 4}, // phminposuw+xor |
5800 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 4}, // FIXME: umin is cheaper than umax |
5801 | {.ISD: ISD::SMIN, .Type: MVT::v2i8, .Cost: 3}, // pminsb |
5802 | {.ISD: ISD::SMIN, .Type: MVT::v4i8, .Cost: 5}, // pminsb |
5803 | {.ISD: ISD::SMIN, .Type: MVT::v8i8, .Cost: 7}, // pminsb |
5804 | {.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: 6}, |
5805 | {.ISD: ISD::UMIN, .Type: MVT::v2i8, .Cost: 3}, // same as sse2 |
5806 | {.ISD: ISD::UMIN, .Type: MVT::v4i8, .Cost: 5}, // same as sse2 |
5807 | {.ISD: ISD::UMIN, .Type: MVT::v8i8, .Cost: 7}, // same as sse2 |
5808 | {.ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: 6}, // FIXME: umin is cheaper than umax |
5809 | }; |
5810 | |
5811 | static const CostTblEntry AVX1CostTbl[] = { |
5812 | {.ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: 6}, |
5813 | {.ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: 6}, // FIXME: umin is cheaper than umax |
5814 | {.ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: 8}, |
5815 | {.ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: 8}, |
5816 | }; |
5817 | |
5818 | static const CostTblEntry AVX512BWCostTbl[] = { |
5819 | {.ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: 8}, |
5820 | {.ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: 8}, // FIXME: umin is cheaper than umax |
5821 | {.ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: 10}, |
5822 | {.ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: 10}, |
5823 | }; |
5824 | |
5825 | // Before legalizing the type, give a chance to look up illegal narrow types |
5826 | // in the table. |
5827 | // FIXME: Is there a better way to do this? |
5828 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5829 | if (VT.isSimple()) { |
5830 | MVT MTy = VT.getSimpleVT(); |
5831 | if (ST->hasBWI()) |
5832 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
5833 | return Entry->Cost; |
5834 | |
5835 | if (ST->hasAVX()) |
5836 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5837 | return Entry->Cost; |
5838 | |
5839 | if (ST->hasSSE41()) |
5840 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
5841 | return Entry->Cost; |
5842 | |
5843 | if (ST->hasSSE2()) |
5844 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5845 | return Entry->Cost; |
5846 | } |
5847 | |
5848 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5849 | unsigned NumVecElts = ValVTy->getNumElements(); |
5850 | |
5851 | auto *Ty = ValVTy; |
5852 | InstructionCost MinMaxCost = 0; |
5853 | if (LT.first != 1 && MTy.isVector() && |
5854 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5855 | // Type needs to be split. We need LT.first - 1 operations ops. |
5856 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5857 | NumElts: MTy.getVectorNumElements()); |
5858 | MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); |
5859 | MinMaxCost *= LT.first - 1; |
5860 | NumVecElts = MTy.getVectorNumElements(); |
5861 | } |
5862 | |
5863 | if (ST->hasBWI()) |
5864 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
5865 | return MinMaxCost + Entry->Cost; |
5866 | |
5867 | if (ST->hasAVX()) |
5868 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5869 | return MinMaxCost + Entry->Cost; |
5870 | |
5871 | if (ST->hasSSE41()) |
5872 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
5873 | return MinMaxCost + Entry->Cost; |
5874 | |
5875 | if (ST->hasSSE2()) |
5876 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5877 | return MinMaxCost + Entry->Cost; |
5878 | |
5879 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); |
5880 | |
5881 | // Special case power of 2 reductions where the scalar type isn't changed |
5882 | // by type legalization. |
5883 | if (!isPowerOf2_32(Value: ValVTy->getNumElements()) || |
5884 | ScalarSize != MTy.getScalarSizeInBits()) |
5885 | return BaseT::getMinMaxReductionCost(IID, Ty: ValTy, FMF, CostKind); |
5886 | |
5887 | // Now handle reduction with the legal type, taking into account size changes |
5888 | // at each level. |
5889 | while (NumVecElts > 1) { |
5890 | // Determine the size of the remaining vector we need to reduce. |
5891 | unsigned Size = NumVecElts * ScalarSize; |
5892 | NumVecElts /= 2; |
5893 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5894 | if (Size > 128) { |
5895 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5896 | MinMaxCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, DstTy: Ty, SrcTy: Ty, Mask: {}, |
5897 | CostKind, Index: NumVecElts, SubTp: SubTy); |
5898 | Ty = SubTy; |
5899 | } else if (Size == 128) { |
5900 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5901 | VectorType *ShufTy; |
5902 | if (ValTy->isFloatingPointTy()) |
5903 | ShufTy = |
5904 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValTy->getContext()), NumElts: 2); |
5905 | else |
5906 | ShufTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValTy->getContext()), NumElts: 2); |
5907 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, Mask: {}, |
5908 | CostKind, Index: 0, SubTp: nullptr); |
5909 | } else if (Size == 64) { |
5910 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5911 | FixedVectorType *ShufTy; |
5912 | if (ValTy->isFloatingPointTy()) |
5913 | ShufTy = FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), NumElts: 4); |
5914 | else |
5915 | ShufTy = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValTy->getContext()), NumElts: 4); |
5916 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, DstTy: ShufTy, SrcTy: ShufTy, Mask: {}, |
5917 | CostKind, Index: 0, SubTp: nullptr); |
5918 | } else { |
5919 | // Reducing from smaller size is a shift by immediate. |
5920 | auto *ShiftTy = FixedVectorType::get( |
5921 | ElementType: Type::getIntNTy(C&: ValTy->getContext(), N: Size), NumElts: 128 / Size); |
5922 | MinMaxCost += getArithmeticInstrCost( |
5923 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind: TTI::TCK_RecipThroughput, |
5924 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5925 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5926 | } |
5927 | |
5928 | // Add the arithmetic op for this level. |
5929 | MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); |
5930 | } |
5931 | |
5932 | // Add the final extract element to the cost. |
5933 | return MinMaxCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5934 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5935 | } |
5936 | |
5937 | /// Calculate the cost of materializing a 64-bit value. This helper |
5938 | /// method might only calculate a fraction of a larger immediate. Therefore it |
5939 | /// is valid to return a cost of ZERO. |
5940 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) const { |
5941 | if (Val == 0) |
5942 | return TTI::TCC_Free; |
5943 | |
5944 | if (isInt<32>(x: Val)) |
5945 | return TTI::TCC_Basic; |
5946 | |
5947 | return 2 * TTI::TCC_Basic; |
5948 | } |
5949 | |
5950 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
5951 | TTI::TargetCostKind CostKind) const { |
5952 | assert(Ty->isIntegerTy()); |
5953 | |
5954 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5955 | if (BitSize == 0) |
5956 | return ~0U; |
5957 | |
5958 | // Never hoist constants larger than 128bit, because this might lead to |
5959 | // incorrect code generation or assertions in codegen. |
5960 | // Fixme: Create a cost model for types larger than i128 once the codegen |
5961 | // issues have been fixed. |
5962 | if (BitSize > 128) |
5963 | return TTI::TCC_Free; |
5964 | |
5965 | if (Imm == 0) |
5966 | return TTI::TCC_Free; |
5967 | |
5968 | // Sign-extend all constants to a multiple of 64-bit. |
5969 | APInt ImmVal = Imm; |
5970 | if (BitSize % 64 != 0) |
5971 | ImmVal = Imm.sext(width: alignTo(Value: BitSize, Align: 64)); |
5972 | |
5973 | // Split the constant into 64-bit chunks and calculate the cost for each |
5974 | // chunk. |
5975 | InstructionCost Cost = 0; |
5976 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
5977 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
5978 | int64_t Val = Tmp.getSExtValue(); |
5979 | Cost += getIntImmCost(Val); |
5980 | } |
5981 | // We need at least one instruction to materialize the constant. |
5982 | return std::max<InstructionCost>(a: 1, b: Cost); |
5983 | } |
5984 | |
5985 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
5986 | const APInt &Imm, Type *Ty, |
5987 | TTI::TargetCostKind CostKind, |
5988 | Instruction *Inst) const { |
5989 | assert(Ty->isIntegerTy()); |
5990 | |
5991 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5992 | unsigned ImmBitWidth = Imm.getBitWidth(); |
5993 | |
5994 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
5995 | // here, so that constant hoisting will ignore this constant. |
5996 | if (BitSize == 0) |
5997 | return TTI::TCC_Free; |
5998 | |
5999 | unsigned ImmIdx = ~0U; |
6000 | switch (Opcode) { |
6001 | default: |
6002 | return TTI::TCC_Free; |
6003 | case Instruction::GetElementPtr: |
6004 | // Always hoist the base address of a GetElementPtr. This prevents the |
6005 | // creation of new constants for every base constant that gets constant |
6006 | // folded with the offset. |
6007 | if (Idx == 0) |
6008 | return 2 * TTI::TCC_Basic; |
6009 | return TTI::TCC_Free; |
6010 | case Instruction::Store: |
6011 | ImmIdx = 0; |
6012 | break; |
6013 | case Instruction::ICmp: |
6014 | // This is an imperfect hack to prevent constant hoisting of |
6015 | // compares that might be trying to check if a 64-bit value fits in |
6016 | // 32-bits. The backend can optimize these cases using a right shift by 32. |
6017 | // There are other predicates and immediates the backend can use shifts for. |
6018 | if (Idx == 1 && ImmBitWidth == 64) { |
6019 | uint64_t ImmVal = Imm.getZExtValue(); |
6020 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) |
6021 | return TTI::TCC_Free; |
6022 | |
6023 | if (auto *Cmp = dyn_cast_or_null<CmpInst>(Val: Inst)) { |
6024 | if (Cmp->isEquality()) { |
6025 | KnownBits Known = computeKnownBits(V: Cmp->getOperand(i_nocapture: 0), DL); |
6026 | if (Known.countMinTrailingZeros() >= 32) |
6027 | return TTI::TCC_Free; |
6028 | } |
6029 | } |
6030 | } |
6031 | ImmIdx = 1; |
6032 | break; |
6033 | case Instruction::And: |
6034 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes |
6035 | // by using a 32-bit operation with implicit zero extension. Detect such |
6036 | // immediates here as the normal path expects bit 31 to be sign extended. |
6037 | if (Idx == 1 && ImmBitWidth == 64 && Imm.isIntN(N: 32)) |
6038 | return TTI::TCC_Free; |
6039 | // If we have BMI then we can use BEXTR/BZHI to mask out upper i64 bits. |
6040 | if (Idx == 1 && ImmBitWidth == 64 && ST->is64Bit() && ST->hasBMI() && |
6041 | Imm.isMask()) |
6042 | return X86TTIImpl::getIntImmCost(Val: ST->hasBMI2() ? 255 : 65535); |
6043 | ImmIdx = 1; |
6044 | break; |
6045 | case Instruction::Add: |
6046 | case Instruction::Sub: |
6047 | // For add/sub, we can use the opposite instruction for INT32_MIN. |
6048 | if (Idx == 1 && ImmBitWidth == 64 && Imm.getZExtValue() == 0x80000000) |
6049 | return TTI::TCC_Free; |
6050 | ImmIdx = 1; |
6051 | break; |
6052 | case Instruction::UDiv: |
6053 | case Instruction::SDiv: |
6054 | case Instruction::URem: |
6055 | case Instruction::SRem: |
6056 | // Division by constant is typically expanded later into a different |
6057 | // instruction sequence. This completely changes the constants. |
6058 | // Report them as "free" to stop ConstantHoist from marking them as opaque. |
6059 | return TTI::TCC_Free; |
6060 | case Instruction::Mul: |
6061 | case Instruction::Or: |
6062 | case Instruction::Xor: |
6063 | ImmIdx = 1; |
6064 | break; |
6065 | // Always return TCC_Free for the shift value of a shift instruction. |
6066 | case Instruction::Shl: |
6067 | case Instruction::LShr: |
6068 | case Instruction::AShr: |
6069 | if (Idx == 1) |
6070 | return TTI::TCC_Free; |
6071 | break; |
6072 | case Instruction::Trunc: |
6073 | case Instruction::ZExt: |
6074 | case Instruction::SExt: |
6075 | case Instruction::IntToPtr: |
6076 | case Instruction::PtrToInt: |
6077 | case Instruction::BitCast: |
6078 | case Instruction::PHI: |
6079 | case Instruction::Call: |
6080 | case Instruction::Select: |
6081 | case Instruction::Ret: |
6082 | case Instruction::Load: |
6083 | break; |
6084 | } |
6085 | |
6086 | if (Idx == ImmIdx) { |
6087 | uint64_t NumConstants = divideCeil(Numerator: BitSize, Denominator: 64); |
6088 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
6089 | return (Cost <= NumConstants * TTI::TCC_Basic) |
6090 | ? static_cast<int>(TTI::TCC_Free) |
6091 | : Cost; |
6092 | } |
6093 | |
6094 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
6095 | } |
6096 | |
6097 | InstructionCost |
6098 | X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
6099 | const APInt &Imm, Type *Ty, |
6100 | TTI::TargetCostKind CostKind) const { |
6101 | assert(Ty->isIntegerTy()); |
6102 | |
6103 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
6104 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
6105 | // here, so that constant hoisting will ignore this constant. |
6106 | if (BitSize == 0) |
6107 | return TTI::TCC_Free; |
6108 | |
6109 | switch (IID) { |
6110 | default: |
6111 | return TTI::TCC_Free; |
6112 | case Intrinsic::sadd_with_overflow: |
6113 | case Intrinsic::uadd_with_overflow: |
6114 | case Intrinsic::ssub_with_overflow: |
6115 | case Intrinsic::usub_with_overflow: |
6116 | case Intrinsic::smul_with_overflow: |
6117 | case Intrinsic::umul_with_overflow: |
6118 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 32)) |
6119 | return TTI::TCC_Free; |
6120 | break; |
6121 | case Intrinsic::experimental_stackmap: |
6122 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
6123 | return TTI::TCC_Free; |
6124 | break; |
6125 | case Intrinsic::experimental_patchpoint_void: |
6126 | case Intrinsic::experimental_patchpoint: |
6127 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
6128 | return TTI::TCC_Free; |
6129 | break; |
6130 | } |
6131 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
6132 | } |
6133 | |
6134 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, |
6135 | TTI::TargetCostKind CostKind, |
6136 | const Instruction *I) const { |
6137 | if (CostKind != TTI::TCK_RecipThroughput) |
6138 | return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic; |
6139 | // Branches are assumed to be predicted. |
6140 | return TTI::TCC_Free; |
6141 | } |
6142 | |
6143 | int X86TTIImpl::getGatherOverhead() const { |
6144 | // Some CPUs have more overhead for gather. The specified overhead is relative |
6145 | // to the Load operation. "2" is the number provided by Intel architects. This |
6146 | // parameter is used for cost estimation of Gather Op and comparison with |
6147 | // other alternatives. |
6148 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only |
6149 | // enable gather with a -march. |
6150 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) |
6151 | return 2; |
6152 | |
6153 | return 1024; |
6154 | } |
6155 | |
6156 | int X86TTIImpl::getScatterOverhead() const { |
6157 | if (ST->hasAVX512()) |
6158 | return 2; |
6159 | |
6160 | return 1024; |
6161 | } |
6162 | |
6163 | // Return an average cost of Gather / Scatter instruction, maybe improved later. |
6164 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, |
6165 | TTI::TargetCostKind CostKind, |
6166 | Type *SrcVTy, const Value *Ptr, |
6167 | Align Alignment, |
6168 | unsigned AddressSpace) const { |
6169 | |
6170 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ); |
6171 | unsigned VF = cast<FixedVectorType>(Val: SrcVTy)->getNumElements(); |
6172 | |
6173 | // Try to reduce index size from 64 bit (default for GEP) |
6174 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the |
6175 | // operation will use 16 x 64 indices which do not fit in a zmm and needs |
6176 | // to split. Also check that the base pointer is the same for all lanes, |
6177 | // and that there's at most one variable index. |
6178 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { |
6179 | unsigned IndexSize = DL.getPointerSizeInBits(); |
6180 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
6181 | if (IndexSize < 64 || !GEP) |
6182 | return IndexSize; |
6183 | |
6184 | unsigned NumOfVarIndices = 0; |
6185 | const Value *Ptrs = GEP->getPointerOperand(); |
6186 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(V: Ptrs)) |
6187 | return IndexSize; |
6188 | for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { |
6189 | if (isa<Constant>(Val: GEP->getOperand(i_nocapture: I))) |
6190 | continue; |
6191 | Type *IndxTy = GEP->getOperand(i_nocapture: I)->getType(); |
6192 | if (auto *IndexVTy = dyn_cast<VectorType>(Val: IndxTy)) |
6193 | IndxTy = IndexVTy->getElementType(); |
6194 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && |
6195 | !isa<SExtInst>(Val: GEP->getOperand(i_nocapture: I))) || |
6196 | ++NumOfVarIndices > 1) |
6197 | return IndexSize; // 64 |
6198 | } |
6199 | return (unsigned)32; |
6200 | }; |
6201 | |
6202 | // Trying to reduce IndexSize to 32 bits for vector 16. |
6203 | // By default the IndexSize is equal to pointer size. |
6204 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) |
6205 | ? getIndexSizeInBits(Ptr, DL) |
6206 | : DL.getPointerSizeInBits(); |
6207 | |
6208 | auto *IndexVTy = FixedVectorType::get( |
6209 | ElementType: IntegerType::get(C&: SrcVTy->getContext(), NumBits: IndexSize), NumElts: VF); |
6210 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(Ty: IndexVTy); |
6211 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcVTy); |
6212 | InstructionCost::CostType SplitFactor = |
6213 | std::max(a: IdxsLT.first, b: SrcLT.first).getValue(); |
6214 | if (SplitFactor > 1) { |
6215 | // Handle splitting of vector of pointers |
6216 | auto *SplitSrcTy = |
6217 | FixedVectorType::get(ElementType: SrcVTy->getScalarType(), NumElts: VF / SplitFactor); |
6218 | return SplitFactor * getGSVectorCost(Opcode, CostKind, SrcVTy: SplitSrcTy, Ptr, |
6219 | Alignment, AddressSpace); |
6220 | } |
6221 | |
6222 | // If we didn't split, this will be a single gather/scatter instruction. |
6223 | if (CostKind == TTI::TCK_CodeSize) |
6224 | return 1; |
6225 | |
6226 | // The gather / scatter cost is given by Intel architects. It is a rough |
6227 | // number since we are looking at one instruction in a time. |
6228 | const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() |
6229 | : getScatterOverhead(); |
6230 | return GSOverhead + VF * getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
6231 | Alignment, AddressSpace, CostKind); |
6232 | } |
6233 | |
6234 | /// Calculate the cost of Gather / Scatter operation |
6235 | InstructionCost X86TTIImpl::getGatherScatterOpCost( |
6236 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, |
6237 | Align Alignment, TTI::TargetCostKind CostKind, |
6238 | const Instruction *I = nullptr) const { |
6239 | if ((Opcode == Instruction::Load && |
6240 | (!isLegalMaskedGather(DataType: SrcVTy, Alignment: Align(Alignment)) || |
6241 | forceScalarizeMaskedGather(VTy: cast<VectorType>(Val: SrcVTy), |
6242 | Alignment: Align(Alignment)))) || |
6243 | (Opcode == Instruction::Store && |
6244 | (!isLegalMaskedScatter(DataType: SrcVTy, Alignment: Align(Alignment)) || |
6245 | forceScalarizeMaskedScatter(VTy: cast<VectorType>(Val: SrcVTy), |
6246 | Alignment: Align(Alignment))))) |
6247 | return BaseT::getGatherScatterOpCost(Opcode, DataTy: SrcVTy, Ptr, VariableMask, |
6248 | Alignment, CostKind, I); |
6249 | |
6250 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ); |
6251 | PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
6252 | if (!PtrTy && Ptr->getType()->isVectorTy()) |
6253 | PtrTy = dyn_cast<PointerType>( |
6254 | Val: cast<VectorType>(Val: Ptr->getType())->getElementType()); |
6255 | assert(PtrTy && "Unexpected type for Ptr argument" ); |
6256 | unsigned AddressSpace = PtrTy->getAddressSpace(); |
6257 | return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, |
6258 | AddressSpace); |
6259 | } |
6260 | |
6261 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
6262 | const TargetTransformInfo::LSRCost &C2) const { |
6263 | // X86 specific here are "instruction number 1st priority". |
6264 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, args: C1.NumIVMuls, |
6265 | args: C1.NumBaseAdds, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
6266 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, args: C2.NumIVMuls, |
6267 | args: C2.NumBaseAdds, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
6268 | } |
6269 | |
6270 | bool X86TTIImpl::canMacroFuseCmp() const { |
6271 | return ST->hasMacroFusion() || ST->hasBranchFusion(); |
6272 | } |
6273 | |
6274 | static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) { |
6275 | if (!ST->hasAVX()) |
6276 | return false; |
6277 | |
6278 | if (ScalarTy->isPointerTy()) |
6279 | return true; |
6280 | |
6281 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6282 | return true; |
6283 | |
6284 | if (ScalarTy->isHalfTy() && ST->hasBWI()) |
6285 | return true; |
6286 | |
6287 | if (ScalarTy->isBFloatTy() && ST->hasBF16()) |
6288 | return true; |
6289 | |
6290 | if (!ScalarTy->isIntegerTy()) |
6291 | return false; |
6292 | |
6293 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6294 | return IntWidth == 32 || IntWidth == 64 || |
6295 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); |
6296 | } |
6297 | |
6298 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, |
6299 | unsigned AddressSpace) const { |
6300 | Type *ScalarTy = DataTy->getScalarType(); |
6301 | |
6302 | // The backend can't handle a single element vector w/o CFCMOV. |
6303 | if (isa<VectorType>(Val: DataTy) && |
6304 | cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
6305 | return ST->hasCF() && |
6306 | hasConditionalLoadStoreForType(Ty: ScalarTy, /*IsStore=*/false); |
6307 | |
6308 | return isLegalMaskedLoadStore(ScalarTy, ST); |
6309 | } |
6310 | |
6311 | bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment, |
6312 | unsigned AddressSpace) const { |
6313 | Type *ScalarTy = DataTy->getScalarType(); |
6314 | |
6315 | // The backend can't handle a single element vector w/o CFCMOV. |
6316 | if (isa<VectorType>(Val: DataTy) && |
6317 | cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
6318 | return ST->hasCF() && |
6319 | hasConditionalLoadStoreForType(Ty: ScalarTy, /*IsStore=*/true); |
6320 | |
6321 | return isLegalMaskedLoadStore(ScalarTy, ST); |
6322 | } |
6323 | |
6324 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) const { |
6325 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
6326 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 |
6327 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 |
6328 | // (the equivalent stores only require AVX). |
6329 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) |
6330 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); |
6331 | |
6332 | return false; |
6333 | } |
6334 | |
6335 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) const { |
6336 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
6337 | |
6338 | // SSE4A supports nontemporal stores of float and double at arbitrary |
6339 | // alignment. |
6340 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) |
6341 | return true; |
6342 | |
6343 | // Besides the SSE4A subtarget exception above, only aligned stores are |
6344 | // available nontemporaly on any other subtarget. And only stores with a size |
6345 | // of 4..32 bytes (powers of 2, only) are permitted. |
6346 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || |
6347 | !isPowerOf2_32(Value: DataSize)) |
6348 | return false; |
6349 | |
6350 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent |
6351 | // loads require AVX2). |
6352 | if (DataSize == 32) |
6353 | return ST->hasAVX(); |
6354 | if (DataSize == 16) |
6355 | return ST->hasSSE1(); |
6356 | return true; |
6357 | } |
6358 | |
6359 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, |
6360 | ElementCount NumElements) const { |
6361 | // movddup |
6362 | return ST->hasSSE3() && !NumElements.isScalable() && |
6363 | NumElements.getFixedValue() == 2 && |
6364 | ElementTy == Type::getDoubleTy(C&: ElementTy->getContext()); |
6365 | } |
6366 | |
6367 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const { |
6368 | if (!isa<VectorType>(Val: DataTy)) |
6369 | return false; |
6370 | |
6371 | if (!ST->hasAVX512()) |
6372 | return false; |
6373 | |
6374 | // The backend can't handle a single element vector. |
6375 | if (cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
6376 | return false; |
6377 | |
6378 | Type *ScalarTy = cast<VectorType>(Val: DataTy)->getElementType(); |
6379 | |
6380 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6381 | return true; |
6382 | |
6383 | if (!ScalarTy->isIntegerTy()) |
6384 | return false; |
6385 | |
6386 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6387 | return IntWidth == 32 || IntWidth == 64 || |
6388 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); |
6389 | } |
6390 | |
6391 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, |
6392 | Align Alignment) const { |
6393 | return isLegalMaskedExpandLoad(DataTy, Alignment); |
6394 | } |
6395 | |
6396 | bool X86TTIImpl::supportsGather() const { |
6397 | // Some CPUs have better gather performance than others. |
6398 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only |
6399 | // enable gather with a -march. |
6400 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); |
6401 | } |
6402 | |
6403 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, |
6404 | Align Alignment) const { |
6405 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX |
6406 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend |
6407 | // it to 8 elements, but zeroing upper bits of the mask vector will add more |
6408 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: |
6409 | // Check, maybe the gather/scatter instruction is better in the VariableMask |
6410 | // case. |
6411 | unsigned NumElts = cast<FixedVectorType>(Val: VTy)->getNumElements(); |
6412 | return NumElts == 1 || |
6413 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); |
6414 | } |
6415 | |
6416 | bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, |
6417 | Align Alignment) const { |
6418 | Type *ScalarTy = DataTy->getScalarType(); |
6419 | if (ScalarTy->isPointerTy()) |
6420 | return true; |
6421 | |
6422 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6423 | return true; |
6424 | |
6425 | if (!ScalarTy->isIntegerTy()) |
6426 | return false; |
6427 | |
6428 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6429 | return IntWidth == 32 || IntWidth == 64; |
6430 | } |
6431 | |
6432 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) const { |
6433 | if (!supportsGather() || !ST->preferGather()) |
6434 | return false; |
6435 | return isLegalMaskedGatherScatter(DataTy, Alignment); |
6436 | } |
6437 | |
6438 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, |
6439 | unsigned Opcode1, |
6440 | const SmallBitVector &OpcodeMask) const { |
6441 | // ADDSUBPS 4xf32 SSE3 |
6442 | // VADDSUBPS 4xf32 AVX |
6443 | // VADDSUBPS 8xf32 AVX2 |
6444 | // ADDSUBPD 2xf64 SSE3 |
6445 | // VADDSUBPD 2xf64 AVX |
6446 | // VADDSUBPD 4xf64 AVX2 |
6447 | |
6448 | unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
6449 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible" ); |
6450 | if (!isPowerOf2_32(Value: NumElements)) |
6451 | return false; |
6452 | // Check the opcode pattern. We apply the mask on the opcode arguments and |
6453 | // then check if it is what we expect. |
6454 | for (int Lane : seq<int>(Begin: 0, End: NumElements)) { |
6455 | unsigned Opc = OpcodeMask.test(Idx: Lane) ? Opcode1 : Opcode0; |
6456 | // We expect FSub for even lanes and FAdd for odd lanes. |
6457 | if (Lane % 2 == 0 && Opc != Instruction::FSub) |
6458 | return false; |
6459 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) |
6460 | return false; |
6461 | } |
6462 | // Now check that the pattern is supported by the target ISA. |
6463 | Type *ElemTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6464 | if (ElemTy->isFloatTy()) |
6465 | return ST->hasSSE3() && NumElements % 4 == 0; |
6466 | if (ElemTy->isDoubleTy()) |
6467 | return ST->hasSSE3() && NumElements % 2 == 0; |
6468 | return false; |
6469 | } |
6470 | |
6471 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) const { |
6472 | // AVX2 doesn't support scatter |
6473 | if (!ST->hasAVX512() || !ST->preferScatter()) |
6474 | return false; |
6475 | return isLegalMaskedGatherScatter(DataTy: DataType, Alignment); |
6476 | } |
6477 | |
6478 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const { |
6479 | EVT VT = TLI->getValueType(DL, Ty: DataType); |
6480 | return TLI->isOperationLegal(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); |
6481 | } |
6482 | |
6483 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction *I) const { |
6484 | // FDIV is always expensive, even if it has a very low uop count. |
6485 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? |
6486 | if (I->getOpcode() == Instruction::FDiv) |
6487 | return true; |
6488 | |
6489 | return BaseT::isExpensiveToSpeculativelyExecute(I); |
6490 | } |
6491 | |
6492 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const { return false; } |
6493 | |
6494 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, |
6495 | const Function *Callee) const { |
6496 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6497 | |
6498 | // Work this as a subsetting of subtarget features. |
6499 | const FeatureBitset &CallerBits = |
6500 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
6501 | const FeatureBitset &CalleeBits = |
6502 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
6503 | |
6504 | // Check whether features are the same (apart from the ignore list). |
6505 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
6506 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
6507 | if (RealCallerBits == RealCalleeBits) |
6508 | return true; |
6509 | |
6510 | // If the features are a subset, we need to additionally check for calls |
6511 | // that may become ABI-incompatible as a result of inlining. |
6512 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
6513 | return false; |
6514 | |
6515 | for (const Instruction &I : instructions(F: Callee)) { |
6516 | if (const auto *CB = dyn_cast<CallBase>(Val: &I)) { |
6517 | // Having more target features is fine for inline ASM. |
6518 | if (CB->isInlineAsm()) |
6519 | continue; |
6520 | |
6521 | SmallVector<Type *, 8> Types; |
6522 | for (Value *Arg : CB->args()) |
6523 | Types.push_back(Elt: Arg->getType()); |
6524 | if (!CB->getType()->isVoidTy()) |
6525 | Types.push_back(Elt: CB->getType()); |
6526 | |
6527 | // Simple types are always ABI compatible. |
6528 | auto IsSimpleTy = [](Type *Ty) { |
6529 | return !Ty->isVectorTy() && !Ty->isAggregateType(); |
6530 | }; |
6531 | if (all_of(Range&: Types, P: IsSimpleTy)) |
6532 | continue; |
6533 | |
6534 | if (Function *NestedCallee = CB->getCalledFunction()) { |
6535 | // Assume that intrinsics are always ABI compatible. |
6536 | if (NestedCallee->isIntrinsic()) |
6537 | continue; |
6538 | |
6539 | // Do a precise compatibility check. |
6540 | if (!areTypesABICompatible(Caller, Callee: NestedCallee, Type: Types)) |
6541 | return false; |
6542 | } else { |
6543 | // We don't know the target features of the callee, |
6544 | // assume it is incompatible. |
6545 | return false; |
6546 | } |
6547 | } |
6548 | } |
6549 | return true; |
6550 | } |
6551 | |
6552 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, |
6553 | const Function *Callee, |
6554 | const ArrayRef<Type *> &Types) const { |
6555 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
6556 | return false; |
6557 | |
6558 | // If we get here, we know the target features match. If one function |
6559 | // considers 512-bit vectors legal and the other does not, consider them |
6560 | // incompatible. |
6561 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6562 | |
6563 | if (TM.getSubtarget<X86Subtarget>(F: *Caller).useAVX512Regs() == |
6564 | TM.getSubtarget<X86Subtarget>(F: *Callee).useAVX512Regs()) |
6565 | return true; |
6566 | |
6567 | // Consider the arguments compatible if they aren't vectors or aggregates. |
6568 | // FIXME: Look at the size of vectors. |
6569 | // FIXME: Look at the element types of aggregates to see if there are vectors. |
6570 | return llvm::none_of(Range: Types, |
6571 | P: [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); |
6572 | } |
6573 | |
6574 | X86TTIImpl::TTI::MemCmpExpansionOptions |
6575 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
6576 | TTI::MemCmpExpansionOptions Options; |
6577 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
6578 | Options.NumLoadsPerBlock = 2; |
6579 | // All GPR and vector loads can be unaligned. |
6580 | Options.AllowOverlappingLoads = true; |
6581 | if (IsZeroCmp) { |
6582 | // Only enable vector loads for equality comparison. Right now the vector |
6583 | // version is not as fast for three way compare (see #33329). |
6584 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); |
6585 | if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) |
6586 | Options.LoadSizes.push_back(Elt: 64); |
6587 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(Elt: 32); |
6588 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(Elt: 16); |
6589 | } |
6590 | if (ST->is64Bit()) { |
6591 | Options.LoadSizes.push_back(Elt: 8); |
6592 | } |
6593 | Options.LoadSizes.push_back(Elt: 4); |
6594 | Options.LoadSizes.push_back(Elt: 2); |
6595 | Options.LoadSizes.push_back(Elt: 1); |
6596 | return Options; |
6597 | } |
6598 | |
6599 | bool X86TTIImpl::prefersVectorizedAddressing() const { |
6600 | return supportsGather(); |
6601 | } |
6602 | |
6603 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { |
6604 | return false; |
6605 | } |
6606 | |
6607 | bool X86TTIImpl::enableInterleavedAccessVectorization() const { |
6608 | // TODO: We expect this to be beneficial regardless of arch, |
6609 | // but there are currently some unexplained performance artifacts on Atom. |
6610 | // As a temporary solution, disable on Atom. |
6611 | return !(ST->isAtom()); |
6612 | } |
6613 | |
6614 | // Get estimation for interleaved load/store operations and strided load. |
6615 | // \p Indices contains indices for strided load. |
6616 | // \p Factor - the factor of interleaving. |
6617 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. |
6618 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( |
6619 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, |
6620 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, |
6621 | TTI::TargetCostKind CostKind, bool UseMaskForCond, |
6622 | bool UseMaskForGaps) const { |
6623 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6624 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6625 | // VecTy = <12 x i32>. |
6626 | |
6627 | // Calculate the number of memory operations (NumOfMemOps), required |
6628 | // for load/store the VecTy. |
6629 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6630 | unsigned VecTySize = DL.getTypeStoreSize(Ty: VecTy); |
6631 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
6632 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
6633 | |
6634 | // Get the cost of one memory operation. |
6635 | auto *SingleMemOpTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6636 | NumElts: LegalVT.getVectorNumElements()); |
6637 | InstructionCost MemOpCost; |
6638 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; |
6639 | if (UseMaskedMemOp) |
6640 | MemOpCost = getMaskedMemoryOpCost(Opcode, SrcTy: SingleMemOpTy, Alignment, |
6641 | AddressSpace, CostKind); |
6642 | else |
6643 | MemOpCost = getMemoryOpCost(Opcode, Src: SingleMemOpTy, Alignment, AddressSpace, |
6644 | CostKind); |
6645 | |
6646 | unsigned VF = VecTy->getNumElements() / Factor; |
6647 | MVT VT = |
6648 | MVT::getVectorVT(VT: TLI->getSimpleValueType(DL, Ty: VecTy->getScalarType()), NumElements: VF); |
6649 | |
6650 | InstructionCost MaskCost; |
6651 | if (UseMaskedMemOp) { |
6652 | APInt DemandedLoadStoreElts = APInt::getZero(numBits: VecTy->getNumElements()); |
6653 | for (unsigned Index : Indices) { |
6654 | assert(Index < Factor && "Invalid index for interleaved memory op" ); |
6655 | for (unsigned Elm = 0; Elm < VF; Elm++) |
6656 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
6657 | } |
6658 | |
6659 | Type *I1Type = Type::getInt1Ty(C&: VecTy->getContext()); |
6660 | |
6661 | MaskCost = getReplicationShuffleCost( |
6662 | EltTy: I1Type, ReplicationFactor: Factor, VF, |
6663 | DemandedDstElts: UseMaskForGaps ? DemandedLoadStoreElts |
6664 | : APInt::getAllOnes(numBits: VecTy->getNumElements()), |
6665 | CostKind); |
6666 | |
6667 | // The Gaps mask is invariant and created outside the loop, therefore the |
6668 | // cost of creating it is not accounted for here. However if we have both |
6669 | // a MaskForGaps and some other mask that guards the execution of the |
6670 | // memory access, we need to account for the cost of And-ing the two masks |
6671 | // inside the loop. |
6672 | if (UseMaskForGaps) { |
6673 | auto *MaskVT = FixedVectorType::get(ElementType: I1Type, NumElts: VecTy->getNumElements()); |
6674 | MaskCost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: MaskVT, CostKind); |
6675 | } |
6676 | } |
6677 | |
6678 | if (Opcode == Instruction::Load) { |
6679 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) |
6680 | // contain the cost of the optimized shuffle sequence that the |
6681 | // X86InterleavedAccess pass will generate. |
6682 | // The cost of loads and stores are computed separately from the table. |
6683 | |
6684 | // X86InterleavedAccess support only the following interleaved-access group. |
6685 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { |
6686 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, //(load 48i8 and) deinterleave into 3 x 16i8 |
6687 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6688 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 22}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6689 | }; |
6690 | |
6691 | if (const auto *Entry = |
6692 | CostTableLookup(Table: AVX512InterleavedLoadTbl, ISD: Factor, Ty: VT)) |
6693 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6694 | //If an entry does not exist, fallback to the default implementation. |
6695 | |
6696 | // Kind of shuffle depends on number of loaded values. |
6697 | // If we load the entire data in one register, we can use a 1-src shuffle. |
6698 | // Otherwise, we'll merge 2 sources in each operation. |
6699 | TTI::ShuffleKind ShuffleKind = |
6700 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; |
6701 | |
6702 | InstructionCost ShuffleCost = getShuffleCost( |
6703 | Kind: ShuffleKind, DstTy: SingleMemOpTy, SrcTy: SingleMemOpTy, Mask: {}, CostKind, Index: 0, SubTp: nullptr); |
6704 | |
6705 | unsigned NumOfLoadsInInterleaveGrp = |
6706 | Indices.size() ? Indices.size() : Factor; |
6707 | auto *ResultTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6708 | NumElts: VecTy->getNumElements() / Factor); |
6709 | InstructionCost NumOfResults = |
6710 | getTypeLegalizationCost(Ty: ResultTy).first * NumOfLoadsInInterleaveGrp; |
6711 | |
6712 | // About a half of the loads may be folded in shuffles when we have only |
6713 | // one result. If we have more than one result, or the loads are masked, |
6714 | // we do not fold loads at all. |
6715 | unsigned NumOfUnfoldedLoads = |
6716 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; |
6717 | |
6718 | // Get a number of shuffle operations per result. |
6719 | unsigned NumOfShufflesPerResult = |
6720 | std::max(a: (unsigned)1, b: (unsigned)(NumOfMemOps - 1)); |
6721 | |
6722 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6723 | // When we have more than one destination, we need additional instructions |
6724 | // to keep sources. |
6725 | InstructionCost NumOfMoves = 0; |
6726 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) |
6727 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; |
6728 | |
6729 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + |
6730 | MaskCost + NumOfUnfoldedLoads * MemOpCost + |
6731 | NumOfMoves; |
6732 | |
6733 | return Cost; |
6734 | } |
6735 | |
6736 | // Store. |
6737 | assert(Opcode == Instruction::Store && |
6738 | "Expected Store Instruction at this point" ); |
6739 | // X86InterleavedAccess support only the following interleaved-access group. |
6740 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { |
6741 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, // interleave 3 x 16i8 into 48i8 (and store) |
6742 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // interleave 3 x 32i8 into 96i8 (and store) |
6743 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 26}, // interleave 3 x 64i8 into 96i8 (and store) |
6744 | |
6745 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 10}, // interleave 4 x 8i8 into 32i8 (and store) |
6746 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 11}, // interleave 4 x 16i8 into 64i8 (and store) |
6747 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 14}, // interleave 4 x 32i8 into 128i8 (and store) |
6748 | {.ISD: 4, .Type: MVT::v64i8, .Cost: 24} // interleave 4 x 32i8 into 256i8 (and store) |
6749 | }; |
6750 | |
6751 | if (const auto *Entry = |
6752 | CostTableLookup(Table: AVX512InterleavedStoreTbl, ISD: Factor, Ty: VT)) |
6753 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6754 | //If an entry does not exist, fallback to the default implementation. |
6755 | |
6756 | // There is no strided stores meanwhile. And store can't be folded in |
6757 | // shuffle. |
6758 | unsigned NumOfSources = Factor; // The number of values to be merged. |
6759 | InstructionCost ShuffleCost = |
6760 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, DstTy: SingleMemOpTy, SrcTy: SingleMemOpTy, Mask: {}, |
6761 | CostKind, Index: 0, SubTp: nullptr); |
6762 | unsigned NumOfShufflesPerStore = NumOfSources - 1; |
6763 | |
6764 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6765 | // We need additional instructions to keep sources. |
6766 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; |
6767 | InstructionCost Cost = |
6768 | MaskCost + |
6769 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + |
6770 | NumOfMoves; |
6771 | return Cost; |
6772 | } |
6773 | |
6774 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( |
6775 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, |
6776 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
6777 | bool UseMaskForCond, bool UseMaskForGaps) const { |
6778 | auto *VecTy = cast<FixedVectorType>(Val: BaseTy); |
6779 | |
6780 | auto isSupportedOnAVX512 = [&](Type *VecTy) { |
6781 | Type *EltTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6782 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(Bitwidth: 64) || |
6783 | EltTy->isIntegerTy(Bitwidth: 32) || EltTy->isPointerTy()) |
6784 | return true; |
6785 | if (EltTy->isIntegerTy(Bitwidth: 16) || EltTy->isIntegerTy(Bitwidth: 8) || EltTy->isHalfTy()) |
6786 | return ST->hasBWI(); |
6787 | if (EltTy->isBFloatTy()) |
6788 | return ST->hasBF16(); |
6789 | return false; |
6790 | }; |
6791 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) |
6792 | return getInterleavedMemoryOpCostAVX512( |
6793 | Opcode, VecTy, Factor, Indices, Alignment, |
6794 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); |
6795 | |
6796 | if (UseMaskForCond || UseMaskForGaps) |
6797 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6798 | Alignment, AddressSpace, CostKind, |
6799 | UseMaskForCond, UseMaskForGaps); |
6800 | |
6801 | // Get estimation for interleaved load/store operations for SSE-AVX2. |
6802 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow |
6803 | // computing the cost using a generic formula as a function of generic |
6804 | // shuffles. We therefore use a lookup table instead, filled according to |
6805 | // the instruction sequences that codegen currently generates. |
6806 | |
6807 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6808 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6809 | // VecTy = <12 x i32>. |
6810 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6811 | |
6812 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case |
6813 | // the VF=2, while v2i128 is an unsupported MVT vector type |
6814 | // (see MachineValueType.h::getVectorVT()). |
6815 | if (!LegalVT.isVector()) |
6816 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6817 | Alignment, AddressSpace, CostKind); |
6818 | |
6819 | unsigned VF = VecTy->getNumElements() / Factor; |
6820 | Type *ScalarTy = VecTy->getElementType(); |
6821 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. |
6822 | if (!ScalarTy->isIntegerTy()) |
6823 | ScalarTy = |
6824 | Type::getIntNTy(C&: ScalarTy->getContext(), N: DL.getTypeSizeInBits(Ty: ScalarTy)); |
6825 | |
6826 | // Get the cost of all the memory operations. |
6827 | // FIXME: discount dead loads. |
6828 | InstructionCost MemOpCosts = |
6829 | getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind); |
6830 | |
6831 | auto *VT = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF); |
6832 | EVT ETy = TLI->getValueType(DL, Ty: VT); |
6833 | if (!ETy.isSimple()) |
6834 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6835 | Alignment, AddressSpace, CostKind); |
6836 | |
6837 | // TODO: Complete for other data-types and strides. |
6838 | // Each combination of Stride, element bit width and VF results in a different |
6839 | // sequence; The cost tables are therefore accessed with: |
6840 | // Factor (stride) and VectorType=VFxiN. |
6841 | // The Cost accounts only for the shuffle sequence; |
6842 | // The cost of the loads/stores is accounted for separately. |
6843 | // |
6844 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { |
6845 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 2}, // (load 4i8 and) deinterleave into 2 x 2i8 |
6846 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 2}, // (load 8i8 and) deinterleave into 2 x 4i8 |
6847 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 2}, // (load 16i8 and) deinterleave into 2 x 8i8 |
6848 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 4}, // (load 32i8 and) deinterleave into 2 x 16i8 |
6849 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 6}, // (load 64i8 and) deinterleave into 2 x 32i8 |
6850 | |
6851 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 6}, // (load 16i16 and) deinterleave into 2 x 8i16 |
6852 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 9}, // (load 32i16 and) deinterleave into 2 x 16i16 |
6853 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 18}, // (load 64i16 and) deinterleave into 2 x 32i16 |
6854 | |
6855 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // (load 16i32 and) deinterleave into 2 x 8i32 |
6856 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // (load 32i32 and) deinterleave into 2 x 16i32 |
6857 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // (load 64i32 and) deinterleave into 2 x 32i32 |
6858 | |
6859 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // (load 8i64 and) deinterleave into 2 x 4i64 |
6860 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // (load 16i64 and) deinterleave into 2 x 8i64 |
6861 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // (load 32i64 and) deinterleave into 2 x 16i64 |
6862 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // (load 64i64 and) deinterleave into 2 x 32i64 |
6863 | |
6864 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 3}, // (load 6i8 and) deinterleave into 3 x 2i8 |
6865 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 3}, // (load 12i8 and) deinterleave into 3 x 4i8 |
6866 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // (load 24i8 and) deinterleave into 3 x 8i8 |
6867 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // (load 48i8 and) deinterleave into 3 x 16i8 |
6868 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // (load 96i8 and) deinterleave into 3 x 32i8 |
6869 | |
6870 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 5}, // (load 6i16 and) deinterleave into 3 x 2i16 |
6871 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 7}, // (load 12i16 and) deinterleave into 3 x 4i16 |
6872 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 9}, // (load 24i16 and) deinterleave into 3 x 8i16 |
6873 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 28}, // (load 48i16 and) deinterleave into 3 x 16i16 |
6874 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 56}, // (load 96i16 and) deinterleave into 3 x 32i16 |
6875 | |
6876 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 3}, // (load 6i32 and) deinterleave into 3 x 2i32 |
6877 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 3}, // (load 12i32 and) deinterleave into 3 x 4i32 |
6878 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 7}, // (load 24i32 and) deinterleave into 3 x 8i32 |
6879 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 14}, // (load 48i32 and) deinterleave into 3 x 16i32 |
6880 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 32}, // (load 96i32 and) deinterleave into 3 x 32i32 |
6881 | |
6882 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 1}, // (load 6i64 and) deinterleave into 3 x 2i64 |
6883 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 5}, // (load 12i64 and) deinterleave into 3 x 4i64 |
6884 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 10}, // (load 24i64 and) deinterleave into 3 x 8i64 |
6885 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 20}, // (load 48i64 and) deinterleave into 3 x 16i64 |
6886 | |
6887 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // (load 8i8 and) deinterleave into 4 x 2i8 |
6888 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // (load 16i8 and) deinterleave into 4 x 4i8 |
6889 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 12}, // (load 32i8 and) deinterleave into 4 x 8i8 |
6890 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 24}, // (load 64i8 and) deinterleave into 4 x 16i8 |
6891 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 56}, // (load 128i8 and) deinterleave into 4 x 32i8 |
6892 | |
6893 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 6}, // (load 8i16 and) deinterleave into 4 x 2i16 |
6894 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 17}, // (load 16i16 and) deinterleave into 4 x 4i16 |
6895 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 33}, // (load 32i16 and) deinterleave into 4 x 8i16 |
6896 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 75}, // (load 64i16 and) deinterleave into 4 x 16i16 |
6897 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 150}, // (load 128i16 and) deinterleave into 4 x 32i16 |
6898 | |
6899 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // (load 8i32 and) deinterleave into 4 x 2i32 |
6900 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 8}, // (load 16i32 and) deinterleave into 4 x 4i32 |
6901 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // (load 32i32 and) deinterleave into 4 x 8i32 |
6902 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // (load 64i32 and) deinterleave into 4 x 16i32 |
6903 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 68}, // (load 128i32 and) deinterleave into 4 x 32i32 |
6904 | |
6905 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // (load 8i64 and) deinterleave into 4 x 2i64 |
6906 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // (load 16i64 and) deinterleave into 4 x 4i64 |
6907 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // (load 32i64 and) deinterleave into 4 x 8i64 |
6908 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // (load 64i64 and) deinterleave into 4 x 16i64 |
6909 | |
6910 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 6}, // (load 12i8 and) deinterleave into 6 x 2i8 |
6911 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 14}, // (load 24i8 and) deinterleave into 6 x 4i8 |
6912 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 18}, // (load 48i8 and) deinterleave into 6 x 8i8 |
6913 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 43}, // (load 96i8 and) deinterleave into 6 x 16i8 |
6914 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 82}, // (load 192i8 and) deinterleave into 6 x 32i8 |
6915 | |
6916 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 13}, // (load 12i16 and) deinterleave into 6 x 2i16 |
6917 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 9}, // (load 24i16 and) deinterleave into 6 x 4i16 |
6918 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 39}, // (load 48i16 and) deinterleave into 6 x 8i16 |
6919 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 106}, // (load 96i16 and) deinterleave into 6 x 16i16 |
6920 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 212}, // (load 192i16 and) deinterleave into 6 x 32i16 |
6921 | |
6922 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 6}, // (load 12i32 and) deinterleave into 6 x 2i32 |
6923 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 15}, // (load 24i32 and) deinterleave into 6 x 4i32 |
6924 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 31}, // (load 48i32 and) deinterleave into 6 x 8i32 |
6925 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 64}, // (load 96i32 and) deinterleave into 6 x 16i32 |
6926 | |
6927 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 6}, // (load 12i64 and) deinterleave into 6 x 2i64 |
6928 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 18}, // (load 24i64 and) deinterleave into 6 x 4i64 |
6929 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 36}, // (load 48i64 and) deinterleave into 6 x 8i64 |
6930 | |
6931 | {.ISD: 8, .Type: MVT::v8i32, .Cost: 40} // (load 64i32 and) deinterleave into 8 x 8i32 |
6932 | }; |
6933 | |
6934 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { |
6935 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 2}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6936 | }; |
6937 | |
6938 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { |
6939 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 2}, // (load 4i16 and) deinterleave into 2 x 2i16 |
6940 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 7}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6941 | |
6942 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 2}, // (load 4i32 and) deinterleave into 2 x 2i32 |
6943 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // (load 8i32 and) deinterleave into 2 x 4i32 |
6944 | |
6945 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // (load 4i64 and) deinterleave into 2 x 2i64 |
6946 | }; |
6947 | |
6948 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { |
6949 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 3}, // interleave 2 x 16i8 into 32i8 (and store) |
6950 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 4}, // interleave 2 x 32i8 into 64i8 (and store) |
6951 | |
6952 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 3}, // interleave 2 x 8i16 into 16i16 (and store) |
6953 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 4}, // interleave 2 x 16i16 into 32i16 (and store) |
6954 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 8}, // interleave 2 x 32i16 into 64i16 (and store) |
6955 | |
6956 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32 (and store) |
6957 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // interleave 2 x 8i32 into 16i32 (and store) |
6958 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // interleave 2 x 16i32 into 32i32 (and store) |
6959 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // interleave 2 x 32i32 into 64i32 (and store) |
6960 | |
6961 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // interleave 2 x 2i64 into 4i64 (and store) |
6962 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // interleave 2 x 4i64 into 8i64 (and store) |
6963 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // interleave 2 x 8i64 into 16i64 (and store) |
6964 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // interleave 2 x 16i64 into 32i64 (and store) |
6965 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // interleave 2 x 32i64 into 64i64 (and store) |
6966 | |
6967 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 4}, // interleave 3 x 2i8 into 6i8 (and store) |
6968 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 4}, // interleave 3 x 4i8 into 12i8 (and store) |
6969 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // interleave 3 x 8i8 into 24i8 (and store) |
6970 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // interleave 3 x 16i8 into 48i8 (and store) |
6971 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 13}, // interleave 3 x 32i8 into 96i8 (and store) |
6972 | |
6973 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 4}, // interleave 3 x 2i16 into 6i16 (and store) |
6974 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 6}, // interleave 3 x 4i16 into 12i16 (and store) |
6975 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 12}, // interleave 3 x 8i16 into 24i16 (and store) |
6976 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 27}, // interleave 3 x 16i16 into 48i16 (and store) |
6977 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 54}, // interleave 3 x 32i16 into 96i16 (and store) |
6978 | |
6979 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 4}, // interleave 3 x 2i32 into 6i32 (and store) |
6980 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 5}, // interleave 3 x 4i32 into 12i32 (and store) |
6981 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 11}, // interleave 3 x 8i32 into 24i32 (and store) |
6982 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 22}, // interleave 3 x 16i32 into 48i32 (and store) |
6983 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 48}, // interleave 3 x 32i32 into 96i32 (and store) |
6984 | |
6985 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 4}, // interleave 3 x 2i64 into 6i64 (and store) |
6986 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 6}, // interleave 3 x 4i64 into 12i64 (and store) |
6987 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 12}, // interleave 3 x 8i64 into 24i64 (and store) |
6988 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 24}, // interleave 3 x 16i64 into 48i64 (and store) |
6989 | |
6990 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8 (and store) |
6991 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8 (and store) |
6992 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 4}, // interleave 4 x 8i8 into 32i8 (and store) |
6993 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 8}, // interleave 4 x 16i8 into 64i8 (and store) |
6994 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 12}, // interleave 4 x 32i8 into 128i8 (and store) |
6995 | |
6996 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 2}, // interleave 4 x 2i16 into 8i16 (and store) |
6997 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 6}, // interleave 4 x 4i16 into 16i16 (and store) |
6998 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 10}, // interleave 4 x 8i16 into 32i16 (and store) |
6999 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 32}, // interleave 4 x 16i16 into 64i16 (and store) |
7000 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 64}, // interleave 4 x 32i16 into 128i16 (and store) |
7001 | |
7002 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 5}, // interleave 4 x 2i32 into 8i32 (and store) |
7003 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 6}, // interleave 4 x 4i32 into 16i32 (and store) |
7004 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // interleave 4 x 8i32 into 32i32 (and store) |
7005 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // interleave 4 x 16i32 into 64i32 (and store) |
7006 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 64}, // interleave 4 x 32i32 into 128i32 (and store) |
7007 | |
7008 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // interleave 4 x 2i64 into 8i64 (and store) |
7009 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // interleave 4 x 4i64 into 16i64 (and store) |
7010 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // interleave 4 x 8i64 into 32i64 (and store) |
7011 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // interleave 4 x 16i64 into 64i64 (and store) |
7012 | |
7013 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 7}, // interleave 6 x 2i8 into 12i8 (and store) |
7014 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 9}, // interleave 6 x 4i8 into 24i8 (and store) |
7015 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 16}, // interleave 6 x 8i8 into 48i8 (and store) |
7016 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 27}, // interleave 6 x 16i8 into 96i8 (and store) |
7017 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 90}, // interleave 6 x 32i8 into 192i8 (and store) |
7018 | |
7019 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 10}, // interleave 6 x 2i16 into 12i16 (and store) |
7020 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 15}, // interleave 6 x 4i16 into 24i16 (and store) |
7021 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 21}, // interleave 6 x 8i16 into 48i16 (and store) |
7022 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 58}, // interleave 6 x 16i16 into 96i16 (and store) |
7023 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 90}, // interleave 6 x 32i16 into 192i16 (and store) |
7024 | |
7025 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 9}, // interleave 6 x 2i32 into 12i32 (and store) |
7026 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 12}, // interleave 6 x 4i32 into 24i32 (and store) |
7027 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 33}, // interleave 6 x 8i32 into 48i32 (and store) |
7028 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 66}, // interleave 6 x 16i32 into 96i32 (and store) |
7029 | |
7030 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 8}, // interleave 6 x 2i64 into 12i64 (and store) |
7031 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 15}, // interleave 6 x 4i64 into 24i64 (and store) |
7032 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 30}, // interleave 6 x 8i64 into 48i64 (and store) |
7033 | }; |
7034 | |
7035 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { |
7036 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8 (and store) |
7037 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8 (and store) |
7038 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8 (and store) |
7039 | |
7040 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16 (and store) |
7041 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16 (and store) |
7042 | |
7043 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32 (and store) |
7044 | }; |
7045 | |
7046 | if (Opcode == Instruction::Load) { |
7047 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), |
7048 | MemOpCosts](const CostTblEntry *Entry) { |
7049 | // NOTE: this is just an approximation! |
7050 | // It can over/under -estimate the cost! |
7051 | return MemOpCosts + divideCeil(Numerator: NumMembers * Entry->Cost, Denominator: Factor); |
7052 | }; |
7053 | |
7054 | if (ST->hasAVX2()) |
7055 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedLoadTbl, ISD: Factor, |
7056 | Ty: ETy.getSimpleVT())) |
7057 | return GetDiscountedCost(Entry); |
7058 | |
7059 | if (ST->hasSSSE3()) |
7060 | if (const auto *Entry = CostTableLookup(Table: SSSE3InterleavedLoadTbl, ISD: Factor, |
7061 | Ty: ETy.getSimpleVT())) |
7062 | return GetDiscountedCost(Entry); |
7063 | |
7064 | if (ST->hasSSE2()) |
7065 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedLoadTbl, ISD: Factor, |
7066 | Ty: ETy.getSimpleVT())) |
7067 | return GetDiscountedCost(Entry); |
7068 | } else { |
7069 | assert(Opcode == Instruction::Store && |
7070 | "Expected Store Instruction at this point" ); |
7071 | assert((!Indices.size() || Indices.size() == Factor) && |
7072 | "Interleaved store only supports fully-interleaved groups." ); |
7073 | if (ST->hasAVX2()) |
7074 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedStoreTbl, ISD: Factor, |
7075 | Ty: ETy.getSimpleVT())) |
7076 | return MemOpCosts + Entry->Cost; |
7077 | |
7078 | if (ST->hasSSE2()) |
7079 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedStoreTbl, ISD: Factor, |
7080 | Ty: ETy.getSimpleVT())) |
7081 | return MemOpCosts + Entry->Cost; |
7082 | } |
7083 | |
7084 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
7085 | Alignment, AddressSpace, CostKind, |
7086 | UseMaskForCond, UseMaskForGaps); |
7087 | } |
7088 | |
7089 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
7090 | StackOffset BaseOffset, |
7091 | bool HasBaseReg, int64_t Scale, |
7092 | unsigned AddrSpace) const { |
7093 | // Scaling factors are not free at all. |
7094 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), |
7095 | // will take 2 allocations in the out of order engine instead of 1 |
7096 | // for plain addressing mode, i.e. inst (reg1). |
7097 | // E.g., |
7098 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 |
7099 | // Requires two allocations (one for the load, one for the computation) |
7100 | // whereas: |
7101 | // vaddps (%rsi), %ymm0, %ymm1 |
7102 | // Requires just 1 allocation, i.e., freeing allocations for other operations |
7103 | // and having less micro operations to execute. |
7104 | // |
7105 | // For some X86 architectures, this is even worse because for instance for |
7106 | // stores, the complex addressing mode forces the instruction to use the |
7107 | // "load" ports instead of the dedicated "store" port. |
7108 | // E.g., on Haswell: |
7109 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. |
7110 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. |
7111 | TargetLoweringBase::AddrMode AM; |
7112 | AM.BaseGV = BaseGV; |
7113 | AM.BaseOffs = BaseOffset.getFixed(); |
7114 | AM.HasBaseReg = HasBaseReg; |
7115 | AM.Scale = Scale; |
7116 | AM.ScalableOffset = BaseOffset.getScalable(); |
7117 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
7118 | // Scale represents reg2 * scale, thus account for 1 |
7119 | // as soon as we use a second register. |
7120 | return AM.Scale != 0; |
7121 | return InstructionCost::getInvalid(); |
7122 | } |
7123 | |
7124 | InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { |
7125 | // TODO: Hook MispredictPenalty of SchedMachineModel into this. |
7126 | return 14; |
7127 | } |
7128 | |
7129 | bool X86TTIImpl::isVectorShiftByScalarCheap(Type *Ty) const { |
7130 | unsigned Bits = Ty->getScalarSizeInBits(); |
7131 | |
7132 | // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. |
7133 | // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. |
7134 | if (ST->hasXOP() && (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) |
7135 | return false; |
7136 | |
7137 | // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable |
7138 | // shifts just as cheap as scalar ones. |
7139 | if (ST->hasAVX2() && (Bits == 32 || Bits == 64)) |
7140 | return false; |
7141 | |
7142 | // AVX512BW has shifts such as vpsllvw. |
7143 | if (ST->hasBWI() && Bits == 16) |
7144 | return false; |
7145 | |
7146 | // Otherwise, it's significantly cheaper to shift by a scalar amount than by a |
7147 | // fully general vector. |
7148 | return true; |
7149 | } |
7150 | |
7151 | unsigned X86TTIImpl::getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, |
7152 | Type *ScalarValTy) const { |
7153 | if (ST->hasF16C() && ScalarMemTy->isHalfTy()) { |
7154 | return 4; |
7155 | } |
7156 | return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); |
7157 | } |
7158 | |
7159 | bool X86TTIImpl::isProfitableToSinkOperands(Instruction *I, |
7160 | SmallVectorImpl<Use *> &Ops) const { |
7161 | using namespace llvm::PatternMatch; |
7162 | |
7163 | FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: I->getType()); |
7164 | if (!VTy) |
7165 | return false; |
7166 | |
7167 | if (I->getOpcode() == Instruction::Mul && |
7168 | VTy->getElementType()->isIntegerTy(Bitwidth: 64)) { |
7169 | for (auto &Op : I->operands()) { |
7170 | // Make sure we are not already sinking this operand |
7171 | if (any_of(Range&: Ops, P: [&](Use *U) { return U->get() == Op; })) |
7172 | continue; |
7173 | |
7174 | // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or |
7175 | // the PMULUDQ pattern where the input is a zext_inreg from vXi32. |
7176 | if (ST->hasSSE41() && |
7177 | match(V: Op.get(), P: m_AShr(L: m_Shl(L: m_Value(), R: m_SpecificInt(V: 32)), |
7178 | R: m_SpecificInt(V: 32)))) { |
7179 | Ops.push_back(Elt: &cast<Instruction>(Val&: Op)->getOperandUse(i: 0)); |
7180 | Ops.push_back(Elt: &Op); |
7181 | } else if (ST->hasSSE2() && |
7182 | match(V: Op.get(), |
7183 | P: m_And(L: m_Value(), R: m_SpecificInt(UINT64_C(0xffffffff))))) { |
7184 | Ops.push_back(Elt: &Op); |
7185 | } |
7186 | } |
7187 | |
7188 | return !Ops.empty(); |
7189 | } |
7190 | |
7191 | // A uniform shift amount in a vector shift or funnel shift may be much |
7192 | // cheaper than a generic variable vector shift, so make that pattern visible |
7193 | // to SDAG by sinking the shuffle instruction next to the shift. |
7194 | int ShiftAmountOpNum = -1; |
7195 | if (I->isShift()) |
7196 | ShiftAmountOpNum = 1; |
7197 | else if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) { |
7198 | if (II->getIntrinsicID() == Intrinsic::fshl || |
7199 | II->getIntrinsicID() == Intrinsic::fshr) |
7200 | ShiftAmountOpNum = 2; |
7201 | } |
7202 | |
7203 | if (ShiftAmountOpNum == -1) |
7204 | return false; |
7205 | |
7206 | auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: I->getOperand(i: ShiftAmountOpNum)); |
7207 | if (Shuf && getSplatIndex(Mask: Shuf->getShuffleMask()) >= 0 && |
7208 | isVectorShiftByScalarCheap(Ty: I->getType())) { |
7209 | Ops.push_back(Elt: &I->getOperandUse(i: ShiftAmountOpNum)); |
7210 | return true; |
7211 | } |
7212 | |
7213 | return false; |
7214 | } |
7215 | |