1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements a TargetTransformInfo analysis pass specific to the |
10 | /// X86 target machine. It uses the target's detailed information to provide |
11 | /// more precise answers to certain TTI queries, while letting the target |
12 | /// independent and default TTI implementations handle the rest. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | /// About Cost Model numbers used below it's necessary to say the following: |
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a |
17 | /// specific CPU model. Usually the numbers correspond to the CPU where the |
18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in |
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU |
20 | /// to support that feature level and thus has most likely the worst case cost, |
21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). |
22 | /// |
23 | /// Some examples of other technologies/CPUs: |
24 | /// SSE 3 - Pentium4 / Athlon64 |
25 | /// SSE 4.1 - Penryn |
26 | /// SSE 4.2 - Nehalem / Silvermont |
27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer |
28 | /// AVX2 - Haswell / Ryzen |
29 | /// AVX-512 - Xeon Phi / Skylake |
30 | /// |
31 | /// And some examples of instruction target dependent costs (latency) |
32 | /// divss sqrtss rsqrtss |
33 | /// AMD K7 11-16 19 3 |
34 | /// Piledriver 9-24 13-15 5 |
35 | /// Jaguar 14 16 2 |
36 | /// Pentium II,III 18 30 2 |
37 | /// Nehalem 7-14 7-18 3 |
38 | /// Haswell 10-13 11 5 |
39 | /// |
40 | /// Interpreting the 4 TargetCostKind types: |
41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case |
42 | /// values reported by the CPU scheduler models (and llvm-mca). |
43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the |
44 | /// actual encoding size of the instruction. |
45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by |
46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are |
47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are |
48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. |
49 | //===----------------------------------------------------------------------===// |
50 | |
51 | #include "X86TargetTransformInfo.h" |
52 | #include "llvm/Analysis/TargetTransformInfo.h" |
53 | #include "llvm/CodeGen/BasicTTIImpl.h" |
54 | #include "llvm/CodeGen/CostTable.h" |
55 | #include "llvm/CodeGen/TargetLowering.h" |
56 | #include "llvm/IR/InstIterator.h" |
57 | #include "llvm/IR/IntrinsicInst.h" |
58 | #include "llvm/Support/Debug.h" |
59 | #include <optional> |
60 | |
61 | using namespace llvm; |
62 | |
63 | #define DEBUG_TYPE "x86tti" |
64 | |
65 | //===----------------------------------------------------------------------===// |
66 | // |
67 | // X86 cost model. |
68 | // |
69 | //===----------------------------------------------------------------------===// |
70 | |
71 | // Helper struct to store/access costs for each cost kind. |
72 | // TODO: Move this to allow other targets to use it? |
73 | struct CostKindCosts { |
74 | unsigned RecipThroughputCost = ~0U; |
75 | unsigned LatencyCost = ~0U; |
76 | unsigned CodeSizeCost = ~0U; |
77 | unsigned SizeAndLatencyCost = ~0U; |
78 | |
79 | std::optional<unsigned> |
80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { |
81 | unsigned Cost = ~0U; |
82 | switch (Kind) { |
83 | case TargetTransformInfo::TCK_RecipThroughput: |
84 | Cost = RecipThroughputCost; |
85 | break; |
86 | case TargetTransformInfo::TCK_Latency: |
87 | Cost = LatencyCost; |
88 | break; |
89 | case TargetTransformInfo::TCK_CodeSize: |
90 | Cost = CodeSizeCost; |
91 | break; |
92 | case TargetTransformInfo::TCK_SizeAndLatency: |
93 | Cost = SizeAndLatencyCost; |
94 | break; |
95 | } |
96 | if (Cost == ~0U) |
97 | return std::nullopt; |
98 | return Cost; |
99 | } |
100 | }; |
101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; |
102 | using TypeConversionCostKindTblEntry = TypeConversionCostTblEntryT<CostKindCosts>; |
103 | |
104 | TargetTransformInfo::PopcntSupportKind |
105 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { |
106 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
107 | // TODO: Currently the __builtin_popcount() implementation using SSE3 |
108 | // instructions is inefficient. Once the problem is fixed, we should |
109 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
110 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; |
111 | } |
112 | |
113 | std::optional<unsigned> X86TTIImpl::getCacheSize( |
114 | TargetTransformInfo::CacheLevel Level) const { |
115 | switch (Level) { |
116 | case TargetTransformInfo::CacheLevel::L1D: |
117 | // - Penryn |
118 | // - Nehalem |
119 | // - Westmere |
120 | // - Sandy Bridge |
121 | // - Ivy Bridge |
122 | // - Haswell |
123 | // - Broadwell |
124 | // - Skylake |
125 | // - Kabylake |
126 | return 32 * 1024; // 32 KByte |
127 | case TargetTransformInfo::CacheLevel::L2D: |
128 | // - Penryn |
129 | // - Nehalem |
130 | // - Westmere |
131 | // - Sandy Bridge |
132 | // - Ivy Bridge |
133 | // - Haswell |
134 | // - Broadwell |
135 | // - Skylake |
136 | // - Kabylake |
137 | return 256 * 1024; // 256 KByte |
138 | } |
139 | |
140 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
141 | } |
142 | |
143 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( |
144 | TargetTransformInfo::CacheLevel Level) const { |
145 | // - Penryn |
146 | // - Nehalem |
147 | // - Westmere |
148 | // - Sandy Bridge |
149 | // - Ivy Bridge |
150 | // - Haswell |
151 | // - Broadwell |
152 | // - Skylake |
153 | // - Kabylake |
154 | switch (Level) { |
155 | case TargetTransformInfo::CacheLevel::L1D: |
156 | [[fallthrough]]; |
157 | case TargetTransformInfo::CacheLevel::L2D: |
158 | return 8; |
159 | } |
160 | |
161 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
162 | } |
163 | |
164 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
165 | bool Vector = (ClassID == 1); |
166 | if (Vector && !ST->hasSSE1()) |
167 | return 0; |
168 | |
169 | if (ST->is64Bit()) { |
170 | if (Vector && ST->hasAVX512()) |
171 | return 32; |
172 | if (!Vector && ST->hasEGPR()) |
173 | return 32; |
174 | return 16; |
175 | } |
176 | return 8; |
177 | } |
178 | |
179 | bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const { |
180 | if (!ST->hasCF()) |
181 | return false; |
182 | if (!Ty) |
183 | return true; |
184 | // Conditional faulting is supported by CFCMOV, which only accepts |
185 | // 16/32/64-bit operands. |
186 | // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's |
187 | // profitable. |
188 | auto *VTy = dyn_cast<FixedVectorType>(Val: Ty); |
189 | if (!Ty->isIntegerTy() && (!VTy || VTy->getNumElements() != 1)) |
190 | return false; |
191 | auto *ScalarTy = Ty->getScalarType(); |
192 | switch (cast<IntegerType>(Val: ScalarTy)->getBitWidth()) { |
193 | default: |
194 | return false; |
195 | case 16: |
196 | case 32: |
197 | case 64: |
198 | return true; |
199 | } |
200 | } |
201 | |
202 | TypeSize |
203 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
204 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); |
205 | switch (K) { |
206 | case TargetTransformInfo::RGK_Scalar: |
207 | return TypeSize::getFixed(ExactSize: ST->is64Bit() ? 64 : 32); |
208 | case TargetTransformInfo::RGK_FixedWidthVector: |
209 | if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) |
210 | return TypeSize::getFixed(ExactSize: 512); |
211 | if (ST->hasAVX() && PreferVectorWidth >= 256) |
212 | return TypeSize::getFixed(ExactSize: 256); |
213 | if (ST->hasSSE1() && PreferVectorWidth >= 128) |
214 | return TypeSize::getFixed(ExactSize: 128); |
215 | return TypeSize::getFixed(ExactSize: 0); |
216 | case TargetTransformInfo::RGK_ScalableVector: |
217 | return TypeSize::getScalable(MinimumSize: 0); |
218 | } |
219 | |
220 | llvm_unreachable("Unsupported register kind" ); |
221 | } |
222 | |
223 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
224 | return getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
225 | .getFixedValue(); |
226 | } |
227 | |
228 | unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
229 | // If the loop will not be vectorized, don't interleave the loop. |
230 | // Let regular unroll to unroll the loop, which saves the overflow |
231 | // check and memory check cost. |
232 | if (VF.isScalar()) |
233 | return 1; |
234 | |
235 | if (ST->isAtom()) |
236 | return 1; |
237 | |
238 | // Sandybridge and Haswell have multiple execution ports and pipelined |
239 | // vector units. |
240 | if (ST->hasAVX()) |
241 | return 4; |
242 | |
243 | return 2; |
244 | } |
245 | |
246 | InstructionCost X86TTIImpl::getArithmeticInstrCost( |
247 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
248 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
249 | ArrayRef<const Value *> Args, |
250 | const Instruction *CxtI) { |
251 | |
252 | // vXi8 multiplications are always promoted to vXi16. |
253 | // Sub-128-bit types can be extended/packed more efficiently. |
254 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && |
255 | Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { |
256 | Type *WideVecTy = |
257 | VectorType::getExtendedElementVectorType(VTy: cast<VectorType>(Val: Ty)); |
258 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: Ty, |
259 | CCH: TargetTransformInfo::CastContextHint::None, |
260 | CostKind) + |
261 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: Ty, Src: WideVecTy, |
262 | CCH: TargetTransformInfo::CastContextHint::None, |
263 | CostKind) + |
264 | getArithmeticInstrCost(Opcode, Ty: WideVecTy, CostKind, Op1Info, Op2Info); |
265 | } |
266 | |
267 | // Legalize the type. |
268 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
269 | |
270 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
271 | assert(ISD && "Invalid opcode" ); |
272 | |
273 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && |
274 | (LT.second.getScalarType() == MVT::i32 || |
275 | LT.second.getScalarType() == MVT::i64)) { |
276 | // Check if the operands can be represented as a smaller datatype. |
277 | bool Op1Signed = false, Op2Signed = false; |
278 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Val: Args[0], isSigned&: Op1Signed); |
279 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Val: Args[1], isSigned&: Op2Signed); |
280 | unsigned OpMinSize = std::max(a: Op1MinSize, b: Op2MinSize); |
281 | bool SignedMode = Op1Signed || Op2Signed; |
282 | |
283 | // If both vXi32 are representable as i15 and at least one is constant, |
284 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we |
285 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. |
286 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && |
287 | LT.second.getScalarType() == MVT::i32) { |
288 | bool Op1Constant = |
289 | isa<ConstantDataVector>(Val: Args[0]) || isa<ConstantVector>(Val: Args[0]); |
290 | bool Op2Constant = |
291 | isa<ConstantDataVector>(Val: Args[1]) || isa<ConstantVector>(Val: Args[1]); |
292 | bool Op1Sext = isa<SExtInst>(Val: Args[0]) && |
293 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); |
294 | bool Op2Sext = isa<SExtInst>(Val: Args[1]) && |
295 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); |
296 | |
297 | bool IsZeroExtended = !Op1Signed || !Op2Signed; |
298 | bool IsConstant = Op1Constant || Op2Constant; |
299 | bool IsSext = Op1Sext || Op2Sext; |
300 | if (IsConstant || IsZeroExtended || IsSext) |
301 | LT.second = |
302 | MVT::getVectorVT(VT: MVT::i16, NumElements: 2 * LT.second.getVectorNumElements()); |
303 | } |
304 | |
305 | // Check if the vXi32 operands can be shrunk into a smaller datatype. |
306 | // This should match the codegen from reduceVMULWidth. |
307 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). |
308 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { |
309 | if (OpMinSize <= 7) |
310 | return LT.first * 3; // pmullw/sext |
311 | if (!SignedMode && OpMinSize <= 8) |
312 | return LT.first * 3; // pmullw/zext |
313 | if (OpMinSize <= 15) |
314 | return LT.first * 5; // pmullw/pmulhw/pshuf |
315 | if (!SignedMode && OpMinSize <= 16) |
316 | return LT.first * 5; // pmullw/pmulhw/pshuf |
317 | } |
318 | |
319 | // If both vXi64 are representable as (unsigned) i32, then we can perform |
320 | // the multiple with a single PMULUDQ instruction. |
321 | // TODO: Add (SSE41+) PMULDQ handling for signed extensions. |
322 | if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) |
323 | ISD = X86ISD::PMULUDQ; |
324 | } |
325 | |
326 | // Vector multiply by pow2 will be simplified to shifts. |
327 | // Vector multiply by -pow2 will be simplified to shifts/negates. |
328 | if (ISD == ISD::MUL && Op2Info.isConstant() && |
329 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { |
330 | InstructionCost Cost = |
331 | getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind, |
332 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
333 | if (Op2Info.isNegatedPowerOf2()) |
334 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind); |
335 | return Cost; |
336 | } |
337 | |
338 | // On X86, vector signed division by constants power-of-two are |
339 | // normally expanded to the sequence SRA + SRL + ADD + SRA. |
340 | // The OperandValue properties may not be the same as that of the previous |
341 | // operation; conservatively assume OP_None. |
342 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && |
343 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
344 | InstructionCost Cost = |
345 | 2 * getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
346 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
347 | Cost += getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
348 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
349 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
350 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
351 | |
352 | if (ISD == ISD::SREM) { |
353 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) |
354 | Cost += getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
355 | Op2Info: Op2Info.getNoProps()); |
356 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
357 | Op2Info: Op2Info.getNoProps()); |
358 | } |
359 | |
360 | return Cost; |
361 | } |
362 | |
363 | // Vector unsigned division/remainder will be simplified to shifts/masks. |
364 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && |
365 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
366 | if (ISD == ISD::UDIV) |
367 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
368 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
369 | // UREM |
370 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
371 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
372 | } |
373 | |
374 | static const CostKindTblEntry GFNIUniformConstCostTable[] = { |
375 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
376 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
377 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
378 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
379 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
380 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
381 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
382 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
383 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
384 | }; |
385 | |
386 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI()) |
387 | if (const auto *Entry = |
388 | CostTableLookup(Table: GFNIUniformConstCostTable, ISD, Ty: LT.second)) |
389 | if (auto KindCost = Entry->Cost[CostKind]) |
390 | return LT.first * *KindCost; |
391 | |
392 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { |
393 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
394 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
395 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
396 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
397 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
398 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw, pand, pxor, psubb. |
399 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
400 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
401 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
402 | |
403 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
404 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
405 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
406 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
407 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
408 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
409 | }; |
410 | |
411 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) |
412 | if (const auto *Entry = |
413 | CostTableLookup(Table: AVX512BWUniformConstCostTable, ISD, Ty: LT.second)) |
414 | if (auto KindCost = Entry->Cost[CostKind]) |
415 | return LT.first * *KindCost; |
416 | |
417 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { |
418 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psllw + pand. |
419 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 12, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw + pand. |
420 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 12, .SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
421 | |
422 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psllw + split. |
423 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psrlw + split. |
424 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // psraw + split. |
425 | |
426 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
427 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
428 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
429 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
430 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
431 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
432 | |
433 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
434 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
435 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
436 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
437 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
438 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
439 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraq |
440 | |
441 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
442 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
443 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
444 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
445 | }; |
446 | |
447 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) |
448 | if (const auto *Entry = |
449 | CostTableLookup(Table: AVX512UniformConstCostTable, ISD, Ty: LT.second)) |
450 | if (auto KindCost = Entry->Cost[CostKind]) |
451 | return LT.first * *KindCost; |
452 | |
453 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { |
454 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
455 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
456 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
457 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psllw + pand. |
458 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // psrlw + pand. |
459 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw, pand, pxor, psubb. |
460 | |
461 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw |
462 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw |
463 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw |
464 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw |
465 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw |
466 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw |
467 | |
468 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
469 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld |
470 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad |
471 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
472 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
473 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
474 | |
475 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq |
476 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq |
477 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
478 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
479 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
480 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // psrad + shuffle + split. |
481 | |
482 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
483 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
484 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
485 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
486 | }; |
487 | |
488 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) |
489 | if (const auto *Entry = |
490 | CostTableLookup(Table: AVX2UniformConstCostTable, ISD, Ty: LT.second)) |
491 | if (auto KindCost = Entry->Cost[CostKind]) |
492 | return LT.first * *KindCost; |
493 | |
494 | static const CostKindTblEntry AVXUniformConstCostTable[] = { |
495 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
496 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
497 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
498 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psllw + pand) + split. |
499 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 8 } }, // 2*(psrlw + pand) + split. |
500 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 12, .SizeAndLatencyCost: 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. |
501 | |
502 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
503 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
504 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
505 | { .ISD: ISD::SHL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psllw + split. |
506 | { .ISD: ISD::SRL, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrlw + split. |
507 | { .ISD: ISD::SRA, .Type: MVT::v16i16,.Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psraw + split. |
508 | |
509 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld. |
510 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
511 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
512 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // pslld + split. |
513 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrld + split. |
514 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // psrad + split. |
515 | |
516 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
517 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
518 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // psrad + shuffle. |
519 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
520 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // 2 x psllq + split. |
521 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle + split. |
522 | |
523 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmuludq sequence + split. |
524 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmuludq+mul+sub sequence + split. |
525 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 12 } }, // 2*pmuludq sequence + split. |
526 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 16 } }, // 2*pmuludq+mul+sub sequence + split. |
527 | }; |
528 | |
529 | // XOP has faster vXi8 shifts. |
530 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && |
531 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
532 | if (const auto *Entry = |
533 | CostTableLookup(Table: AVXUniformConstCostTable, ISD, Ty: LT.second)) |
534 | if (auto KindCost = Entry->Cost[CostKind]) |
535 | return LT.first * *KindCost; |
536 | |
537 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { |
538 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw + pand. |
539 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw + pand. |
540 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psrlw, pand, pxor, psubb. |
541 | |
542 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllw. |
543 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlw. |
544 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psraw. |
545 | |
546 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pslld |
547 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrld. |
548 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrad. |
549 | |
550 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psllq. |
551 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psrlq. |
552 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, // 2 x psrad + shuffle. |
553 | |
554 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6 } }, // pmuludq sequence |
555 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8 } }, // pmuludq+mul+sub sequence |
556 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5 } }, // pmuludq sequence |
557 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7 } }, // pmuludq+mul+sub sequence |
558 | }; |
559 | |
560 | // XOP has faster vXi8 shifts. |
561 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && |
562 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
563 | if (const auto *Entry = |
564 | CostTableLookup(Table: SSE2UniformConstCostTable, ISD, Ty: LT.second)) |
565 | if (auto KindCost = Entry->Cost[CostKind]) |
566 | return LT.first * *KindCost; |
567 | |
568 | static const CostKindTblEntry AVX512BWConstCostTable[] = { |
569 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
570 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
571 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
572 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
573 | |
574 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
575 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
576 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
577 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
578 | }; |
579 | |
580 | if (Op2Info.isConstant() && ST->hasBWI()) |
581 | if (const auto *Entry = |
582 | CostTableLookup(Table: AVX512BWConstCostTable, ISD, Ty: LT.second)) |
583 | if (auto KindCost = Entry->Cost[CostKind]) |
584 | return LT.first * *KindCost; |
585 | |
586 | static const CostKindTblEntry AVX512ConstCostTable[] = { |
587 | { .ISD: ISD::SDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
588 | { .ISD: ISD::SREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
589 | { .ISD: ISD::UDIV, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 28 } }, // 4*ext+4*pmulhw sequence |
590 | { .ISD: ISD::UREM, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
591 | |
592 | { .ISD: ISD::SDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhw sequence |
593 | { .ISD: ISD::SREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhw+mul+sub sequence |
594 | { .ISD: ISD::UDIV, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 12 } }, // 2*vpmulhuw sequence |
595 | { .ISD: ISD::UREM, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 16 } }, // 2*vpmulhuw+mul+sub sequence |
596 | |
597 | { .ISD: ISD::SDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
598 | { .ISD: ISD::SREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuldq+mul+sub sequence |
599 | { .ISD: ISD::UDIV, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
600 | { .ISD: ISD::UREM, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 17 } }, // vpmuludq+mul+sub sequence |
601 | }; |
602 | |
603 | if (Op2Info.isConstant() && ST->hasAVX512()) |
604 | if (const auto *Entry = |
605 | CostTableLookup(Table: AVX512ConstCostTable, ISD, Ty: LT.second)) |
606 | if (auto KindCost = Entry->Cost[CostKind]) |
607 | return LT.first * *KindCost; |
608 | |
609 | static const CostKindTblEntry AVX2ConstCostTable[] = { |
610 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
611 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
612 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
613 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
614 | |
615 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhw sequence |
616 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhw+mul+sub sequence |
617 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6 } }, // vpmulhuw sequence |
618 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8 } }, // vpmulhuw+mul+sub sequence |
619 | |
620 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuldq sequence |
621 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuldq+mul+sub sequence |
622 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
623 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 19 } }, // vpmuludq+mul+sub sequence |
624 | }; |
625 | |
626 | if (Op2Info.isConstant() && ST->hasAVX2()) |
627 | if (const auto *Entry = CostTableLookup(Table: AVX2ConstCostTable, ISD, Ty: LT.second)) |
628 | if (auto KindCost = Entry->Cost[CostKind]) |
629 | return LT.first * *KindCost; |
630 | |
631 | static const CostKindTblEntry AVXConstCostTable[] = { |
632 | { .ISD: ISD::SDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
633 | { .ISD: ISD::SREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
634 | { .ISD: ISD::UDIV, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 30 } }, // 4*ext+4*pmulhw sequence + split. |
635 | { .ISD: ISD::UREM, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
636 | |
637 | { .ISD: ISD::SDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhw sequence + split. |
638 | { .ISD: ISD::SREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhw+mul+sub sequence + split. |
639 | { .ISD: ISD::UDIV, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 14 } }, // 2*pmulhuw sequence + split. |
640 | { .ISD: ISD::UREM, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18 } }, // 2*pmulhuw+mul+sub sequence + split. |
641 | |
642 | { .ISD: ISD::SDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // vpmuludq sequence |
643 | { .ISD: ISD::SREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 38 } }, // vpmuludq+mul+sub sequence |
644 | { .ISD: ISD::UDIV, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 32 } }, // 2*pmuludq sequence + split. |
645 | { .ISD: ISD::UREM, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 42 } }, // 2*pmuludq+mul+sub sequence + split. |
646 | }; |
647 | |
648 | if (Op2Info.isConstant() && ST->hasAVX()) |
649 | if (const auto *Entry = CostTableLookup(Table: AVXConstCostTable, ISD, Ty: LT.second)) |
650 | if (auto KindCost = Entry->Cost[CostKind]) |
651 | return LT.first * *KindCost; |
652 | |
653 | static const CostKindTblEntry SSE41ConstCostTable[] = { |
654 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // vpmuludq sequence |
655 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // vpmuludq+mul+sub sequence |
656 | }; |
657 | |
658 | if (Op2Info.isConstant() && ST->hasSSE41()) |
659 | if (const auto *Entry = |
660 | CostTableLookup(Table: SSE41ConstCostTable, ISD, Ty: LT.second)) |
661 | if (auto KindCost = Entry->Cost[CostKind]) |
662 | return LT.first * *KindCost; |
663 | |
664 | static const CostKindTblEntry SSE2ConstCostTable[] = { |
665 | { .ISD: ISD::SDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
666 | { .ISD: ISD::SREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
667 | { .ISD: ISD::UDIV, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14 } }, // 2*ext+2*pmulhw sequence |
668 | { .ISD: ISD::UREM, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
669 | |
670 | { .ISD: ISD::SDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhw sequence |
671 | { .ISD: ISD::SREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhw+mul+sub sequence |
672 | { .ISD: ISD::UDIV, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6 } }, // pmulhuw sequence |
673 | { .ISD: ISD::UREM, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8 } }, // pmulhuw+mul+sub sequence |
674 | |
675 | { .ISD: ISD::SDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 19 } }, // pmuludq sequence |
676 | { .ISD: ISD::SREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 24 } }, // pmuludq+mul+sub sequence |
677 | { .ISD: ISD::UDIV, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15 } }, // pmuludq sequence |
678 | { .ISD: ISD::UREM, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 20 } }, // pmuludq+mul+sub sequence |
679 | }; |
680 | |
681 | if (Op2Info.isConstant() && ST->hasSSE2()) |
682 | if (const auto *Entry = CostTableLookup(Table: SSE2ConstCostTable, ISD, Ty: LT.second)) |
683 | if (auto KindCost = Entry->Cost[CostKind]) |
684 | return LT.first * *KindCost; |
685 | |
686 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { |
687 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
688 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
689 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4,.LatencyCost: 12, .CodeSizeCost: 8,.SizeAndLatencyCost: 12 } }, // psrlw, pand, pxor, psubb. |
690 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
691 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
692 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
693 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
694 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // psrlw + pand. |
695 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 15 } }, // psrlw, pand, pxor, psubb. |
696 | |
697 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw |
698 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw |
699 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrqw |
700 | }; |
701 | |
702 | if (ST->hasBWI() && Op2Info.isUniform()) |
703 | if (const auto *Entry = |
704 | CostTableLookup(Table: AVX512BWUniformCostTable, ISD, Ty: LT.second)) |
705 | if (auto KindCost = Entry->Cost[CostKind]) |
706 | return LT.first * *KindCost; |
707 | |
708 | static const CostKindTblEntry AVX512UniformCostTable[] = { |
709 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
710 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
711 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 5,.LatencyCost: 10, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
712 | |
713 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
714 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
715 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
716 | |
717 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
718 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
719 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
720 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
721 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
722 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
723 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraq |
724 | }; |
725 | |
726 | if (ST->hasAVX512() && Op2Info.isUniform()) |
727 | if (const auto *Entry = |
728 | CostTableLookup(Table: AVX512UniformCostTable, ISD, Ty: LT.second)) |
729 | if (auto KindCost = Entry->Cost[CostKind]) |
730 | return LT.first * *KindCost; |
731 | |
732 | static const CostKindTblEntry AVX2UniformCostTable[] = { |
733 | // Uniform splats are cheaper for the following instructions. |
734 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + pand. |
735 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
736 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
737 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
738 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
739 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // psrlw, pand, pxor, psubb. |
740 | |
741 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
742 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
743 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
744 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psllw. |
745 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrlw. |
746 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psraw. |
747 | |
748 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
749 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld |
750 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad |
751 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // pslld |
752 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrld |
753 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, // psrad |
754 | |
755 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
756 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
757 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
758 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq |
759 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq |
760 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // 2 x psrad + shuffle. |
761 | }; |
762 | |
763 | if (ST->hasAVX2() && Op2Info.isUniform()) |
764 | if (const auto *Entry = |
765 | CostTableLookup(Table: AVX2UniformCostTable, ISD, Ty: LT.second)) |
766 | if (auto KindCost = Entry->Cost[CostKind]) |
767 | return LT.first * *KindCost; |
768 | |
769 | static const CostKindTblEntry AVXUniformCostTable[] = { |
770 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, // psllw + pand. |
771 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, // psrlw + pand. |
772 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 6, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // psrlw, pand, pxor, psubb. |
773 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 8,.CodeSizeCost: 11,.SizeAndLatencyCost: 14 } }, // psllw + pand + split. |
774 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // psrlw + pand + split. |
775 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10,.LatencyCost: 11,.CodeSizeCost: 16,.SizeAndLatencyCost: 21 } }, // psrlw, pand, pxor, psubb + split. |
776 | |
777 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
778 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
779 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
780 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psllw + split. |
781 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrlw + split. |
782 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psraw + split. |
783 | |
784 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld. |
785 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
786 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
787 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // pslld + split. |
788 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrld + split. |
789 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // psrad + split. |
790 | |
791 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
792 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
793 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2 x psrad + shuffle. |
794 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psllq + split. |
795 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // psrlq + split. |
796 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 10,.SizeAndLatencyCost: 13 } }, // 2 x (2 x psrad + shuffle) + split. |
797 | }; |
798 | |
799 | // XOP has faster vXi8 shifts. |
800 | if (ST->hasAVX() && Op2Info.isUniform() && |
801 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
802 | if (const auto *Entry = |
803 | CostTableLookup(Table: AVXUniformCostTable, ISD, Ty: LT.second)) |
804 | if (auto KindCost = Entry->Cost[CostKind]) |
805 | return LT.first * *KindCost; |
806 | |
807 | static const CostKindTblEntry SSE2UniformCostTable[] = { |
808 | // Uniform splats are cheaper for the following instructions. |
809 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, // psllw + pand. |
810 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // psrlw + pand. |
811 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 9,.SizeAndLatencyCost: 13 } }, // pcmpgtb sequence. |
812 | |
813 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllw. |
814 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlw. |
815 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psraw. |
816 | |
817 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pslld |
818 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrld. |
819 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrad. |
820 | |
821 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psllq. |
822 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psrlq. |
823 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 9, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // 2*psrlq + xor + sub. |
824 | }; |
825 | |
826 | if (ST->hasSSE2() && Op2Info.isUniform() && |
827 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
828 | if (const auto *Entry = |
829 | CostTableLookup(Table: SSE2UniformCostTable, ISD, Ty: LT.second)) |
830 | if (auto KindCost = Entry->Cost[CostKind]) |
831 | return LT.first * *KindCost; |
832 | |
833 | static const CostKindTblEntry AVX512DQCostTable[] = { |
834 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
835 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmullq |
836 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } } // pmullq |
837 | }; |
838 | |
839 | // Look for AVX512DQ lowering tricks for custom cases. |
840 | if (ST->hasDQI()) |
841 | if (const auto *Entry = CostTableLookup(Table: AVX512DQCostTable, ISD, Ty: LT.second)) |
842 | if (auto KindCost = Entry->Cost[CostKind]) |
843 | return LT.first * *KindCost; |
844 | |
845 | static const CostKindTblEntry AVX512BWCostTable[] = { |
846 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsllvw/pack sequence. |
847 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsrlvw/pack sequence. |
848 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/vpsravw/pack sequence. |
849 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // extend/vpsllvw/pack sequence. |
850 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
851 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
852 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 19,.CodeSizeCost: 13,.SizeAndLatencyCost: 15 } }, // extend/vpsllvw/pack sequence. |
853 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 27,.CodeSizeCost: 15,.SizeAndLatencyCost: 18 } }, // extend/vpsrlvw/pack sequence. |
854 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 15,.CodeSizeCost: 30,.SizeAndLatencyCost: 30 } }, // extend/vpsravw/pack sequence. |
855 | |
856 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
857 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
858 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
859 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
860 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
861 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
862 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvw |
863 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvw |
864 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsravw |
865 | |
866 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
867 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
868 | |
869 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddb |
870 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddw |
871 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddd |
872 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
873 | |
874 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
875 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
876 | |
877 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // extend/pmullw/trunc |
878 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
879 | { .ISD: ISD::MUL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 7,.SizeAndLatencyCost: 10 } }, // pmaddubsw |
880 | { .ISD: ISD::MUL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
881 | |
882 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubb |
883 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubw |
884 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubd |
885 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
886 | }; |
887 | |
888 | // Look for AVX512BW lowering tricks for custom cases. |
889 | if (ST->hasBWI()) |
890 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTable, ISD, Ty: LT.second)) |
891 | if (auto KindCost = Entry->Cost[CostKind]) |
892 | return LT.first * *KindCost; |
893 | |
894 | static const CostKindTblEntry AVX512CostTable[] = { |
895 | { .ISD: ISD::SHL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 27,.SizeAndLatencyCost: 33 } }, // vpblendv+split sequence. |
896 | { .ISD: ISD::SRL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 19,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // vpblendv+split sequence. |
897 | { .ISD: ISD::SRA, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 37,.CodeSizeCost: 51,.SizeAndLatencyCost: 63 } }, // vpblendv+split sequence. |
898 | |
899 | { .ISD: ISD::SHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
900 | { .ISD: ISD::SRL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsrlvd/pack sequence. |
901 | { .ISD: ISD::SRA, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // 2*extend/vpsravd/pack sequence. |
902 | |
903 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
904 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
905 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
906 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
907 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
908 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
909 | { .ISD: ISD::SHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
910 | { .ISD: ISD::SRL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
911 | { .ISD: ISD::SRA, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
912 | |
913 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
914 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
915 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
916 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
917 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
918 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
919 | { .ISD: ISD::SHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
920 | { .ISD: ISD::SRL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
921 | { .ISD: ISD::SRA, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
922 | |
923 | { .ISD: ISD::ADD, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddb + split |
924 | { .ISD: ISD::ADD, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*paddw + split |
925 | |
926 | { .ISD: ISD::SUB, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubb + split |
927 | { .ISD: ISD::SUB, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // 2*psubw + split |
928 | |
929 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
930 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
931 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
932 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
933 | |
934 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
935 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
936 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
937 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
938 | |
939 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
940 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
941 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
942 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
943 | |
944 | { .ISD: ISD::MUL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
945 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
946 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld (Skylake from agner.org) |
947 | { .ISD: ISD::MUL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
948 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Skylake from http://www.agner.org/ |
949 | |
950 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
951 | |
952 | { .ISD: ISD::FNEG, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
953 | { .ISD: ISD::FADD, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
954 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
955 | { .ISD: ISD::FSUB, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
956 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
957 | { .ISD: ISD::FMUL, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
958 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
959 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
960 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
961 | |
962 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
963 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
964 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
965 | { .ISD: ISD::FDIV, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
966 | |
967 | { .ISD: ISD::FNEG, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Skylake from http://www.agner.org/ |
968 | { .ISD: ISD::FADD, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
969 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
970 | { .ISD: ISD::FSUB, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
971 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
972 | { .ISD: ISD::FMUL, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
973 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
974 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
975 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
976 | |
977 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
978 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
979 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
980 | { .ISD: ISD::FDIV, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
981 | }; |
982 | |
983 | if (ST->hasAVX512()) |
984 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTable, ISD, Ty: LT.second)) |
985 | if (auto KindCost = Entry->Cost[CostKind]) |
986 | return LT.first * *KindCost; |
987 | |
988 | static const CostKindTblEntry AVX2ShiftCostTable[] = { |
989 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to |
990 | // customize them to detect the cases where shift amount is a scalar one. |
991 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
992 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
993 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
994 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsllvd (Haswell from agner.org) |
995 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsrlvd (Haswell from agner.org) |
996 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vpsravd (Haswell from agner.org) |
997 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllvq (Haswell from agner.org) |
998 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsrlvq (Haswell from agner.org) |
999 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsllvq (Haswell from agner.org) |
1000 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vpsrlvq (Haswell from agner.org) |
1001 | }; |
1002 | |
1003 | if (ST->hasAVX512()) { |
1004 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) |
1005 | // On AVX512, a packed v32i16 shift left by a constant build_vector |
1006 | // is lowered into a vector multiply (vpmullw). |
1007 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
1008 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1009 | } |
1010 | |
1011 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). |
1012 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { |
1013 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && |
1014 | Op2Info.isConstant()) |
1015 | // On AVX2, a packed v16i16 shift left by a constant build_vector |
1016 | // is lowered into a vector multiply (vpmullw). |
1017 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
1018 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1019 | |
1020 | if (const auto *Entry = CostTableLookup(Table: AVX2ShiftCostTable, ISD, Ty: LT.second)) |
1021 | if (auto KindCost = Entry->Cost[CostKind]) |
1022 | return LT.first * *KindCost; |
1023 | } |
1024 | |
1025 | static const CostKindTblEntry XOPShiftCostTable[] = { |
1026 | // 128bit shifts take 1cy, but right shifts require negation beforehand. |
1027 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1028 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1029 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1030 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1031 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1032 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1033 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1034 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1035 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1036 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1037 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1038 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1039 | // 256bit shifts require splitting if AVX2 didn't catch them above. |
1040 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1041 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1042 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1043 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1044 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1045 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1046 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1047 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1048 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1049 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1050 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1051 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
1052 | }; |
1053 | |
1054 | // Look for XOP lowering tricks. |
1055 | if (ST->hasXOP()) { |
1056 | // If the right shift is constant then we'll fold the negation so |
1057 | // it's as cheap as a left shift. |
1058 | int ShiftISD = ISD; |
1059 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) |
1060 | ShiftISD = ISD::SHL; |
1061 | if (const auto *Entry = |
1062 | CostTableLookup(Table: XOPShiftCostTable, ISD: ShiftISD, Ty: LT.second)) |
1063 | if (auto KindCost = Entry->Cost[CostKind]) |
1064 | return LT.first * *KindCost; |
1065 | } |
1066 | |
1067 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { |
1068 | MVT VT = LT.second; |
1069 | // Vector shift left by non uniform constant can be lowered |
1070 | // into vector multiply. |
1071 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || |
1072 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) |
1073 | ISD = ISD::MUL; |
1074 | } |
1075 | |
1076 | static const CostKindTblEntry GLMCostTable[] = { |
1077 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
1078 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 36, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divps |
1079 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 33, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
1080 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 65, .LatencyCost: 66, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divpd |
1081 | }; |
1082 | |
1083 | if (ST->useGLMDivSqrtCosts()) |
1084 | if (const auto *Entry = CostTableLookup(Table: GLMCostTable, ISD, Ty: LT.second)) |
1085 | if (auto KindCost = Entry->Cost[CostKind]) |
1086 | return LT.first * *KindCost; |
1087 | |
1088 | static const CostKindTblEntry SLMCostTable[] = { |
1089 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 7 } }, // pmulld |
1090 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
1091 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulsd |
1092 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulss |
1093 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulpd |
1094 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // mulps |
1095 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divss |
1096 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divps |
1097 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 34, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // divsd |
1098 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 6 } }, // divpd |
1099 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // addpd |
1100 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // subpd |
1101 | // v2i64/v4i64 mul is custom lowered as a series of long: |
1102 | // multiplies(3), shifts(3) and adds(2) |
1103 | // slm muldq version throughput is 2 and addq throughput 4 |
1104 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + |
1105 | // 3X4 (addq throughput) = 17 |
1106 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
1107 | // slm addq\subq throughput is 4 |
1108 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1109 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1110 | }; |
1111 | |
1112 | if (ST->useSLMArithCosts()) |
1113 | if (const auto *Entry = CostTableLookup(Table: SLMCostTable, ISD, Ty: LT.second)) |
1114 | if (auto KindCost = Entry->Cost[CostKind]) |
1115 | return LT.first * *KindCost; |
1116 | |
1117 | static const CostKindTblEntry AVX2CostTable[] = { |
1118 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 16 } }, // vpblendvb sequence. |
1119 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 23,.CodeSizeCost: 11,.SizeAndLatencyCost: 22 } }, // vpblendvb sequence. |
1120 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
1121 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
1122 | |
1123 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // vpblendvb sequence. |
1124 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 30,.CodeSizeCost: 12,.SizeAndLatencyCost: 24 } }, // vpblendvb sequence. |
1125 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsrlvd/pack sequence. |
1126 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsrlvd/pack sequence. |
1127 | |
1128 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 17,.CodeSizeCost: 24,.SizeAndLatencyCost: 30 } }, // vpblendvb sequence. |
1129 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 20,.CodeSizeCost: 24,.SizeAndLatencyCost: 43 } }, // vpblendvb sequence. |
1130 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 5,.SizeAndLatencyCost: 10 } }, // extend/vpsravd/pack sequence. |
1131 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // extend/vpsravd/pack sequence. |
1132 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // srl/xor/sub sequence. |
1133 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 9 } }, // srl/xor/sub sequence. |
1134 | |
1135 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubb |
1136 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddb |
1137 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubw |
1138 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddw |
1139 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubd |
1140 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddd |
1141 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
1142 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
1143 | |
1144 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18, .CodeSizeCost: 6,.SizeAndLatencyCost: 12 } }, // extend/pmullw/pack |
1145 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 8,.SizeAndLatencyCost: 16 } }, // pmaddubsw |
1146 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmullw |
1147 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
1148 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pmulld |
1149 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8,.SizeAndLatencyCost: 13 } }, // 3*pmuludq/3*shift/2*add |
1150 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, // 3*pmuludq/3*shift/2*add |
1151 | |
1152 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1153 | |
1154 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorpd |
1155 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1156 | |
1157 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddsd |
1158 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddss |
1159 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddpd |
1160 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vaddps |
1161 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddpd |
1162 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vaddps |
1163 | |
1164 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubsd |
1165 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubss |
1166 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubpd |
1167 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsubps |
1168 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubpd |
1169 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vsubps |
1170 | |
1171 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulsd |
1172 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulss |
1173 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulpd |
1174 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vmulps |
1175 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulpd |
1176 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vmulps |
1177 | |
1178 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivss |
1179 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 13, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivps |
1180 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivps |
1181 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivsd |
1182 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vdivpd |
1183 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vdivpd |
1184 | }; |
1185 | |
1186 | // Look for AVX2 lowering tricks for custom cases. |
1187 | if (ST->hasAVX2()) |
1188 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTable, ISD, Ty: LT.second)) |
1189 | if (auto KindCost = Entry->Cost[CostKind]) |
1190 | return LT.first * *KindCost; |
1191 | |
1192 | static const CostKindTblEntry AVX1CostTable[] = { |
1193 | // We don't have to scalarize unsupported ops. We can issue two half-sized |
1194 | // operations and we only need to extract the upper YMM half. |
1195 | // Two ops + 1 extract + 1 insert = 4. |
1196 | { .ISD: ISD::MUL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 11, .CodeSizeCost: 18, .SizeAndLatencyCost: 19 } }, // pmaddubsw + split |
1197 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
1198 | { .ISD: ISD::MUL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // pmullw + split |
1199 | { .ISD: ISD::MUL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, // pmulld + split |
1200 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pmulld |
1201 | { .ISD: ISD::MUL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
1202 | |
1203 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1204 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1205 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1206 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vandps |
1207 | |
1208 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1209 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1210 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1211 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vorps |
1212 | |
1213 | { .ISD: ISD::XOR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1214 | { .ISD: ISD::XOR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1215 | { .ISD: ISD::XOR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1216 | { .ISD: ISD::XOR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vxorps |
1217 | |
1218 | { .ISD: ISD::SUB, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubb + split |
1219 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddb + split |
1220 | { .ISD: ISD::SUB, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubw + split |
1221 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddw + split |
1222 | { .ISD: ISD::SUB, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubd + split |
1223 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddd + split |
1224 | { .ISD: ISD::SUB, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // psubq + split |
1225 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // paddq + split |
1226 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // psubq |
1227 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // paddq |
1228 | |
1229 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21,.CodeSizeCost: 11,.SizeAndLatencyCost: 17 } }, // pblendvb sequence. |
1230 | { .ISD: ISD::SHL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22,.CodeSizeCost: 27,.SizeAndLatencyCost: 40 } }, // pblendvb sequence + split. |
1231 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
1232 | { .ISD: ISD::SHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 24,.SizeAndLatencyCost: 25 } }, // pblendvb sequence + split. |
1233 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // pslld/paddd/cvttps2dq/pmulld |
1234 | { .ISD: ISD::SHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 17 } }, // pslld/paddd/cvttps2dq/pmulld + split |
1235 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
1236 | { .ISD: ISD::SHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
1237 | |
1238 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 27,.CodeSizeCost: 12,.SizeAndLatencyCost: 18 } }, // pblendvb sequence. |
1239 | { .ISD: ISD::SRL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23,.CodeSizeCost: 30,.SizeAndLatencyCost: 43 } }, // pblendvb sequence + split. |
1240 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1241 | { .ISD: ISD::SRL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
1242 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
1243 | { .ISD: ISD::SRL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
1244 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // Shift each lane + blend. |
1245 | { .ISD: ISD::SRL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 11,.SizeAndLatencyCost: 15 } }, // Shift each lane + blend + split. |
1246 | |
1247 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 22,.CodeSizeCost: 24,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
1248 | { .ISD: ISD::SRA, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45,.CodeSizeCost: 51,.SizeAndLatencyCost: 76 } }, // pblendvb sequence + split. |
1249 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 16,.CodeSizeCost: 14,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1250 | { .ISD: ISD::SRA, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30,.CodeSizeCost: 31,.SizeAndLatencyCost: 48 } }, // pblendvb sequence + split. |
1251 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 7,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // Shift each lane + blend. |
1252 | { .ISD: ISD::SRA, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14,.CodeSizeCost: 26,.SizeAndLatencyCost: 34 } }, // Shift each lane + blend + split. |
1253 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6,.CodeSizeCost: 10,.SizeAndLatencyCost: 14 } }, // Shift each lane + blend. |
1254 | { .ISD: ISD::SRA, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 22,.SizeAndLatencyCost: 30 } }, // Shift each lane + blend + split. |
1255 | |
1256 | { .ISD: ISD::FNEG, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1257 | { .ISD: ISD::FNEG, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1258 | |
1259 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1260 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1261 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1262 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1263 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1264 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1265 | |
1266 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1267 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1268 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1269 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BDVER2 from http://www.agner.org/ |
1270 | { .ISD: ISD::FSUB, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1271 | { .ISD: ISD::FSUB, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BDVER2 from http://www.agner.org/ |
1272 | |
1273 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1274 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1275 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1276 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BTVER2 from http://www.agner.org/ |
1277 | { .ISD: ISD::FMUL, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1278 | { .ISD: ISD::FMUL, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // BTVER2 from http://www.agner.org/ |
1279 | |
1280 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1281 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1282 | { .ISD: ISD::FDIV, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 29, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
1283 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1284 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // SNB from http://www.agner.org/ |
1285 | { .ISD: ISD::FDIV, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 44, .LatencyCost: 45, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // SNB from http://www.agner.org/ |
1286 | }; |
1287 | |
1288 | if (ST->hasAVX()) |
1289 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTable, ISD, Ty: LT.second)) |
1290 | if (auto KindCost = Entry->Cost[CostKind]) |
1291 | return LT.first * *KindCost; |
1292 | |
1293 | static const CostKindTblEntry SSE42CostTable[] = { |
1294 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1295 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1296 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1297 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1298 | |
1299 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1300 | { .ISD: ISD::FSUB, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1301 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1302 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1303 | |
1304 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1305 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1306 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1307 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1308 | |
1309 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1310 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1311 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1312 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 22, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
1313 | |
1314 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } } // 3*pmuludq/3*shift/2*add |
1315 | }; |
1316 | |
1317 | if (ST->hasSSE42()) |
1318 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTable, ISD, Ty: LT.second)) |
1319 | if (auto KindCost = Entry->Cost[CostKind]) |
1320 | return LT.first * *KindCost; |
1321 | |
1322 | static const CostKindTblEntry SSE41CostTable[] = { |
1323 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 24,.CodeSizeCost: 17,.SizeAndLatencyCost: 22 } }, // pblendvb sequence. |
1324 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 14,.CodeSizeCost: 11,.SizeAndLatencyCost: 11 } }, // pblendvb sequence. |
1325 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 20, .CodeSizeCost: 4,.SizeAndLatencyCost: 10 } }, // pslld/paddd/cvttps2dq/pmulld |
1326 | |
1327 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27,.CodeSizeCost: 18,.SizeAndLatencyCost: 24 } }, // pblendvb sequence. |
1328 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
1329 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1330 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1331 | |
1332 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 41,.CodeSizeCost: 30,.SizeAndLatencyCost: 36 } }, // pblendvb sequence. |
1333 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 22, .LatencyCost: 26,.CodeSizeCost: 23,.SizeAndLatencyCost: 27 } }, // pblendvb sequence. |
1334 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 17,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1335 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 17, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1336 | |
1337 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } } // pmulld (Nehalem from agner.org) |
1338 | }; |
1339 | |
1340 | if (ST->hasSSE41()) |
1341 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTable, ISD, Ty: LT.second)) |
1342 | if (auto KindCost = Entry->Cost[CostKind]) |
1343 | return LT.first * *KindCost; |
1344 | |
1345 | static const CostKindTblEntry SSSE3CostTable[] = { |
1346 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 18,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // 2*pmaddubsw/3*and/psllw/or |
1347 | }; |
1348 | |
1349 | if (ST->hasSSSE3()) |
1350 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTable, ISD, Ty: LT.second)) |
1351 | if (auto KindCost = Entry->Cost[CostKind]) |
1352 | return LT.first * *KindCost; |
1353 | |
1354 | static const CostKindTblEntry SSE2CostTable[] = { |
1355 | // We don't correctly identify costs of casts because they are marked as |
1356 | // custom. |
1357 | { .ISD: ISD::SHL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 21,.CodeSizeCost: 26,.SizeAndLatencyCost: 28 } }, // cmpgtb sequence. |
1358 | { .ISD: ISD::SHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 27,.CodeSizeCost: 16,.SizeAndLatencyCost: 20 } }, // cmpgtw sequence. |
1359 | { .ISD: ISD::SHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 19,.CodeSizeCost: 10,.SizeAndLatencyCost: 12 } }, // pslld/paddd/cvttps2dq/pmuludq. |
1360 | { .ISD: ISD::SHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1361 | |
1362 | { .ISD: ISD::SRL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28,.CodeSizeCost: 27,.SizeAndLatencyCost: 30 } }, // cmpgtb sequence. |
1363 | { .ISD: ISD::SRL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
1364 | { .ISD: ISD::SRL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1365 | { .ISD: ISD::SRL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, // splat+shuffle sequence. |
1366 | |
1367 | { .ISD: ISD::SRA, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 30,.CodeSizeCost: 54,.SizeAndLatencyCost: 54 } }, // unpacked cmpgtb sequence. |
1368 | { .ISD: ISD::SRA, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 19,.CodeSizeCost: 31,.SizeAndLatencyCost: 31 } }, // cmpgtw sequence. |
1369 | { .ISD: ISD::SRA, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 12,.CodeSizeCost: 15,.SizeAndLatencyCost: 19 } }, // Shift each lane + blend. |
1370 | { .ISD: ISD::SRA, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 11,.CodeSizeCost: 12,.SizeAndLatencyCost: 16 } }, // srl/xor/sub splat+shuffle sequence. |
1371 | |
1372 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1373 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1374 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1375 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pand |
1376 | |
1377 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1378 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1379 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1380 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // por |
1381 | |
1382 | { .ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1383 | { .ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1384 | { .ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1385 | { .ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pxor |
1386 | |
1387 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // paddq |
1388 | { .ISD: ISD::SUB, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // psubq |
1389 | |
1390 | { .ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18,.CodeSizeCost: 12,.SizeAndLatencyCost: 12 } }, // 2*unpack/2*pmullw/2*and/pack |
1391 | { .ISD: ISD::MUL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pmullw |
1392 | { .ISD: ISD::MUL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // 3*pmuludq/4*shuffle |
1393 | { .ISD: ISD::MUL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 10,.CodeSizeCost: 10,.SizeAndLatencyCost: 10 } }, // 3*pmuludq/3*shift/2*add |
1394 | |
1395 | { .ISD: X86ISD::PMULUDQ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1396 | |
1397 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 23, .LatencyCost: 23, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1398 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 39, .LatencyCost: 39, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1399 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1400 | { .ISD: ISD::FDIV, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 69, .LatencyCost: 69, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1401 | |
1402 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1403 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1404 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1405 | { .ISD: ISD::FNEG, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1406 | |
1407 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1408 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1409 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1410 | |
1411 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1412 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1413 | { .ISD: ISD::FSUB, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1414 | |
1415 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1416 | { .ISD: ISD::FMUL, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium IV from http://www.agner.org/ |
1417 | }; |
1418 | |
1419 | if (ST->hasSSE2()) |
1420 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTable, ISD, Ty: LT.second)) |
1421 | if (auto KindCost = Entry->Cost[CostKind]) |
1422 | return LT.first * *KindCost; |
1423 | |
1424 | static const CostKindTblEntry SSE1CostTable[] = { |
1425 | { .ISD: ISD::FDIV, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1426 | { .ISD: ISD::FDIV, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 48, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1427 | |
1428 | { .ISD: ISD::FNEG, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
1429 | { .ISD: ISD::FNEG, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
1430 | |
1431 | { .ISD: ISD::FADD, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1432 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1433 | |
1434 | { .ISD: ISD::FSUB, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1435 | { .ISD: ISD::FSUB, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1436 | |
1437 | { .ISD: ISD::FMUL, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1438 | { .ISD: ISD::FMUL, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Pentium III from http://www.agner.org/ |
1439 | }; |
1440 | |
1441 | if (ST->hasSSE1()) |
1442 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTable, ISD, Ty: LT.second)) |
1443 | if (auto KindCost = Entry->Cost[CostKind]) |
1444 | return LT.first * *KindCost; |
1445 | |
1446 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
1447 | { .ISD: ISD::ADD, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
1448 | { .ISD: ISD::SUB, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, // Core (Merom) from http://www.agner.org/ |
1449 | { .ISD: ISD::MUL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
1450 | }; |
1451 | |
1452 | if (ST->is64Bit()) |
1453 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: LT.second)) |
1454 | if (auto KindCost = Entry->Cost[CostKind]) |
1455 | return LT.first * *KindCost; |
1456 | |
1457 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
1458 | { .ISD: ISD::ADD, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1459 | { .ISD: ISD::ADD, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1460 | { .ISD: ISD::ADD, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1461 | |
1462 | { .ISD: ISD::SUB, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1463 | { .ISD: ISD::SUB, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1464 | { .ISD: ISD::SUB, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, // Pentium III from http://www.agner.org/ |
1465 | |
1466 | { .ISD: ISD::MUL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1467 | { .ISD: ISD::MUL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1468 | { .ISD: ISD::MUL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
1469 | |
1470 | { .ISD: ISD::FNEG, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // (x87) |
1471 | { .ISD: ISD::FADD, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1472 | { .ISD: ISD::FSUB, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1473 | { .ISD: ISD::FMUL, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1474 | { .ISD: ISD::FDIV, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 38, .LatencyCost: 38, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // (x87) |
1475 | }; |
1476 | |
1477 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: LT.second)) |
1478 | if (auto KindCost = Entry->Cost[CostKind]) |
1479 | return LT.first * *KindCost; |
1480 | |
1481 | // It is not a good idea to vectorize division. We have to scalarize it and |
1482 | // in the process we will often end up having to spilling regular |
1483 | // registers. The overhead of division is going to dominate most kernels |
1484 | // anyways so try hard to prevent vectorization of division - it is |
1485 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
1486 | // to hide "20 cycles" for each lane. |
1487 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && |
1488 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || |
1489 | ISD == ISD::UREM)) { |
1490 | InstructionCost ScalarCost = |
1491 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
1492 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1493 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; |
1494 | } |
1495 | |
1496 | // Handle some basic single instruction code size cases. |
1497 | if (CostKind == TTI::TCK_CodeSize) { |
1498 | switch (ISD) { |
1499 | case ISD::FADD: |
1500 | case ISD::FSUB: |
1501 | case ISD::FMUL: |
1502 | case ISD::FDIV: |
1503 | case ISD::FNEG: |
1504 | case ISD::AND: |
1505 | case ISD::OR: |
1506 | case ISD::XOR: |
1507 | return LT.first; |
1508 | break; |
1509 | } |
1510 | } |
1511 | |
1512 | // Fallback to the default implementation. |
1513 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1514 | Args, CxtI); |
1515 | } |
1516 | |
1517 | InstructionCost |
1518 | X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, |
1519 | unsigned Opcode1, const SmallBitVector &OpcodeMask, |
1520 | TTI::TargetCostKind CostKind) const { |
1521 | if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) |
1522 | return TTI::TCC_Basic; |
1523 | return InstructionCost::getInvalid(); |
1524 | } |
1525 | |
1526 | InstructionCost X86TTIImpl::getShuffleCost( |
1527 | TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, |
1528 | TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, |
1529 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
1530 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
1531 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. |
1532 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: BaseTp); |
1533 | |
1534 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: BaseTp, Index, SubTy&: SubTp); |
1535 | |
1536 | // Recognize a basic concat_vector shuffle. |
1537 | if (Kind == TTI::SK_PermuteTwoSrc && |
1538 | Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) && |
1539 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
1540 | return getShuffleCost(Kind: TTI::SK_InsertSubvector, |
1541 | BaseTp: VectorType::getDoubleElementsVectorType(VTy: BaseTp), Mask, |
1542 | CostKind, Index: Mask.size() / 2, SubTp: BaseTp); |
1543 | |
1544 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. |
1545 | if (Kind == TTI::SK_Transpose) |
1546 | Kind = TTI::SK_PermuteTwoSrc; |
1547 | |
1548 | if (Kind == TTI::SK_Broadcast) { |
1549 | // For Broadcasts we are splatting the first element from the first input |
1550 | // register, so only need to reference that input and all the output |
1551 | // registers are the same. |
1552 | LT.first = 1; |
1553 | |
1554 | // If we're broadcasting a load then AVX/AVX2 can do this for free. |
1555 | using namespace PatternMatch; |
1556 | if (!Args.empty() && match(V: Args[0], P: m_OneUse(SubPattern: m_Load(Op: m_Value()))) && |
1557 | (ST->hasAVX2() || |
1558 | (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) |
1559 | return TTI::TCC_Free; |
1560 | } |
1561 | |
1562 | // Treat <X x bfloat> shuffles as <X x half>. |
1563 | if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) |
1564 | LT.second = LT.second.changeVectorElementType(EltVT: MVT::f16); |
1565 | |
1566 | // Subvector extractions are free if they start at the beginning of a |
1567 | // vector and cheap if the subvectors are aligned. |
1568 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { |
1569 | int NumElts = LT.second.getVectorNumElements(); |
1570 | if ((Index % NumElts) == 0) |
1571 | return 0; |
1572 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1573 | if (SubLT.second.isVector()) { |
1574 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1575 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1576 | return SubLT.first; |
1577 | // Handle some cases for widening legalization. For now we only handle |
1578 | // cases where the original subvector was naturally aligned and evenly |
1579 | // fit in its legalized subvector type. |
1580 | // FIXME: Remove some of the alignment restrictions. |
1581 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit |
1582 | // vectors. |
1583 | int OrigSubElts = cast<FixedVectorType>(Val: SubTp)->getNumElements(); |
1584 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && |
1585 | (NumSubElts % OrigSubElts) == 0 && |
1586 | LT.second.getVectorElementType() == |
1587 | SubLT.second.getVectorElementType() && |
1588 | LT.second.getVectorElementType().getSizeInBits() == |
1589 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { |
1590 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts && |
1591 | "Unexpected number of elements!" ); |
1592 | auto *VecTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1593 | NumElts: LT.second.getVectorNumElements()); |
1594 | auto *SubTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1595 | NumElts: SubLT.second.getVectorNumElements()); |
1596 | int = alignDown(Value: (Index % NumElts), Align: NumSubElts); |
1597 | InstructionCost = |
1598 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: VecTy, Mask: std::nullopt, |
1599 | CostKind, Index: ExtractIndex, SubTp: SubTy); |
1600 | |
1601 | // If the original size is 32-bits or more, we can use pshufd. Otherwise |
1602 | // if we have SSSE3 we can use pshufb. |
1603 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) |
1604 | return ExtractCost + 1; // pshufd or pshufb |
1605 | |
1606 | assert(SubTp->getPrimitiveSizeInBits() == 16 && |
1607 | "Unexpected vector size" ); |
1608 | |
1609 | return ExtractCost + 2; // worst case pshufhw + pshufd |
1610 | } |
1611 | } |
1612 | // If the extract subvector is not optimal, treat it as single op shuffle. |
1613 | Kind = TTI::SK_PermuteSingleSrc; |
1614 | } |
1615 | |
1616 | // Subvector insertions are cheap if the subvectors are aligned. |
1617 | // Note that in general, the insertion starting at the beginning of a vector |
1618 | // isn't free, because we need to preserve the rest of the wide vector. |
1619 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { |
1620 | int NumElts = LT.second.getVectorNumElements(); |
1621 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1622 | if (SubLT.second.isVector()) { |
1623 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1624 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1625 | return SubLT.first; |
1626 | } |
1627 | |
1628 | // If the insertion isn't aligned, treat it like a 2-op shuffle. |
1629 | Kind = TTI::SK_PermuteTwoSrc; |
1630 | } |
1631 | |
1632 | // Handle some common (illegal) sub-vector types as they are often very cheap |
1633 | // to shuffle even on targets without PSHUFB. |
1634 | EVT VT = TLI->getValueType(DL, Ty: BaseTp); |
1635 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && |
1636 | !ST->hasSSSE3()) { |
1637 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { |
1638 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: 1}, // pshuflw |
1639 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i16, .Cost: 1}, // pshuflw |
1640 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: 2}, // punpck/pshuflw |
1641 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i8, .Cost: 2}, // punpck/pshuflw |
1642 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i8, .Cost: 1}, // punpck |
1643 | |
1644 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: 1}, // pshuflw |
1645 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i16, .Cost: 1}, // pshuflw |
1646 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i8, .Cost: 3}, // punpck/pshuflw/packus |
1647 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i8, .Cost: 1}, // punpck |
1648 | |
1649 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: 2}, // punpck+psrldq |
1650 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i16, .Cost: 2}, // punpck+psrldq |
1651 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i8, .Cost: 2}, // punpck+psrldq |
1652 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i8, .Cost: 2}, // punpck+psrldq |
1653 | |
1654 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i16, .Cost: 2}, // punpck/pshuflw |
1655 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i16, .Cost: 2}, // punpck/pshuflw |
1656 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i8, .Cost: 7}, // punpck/pshuflw |
1657 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i8, .Cost: 4}, // punpck/pshuflw |
1658 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i8, .Cost: 2}, // punpck |
1659 | |
1660 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: 1}, // pshuflw |
1661 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i16, .Cost: 1}, // pshuflw |
1662 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: 5}, // punpck/pshuflw |
1663 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i8, .Cost: 3}, // punpck/pshuflw |
1664 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i8, .Cost: 1}, // punpck |
1665 | }; |
1666 | |
1667 | if (ST->hasSSE2()) |
1668 | if (const auto *Entry = |
1669 | CostTableLookup(Table: SSE2SubVectorShuffleTbl, ISD: Kind, Ty: VT.getSimpleVT())) |
1670 | return Entry->Cost; |
1671 | } |
1672 | |
1673 | // We are going to permute multiple sources and the result will be in multiple |
1674 | // destinations. Providing an accurate cost only for splits where the element |
1675 | // type remains the same. |
1676 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { |
1677 | MVT LegalVT = LT.second; |
1678 | if (LegalVT.isVector() && |
1679 | LegalVT.getVectorElementType().getSizeInBits() == |
1680 | BaseTp->getElementType()->getPrimitiveSizeInBits() && |
1681 | LegalVT.getVectorNumElements() < |
1682 | cast<FixedVectorType>(Val: BaseTp)->getNumElements()) { |
1683 | unsigned VecTySize = DL.getTypeStoreSize(Ty: BaseTp); |
1684 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
1685 | // Number of source vectors after legalization: |
1686 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
1687 | // Number of destination vectors after legalization: |
1688 | InstructionCost NumOfDests = LT.first; |
1689 | |
1690 | auto *SingleOpTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1691 | NumElts: LegalVT.getVectorNumElements()); |
1692 | |
1693 | if (!Mask.empty() && NumOfDests.isValid()) { |
1694 | // Try to perform better estimation of the permutation. |
1695 | // 1. Split the source/destination vectors into real registers. |
1696 | // 2. Do the mask analysis to identify which real registers are |
1697 | // permuted. If more than 1 source registers are used for the |
1698 | // destination register building, the cost for this destination register |
1699 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one |
1700 | // source register is used, build mask and calculate the cost as a cost |
1701 | // of PermuteSingleSrc. |
1702 | // Also, for the single register permute we try to identify if the |
1703 | // destination register is just a copy of the source register or the |
1704 | // copy of the previous destination register (the cost is |
1705 | // TTI::TCC_Basic). If the source register is just reused, the cost for |
1706 | // this operation is 0. |
1707 | NumOfDests = |
1708 | getTypeLegalizationCost( |
1709 | Ty: FixedVectorType::get(ElementType: BaseTp->getElementType(), NumElts: Mask.size())) |
1710 | .first; |
1711 | unsigned E = *NumOfDests.getValue(); |
1712 | unsigned NormalizedVF = |
1713 | LegalVT.getVectorNumElements() * std::max(a: NumOfSrcs, b: E); |
1714 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1715 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1716 | SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); |
1717 | copy(Range&: Mask, Out: NormalizedMask.begin()); |
1718 | unsigned PrevSrcReg = 0; |
1719 | ArrayRef<int> PrevRegMask; |
1720 | InstructionCost Cost = 0; |
1721 | processShuffleMasks( |
1722 | Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {}, |
1723 | SingleInputAction: [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, |
1724 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { |
1725 | if (!ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size())) { |
1726 | // Check if the previous register can be just copied to the next |
1727 | // one. |
1728 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || |
1729 | PrevRegMask != RegMask) |
1730 | Cost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: SingleOpTy, |
1731 | Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
1732 | else |
1733 | // Just a copy of previous destination register. |
1734 | Cost += TTI::TCC_Basic; |
1735 | return; |
1736 | } |
1737 | if (SrcReg != DestReg && |
1738 | any_of(Range&: RegMask, P: [](int I) { return I != PoisonMaskElem; })) { |
1739 | // Just a copy of the source register. |
1740 | Cost += TTI::TCC_Basic; |
1741 | } |
1742 | PrevSrcReg = SrcReg; |
1743 | PrevRegMask = RegMask; |
1744 | }, |
1745 | ManyInputsAction: [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, |
1746 | unsigned /*Unused*/, |
1747 | unsigned /*Unused*/) { |
1748 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleOpTy, Mask: RegMask, |
1749 | CostKind, Index: 0, SubTp: nullptr); |
1750 | }); |
1751 | return Cost; |
1752 | } |
1753 | |
1754 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
1755 | return NumOfShuffles * getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleOpTy, |
1756 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
1757 | } |
1758 | |
1759 | return BaseT::getShuffleCost(Kind, Tp: BaseTp, Mask, CostKind, Index, SubTp); |
1760 | } |
1761 | |
1762 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. |
1763 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { |
1764 | // We assume that source and destination have the same vector type. |
1765 | InstructionCost NumOfDests = LT.first; |
1766 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; |
1767 | LT.first = NumOfDests * NumOfShufflesPerDest; |
1768 | } |
1769 | |
1770 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { |
1771 | {.ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: 1}, // vpermb |
1772 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: 1}, // vpermb |
1773 | |
1774 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: 1}, // vpermb |
1775 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: 1}, // vpermb |
1776 | |
1777 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: 2}, // vpermt2b |
1778 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: 2}, // vpermt2b |
1779 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: 2} // vpermt2b |
1780 | }; |
1781 | |
1782 | if (ST->hasVBMI()) |
1783 | if (const auto *Entry = |
1784 | CostTableLookup(Table: AVX512VBMIShuffleTbl, ISD: Kind, Ty: LT.second)) |
1785 | return LT.first * Entry->Cost; |
1786 | |
1787 | static const CostTblEntry AVX512BWShuffleTbl[] = { |
1788 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: 1}, // vpbroadcastw |
1789 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: 1}, // vpbroadcastw |
1790 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: 1}, // vpbroadcastb |
1791 | |
1792 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: 2}, // vpermw |
1793 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: 2}, // vpermw |
1794 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: 2}, // vpermw |
1795 | {.ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: 2}, // pshufb + vshufi64x2 |
1796 | |
1797 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: 2}, // vpermw |
1798 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: 2}, // vpermw |
1799 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: 2}, // vpermw |
1800 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: 2}, // vpermw |
1801 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: 8}, // extend to v32i16 |
1802 | |
1803 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16, .Cost: 2}, // vpermt2w |
1804 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16, .Cost: 2}, // vpermt2w |
1805 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: 2}, // vpermt2w |
1806 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: 2}, // vpermt2w |
1807 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: 19}, // 6 * v32i8 + 1 |
1808 | |
1809 | {.ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: 1}, // vblendmw |
1810 | {.ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: 1}, // vblendmb |
1811 | |
1812 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: 2}, // vshufi64x2 + palignr |
1813 | {.ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: 2}, // vshufi64x2 + palignr |
1814 | {.ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: 2}, // vshufi64x2 + palignr |
1815 | }; |
1816 | |
1817 | if (ST->hasBWI()) |
1818 | if (const auto *Entry = |
1819 | CostTableLookup(Table: AVX512BWShuffleTbl, ISD: Kind, Ty: LT.second)) |
1820 | return LT.first * Entry->Cost; |
1821 | |
1822 | static const CostKindTblEntry AVX512ShuffleTbl[] = { |
1823 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastsd |
1824 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vbroadcastss |
1825 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastq |
1826 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastd |
1827 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1828 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastw |
1829 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpbroadcastb |
1830 | |
1831 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1832 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1833 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1834 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1835 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1836 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1837 | {.ISD: TTI::SK_Reverse, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // per mca |
1838 | |
1839 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1840 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1841 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1842 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1843 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1844 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1845 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1846 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpalignd |
1847 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1848 | {.ISD: TTI::SK_Splice, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1849 | {.ISD: TTI::SK_Splice, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, // split + palignr |
1850 | |
1851 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1852 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1853 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermpd |
1854 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1855 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1856 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermps |
1857 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1858 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1859 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermq |
1860 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1861 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1862 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermd |
1863 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // pshufb |
1864 | |
1865 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1866 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1867 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1868 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1869 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1870 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1871 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1872 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1873 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2pd |
1874 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2ps |
1875 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2q |
1876 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpermt2d |
1877 | |
1878 | // FIXME: This just applies the type legalization cost rules above |
1879 | // assuming these completely split. |
1880 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1881 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1882 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
1883 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1884 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1885 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 42, .SizeAndLatencyCost: 42 } }, |
1886 | |
1887 | {.ISD: TTI::SK_Select, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1888 | {.ISD: TTI::SK_Select, .Type: MVT::v32f16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1889 | {.ISD: TTI::SK_Select, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
1890 | {.ISD: TTI::SK_Select, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmpd |
1891 | {.ISD: TTI::SK_Select, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmps |
1892 | {.ISD: TTI::SK_Select, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmq |
1893 | {.ISD: TTI::SK_Select, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vblendmd |
1894 | }; |
1895 | |
1896 | if (ST->hasAVX512()) |
1897 | if (const auto *Entry = CostTableLookup(Table: AVX512ShuffleTbl, ISD: Kind, Ty: LT.second)) |
1898 | if (auto KindCost = Entry->Cost[CostKind]) |
1899 | return LT.first * *KindCost; |
1900 | |
1901 | static const CostTblEntry AVX2ShuffleTbl[] = { |
1902 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: 1}, // vbroadcastpd |
1903 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: 1}, // vbroadcastps |
1904 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: 1}, // vpbroadcastq |
1905 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: 1}, // vpbroadcastd |
1906 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: 1}, // vpbroadcastw |
1907 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: 1}, // vpbroadcastw |
1908 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: 1}, // vpbroadcastb |
1909 | |
1910 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: 1}, // vpermpd |
1911 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: 1}, // vpermps |
1912 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: 1}, // vpermq |
1913 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: 1}, // vpermd |
1914 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: 2}, // vperm2i128 + pshufb |
1915 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: 2}, // vperm2i128 + pshufb |
1916 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: 2}, // vperm2i128 + pshufb |
1917 | |
1918 | {.ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: 1}, // vpblendvb |
1919 | {.ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: 1}, // vpblendvb |
1920 | {.ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: 1}, // vpblendvb |
1921 | |
1922 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: 2}, // vperm2i128 + vpalignr |
1923 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: 2}, // vperm2i128 + vpalignr |
1924 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: 2}, // vperm2i128 + vpalignr |
1925 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: 2}, // vperm2i128 + vpalignr |
1926 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: 2}, // vperm2i128 + vpalignr |
1927 | |
1928 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: 1}, // vpermpd |
1929 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: 1}, // vpermps |
1930 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: 1}, // vpermq |
1931 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: 1}, // vpermd |
1932 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: 4}, // vperm2i128 + 2*vpshufb |
1933 | // + vpblendvb |
1934 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: 4}, // vperm2i128 + 2*vpshufb |
1935 | // + vpblendvb |
1936 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: 4}, // vperm2i128 + 2*vpshufb |
1937 | // + vpblendvb |
1938 | |
1939 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: 3}, // 2*vpermpd + vblendpd |
1940 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: 3}, // 2*vpermps + vblendps |
1941 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: 3}, // 2*vpermq + vpblendd |
1942 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: 3}, // 2*vpermd + vpblendd |
1943 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: 7}, // 2*vperm2i128 + 4*vpshufb |
1944 | // + vpblendvb |
1945 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: 7}, // 2*vperm2i128 + 4*vpshufb |
1946 | // + vpblendvb |
1947 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: 7}, // 2*vperm2i128 + 4*vpshufb |
1948 | // + vpblendvb |
1949 | }; |
1950 | |
1951 | if (ST->hasAVX2()) |
1952 | if (const auto *Entry = CostTableLookup(Table: AVX2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
1953 | return LT.first * Entry->Cost; |
1954 | |
1955 | static const CostTblEntry XOPShuffleTbl[] = { |
1956 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: 2}, // vperm2f128 + vpermil2pd |
1957 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: 2}, // vperm2f128 + vpermil2ps |
1958 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: 2}, // vperm2f128 + vpermil2pd |
1959 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: 2}, // vperm2f128 + vpermil2ps |
1960 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: 4}, // vextractf128 + 2*vpperm |
1961 | // + vinsertf128 |
1962 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: 4}, // vextractf128 + 2*vpperm |
1963 | // + vinsertf128 |
1964 | |
1965 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: 9}, // 2*vextractf128 + 6*vpperm |
1966 | // + vinsertf128 |
1967 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: 1}, // vpperm |
1968 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: 9}, // 2*vextractf128 + 6*vpperm |
1969 | // + vinsertf128 |
1970 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: 1}, // vpperm |
1971 | }; |
1972 | |
1973 | if (ST->hasXOP()) |
1974 | if (const auto *Entry = CostTableLookup(Table: XOPShuffleTbl, ISD: Kind, Ty: LT.second)) |
1975 | return LT.first * Entry->Cost; |
1976 | |
1977 | static const CostTblEntry AVX1ShuffleTbl[] = { |
1978 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4f64, .Cost: 2}, // vperm2f128 + vpermilpd |
1979 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f32, .Cost: 2}, // vperm2f128 + vpermilps |
1980 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i64, .Cost: 2}, // vperm2f128 + vpermilpd |
1981 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i32, .Cost: 2}, // vperm2f128 + vpermilps |
1982 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i16, .Cost: 3}, // vpshuflw + vpshufd + vinsertf128 |
1983 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16f16, .Cost: 3}, // vpshuflw + vpshufd + vinsertf128 |
1984 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v32i8, .Cost: 2}, // vpshufb + vinsertf128 |
1985 | |
1986 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4f64, .Cost: 2}, // vperm2f128 + vpermilpd |
1987 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f32, .Cost: 2}, // vperm2f128 + vpermilps |
1988 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i64, .Cost: 2}, // vperm2f128 + vpermilpd |
1989 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i32, .Cost: 2}, // vperm2f128 + vpermilps |
1990 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i16, .Cost: 4}, // vextractf128 + 2*pshufb |
1991 | // + vinsertf128 |
1992 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16f16, .Cost: 4}, // vextractf128 + 2*pshufb |
1993 | // + vinsertf128 |
1994 | {.ISD: TTI::SK_Reverse, .Type: MVT::v32i8, .Cost: 4}, // vextractf128 + 2*pshufb |
1995 | // + vinsertf128 |
1996 | |
1997 | {.ISD: TTI::SK_Select, .Type: MVT::v4i64, .Cost: 1}, // vblendpd |
1998 | {.ISD: TTI::SK_Select, .Type: MVT::v4f64, .Cost: 1}, // vblendpd |
1999 | {.ISD: TTI::SK_Select, .Type: MVT::v8i32, .Cost: 1}, // vblendps |
2000 | {.ISD: TTI::SK_Select, .Type: MVT::v8f32, .Cost: 1}, // vblendps |
2001 | {.ISD: TTI::SK_Select, .Type: MVT::v16i16, .Cost: 3}, // vpand + vpandn + vpor |
2002 | {.ISD: TTI::SK_Select, .Type: MVT::v16f16, .Cost: 3}, // vpand + vpandn + vpor |
2003 | {.ISD: TTI::SK_Select, .Type: MVT::v32i8, .Cost: 3}, // vpand + vpandn + vpor |
2004 | |
2005 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i64, .Cost: 2}, // vperm2f128 + shufpd |
2006 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f64, .Cost: 2}, // vperm2f128 + shufpd |
2007 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2008 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2009 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i16, .Cost: 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2010 | {.ISD: TTI::SK_Splice, .Type: MVT::v16f16, .Cost: 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2011 | {.ISD: TTI::SK_Splice, .Type: MVT::v32i8, .Cost: 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
2012 | |
2013 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f64, .Cost: 2}, // vperm2f128 + vshufpd |
2014 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i64, .Cost: 2}, // vperm2f128 + vshufpd |
2015 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2016 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2017 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i16, .Cost: 8}, // vextractf128 + 4*pshufb |
2018 | // + 2*por + vinsertf128 |
2019 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16f16, .Cost: 8}, // vextractf128 + 4*pshufb |
2020 | // + 2*por + vinsertf128 |
2021 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v32i8, .Cost: 8}, // vextractf128 + 4*pshufb |
2022 | // + 2*por + vinsertf128 |
2023 | |
2024 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f64, .Cost: 3}, // 2*vperm2f128 + vshufpd |
2025 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i64, .Cost: 3}, // 2*vperm2f128 + vshufpd |
2026 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2027 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i32, .Cost: 4}, // 2*vperm2f128 + 2*vshufps |
2028 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i16, .Cost: 15}, // 2*vextractf128 + 8*pshufb |
2029 | // + 4*por + vinsertf128 |
2030 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16f16, .Cost: 15}, // 2*vextractf128 + 8*pshufb |
2031 | // + 4*por + vinsertf128 |
2032 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v32i8, .Cost: 15}, // 2*vextractf128 + 8*pshufb |
2033 | // + 4*por + vinsertf128 |
2034 | }; |
2035 | |
2036 | if (ST->hasAVX()) |
2037 | if (const auto *Entry = CostTableLookup(Table: AVX1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2038 | return LT.first * Entry->Cost; |
2039 | |
2040 | static const CostTblEntry SSE41ShuffleTbl[] = { |
2041 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // pblendw |
2042 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // movsd |
2043 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 1}, // pblendw |
2044 | {.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 1}, // blendps |
2045 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: 1}, // pblendw |
2046 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: 1}, // pblendw |
2047 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: 1} // pblendvb |
2048 | }; |
2049 | |
2050 | if (ST->hasSSE41()) |
2051 | if (const auto *Entry = CostTableLookup(Table: SSE41ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2052 | return LT.first * Entry->Cost; |
2053 | |
2054 | static const CostTblEntry SSSE3ShuffleTbl[] = { |
2055 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 1}, // pshufb |
2056 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 1}, // pshufb |
2057 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 1}, // pshufb |
2058 | |
2059 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 1}, // pshufb |
2060 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 1}, // pshufb |
2061 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 1}, // pshufb |
2062 | |
2063 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: 3}, // 2*pshufb + por |
2064 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: 3}, // 2*pshufb + por |
2065 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: 3}, // 2*pshufb + por |
2066 | |
2067 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 1}, // palignr |
2068 | {.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 1}, // palignr |
2069 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 1}, // palignr |
2070 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 1}, // palignr |
2071 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 1}, // palignr |
2072 | |
2073 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 1}, // pshufb |
2074 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 1}, // pshufb |
2075 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 1}, // pshufb |
2076 | |
2077 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: 3}, // 2*pshufb + por |
2078 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: 3}, // 2*pshufb + por |
2079 | {.ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: 3}, // 2*pshufb + por |
2080 | }; |
2081 | |
2082 | if (ST->hasSSSE3()) |
2083 | if (const auto *Entry = CostTableLookup(Table: SSSE3ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2084 | return LT.first * Entry->Cost; |
2085 | |
2086 | static const CostTblEntry SSE2ShuffleTbl[] = { |
2087 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 1}, // shufpd |
2088 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: 1}, // pshufd |
2089 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: 1}, // pshufd |
2090 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: 2}, // pshuflw + pshufd |
2091 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: 2}, // pshuflw + pshufd |
2092 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: 3}, // unpck + pshuflw + pshufd |
2093 | |
2094 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: 1}, // shufpd |
2095 | {.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: 1}, // pshufd |
2096 | {.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: 1}, // pshufd |
2097 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: 3}, // pshuflw + pshufhw + pshufd |
2098 | {.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: 3}, // pshuflw + pshufhw + pshufd |
2099 | {.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: 9}, // 2*pshuflw + 2*pshufhw |
2100 | // + 2*pshufd + 2*unpck + packus |
2101 | |
2102 | {.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: 1}, // movsd |
2103 | {.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: 1}, // movsd |
2104 | {.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: 2}, // 2*shufps |
2105 | {.ISD: TTI::SK_Select, .Type: MVT::v8i16, .Cost: 3}, // pand + pandn + por |
2106 | {.ISD: TTI::SK_Select, .Type: MVT::v8f16, .Cost: 3}, // pand + pandn + por |
2107 | {.ISD: TTI::SK_Select, .Type: MVT::v16i8, .Cost: 3}, // pand + pandn + por |
2108 | |
2109 | {.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: 1}, // shufpd |
2110 | {.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: 1}, // shufpd |
2111 | {.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: 2}, // 2*{unpck,movsd,pshufd} |
2112 | {.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: 3}, // psrldq + psrlldq + por |
2113 | {.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: 3}, // psrldq + psrlldq + por |
2114 | {.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: 3}, // psrldq + psrlldq + por |
2115 | |
2116 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: 1}, // shufpd |
2117 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: 1}, // pshufd |
2118 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: 1}, // pshufd |
2119 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: 5}, // 2*pshuflw + 2*pshufhw |
2120 | // + pshufd/unpck |
2121 | {.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: 5}, // 2*pshuflw + 2*pshufhw |
2122 | // + pshufd/unpck |
2123 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: 10 }, // 2*pshuflw + 2*pshufhw |
2124 | // + 2*pshufd + 2*unpck + 2*packus |
2125 | |
2126 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2f64, .Cost: 1 }, // shufpd |
2127 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v2i64, .Cost: 1 }, // shufpd |
2128 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4i32, .Cost: 2 }, // 2*{unpck,movsd,pshufd} |
2129 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8i16, .Cost: 8 }, // blend+permute |
2130 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v8f16, .Cost: 8 }, // blend+permute |
2131 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v16i8, .Cost: 13 }, // blend+permute |
2132 | }; |
2133 | |
2134 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { |
2135 | {.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: 0}, // broadcast handled by movddup |
2136 | }; |
2137 | |
2138 | if (ST->hasSSE2()) { |
2139 | bool IsLoad = |
2140 | llvm::any_of(Range&: Args, P: [](const auto &V) { return isa<LoadInst>(V); }); |
2141 | if (ST->hasSSE3() && IsLoad) |
2142 | if (const auto *Entry = |
2143 | CostTableLookup(Table: SSE3BroadcastLoadTbl, ISD: Kind, Ty: LT.second)) { |
2144 | assert(isLegalBroadcastLoad(BaseTp->getElementType(), |
2145 | LT.second.getVectorElementCount()) && |
2146 | "Table entry missing from isLegalBroadcastLoad()" ); |
2147 | return LT.first * Entry->Cost; |
2148 | } |
2149 | |
2150 | if (const auto *Entry = CostTableLookup(Table: SSE2ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2151 | return LT.first * Entry->Cost; |
2152 | } |
2153 | |
2154 | static const CostTblEntry SSE1ShuffleTbl[] = { |
2155 | { .ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: 1 }, // shufps |
2156 | { .ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: 1 }, // shufps |
2157 | { .ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: 2 }, // 2*shufps |
2158 | { .ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: 2 }, // 2*shufps |
2159 | { .ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: 1 }, // shufps |
2160 | { .ISD: TTI::SK_PermuteTwoSrc, .Type: MVT::v4f32, .Cost: 2 }, // 2*shufps |
2161 | }; |
2162 | |
2163 | if (ST->hasSSE1()) |
2164 | if (const auto *Entry = CostTableLookup(Table: SSE1ShuffleTbl, ISD: Kind, Ty: LT.second)) |
2165 | return LT.first * Entry->Cost; |
2166 | |
2167 | return BaseT::getShuffleCost(Kind, Tp: BaseTp, Mask, CostKind, Index, SubTp); |
2168 | } |
2169 | |
2170 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
2171 | Type *Src, |
2172 | TTI::CastContextHint CCH, |
2173 | TTI::TargetCostKind CostKind, |
2174 | const Instruction *I) { |
2175 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2176 | assert(ISD && "Invalid opcode" ); |
2177 | |
2178 | // The cost tables include both specific, custom (non-legal) src/dst type |
2179 | // conversions and generic, legalized types. We test for customs first, before |
2180 | // falling back to legalization. |
2181 | // FIXME: Need a better design of the cost table to handle non-simple types of |
2182 | // potential massive combinations (elem_num x src_type x dst_type). |
2183 | static const TypeConversionCostKindTblEntry AVX512BWConversionTbl[]{ |
2184 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2185 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2186 | |
2187 | // Mask sign extend has an instruction. |
2188 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2189 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2190 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2191 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2192 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2193 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2194 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2195 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2196 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2197 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2198 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2199 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2200 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2201 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2202 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2203 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2204 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2205 | |
2206 | // Mask zero extend is a sext + shift. |
2207 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2208 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2209 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2210 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2211 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2212 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2213 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2214 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2215 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2216 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2217 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2218 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2219 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2220 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2221 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2222 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v64i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2223 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2224 | |
2225 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2226 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2227 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2228 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2229 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2230 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2231 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2232 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2233 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2234 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2235 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2236 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2237 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2238 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2239 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2240 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2241 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2242 | |
2243 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2244 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // widen to zmm |
2245 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2246 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2247 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2248 | }; |
2249 | |
2250 | static const TypeConversionCostKindTblEntry AVX512DQConversionTbl[] = { |
2251 | // Mask sign extend has an instruction. |
2252 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2253 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2254 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2255 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2256 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2257 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2258 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2259 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2260 | |
2261 | // Mask zero extend is a sext + shift. |
2262 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2263 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2264 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2265 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2266 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2267 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2268 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2269 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1, } }, |
2270 | |
2271 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2272 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2273 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2274 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2275 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2276 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2277 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2278 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2279 | |
2280 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2281 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2282 | |
2283 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2284 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2285 | |
2286 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2287 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2288 | |
2289 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2290 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i64, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2291 | }; |
2292 | |
2293 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
2294 | // 256-bit wide vectors. |
2295 | |
2296 | static const TypeConversionCostKindTblEntry AVX512FConversionTbl[] = { |
2297 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2298 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2299 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v16f64, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // 2*vcvtps2pd+vextractf64x4 |
2300 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2301 | |
2302 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2303 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2304 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2305 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2306 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2307 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2308 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2309 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2310 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2311 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2312 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpslld+vptestmd |
2313 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2314 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
2315 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpsllq+vptestmq |
2316 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2317 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2318 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2319 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2320 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2321 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdb |
2322 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
2323 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovdw |
2324 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2325 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpshufb |
2326 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2327 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2328 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2329 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2330 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2331 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2332 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i16, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2333 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
2334 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpmovqd |
2335 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } },// 2*vpmovqd+concat+vpmovdb |
2336 | |
2337 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // extend to v16i32 |
2338 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2339 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i8, .Src: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2340 | |
2341 | // Sign extend is zmm vpternlogd+vptruncdb. |
2342 | // Zero extend is zmm broadcast load+vptruncdw. |
2343 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2344 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2345 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2346 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2347 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2348 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2349 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2350 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2351 | |
2352 | // Sign extend is zmm vpternlogd+vptruncdw. |
2353 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. |
2354 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2355 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2356 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2357 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2358 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2359 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2360 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2361 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2362 | |
2363 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2364 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2365 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2366 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2367 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd |
2368 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogd+psrld |
2369 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
2370 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
2371 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq |
2372 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // zmm vpternlogq+psrlq |
2373 | |
2374 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2375 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2376 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2377 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2378 | |
2379 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2380 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2381 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2382 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2383 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2384 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2385 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2386 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2387 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2388 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2389 | |
2390 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
2391 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i16, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // FIXME: May not be right |
2392 | |
2393 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2394 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2395 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2396 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2397 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2398 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2399 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2400 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2401 | |
2402 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2403 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2404 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2405 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2406 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2407 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2408 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2409 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2410 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i64, .Cost: {.RecipThroughputCost: 26, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2411 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2412 | |
2413 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2414 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2415 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2416 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f32, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2417 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v64i8, .Src: MVT::v64f64, .Cost: {.RecipThroughputCost: 31, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2418 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2419 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2420 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2421 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i16, .Src: MVT::v32f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2422 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2423 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2424 | |
2425 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2426 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2427 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2428 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2429 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2430 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2431 | }; |
2432 | |
2433 | static const TypeConversionCostKindTblEntry AVX512BWVLConversionTbl[] { |
2434 | // Mask sign extend has an instruction. |
2435 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2436 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2437 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2438 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2439 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2440 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2441 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2442 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2443 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2444 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2445 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2446 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2447 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2448 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2449 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2450 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2451 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2452 | |
2453 | // Mask zero extend is a sext + shift. |
2454 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2455 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2456 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2457 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2458 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2459 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2460 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2461 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2462 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2463 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2464 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2465 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2466 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2467 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2468 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v32i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2469 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v32i8, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2470 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v64i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2471 | |
2472 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2473 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2474 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2475 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2476 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2477 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2478 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2479 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2480 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2481 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2482 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2483 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2484 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2485 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2486 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v32i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2487 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2488 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v64i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2489 | |
2490 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2491 | }; |
2492 | |
2493 | static const TypeConversionCostKindTblEntry AVX512DQVLConversionTbl[] = { |
2494 | // Mask sign extend has an instruction. |
2495 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2496 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2497 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2498 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2499 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2500 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2501 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2502 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2503 | |
2504 | // Mask zero extend is a sext + shift. |
2505 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2506 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2507 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2508 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2509 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2510 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2511 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2512 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2513 | |
2514 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2515 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2516 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2517 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2518 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2519 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2520 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2521 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2522 | |
2523 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2524 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2525 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2526 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2527 | |
2528 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2529 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2530 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2531 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2532 | |
2533 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2534 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2535 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2536 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2537 | |
2538 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2539 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2540 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2541 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i64, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2542 | }; |
2543 | |
2544 | static const TypeConversionCostKindTblEntry AVX512VLConversionTbl[] = { |
2545 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2546 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2547 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpslld+vptestmd |
2548 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i8 |
2549 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2550 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2551 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sext+vpsllq+vptestmq |
2552 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // split+2*v8i16 |
2553 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2554 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2555 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2556 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpslld+vptestmd |
2557 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2558 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpsllq+vptestmq |
2559 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqd |
2560 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqb |
2561 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovqw |
2562 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpmovwb |
2563 | |
2564 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb |
2565 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb |
2566 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2567 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i8, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2568 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2569 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i8, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2570 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2571 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i8, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2572 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2573 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i8, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2574 | |
2575 | // sign extend is vpcmpeq+maskedmove+vpmovdw |
2576 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw |
2577 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2578 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i16, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2579 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2580 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i16, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2581 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2582 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2583 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2584 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2585 | |
2586 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2587 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i32, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2588 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2589 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2590 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2591 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2592 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd |
2593 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogd+psrld |
2594 | |
2595 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2596 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2597 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq |
2598 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vpternlogq+psrlq |
2599 | |
2600 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2601 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2602 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2603 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2604 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2605 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2606 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2607 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2608 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2609 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2610 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2611 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2612 | |
2613 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2614 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2615 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2616 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2617 | |
2618 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2619 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2620 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2621 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2622 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2623 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2624 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2625 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2626 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2627 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2628 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2629 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2630 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2631 | |
2632 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2633 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2634 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v32f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2635 | |
2636 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2637 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2638 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2639 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2640 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2641 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2642 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2643 | }; |
2644 | |
2645 | static const TypeConversionCostKindTblEntry AVX2ConversionTbl[] = { |
2646 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2647 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2648 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2649 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2650 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2651 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2652 | |
2653 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2654 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2655 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2656 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2657 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2658 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2659 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2660 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2661 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2662 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2663 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2664 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2665 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2666 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2667 | |
2668 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2669 | |
2670 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2671 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2672 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2673 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2674 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2675 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2676 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2677 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2678 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2679 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2680 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2681 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2682 | |
2683 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2684 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v8f32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2685 | |
2686 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2687 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2688 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2689 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2690 | |
2691 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2692 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2693 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2694 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2695 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2696 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2697 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2698 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2699 | |
2700 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2701 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2702 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2703 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2704 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2705 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2706 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2707 | |
2708 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2709 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2710 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2711 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2712 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2713 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2714 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2715 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2716 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2717 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2718 | }; |
2719 | |
2720 | static const TypeConversionCostKindTblEntry AVXConversionTbl[] = { |
2721 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2722 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2723 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2724 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2725 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2726 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i1, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2727 | |
2728 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2729 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2730 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2731 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2732 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2733 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2734 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2735 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2736 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2737 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2738 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2739 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2740 | |
2741 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2742 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2743 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2744 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2745 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: {.RecipThroughputCost: 11, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2746 | |
2747 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2748 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2749 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+packuswb |
2750 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2751 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2752 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2753 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // and+extract+2*packusdw |
2754 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2755 | |
2756 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2757 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2758 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2759 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2760 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2761 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2762 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2763 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2764 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2765 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2766 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2767 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2768 | |
2769 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2770 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i1, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2771 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i1, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2772 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2773 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2774 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2775 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2776 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2777 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2778 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2779 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2780 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2781 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f64, .Src: MVT::v8i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2782 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2783 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2784 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2785 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2786 | |
2787 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2788 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2789 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2790 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2791 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2792 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2793 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2794 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2795 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2796 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2797 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2798 | |
2799 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2800 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2801 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2802 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v32i8, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2803 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2804 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2805 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2806 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2807 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2808 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2809 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2810 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2811 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2812 | |
2813 | { .ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2814 | { .ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2815 | }; |
2816 | |
2817 | static const TypeConversionCostKindTblEntry SSE41ConversionTbl[] = { |
2818 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2819 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2820 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2821 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2822 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2823 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2824 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2825 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2826 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2827 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2828 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2829 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2830 | |
2831 | // These truncates end up widening elements. |
2832 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBQ |
2833 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZWQ |
2834 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PMOVXZBD |
2835 | |
2836 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2837 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2838 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2839 | |
2840 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2841 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2842 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2843 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2844 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2845 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2846 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2847 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2848 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2849 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2850 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2851 | |
2852 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2853 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2854 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2855 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2856 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2857 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2858 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2859 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2860 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2861 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2862 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2863 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 12, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2864 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i64, .Cost: {.RecipThroughputCost: 22, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2865 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2866 | |
2867 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2868 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2869 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2870 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2871 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2872 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2873 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2874 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2875 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2876 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2877 | |
2878 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2879 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2880 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2881 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2882 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2883 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2884 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2885 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2886 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2887 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2888 | }; |
2889 | |
2890 | static const TypeConversionCostKindTblEntry SSE2ConversionTbl[] = { |
2891 | // These are somewhat magic numbers justified by comparing the |
2892 | // output of llvm-mca for our various supported scheduler models |
2893 | // and basing it off the worst case scenario. |
2894 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2895 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2896 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2897 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2898 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2899 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2900 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2901 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2902 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2903 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2904 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2905 | { .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2906 | |
2907 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2908 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2909 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f32, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2910 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::f64, .Src: MVT::i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2911 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2912 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2913 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2914 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2915 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2916 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2917 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2918 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2919 | { .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v2i64, .Cost: {.RecipThroughputCost: 18, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2920 | |
2921 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2922 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2923 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2924 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2925 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2926 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2927 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2928 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2929 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2930 | { .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2931 | |
2932 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2933 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2934 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i32, .Src: MVT::f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2935 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::i64, .Src: MVT::f64, .Cost: {.RecipThroughputCost: 15, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2936 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2937 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2938 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2939 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2940 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2941 | { .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v2f64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2942 | |
2943 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2944 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2945 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2946 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2947 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2948 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2949 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2950 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2951 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2952 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2953 | { .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2954 | { .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2955 | |
2956 | // These truncates are really widening elements. |
2957 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
2958 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD+DQ |
2959 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i1, .Src: MVT::v2i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD+PSHUFD |
2960 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLWD |
2961 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i1, .Src: MVT::v4i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW+WD |
2962 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i1, .Src: MVT::v8i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PUNPCKLBW |
2963 | |
2964 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+PACKUSWB |
2965 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2966 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+2*PACKUSWB |
2967 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2968 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2969 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2970 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2971 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: {.RecipThroughputCost: 10, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
2972 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PAND+3*PACKUSWB |
2973 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD+PSHUFLW |
2974 | { .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // PSHUFD |
2975 | }; |
2976 | |
2977 | // Attempt to map directly to (simple) MVT types to let us match custom entries. |
2978 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
2979 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
2980 | |
2981 | // The function getSimpleVT only handles simple value types. |
2982 | if (SrcTy.isSimple() && DstTy.isSimple()) { |
2983 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); |
2984 | MVT SimpleDstTy = DstTy.getSimpleVT(); |
2985 | |
2986 | if (ST->useAVX512Regs()) { |
2987 | if (ST->hasBWI()) |
2988 | if (const auto *Entry = ConvertCostTableLookup( |
2989 | Table: AVX512BWConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
2990 | if (auto KindCost = Entry->Cost[CostKind]) |
2991 | return *KindCost; |
2992 | |
2993 | if (ST->hasDQI()) |
2994 | if (const auto *Entry = ConvertCostTableLookup( |
2995 | Table: AVX512DQConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
2996 | if (auto KindCost = Entry->Cost[CostKind]) |
2997 | return *KindCost; |
2998 | |
2999 | if (ST->hasAVX512()) |
3000 | if (const auto *Entry = ConvertCostTableLookup( |
3001 | Table: AVX512FConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3002 | if (auto KindCost = Entry->Cost[CostKind]) |
3003 | return *KindCost; |
3004 | } |
3005 | |
3006 | if (ST->hasBWI()) |
3007 | if (const auto *Entry = ConvertCostTableLookup( |
3008 | Table: AVX512BWVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3009 | if (auto KindCost = Entry->Cost[CostKind]) |
3010 | return *KindCost; |
3011 | |
3012 | if (ST->hasDQI()) |
3013 | if (const auto *Entry = ConvertCostTableLookup( |
3014 | Table: AVX512DQVLConversionTbl, ISD, Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3015 | if (auto KindCost = Entry->Cost[CostKind]) |
3016 | return *KindCost; |
3017 | |
3018 | if (ST->hasAVX512()) |
3019 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
3020 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3021 | if (auto KindCost = Entry->Cost[CostKind]) |
3022 | return *KindCost; |
3023 | |
3024 | if (ST->hasAVX2()) { |
3025 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
3026 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3027 | if (auto KindCost = Entry->Cost[CostKind]) |
3028 | return *KindCost; |
3029 | } |
3030 | |
3031 | if (ST->hasAVX()) { |
3032 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
3033 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3034 | if (auto KindCost = Entry->Cost[CostKind]) |
3035 | return *KindCost; |
3036 | } |
3037 | |
3038 | if (ST->hasSSE41()) { |
3039 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
3040 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3041 | if (auto KindCost = Entry->Cost[CostKind]) |
3042 | return *KindCost; |
3043 | } |
3044 | |
3045 | if (ST->hasSSE2()) { |
3046 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
3047 | Dst: SimpleDstTy, Src: SimpleSrcTy)) |
3048 | if (auto KindCost = Entry->Cost[CostKind]) |
3049 | return *KindCost; |
3050 | } |
3051 | } |
3052 | |
3053 | // Fall back to legalized types. |
3054 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Ty: Src); |
3055 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Ty: Dst); |
3056 | |
3057 | // If we're truncating to the same legalized type - just assume its free. |
3058 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) |
3059 | return TTI::TCC_Free; |
3060 | |
3061 | if (ST->useAVX512Regs()) { |
3062 | if (ST->hasBWI()) |
3063 | if (const auto *Entry = ConvertCostTableLookup( |
3064 | Table: AVX512BWConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3065 | if (auto KindCost = Entry->Cost[CostKind]) |
3066 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3067 | |
3068 | if (ST->hasDQI()) |
3069 | if (const auto *Entry = ConvertCostTableLookup( |
3070 | Table: AVX512DQConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3071 | if (auto KindCost = Entry->Cost[CostKind]) |
3072 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3073 | |
3074 | if (ST->hasAVX512()) |
3075 | if (const auto *Entry = ConvertCostTableLookup( |
3076 | Table: AVX512FConversionTbl, ISD, Dst: LTDest.second, Src: LTSrc.second)) |
3077 | if (auto KindCost = Entry->Cost[CostKind]) |
3078 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3079 | } |
3080 | |
3081 | if (ST->hasBWI()) |
3082 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512BWVLConversionTbl, ISD, |
3083 | Dst: LTDest.second, Src: LTSrc.second)) |
3084 | if (auto KindCost = Entry->Cost[CostKind]) |
3085 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3086 | |
3087 | if (ST->hasDQI()) |
3088 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512DQVLConversionTbl, ISD, |
3089 | Dst: LTDest.second, Src: LTSrc.second)) |
3090 | if (auto KindCost = Entry->Cost[CostKind]) |
3091 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3092 | |
3093 | if (ST->hasAVX512()) |
3094 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX512VLConversionTbl, ISD, |
3095 | Dst: LTDest.second, Src: LTSrc.second)) |
3096 | if (auto KindCost = Entry->Cost[CostKind]) |
3097 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3098 | |
3099 | if (ST->hasAVX2()) |
3100 | if (const auto *Entry = ConvertCostTableLookup(Table: AVX2ConversionTbl, ISD, |
3101 | Dst: LTDest.second, Src: LTSrc.second)) |
3102 | if (auto KindCost = Entry->Cost[CostKind]) |
3103 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3104 | |
3105 | if (ST->hasAVX()) |
3106 | if (const auto *Entry = ConvertCostTableLookup(Table: AVXConversionTbl, ISD, |
3107 | Dst: LTDest.second, Src: LTSrc.second)) |
3108 | if (auto KindCost = Entry->Cost[CostKind]) |
3109 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3110 | |
3111 | if (ST->hasSSE41()) |
3112 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE41ConversionTbl, ISD, |
3113 | Dst: LTDest.second, Src: LTSrc.second)) |
3114 | if (auto KindCost = Entry->Cost[CostKind]) |
3115 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3116 | |
3117 | if (ST->hasSSE2()) |
3118 | if (const auto *Entry = ConvertCostTableLookup(Table: SSE2ConversionTbl, ISD, |
3119 | Dst: LTDest.second, Src: LTSrc.second)) |
3120 | if (auto KindCost = Entry->Cost[CostKind]) |
3121 | return std::max(a: LTSrc.first, b: LTDest.first) * *KindCost; |
3122 | |
3123 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for |
3124 | // sitofp. |
3125 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
3126 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { |
3127 | Type *ExtSrc = Src->getWithNewBitWidth(NewBitWidth: 32); |
3128 | unsigned ExtOpc = |
3129 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; |
3130 | |
3131 | // For scalar loads the extend would be free. |
3132 | InstructionCost ExtCost = 0; |
3133 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(Val: I->getOperand(i: 0)))) |
3134 | ExtCost = getCastInstrCost(Opcode: ExtOpc, Dst: ExtSrc, Src, CCH, CostKind); |
3135 | |
3136 | return ExtCost + getCastInstrCost(Opcode: Instruction::SIToFP, Dst, Src: ExtSrc, |
3137 | CCH: TTI::CastContextHint::None, CostKind); |
3138 | } |
3139 | |
3140 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi |
3141 | // i32. |
3142 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && |
3143 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { |
3144 | Type *TruncDst = Dst->getWithNewBitWidth(NewBitWidth: 32); |
3145 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: TruncDst, Src, CCH, CostKind) + |
3146 | getCastInstrCost(Opcode: Instruction::Trunc, Dst, Src: TruncDst, |
3147 | CCH: TTI::CastContextHint::None, CostKind); |
3148 | } |
3149 | |
3150 | // TODO: Allow non-throughput costs that aren't binary. |
3151 | auto AdjustCost = [&CostKind](InstructionCost Cost, |
3152 | InstructionCost N = 1) -> InstructionCost { |
3153 | if (CostKind != TTI::TCK_RecipThroughput) |
3154 | return Cost == 0 ? 0 : N; |
3155 | return Cost * N; |
3156 | }; |
3157 | return AdjustCost( |
3158 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
3159 | } |
3160 | |
3161 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
3162 | Type *CondTy, |
3163 | CmpInst::Predicate VecPred, |
3164 | TTI::TargetCostKind CostKind, |
3165 | const Instruction *I) { |
3166 | // Early out if this type isn't scalar/vector integer/float. |
3167 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) |
3168 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3169 | I); |
3170 | |
3171 | // Legalize the type. |
3172 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3173 | |
3174 | MVT MTy = LT.second; |
3175 | |
3176 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3177 | assert(ISD && "Invalid opcode" ); |
3178 | |
3179 | InstructionCost = 0; |
3180 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { |
3181 | // Some vector comparison predicates cost extra instructions. |
3182 | // TODO: Adjust ExtraCost based on CostKind? |
3183 | // TODO: Should we invert this and assume worst case cmp costs |
3184 | // and reduce for particular predicates? |
3185 | if (MTy.isVector() && |
3186 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || |
3187 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || |
3188 | ST->hasBWI())) { |
3189 | // Fallback to I if a specific predicate wasn't specified. |
3190 | CmpInst::Predicate Pred = VecPred; |
3191 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || |
3192 | Pred == CmpInst::BAD_FCMP_PREDICATE)) |
3193 | Pred = cast<CmpInst>(Val: I)->getPredicate(); |
3194 | |
3195 | bool CmpWithConstant = false; |
3196 | if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(Val: I)) |
3197 | CmpWithConstant = isa<Constant>(Val: CmpInstr->getOperand(i_nocapture: 1)); |
3198 | |
3199 | switch (Pred) { |
3200 | case CmpInst::Predicate::ICMP_NE: |
3201 | // xor(cmpeq(x,y),-1) |
3202 | ExtraCost = CmpWithConstant ? 0 : 1; |
3203 | break; |
3204 | case CmpInst::Predicate::ICMP_SGE: |
3205 | case CmpInst::Predicate::ICMP_SLE: |
3206 | // xor(cmpgt(x,y),-1) |
3207 | ExtraCost = CmpWithConstant ? 0 : 1; |
3208 | break; |
3209 | case CmpInst::Predicate::ICMP_ULT: |
3210 | case CmpInst::Predicate::ICMP_UGT: |
3211 | // cmpgt(xor(x,signbit),xor(y,signbit)) |
3212 | // xor(cmpeq(pmaxu(x,y),x),-1) |
3213 | ExtraCost = CmpWithConstant ? 1 : 2; |
3214 | break; |
3215 | case CmpInst::Predicate::ICMP_ULE: |
3216 | case CmpInst::Predicate::ICMP_UGE: |
3217 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || |
3218 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { |
3219 | // cmpeq(psubus(x,y),0) |
3220 | // cmpeq(pminu(x,y),x) |
3221 | ExtraCost = 1; |
3222 | } else { |
3223 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) |
3224 | ExtraCost = CmpWithConstant ? 2 : 3; |
3225 | } |
3226 | break; |
3227 | case CmpInst::Predicate::FCMP_ONE: |
3228 | case CmpInst::Predicate::FCMP_UEQ: |
3229 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. |
3230 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. |
3231 | if (CondTy && !ST->hasAVX()) |
3232 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3233 | VecPred: CmpInst::Predicate::FCMP_UNO, CostKind) + |
3234 | getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3235 | VecPred: CmpInst::Predicate::FCMP_OEQ, CostKind) + |
3236 | getArithmeticInstrCost(Opcode: Instruction::Or, Ty: CondTy, CostKind); |
3237 | |
3238 | break; |
3239 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: |
3240 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: |
3241 | // Assume worst case scenario and add the maximum extra cost. |
3242 | ExtraCost = 3; |
3243 | break; |
3244 | default: |
3245 | break; |
3246 | } |
3247 | } |
3248 | } |
3249 | |
3250 | static const CostKindTblEntry SLMCostTbl[] = { |
3251 | // slm pcmpeq/pcmpgt throughput is 2 |
3252 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3253 | // slm pblendvb/blendvpd/blendvps throughput is 4 |
3254 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvpd |
3255 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vblendvps |
3256 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3257 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3258 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3259 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // pblendvb |
3260 | }; |
3261 | |
3262 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3263 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3264 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3265 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3266 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3267 | |
3268 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3269 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3270 | }; |
3271 | |
3272 | static const CostKindTblEntry AVX512CostTbl[] = { |
3273 | { .ISD: ISD::SETCC, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3274 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3275 | { .ISD: ISD::SETCC, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3276 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3277 | |
3278 | { .ISD: ISD::SETCC, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3279 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3280 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3281 | { .ISD: ISD::SETCC, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3282 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3283 | { .ISD: ISD::SETCC, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3284 | { .ISD: ISD::SETCC, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3285 | |
3286 | { .ISD: ISD::SELECT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3287 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3288 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3289 | { .ISD: ISD::SELECT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3290 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3291 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3292 | { .ISD: ISD::SELECT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3293 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3294 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3295 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3296 | { .ISD: ISD::SELECT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3297 | { .ISD: ISD::SELECT, .Type: MVT::v8f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3298 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3299 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3300 | |
3301 | { .ISD: ISD::SELECT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3302 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3303 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3304 | { .ISD: ISD::SELECT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3305 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3306 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3307 | }; |
3308 | |
3309 | static const CostKindTblEntry AVX2CostTbl[] = { |
3310 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3311 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3312 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3313 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3314 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3315 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3316 | |
3317 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3318 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3319 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3320 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3321 | |
3322 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3323 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3324 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3325 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3326 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3327 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3328 | }; |
3329 | |
3330 | static const CostKindTblEntry XOPCostTbl[] = { |
3331 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3332 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3333 | }; |
3334 | |
3335 | static const CostKindTblEntry AVX1CostTbl[] = { |
3336 | { .ISD: ISD::SETCC, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3337 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3338 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3339 | { .ISD: ISD::SETCC, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3340 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3341 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3342 | |
3343 | // AVX1 does not support 8-wide integer compare. |
3344 | { .ISD: ISD::SETCC, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3345 | { .ISD: ISD::SETCC, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3346 | { .ISD: ISD::SETCC, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3347 | { .ISD: ISD::SETCC, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 2, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3348 | |
3349 | { .ISD: ISD::SELECT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3350 | { .ISD: ISD::SELECT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3351 | { .ISD: ISD::SELECT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvpd |
3352 | { .ISD: ISD::SELECT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // vblendvps |
3353 | { .ISD: ISD::SELECT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
3354 | { .ISD: ISD::SELECT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // vandps + vandnps + vorps |
3355 | }; |
3356 | |
3357 | static const CostKindTblEntry SSE42CostTbl[] = { |
3358 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3359 | }; |
3360 | |
3361 | static const CostKindTblEntry SSE41CostTbl[] = { |
3362 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3363 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3364 | |
3365 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
3366 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvpd |
3367 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
3368 | { .ISD: ISD::SELECT, .Type: MVT::f32 , .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // blendvps |
3369 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3370 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3371 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3372 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // pblendvb |
3373 | }; |
3374 | |
3375 | static const CostKindTblEntry SSE2CostTbl[] = { |
3376 | { .ISD: ISD::SETCC, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3377 | { .ISD: ISD::SETCC, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3378 | |
3379 | { .ISD: ISD::SETCC, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, // pcmpeqd/pcmpgtd expansion |
3380 | { .ISD: ISD::SETCC, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3381 | { .ISD: ISD::SETCC, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3382 | { .ISD: ISD::SETCC, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3383 | |
3384 | { .ISD: ISD::SELECT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
3385 | { .ISD: ISD::SELECT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andpd + andnpd + orpd |
3386 | { .ISD: ISD::SELECT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3387 | { .ISD: ISD::SELECT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3388 | { .ISD: ISD::SELECT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3389 | { .ISD: ISD::SELECT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // pand + pandn + por |
3390 | }; |
3391 | |
3392 | static const CostKindTblEntry SSE1CostTbl[] = { |
3393 | { .ISD: ISD::SETCC, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3394 | { .ISD: ISD::SETCC, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3395 | |
3396 | { .ISD: ISD::SELECT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
3397 | { .ISD: ISD::SELECT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // andps + andnps + orps |
3398 | }; |
3399 | |
3400 | if (ST->useSLMArithCosts()) |
3401 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
3402 | if (auto KindCost = Entry->Cost[CostKind]) |
3403 | return LT.first * (ExtraCost + *KindCost); |
3404 | |
3405 | if (ST->hasBWI()) |
3406 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
3407 | if (auto KindCost = Entry->Cost[CostKind]) |
3408 | return LT.first * (ExtraCost + *KindCost); |
3409 | |
3410 | if (ST->hasAVX512()) |
3411 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
3412 | if (auto KindCost = Entry->Cost[CostKind]) |
3413 | return LT.first * (ExtraCost + *KindCost); |
3414 | |
3415 | if (ST->hasAVX2()) |
3416 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
3417 | if (auto KindCost = Entry->Cost[CostKind]) |
3418 | return LT.first * (ExtraCost + *KindCost); |
3419 | |
3420 | if (ST->hasXOP()) |
3421 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
3422 | if (auto KindCost = Entry->Cost[CostKind]) |
3423 | return LT.first * (ExtraCost + *KindCost); |
3424 | |
3425 | if (ST->hasAVX()) |
3426 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
3427 | if (auto KindCost = Entry->Cost[CostKind]) |
3428 | return LT.first * (ExtraCost + *KindCost); |
3429 | |
3430 | if (ST->hasSSE42()) |
3431 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
3432 | if (auto KindCost = Entry->Cost[CostKind]) |
3433 | return LT.first * (ExtraCost + *KindCost); |
3434 | |
3435 | if (ST->hasSSE41()) |
3436 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
3437 | if (auto KindCost = Entry->Cost[CostKind]) |
3438 | return LT.first * (ExtraCost + *KindCost); |
3439 | |
3440 | if (ST->hasSSE2()) |
3441 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
3442 | if (auto KindCost = Entry->Cost[CostKind]) |
3443 | return LT.first * (ExtraCost + *KindCost); |
3444 | |
3445 | if (ST->hasSSE1()) |
3446 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
3447 | if (auto KindCost = Entry->Cost[CostKind]) |
3448 | return LT.first * (ExtraCost + *KindCost); |
3449 | |
3450 | // Assume a 3cy latency for fp select ops. |
3451 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) |
3452 | if (ValTy->getScalarType()->isFloatingPointTy()) |
3453 | return 3; |
3454 | |
3455 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
3456 | } |
3457 | |
3458 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } |
3459 | |
3460 | InstructionCost |
3461 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
3462 | TTI::TargetCostKind CostKind) { |
3463 | // Costs should match the codegen from: |
3464 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll |
3465 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll |
3466 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll |
3467 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll |
3468 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll |
3469 | |
3470 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not |
3471 | // specialized in these tables yet. |
3472 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { |
3473 | { .ISD: ISD::FSHL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3474 | { .ISD: ISD::FSHL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3475 | { .ISD: ISD::FSHL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3476 | { .ISD: ISD::FSHL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3477 | { .ISD: ISD::FSHL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3478 | { .ISD: ISD::FSHL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3479 | { .ISD: ISD::FSHL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3480 | { .ISD: ISD::FSHL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3481 | { .ISD: ISD::FSHL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3482 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3483 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3484 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3485 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3486 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3487 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3488 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3489 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3490 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3491 | }; |
3492 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { |
3493 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3494 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3495 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3496 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3497 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3498 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3499 | }; |
3500 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { |
3501 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3502 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3503 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3504 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3505 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3506 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3507 | }; |
3508 | static const CostKindTblEntry AVX512CDCostTbl[] = { |
3509 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3510 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3511 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 27, .CodeSizeCost: 23, .SizeAndLatencyCost: 27 } }, |
3512 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 16, .CodeSizeCost: 9, .SizeAndLatencyCost: 11 } }, |
3513 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3514 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3515 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 19, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3516 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3517 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3518 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3519 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 15, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, |
3520 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 10, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3521 | |
3522 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3523 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3524 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3525 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3526 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3527 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
3528 | }; |
3529 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3530 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3531 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3532 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3533 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3534 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3535 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3536 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3537 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3538 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3539 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3540 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3541 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3542 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3543 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 12 } }, |
3544 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3545 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3546 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3547 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3548 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3549 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3550 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3551 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3552 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3553 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 22, .CodeSizeCost: 23, .SizeAndLatencyCost: 23 } }, |
3554 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 23, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
3555 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 15, .CodeSizeCost: 15, .SizeAndLatencyCost: 16 } }, |
3556 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 9 } }, |
3557 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3558 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3559 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 8, .CodeSizeCost: 10, .SizeAndLatencyCost: 12 } }, |
3560 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3561 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 11, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3562 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
3563 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3564 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3565 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3566 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3567 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3568 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 10 } }, |
3569 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3570 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3571 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 10, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
3572 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3573 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3574 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 13 } }, |
3575 | { .ISD: ISD::ROTL, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
3576 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3577 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3578 | { .ISD: ISD::ROTL, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 12 } }, |
3579 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
3580 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 15, .CodeSizeCost: 7, .SizeAndLatencyCost: 10 } }, |
3581 | { .ISD: ISD::ROTR, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 8 } }, |
3582 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3583 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3584 | { .ISD: ISD::ROTR, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 12, .SizeAndLatencyCost: 14 } }, |
3585 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
3586 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 14, .CodeSizeCost: 6, .SizeAndLatencyCost: 9 } }, |
3587 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3588 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3589 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 5, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3590 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3591 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3592 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3593 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1 } }, |
3594 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1 } }, |
3595 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3596 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3597 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3598 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3599 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1 } }, |
3600 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1 } }, |
3601 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1 } }, |
3602 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1 } }, |
3603 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3604 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3605 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3606 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3607 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1 } }, |
3608 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1 } }, |
3609 | }; |
3610 | static const CostKindTblEntry AVX512CostTbl[] = { |
3611 | { .ISD: ISD::ABS, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3612 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3613 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3614 | { .ISD: ISD::ABS, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3615 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3616 | { .ISD: ISD::ABS, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3617 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3618 | { .ISD: ISD::ABS, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
3619 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3620 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3621 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3622 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 13, .CodeSizeCost: 20, .SizeAndLatencyCost: 20 } }, |
3623 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
3624 | { .ISD: ISD::BSWAP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3625 | { .ISD: ISD::BSWAP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3626 | { .ISD: ISD::BSWAP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3627 | { .ISD: ISD::CTLZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 28, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
3628 | { .ISD: ISD::CTLZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 30, .CodeSizeCost: 38, .SizeAndLatencyCost: 38 } }, |
3629 | { .ISD: ISD::CTLZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 15, .CodeSizeCost: 29, .SizeAndLatencyCost: 29 } }, |
3630 | { .ISD: ISD::CTLZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 11, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
3631 | { .ISD: ISD::CTPOP, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
3632 | { .ISD: ISD::CTPOP, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 19, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
3633 | { .ISD: ISD::CTPOP, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 15, .CodeSizeCost: 22, .SizeAndLatencyCost: 22 } }, |
3634 | { .ISD: ISD::CTPOP, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 11, .CodeSizeCost: 16, .SizeAndLatencyCost: 16 } }, |
3635 | { .ISD: ISD::CTTZ, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3636 | { .ISD: ISD::CTTZ, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3637 | { .ISD: ISD::CTTZ, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 17, .CodeSizeCost: 27, .SizeAndLatencyCost: 27 } }, |
3638 | { .ISD: ISD::CTTZ, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 13, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
3639 | { .ISD: ISD::ROTL, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3640 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3641 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3642 | { .ISD: ISD::ROTL, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3643 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3644 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3645 | { .ISD: ISD::ROTR, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3646 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3647 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3648 | { .ISD: ISD::ROTR, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3649 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3650 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3651 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3652 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3653 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3654 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3655 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3656 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3657 | { .ISD: ISD::SMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3658 | { .ISD: ISD::SMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3659 | { .ISD: ISD::SMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3660 | { .ISD: ISD::SMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3661 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3662 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3663 | { .ISD: ISD::SMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3664 | { .ISD: ISD::SMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3665 | { .ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3666 | { .ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3667 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3668 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3669 | { .ISD: ISD::UMAX, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3670 | { .ISD: ISD::UMAX, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3671 | { .ISD: ISD::UMAX, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3672 | { .ISD: ISD::UMAX, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3673 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3674 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3675 | { .ISD: ISD::UMIN, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3676 | { .ISD: ISD::UMIN, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3677 | { .ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3678 | { .ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
3679 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3680 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3681 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 2 } }, // pmaxud + psubd |
3682 | { .ISD: ISD::USUBSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2 } }, // pmaxuq + psubq |
3683 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2 } }, // pmaxuq + psubq |
3684 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 2 } }, // pmaxuq + psubq |
3685 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 3 } }, // not + pminud + paddd |
3686 | { .ISD: ISD::UADDSAT, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3 } }, // not + pminuq + paddq |
3687 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3 } }, // not + pminuq + paddq |
3688 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 3 } }, // not + pminuq + paddq |
3689 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2 } }, |
3690 | { .ISD: ISD::SADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2 } }, |
3691 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2 } }, |
3692 | { .ISD: ISD::SSUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2 } }, |
3693 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2 } }, |
3694 | { .ISD: ISD::UADDSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2 } }, |
3695 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 2 } }, |
3696 | { .ISD: ISD::USUBSAT, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 2 } }, |
3697 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3698 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3699 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3700 | { .ISD: ISD::FMAXNUM, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3701 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3702 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3703 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3704 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3705 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3706 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3707 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3708 | { .ISD: ISD::FSQRT, .Type: MVT::v16f32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
3709 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3710 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3711 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Skylake from http://www.agner.org/ |
3712 | { .ISD: ISD::FSQRT, .Type: MVT::v8f64, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // Skylake from http://www.agner.org/ |
3713 | }; |
3714 | static const CostKindTblEntry XOPCostTbl[] = { |
3715 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3716 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3717 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3718 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3719 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3720 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3721 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3722 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3723 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3724 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3725 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3726 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, |
3727 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) |
3728 | { .ISD: ISD::ROTL, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3729 | { .ISD: ISD::ROTL, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3730 | { .ISD: ISD::ROTL, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3731 | { .ISD: ISD::ROTL, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3732 | { .ISD: ISD::ROTL, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3733 | { .ISD: ISD::ROTL, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3734 | { .ISD: ISD::ROTL, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3735 | { .ISD: ISD::ROTL, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3736 | { .ISD: ISD::ROTR, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3737 | { .ISD: ISD::ROTR, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3738 | { .ISD: ISD::ROTR, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3739 | { .ISD: ISD::ROTR, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 8, .SizeAndLatencyCost: 9 } }, |
3740 | { .ISD: ISD::ROTR, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3741 | { .ISD: ISD::ROTR, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3742 | { .ISD: ISD::ROTR, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3743 | { .ISD: ISD::ROTR, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
3744 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3745 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3746 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3747 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 7, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3748 | { .ISD: X86ISD::VROTLI, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3749 | { .ISD: X86ISD::VROTLI, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3750 | { .ISD: X86ISD::VROTLI, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3751 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3752 | }; |
3753 | static const CostKindTblEntry AVX2CostTbl[] = { |
3754 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3755 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3756 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3757 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3758 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3759 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3760 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3761 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3762 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3763 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3764 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3765 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3766 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 11 } }, |
3767 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 10, .SizeAndLatencyCost: 17 } }, |
3768 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
3769 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 15 } }, |
3770 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3771 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3772 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3773 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3774 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3775 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3776 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 25 } }, |
3777 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 24, .SizeAndLatencyCost: 44 } }, |
3778 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 20 } }, |
3779 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 16, .CodeSizeCost: 19, .SizeAndLatencyCost: 34 } }, |
3780 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 13, .CodeSizeCost: 14, .SizeAndLatencyCost: 15 } }, |
3781 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 14, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
3782 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 10 } }, |
3783 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 14 } }, |
3784 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 10 } }, |
3785 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3786 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3787 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
3788 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3789 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
3790 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
3791 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
3792 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 13 } }, |
3793 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 11, .CodeSizeCost: 13, .SizeAndLatencyCost: 20 } }, |
3794 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
3795 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 24 } }, |
3796 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 14 } }, |
3797 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 14, .SizeAndLatencyCost: 24 } }, |
3798 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
3799 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 11, .SizeAndLatencyCost: 18 } }, |
3800 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1 } }, |
3801 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1 } }, |
3802 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3803 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3804 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3805 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3806 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3807 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3808 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3809 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3810 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3811 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3812 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1 } }, |
3813 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1 } }, |
3814 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1 } }, |
3815 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1 } }, |
3816 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3 } }, // not + pminud + paddd |
3817 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3818 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
3819 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3820 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3821 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3822 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, |
3823 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 8 } }, |
3824 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3825 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3826 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
3827 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1 } }, |
3828 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1 } }, |
3829 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 2 } }, // pmaxud + psubd |
3830 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3831 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3832 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3833 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3834 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3835 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3836 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
3837 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 15, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
3838 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
3839 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
3840 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
3841 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
3842 | }; |
3843 | static const CostKindTblEntry AVX1CostTbl[] = { |
3844 | { .ISD: ISD::ABS, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 8, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3845 | { .ISD: ISD::ABS, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
3846 | { .ISD: ISD::ABS, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
3847 | { .ISD: ISD::ABS, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
3848 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
3849 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
3850 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
3851 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
3852 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 20, .CodeSizeCost: 20, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
3853 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 13, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
3854 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 17, .SizeAndLatencyCost: 26 } }, // 2 x 128-bit Op + extract/insert |
3855 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 7, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
3856 | { .ISD: ISD::BSWAP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
3857 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
3858 | { .ISD: ISD::BSWAP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
3859 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
3860 | { .ISD: ISD::BSWAP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 10 } }, |
3861 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
3862 | { .ISD: ISD::CTLZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 29, .LatencyCost: 33, .CodeSizeCost: 49, .SizeAndLatencyCost: 58 } }, // 2 x 128-bit Op + extract/insert |
3863 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 24, .CodeSizeCost: 24, .SizeAndLatencyCost: 28 } }, |
3864 | { .ISD: ISD::CTLZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 24, .LatencyCost: 28, .CodeSizeCost: 39, .SizeAndLatencyCost: 48 } }, // 2 x 128-bit Op + extract/insert |
3865 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 20, .CodeSizeCost: 19, .SizeAndLatencyCost: 23 } }, |
3866 | { .ISD: ISD::CTLZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 22, .CodeSizeCost: 29, .SizeAndLatencyCost: 38 } }, // 2 x 128-bit Op + extract/insert |
3867 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 16, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
3868 | { .ISD: ISD::CTLZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 15, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
3869 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 12, .CodeSizeCost: 9, .SizeAndLatencyCost: 13 } }, |
3870 | { .ISD: ISD::CTPOP, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 18, .CodeSizeCost: 19, .SizeAndLatencyCost: 28 } }, // 2 x 128-bit Op + extract/insert |
3871 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 14, .CodeSizeCost: 10, .SizeAndLatencyCost: 14 } }, |
3872 | { .ISD: ISD::CTPOP, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
3873 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 20, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
3874 | { .ISD: ISD::CTPOP, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 21, .CodeSizeCost: 22, .SizeAndLatencyCost: 31 } }, // 2 x 128-bit Op + extract/insert |
3875 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 18, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
3876 | { .ISD: ISD::CTPOP, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 15, .CodeSizeCost: 16, .SizeAndLatencyCost: 25 } }, // 2 x 128-bit Op + extract/insert |
3877 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 12, .CodeSizeCost: 8, .SizeAndLatencyCost: 12 } }, |
3878 | { .ISD: ISD::CTTZ, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 17, .LatencyCost: 22, .CodeSizeCost: 24, .SizeAndLatencyCost: 33 } }, // 2 x 128-bit Op + extract/insert |
3879 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 19, .CodeSizeCost: 13, .SizeAndLatencyCost: 17 } }, |
3880 | { .ISD: ISD::CTTZ, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 27, .CodeSizeCost: 32, .SizeAndLatencyCost: 41 } }, // 2 x 128-bit Op + extract/insert |
3881 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 24, .CodeSizeCost: 17, .SizeAndLatencyCost: 21 } }, |
3882 | { .ISD: ISD::CTTZ, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 27, .SizeAndLatencyCost: 36 } }, // 2 x 128-bit Op + extract/insert |
3883 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 18 } }, |
3884 | { .ISD: ISD::CTTZ, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 18, .CodeSizeCost: 21, .SizeAndLatencyCost: 30 } }, // 2 x 128-bit Op + extract/insert |
3885 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 16, .CodeSizeCost: 11, .SizeAndLatencyCost: 15 } }, |
3886 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3887 | { .ISD: ISD::SADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3888 | { .ISD: ISD::SMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
3889 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
3890 | { .ISD: ISD::SMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3891 | { .ISD: ISD::SMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3892 | { .ISD: ISD::SMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3893 | { .ISD: ISD::SMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 6, .LatencyCost: 9, .CodeSizeCost: 6, .SizeAndLatencyCost: 12 } }, // 2 x 128-bit Op + extract/insert |
3894 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3895 | { .ISD: ISD::SMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3896 | { .ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3897 | { .ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3898 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3899 | { .ISD: ISD::SSUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3900 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3901 | { .ISD: ISD::UADDSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3902 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 8 } }, // 2 x 128-bit Op + extract/insert |
3903 | { .ISD: ISD::UMAX, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
3904 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
3905 | { .ISD: ISD::UMAX, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3906 | { .ISD: ISD::UMAX, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3907 | { .ISD: ISD::UMAX, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3908 | { .ISD: ISD::UMIN, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 10, .CodeSizeCost: 11, .SizeAndLatencyCost: 17 } }, // 2 x 128-bit Op + extract/insert |
3909 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 5, .SizeAndLatencyCost: 7 } }, |
3910 | { .ISD: ISD::UMIN, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3911 | { .ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3912 | { .ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3913 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3914 | { .ISD: ISD::USUBSAT, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 4 } }, // 2 x 128-bit Op + extract/insert |
3915 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 6 } }, // 2 x 128-bit Op + extract/insert |
3916 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3917 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3918 | { .ISD: ISD::FMAXNUM, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3919 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3920 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3921 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 7, .CodeSizeCost: 3, .SizeAndLatencyCost: 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3922 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtss |
3923 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 21, .LatencyCost: 21, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtps |
3924 | { .ISD: ISD::FSQRT, .Type: MVT::v8f32, .Cost: { .RecipThroughputCost: 42, .LatencyCost: 42, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtps |
3925 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtsd |
3926 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 27, .LatencyCost: 27, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // vsqrtpd |
3927 | { .ISD: ISD::FSQRT, .Type: MVT::v4f64, .Cost: { .RecipThroughputCost: 54, .LatencyCost: 54, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, // vsqrtpd |
3928 | }; |
3929 | static const CostKindTblEntry GFNICostTbl[] = { |
3930 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3931 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
3932 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // gf2p8affineqb |
3933 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 3, .CodeSizeCost: 4, .SizeAndLatencyCost: 6 } }, // gf2p8affineqb |
3934 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3935 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3936 | { .ISD: ISD::BITREVERSE, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3937 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3938 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3939 | { .ISD: ISD::BITREVERSE, .Type: MVT::v32i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3940 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3941 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3942 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3943 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 8, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3944 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3945 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 9, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, // gf2p8affineqb |
3946 | { .ISD: X86ISD::VROTLI, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3947 | { .ISD: X86ISD::VROTLI, .Type: MVT::v32i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3948 | { .ISD: X86ISD::VROTLI, .Type: MVT::v64i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 6, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // gf2p8affineqb |
3949 | }; |
3950 | static const CostKindTblEntry GLMCostTbl[] = { |
3951 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 19, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
3952 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 37, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
3953 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 34, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
3954 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 67, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
3955 | }; |
3956 | static const CostKindTblEntry SLMCostTbl[] = { |
3957 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
3958 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
3959 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
3960 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 20, .LatencyCost: 20, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtss |
3961 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 40, .LatencyCost: 41, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtps |
3962 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 35, .LatencyCost: 35, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // sqrtsd |
3963 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 70, .LatencyCost: 71, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, // sqrtpd |
3964 | }; |
3965 | static const CostKindTblEntry SSE42CostTbl[] = { |
3966 | { .ISD: ISD::USUBSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2 } }, // pmaxud + psubd |
3967 | { .ISD: ISD::UADDSAT, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 3 } }, // not + pminud + paddd |
3968 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3969 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3970 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3971 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3972 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
3973 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 18, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
3974 | }; |
3975 | static const CostKindTblEntry SSE41CostTbl[] = { |
3976 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 4, .CodeSizeCost: 3, .SizeAndLatencyCost: 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) |
3977 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3978 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3979 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3980 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 7, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
3981 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3982 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3983 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3984 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3985 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3986 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 11, .CodeSizeCost: 6, .SizeAndLatencyCost: 7 } }, |
3987 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3988 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3989 | }; |
3990 | static const CostKindTblEntry SSSE3CostTbl[] = { |
3991 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3992 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3993 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
3994 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
3995 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
3996 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 11, .SizeAndLatencyCost: 21 } }, |
3997 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
3998 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
3999 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4000 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 5 } }, |
4001 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 28, .CodeSizeCost: 28, .SizeAndLatencyCost: 35 } }, |
4002 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 20, .CodeSizeCost: 22, .SizeAndLatencyCost: 28 } }, |
4003 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 17, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
4004 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 15, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4005 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 19, .CodeSizeCost: 12, .SizeAndLatencyCost: 18 } }, |
4006 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 24, .CodeSizeCost: 16, .SizeAndLatencyCost: 22 } }, |
4007 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 18, .CodeSizeCost: 14, .SizeAndLatencyCost: 20 } }, |
4008 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 10, .SizeAndLatencyCost: 16 } }, |
4009 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 15, .SizeAndLatencyCost: 22 } }, |
4010 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 26, .CodeSizeCost: 19, .SizeAndLatencyCost: 25 } }, |
4011 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 20, .CodeSizeCost: 17, .SizeAndLatencyCost: 23 } }, |
4012 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 16, .CodeSizeCost: 13, .SizeAndLatencyCost: 19 } } |
4013 | }; |
4014 | static const CostKindTblEntry SSE2CostTbl[] = { |
4015 | { .ISD: ISD::ABS, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 3, .LatencyCost: 6, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4016 | { .ISD: ISD::ABS, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 4 } }, |
4017 | { .ISD: ISD::ABS, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4018 | { .ISD: ISD::ABS, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4019 | { .ISD: ISD::BITREVERSE, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 32, .SizeAndLatencyCost: 32 } }, |
4020 | { .ISD: ISD::BITREVERSE, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 30, .SizeAndLatencyCost: 30 } }, |
4021 | { .ISD: ISD::BITREVERSE, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 20, .CodeSizeCost: 25, .SizeAndLatencyCost: 25 } }, |
4022 | { .ISD: ISD::BITREVERSE, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 11, .LatencyCost: 12, .CodeSizeCost: 21, .SizeAndLatencyCost: 21 } }, |
4023 | { .ISD: ISD::BSWAP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 6, .CodeSizeCost: 11, .SizeAndLatencyCost: 11 } }, |
4024 | { .ISD: ISD::BSWAP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 9, .SizeAndLatencyCost: 9 } }, |
4025 | { .ISD: ISD::BSWAP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 4, .SizeAndLatencyCost: 5 } }, |
4026 | { .ISD: ISD::CTLZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 36, .SizeAndLatencyCost: 38 } }, |
4027 | { .ISD: ISD::CTLZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 45, .CodeSizeCost: 38, .SizeAndLatencyCost: 40 } }, |
4028 | { .ISD: ISD::CTLZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 38, .CodeSizeCost: 32, .SizeAndLatencyCost: 34 } }, |
4029 | { .ISD: ISD::CTLZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 39, .CodeSizeCost: 29, .SizeAndLatencyCost: 32 } }, |
4030 | { .ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 12, .LatencyCost: 26, .CodeSizeCost: 16, .SizeAndLatencyCost: 18 } }, |
4031 | { .ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 15, .LatencyCost: 29, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
4032 | { .ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 25, .CodeSizeCost: 18, .SizeAndLatencyCost: 20 } }, |
4033 | { .ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 21, .CodeSizeCost: 14, .SizeAndLatencyCost: 16 } }, |
4034 | { .ISD: ISD::CTTZ, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 14, .LatencyCost: 28, .CodeSizeCost: 19, .SizeAndLatencyCost: 21 } }, |
4035 | { .ISD: ISD::CTTZ, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 18, .LatencyCost: 31, .CodeSizeCost: 24, .SizeAndLatencyCost: 26 } }, |
4036 | { .ISD: ISD::CTTZ, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 16, .LatencyCost: 27, .CodeSizeCost: 21, .SizeAndLatencyCost: 23 } }, |
4037 | { .ISD: ISD::CTTZ, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 13, .LatencyCost: 23, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4038 | { .ISD: ISD::SADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1 } }, |
4039 | { .ISD: ISD::SADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1 } }, |
4040 | { .ISD: ISD::SMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4041 | { .ISD: ISD::SMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4042 | { .ISD: ISD::SMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4043 | { .ISD: ISD::SMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4044 | { .ISD: ISD::SMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4045 | { .ISD: ISD::SMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4046 | { .ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4047 | { .ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 5, .SizeAndLatencyCost: 5 } }, |
4048 | { .ISD: ISD::SSUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1 } }, |
4049 | { .ISD: ISD::SSUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1 } }, |
4050 | { .ISD: ISD::UADDSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1 } }, |
4051 | { .ISD: ISD::UADDSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1 } }, |
4052 | { .ISD: ISD::UMAX, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4053 | { .ISD: ISD::UMAX, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4054 | { .ISD: ISD::UMAX, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4055 | { .ISD: ISD::UMAX, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4056 | { .ISD: ISD::UMIN, .Type: MVT::v2i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 8, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4057 | { .ISD: ISD::UMIN, .Type: MVT::v4i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 5, .CodeSizeCost: 8, .SizeAndLatencyCost: 8 } }, |
4058 | { .ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, |
4059 | { .ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4060 | { .ISD: ISD::USUBSAT, .Type: MVT::v8i16, .Cost: { .RecipThroughputCost: 1 } }, |
4061 | { .ISD: ISD::USUBSAT, .Type: MVT::v16i8, .Cost: { .RecipThroughputCost: 1 } }, |
4062 | { .ISD: ISD::FMAXNUM, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4063 | { .ISD: ISD::FMAXNUM, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4064 | { .ISD: ISD::FSQRT, .Type: MVT::f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4065 | { .ISD: ISD::FSQRT, .Type: MVT::v2f64, .Cost: { .RecipThroughputCost: 32, .LatencyCost: 32, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // Nehalem from http://www.agner.org/ |
4066 | }; |
4067 | static const CostKindTblEntry SSE1CostTbl[] = { |
4068 | { .ISD: ISD::FMAXNUM, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 5, .LatencyCost: 5, .CodeSizeCost: 7, .SizeAndLatencyCost: 7 } }, |
4069 | { .ISD: ISD::FMAXNUM, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4070 | { .ISD: ISD::FSQRT, .Type: MVT::f32, .Cost: { .RecipThroughputCost: 28, .LatencyCost: 30, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
4071 | { .ISD: ISD::FSQRT, .Type: MVT::v4f32, .Cost: { .RecipThroughputCost: 56, .LatencyCost: 56, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // Pentium III from http://www.agner.org/ |
4072 | }; |
4073 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets |
4074 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, |
4075 | }; |
4076 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets |
4077 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, |
4078 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, |
4079 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, |
4080 | }; |
4081 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets |
4082 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, |
4083 | }; |
4084 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets |
4085 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, |
4086 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2 } }, |
4087 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2 } }, |
4088 | }; |
4089 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets |
4090 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
4091 | }; |
4092 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets |
4093 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // popcnt |
4094 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
4095 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 2, .SizeAndLatencyCost: 2 } }, // popcnt(zext()) |
4096 | }; |
4097 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
4098 | { .ISD: ISD::ABS, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+CMOV |
4099 | { .ISD: ISD::BITREVERSE, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 12, .CodeSizeCost: 20, .SizeAndLatencyCost: 22 } }, |
4100 | { .ISD: ISD::BSWAP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, |
4101 | { .ISD: ISD::CTLZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4102 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSR+XOR |
4103 | { .ISD: ISD::CTTZ, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4104 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i64,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSR |
4105 | { .ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 10, .LatencyCost: 6, .CodeSizeCost: 19, .SizeAndLatencyCost: 19 } }, |
4106 | { .ISD: ISD::ROTL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4107 | { .ISD: ISD::ROTR, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4108 | { .ISD: X86ISD::VROTLI, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4109 | { .ISD: ISD::FSHL, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
4110 | { .ISD: ISD::SMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4111 | { .ISD: ISD::SMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4112 | { .ISD: ISD::UMAX, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4113 | { .ISD: ISD::UMIN, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 3, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4114 | { .ISD: ISD::SADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, |
4115 | { .ISD: ISD::UADDO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 1 } }, |
4116 | { .ISD: ISD::UMULO, .Type: MVT::i64, .Cost: { .RecipThroughputCost: 2 } }, // mulq + seto |
4117 | }; |
4118 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
4119 | { .ISD: ISD::ABS, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
4120 | { .ISD: ISD::ABS, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA or SUB+CMOV |
4121 | { .ISD: ISD::ABS, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 4, .CodeSizeCost: 4, .SizeAndLatencyCost: 3 } }, // SUB+XOR+SRA |
4122 | { .ISD: ISD::BITREVERSE, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4123 | { .ISD: ISD::BITREVERSE, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 12, .CodeSizeCost: 17, .SizeAndLatencyCost: 19 } }, |
4124 | { .ISD: ISD::BITREVERSE, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 9, .CodeSizeCost: 13, .SizeAndLatencyCost: 14 } }, |
4125 | { .ISD: ISD::BSWAP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4126 | { .ISD: ISD::BSWAP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 2 } }, // ROL |
4127 | { .ISD: ISD::CTLZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4128 | { .ISD: ISD::CTLZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4129 | { .ISD: ISD::CTLZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4130 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSR+XOR |
4131 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // BSR+XOR |
4132 | { .ISD: ISD::CTLZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 3, .SizeAndLatencyCost: 3 } }, // BSR+XOR |
4133 | { .ISD: ISD::CTTZ, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4134 | { .ISD: ISD::CTTZ, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4135 | { .ISD: ISD::CTTZ, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 3 } }, // TEST+BSF+CMOV/BRANCH |
4136 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i32,.Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSF |
4137 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i16,.Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSF |
4138 | { .ISD: ISD::CTTZ_ZERO_UNDEF, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 2, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, // BSF |
4139 | { .ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 8, .LatencyCost: 7, .CodeSizeCost: 15, .SizeAndLatencyCost: 15 } }, |
4140 | { .ISD: ISD::CTPOP, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 9, .LatencyCost: 8, .CodeSizeCost: 17, .SizeAndLatencyCost: 17 } }, |
4141 | { .ISD: ISD::CTPOP, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 7, .LatencyCost: 6, .CodeSizeCost: 6, .SizeAndLatencyCost: 6 } }, |
4142 | { .ISD: ISD::ROTL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4143 | { .ISD: ISD::ROTL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4144 | { .ISD: ISD::ROTL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4145 | { .ISD: ISD::ROTR, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4146 | { .ISD: ISD::ROTR, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4147 | { .ISD: ISD::ROTR, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2, .LatencyCost: 3, .CodeSizeCost: 1, .SizeAndLatencyCost: 3 } }, |
4148 | { .ISD: X86ISD::VROTLI, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4149 | { .ISD: X86ISD::VROTLI, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4150 | { .ISD: X86ISD::VROTLI, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 1, .CodeSizeCost: 1, .SizeAndLatencyCost: 1 } }, |
4151 | { .ISD: ISD::FSHL, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 1, .SizeAndLatencyCost: 4 } }, |
4152 | { .ISD: ISD::FSHL, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
4153 | { .ISD: ISD::FSHL, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 4, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 5 } }, |
4154 | { .ISD: ISD::SMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4155 | { .ISD: ISD::SMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4156 | { .ISD: ISD::SMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4157 | { .ISD: ISD::SMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4158 | { .ISD: ISD::SMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4159 | { .ISD: ISD::SMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4160 | { .ISD: ISD::UMAX, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4161 | { .ISD: ISD::UMAX, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4162 | { .ISD: ISD::UMAX, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4163 | { .ISD: ISD::UMIN, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 2, .CodeSizeCost: 2, .SizeAndLatencyCost: 3 } }, |
4164 | { .ISD: ISD::UMIN, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4165 | { .ISD: ISD::UMIN, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1, .LatencyCost: 4, .CodeSizeCost: 2, .SizeAndLatencyCost: 4 } }, |
4166 | { .ISD: ISD::SADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, |
4167 | { .ISD: ISD::SADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, |
4168 | { .ISD: ISD::SADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, |
4169 | { .ISD: ISD::UADDO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 1 } }, |
4170 | { .ISD: ISD::UADDO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 1 } }, |
4171 | { .ISD: ISD::UADDO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 1 } }, |
4172 | { .ISD: ISD::UMULO, .Type: MVT::i32, .Cost: { .RecipThroughputCost: 2 } }, // mul + seto |
4173 | { .ISD: ISD::UMULO, .Type: MVT::i16, .Cost: { .RecipThroughputCost: 2 } }, |
4174 | { .ISD: ISD::UMULO, .Type: MVT::i8, .Cost: { .RecipThroughputCost: 2 } }, |
4175 | }; |
4176 | |
4177 | Type *RetTy = ICA.getReturnType(); |
4178 | Type *OpTy = RetTy; |
4179 | Intrinsic::ID IID = ICA.getID(); |
4180 | unsigned ISD = ISD::DELETED_NODE; |
4181 | switch (IID) { |
4182 | default: |
4183 | break; |
4184 | case Intrinsic::abs: |
4185 | ISD = ISD::ABS; |
4186 | break; |
4187 | case Intrinsic::bitreverse: |
4188 | ISD = ISD::BITREVERSE; |
4189 | break; |
4190 | case Intrinsic::bswap: |
4191 | ISD = ISD::BSWAP; |
4192 | break; |
4193 | case Intrinsic::ctlz: |
4194 | ISD = ISD::CTLZ; |
4195 | break; |
4196 | case Intrinsic::ctpop: |
4197 | ISD = ISD::CTPOP; |
4198 | break; |
4199 | case Intrinsic::cttz: |
4200 | ISD = ISD::CTTZ; |
4201 | break; |
4202 | case Intrinsic::fshl: |
4203 | ISD = ISD::FSHL; |
4204 | if (!ICA.isTypeBasedOnly()) { |
4205 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4206 | if (Args[0] == Args[1]) { |
4207 | ISD = ISD::ROTL; |
4208 | // Handle uniform constant rotation amounts. |
4209 | // TODO: Handle funnel-shift cases. |
4210 | const APInt *Amt; |
4211 | if (Args[2] && |
4212 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4213 | ISD = X86ISD::VROTLI; |
4214 | } |
4215 | } |
4216 | break; |
4217 | case Intrinsic::fshr: |
4218 | // FSHR has same costs so don't duplicate. |
4219 | ISD = ISD::FSHL; |
4220 | if (!ICA.isTypeBasedOnly()) { |
4221 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4222 | if (Args[0] == Args[1]) { |
4223 | ISD = ISD::ROTR; |
4224 | // Handle uniform constant rotation amount. |
4225 | // TODO: Handle funnel-shift cases. |
4226 | const APInt *Amt; |
4227 | if (Args[2] && |
4228 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4229 | ISD = X86ISD::VROTLI; |
4230 | } |
4231 | } |
4232 | break; |
4233 | case Intrinsic::lrint: |
4234 | case Intrinsic::llrint: |
4235 | // X86 can use the CVTP2SI instructions to lower lrint/llrint calls, which |
4236 | // have the same costs as the CVTTP2SI (fptosi) instructions |
4237 | if (!ICA.isTypeBasedOnly()) { |
4238 | const SmallVectorImpl<Type *> &ArgTys = ICA.getArgTypes(); |
4239 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: RetTy, Src: ArgTys[0], |
4240 | CCH: TTI::CastContextHint::None, CostKind); |
4241 | } |
4242 | break; |
4243 | case Intrinsic::maxnum: |
4244 | case Intrinsic::minnum: |
4245 | // FMINNUM has same costs so don't duplicate. |
4246 | ISD = ISD::FMAXNUM; |
4247 | break; |
4248 | case Intrinsic::sadd_sat: |
4249 | ISD = ISD::SADDSAT; |
4250 | break; |
4251 | case Intrinsic::smax: |
4252 | ISD = ISD::SMAX; |
4253 | break; |
4254 | case Intrinsic::smin: |
4255 | ISD = ISD::SMIN; |
4256 | break; |
4257 | case Intrinsic::ssub_sat: |
4258 | ISD = ISD::SSUBSAT; |
4259 | break; |
4260 | case Intrinsic::uadd_sat: |
4261 | ISD = ISD::UADDSAT; |
4262 | break; |
4263 | case Intrinsic::umax: |
4264 | ISD = ISD::UMAX; |
4265 | break; |
4266 | case Intrinsic::umin: |
4267 | ISD = ISD::UMIN; |
4268 | break; |
4269 | case Intrinsic::usub_sat: |
4270 | ISD = ISD::USUBSAT; |
4271 | break; |
4272 | case Intrinsic::sqrt: |
4273 | ISD = ISD::FSQRT; |
4274 | break; |
4275 | case Intrinsic::sadd_with_overflow: |
4276 | case Intrinsic::ssub_with_overflow: |
4277 | // SSUBO has same costs so don't duplicate. |
4278 | ISD = ISD::SADDO; |
4279 | OpTy = RetTy->getContainedType(i: 0); |
4280 | break; |
4281 | case Intrinsic::uadd_with_overflow: |
4282 | case Intrinsic::usub_with_overflow: |
4283 | // USUBO has same costs so don't duplicate. |
4284 | ISD = ISD::UADDO; |
4285 | OpTy = RetTy->getContainedType(i: 0); |
4286 | break; |
4287 | case Intrinsic::umul_with_overflow: |
4288 | case Intrinsic::smul_with_overflow: |
4289 | // SMULO has same costs so don't duplicate. |
4290 | ISD = ISD::UMULO; |
4291 | OpTy = RetTy->getContainedType(i: 0); |
4292 | break; |
4293 | } |
4294 | |
4295 | if (ISD != ISD::DELETED_NODE) { |
4296 | auto adjustTableCost = [&](int ISD, unsigned Cost, |
4297 | std::pair<InstructionCost, MVT> LT, |
4298 | FastMathFlags FMF) -> InstructionCost { |
4299 | InstructionCost LegalizationCost = LT.first; |
4300 | MVT MTy = LT.second; |
4301 | |
4302 | // If there are no NANs to deal with, then these are reduced to a |
4303 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we |
4304 | // assume is used in the non-fast case. |
4305 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { |
4306 | if (FMF.noNaNs()) |
4307 | return LegalizationCost * 1; |
4308 | } |
4309 | |
4310 | // For cases where some ops can be folded into a load/store, assume free. |
4311 | if (MTy.isScalarInteger()) { |
4312 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { |
4313 | if (const Instruction *II = ICA.getInst()) { |
4314 | if (II->hasOneUse() && isa<StoreInst>(Val: II->user_back())) |
4315 | return TTI::TCC_Free; |
4316 | if (auto *LI = dyn_cast<LoadInst>(Val: II->getOperand(i: 0))) { |
4317 | if (LI->hasOneUse()) |
4318 | return TTI::TCC_Free; |
4319 | } |
4320 | } |
4321 | } |
4322 | } |
4323 | |
4324 | return LegalizationCost * (int)Cost; |
4325 | }; |
4326 | |
4327 | // Legalize the type. |
4328 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: OpTy); |
4329 | MVT MTy = LT.second; |
4330 | |
4331 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. |
4332 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || |
4333 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && |
4334 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { |
4335 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4336 | if (auto *Cst = dyn_cast<ConstantInt>(Val: Args[1])) |
4337 | if (Cst->isAllOnesValue()) |
4338 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; |
4339 | } |
4340 | |
4341 | // FSQRT is a single instruction. |
4342 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) |
4343 | return LT.first; |
4344 | |
4345 | if (ST->useGLMDivSqrtCosts()) |
4346 | if (const auto *Entry = CostTableLookup(Table: GLMCostTbl, ISD, Ty: MTy)) |
4347 | if (auto KindCost = Entry->Cost[CostKind]) |
4348 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4349 | |
4350 | if (ST->useSLMArithCosts()) |
4351 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
4352 | if (auto KindCost = Entry->Cost[CostKind]) |
4353 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4354 | |
4355 | if (ST->hasVBMI2()) |
4356 | if (const auto *Entry = CostTableLookup(Table: AVX512VBMI2CostTbl, ISD, Ty: MTy)) |
4357 | if (auto KindCost = Entry->Cost[CostKind]) |
4358 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4359 | |
4360 | if (ST->hasBITALG()) |
4361 | if (const auto *Entry = CostTableLookup(Table: AVX512BITALGCostTbl, ISD, Ty: MTy)) |
4362 | if (auto KindCost = Entry->Cost[CostKind]) |
4363 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4364 | |
4365 | if (ST->hasVPOPCNTDQ()) |
4366 | if (const auto *Entry = CostTableLookup(Table: AVX512VPOPCNTDQCostTbl, ISD, Ty: MTy)) |
4367 | if (auto KindCost = Entry->Cost[CostKind]) |
4368 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4369 | |
4370 | if (ST->hasGFNI()) |
4371 | if (const auto *Entry = CostTableLookup(Table: GFNICostTbl, ISD, Ty: MTy)) |
4372 | if (auto KindCost = Entry->Cost[CostKind]) |
4373 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4374 | |
4375 | if (ST->hasCDI()) |
4376 | if (const auto *Entry = CostTableLookup(Table: AVX512CDCostTbl, ISD, Ty: MTy)) |
4377 | if (auto KindCost = Entry->Cost[CostKind]) |
4378 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4379 | |
4380 | if (ST->hasBWI()) |
4381 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
4382 | if (auto KindCost = Entry->Cost[CostKind]) |
4383 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4384 | |
4385 | if (ST->hasAVX512()) |
4386 | if (const auto *Entry = CostTableLookup(Table: AVX512CostTbl, ISD, Ty: MTy)) |
4387 | if (auto KindCost = Entry->Cost[CostKind]) |
4388 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4389 | |
4390 | if (ST->hasXOP()) |
4391 | if (const auto *Entry = CostTableLookup(Table: XOPCostTbl, ISD, Ty: MTy)) |
4392 | if (auto KindCost = Entry->Cost[CostKind]) |
4393 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4394 | |
4395 | if (ST->hasAVX2()) |
4396 | if (const auto *Entry = CostTableLookup(Table: AVX2CostTbl, ISD, Ty: MTy)) |
4397 | if (auto KindCost = Entry->Cost[CostKind]) |
4398 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4399 | |
4400 | if (ST->hasAVX()) |
4401 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
4402 | if (auto KindCost = Entry->Cost[CostKind]) |
4403 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4404 | |
4405 | if (ST->hasSSE42()) |
4406 | if (const auto *Entry = CostTableLookup(Table: SSE42CostTbl, ISD, Ty: MTy)) |
4407 | if (auto KindCost = Entry->Cost[CostKind]) |
4408 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4409 | |
4410 | if (ST->hasSSE41()) |
4411 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
4412 | if (auto KindCost = Entry->Cost[CostKind]) |
4413 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4414 | |
4415 | if (ST->hasSSSE3()) |
4416 | if (const auto *Entry = CostTableLookup(Table: SSSE3CostTbl, ISD, Ty: MTy)) |
4417 | if (auto KindCost = Entry->Cost[CostKind]) |
4418 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4419 | |
4420 | if (ST->hasSSE2()) |
4421 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
4422 | if (auto KindCost = Entry->Cost[CostKind]) |
4423 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4424 | |
4425 | if (ST->hasSSE1()) |
4426 | if (const auto *Entry = CostTableLookup(Table: SSE1CostTbl, ISD, Ty: MTy)) |
4427 | if (auto KindCost = Entry->Cost[CostKind]) |
4428 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4429 | |
4430 | if (ST->hasBMI()) { |
4431 | if (ST->is64Bit()) |
4432 | if (const auto *Entry = CostTableLookup(Table: BMI64CostTbl, ISD, Ty: MTy)) |
4433 | if (auto KindCost = Entry->Cost[CostKind]) |
4434 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4435 | |
4436 | if (const auto *Entry = CostTableLookup(Table: BMI32CostTbl, ISD, Ty: MTy)) |
4437 | if (auto KindCost = Entry->Cost[CostKind]) |
4438 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4439 | } |
4440 | |
4441 | if (ST->hasLZCNT()) { |
4442 | if (ST->is64Bit()) |
4443 | if (const auto *Entry = CostTableLookup(Table: LZCNT64CostTbl, ISD, Ty: MTy)) |
4444 | if (auto KindCost = Entry->Cost[CostKind]) |
4445 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4446 | |
4447 | if (const auto *Entry = CostTableLookup(Table: LZCNT32CostTbl, ISD, Ty: MTy)) |
4448 | if (auto KindCost = Entry->Cost[CostKind]) |
4449 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4450 | } |
4451 | |
4452 | if (ST->hasPOPCNT()) { |
4453 | if (ST->is64Bit()) |
4454 | if (const auto *Entry = CostTableLookup(Table: POPCNT64CostTbl, ISD, Ty: MTy)) |
4455 | if (auto KindCost = Entry->Cost[CostKind]) |
4456 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4457 | |
4458 | if (const auto *Entry = CostTableLookup(Table: POPCNT32CostTbl, ISD, Ty: MTy)) |
4459 | if (auto KindCost = Entry->Cost[CostKind]) |
4460 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4461 | } |
4462 | |
4463 | if (ST->is64Bit()) |
4464 | if (const auto *Entry = CostTableLookup(Table: X64CostTbl, ISD, Ty: MTy)) |
4465 | if (auto KindCost = Entry->Cost[CostKind]) |
4466 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4467 | |
4468 | if (const auto *Entry = CostTableLookup(Table: X86CostTbl, ISD, Ty: MTy)) |
4469 | if (auto KindCost = Entry->Cost[CostKind]) |
4470 | return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags()); |
4471 | } |
4472 | |
4473 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
4474 | } |
4475 | |
4476 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
4477 | TTI::TargetCostKind CostKind, |
4478 | unsigned Index, Value *Op0, |
4479 | Value *Op1) { |
4480 | static const CostTblEntry SLMCostTbl[] = { |
4481 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i8, .Cost: 4 }, |
4482 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i16, .Cost: 4 }, |
4483 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i32, .Cost: 4 }, |
4484 | { .ISD: ISD::EXTRACT_VECTOR_ELT, .Type: MVT::i64, .Cost: 7 } |
4485 | }; |
4486 | |
4487 | assert(Val->isVectorTy() && "This must be a vector type" ); |
4488 | Type *ScalarType = Val->getScalarType(); |
4489 | InstructionCost RegisterFileMoveCost = 0; |
4490 | |
4491 | // Non-immediate extraction/insertion can be handled as a sequence of |
4492 | // aliased loads+stores via the stack. |
4493 | if (Index == -1U && (Opcode == Instruction::ExtractElement || |
4494 | Opcode == Instruction::InsertElement)) { |
4495 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: |
4496 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. |
4497 | |
4498 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. |
4499 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected" ); |
4500 | Align VecAlign = DL.getPrefTypeAlign(Ty: Val); |
4501 | Align SclAlign = DL.getPrefTypeAlign(Ty: ScalarType); |
4502 | |
4503 | // Extract - store vector to stack, load scalar. |
4504 | if (Opcode == Instruction::ExtractElement) { |
4505 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4506 | getMemoryOpCost(Opcode: Instruction::Load, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4507 | CostKind); |
4508 | } |
4509 | // Insert - store vector to stack, store scalar, load vector. |
4510 | if (Opcode == Instruction::InsertElement) { |
4511 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4512 | getMemoryOpCost(Opcode: Instruction::Store, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4513 | CostKind) + |
4514 | getMemoryOpCost(Opcode: Instruction::Load, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind); |
4515 | } |
4516 | } |
4517 | |
4518 | if (Index != -1U && (Opcode == Instruction::ExtractElement || |
4519 | Opcode == Instruction::InsertElement)) { |
4520 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. |
4521 | if (Opcode == Instruction::ExtractElement && |
4522 | ScalarType->getScalarSizeInBits() == 1 && |
4523 | cast<FixedVectorType>(Val)->getNumElements() > 1) |
4524 | return 1; |
4525 | |
4526 | // Legalize the type. |
4527 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
4528 | |
4529 | // This type is legalized to a scalar type. |
4530 | if (!LT.second.isVector()) |
4531 | return 0; |
4532 | |
4533 | // The type may be split. Normalize the index to the new type. |
4534 | unsigned SizeInBits = LT.second.getSizeInBits(); |
4535 | unsigned NumElts = LT.second.getVectorNumElements(); |
4536 | unsigned SubNumElts = NumElts; |
4537 | Index = Index % NumElts; |
4538 | |
4539 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. |
4540 | // For inserts, we also need to insert the subvector back. |
4541 | if (SizeInBits > 128) { |
4542 | assert((SizeInBits % 128) == 0 && "Illegal vector" ); |
4543 | unsigned NumSubVecs = SizeInBits / 128; |
4544 | SubNumElts = NumElts / NumSubVecs; |
4545 | if (SubNumElts <= Index) { |
4546 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); |
4547 | Index %= SubNumElts; |
4548 | } |
4549 | } |
4550 | |
4551 | MVT MScalarTy = LT.second.getScalarType(); |
4552 | auto IsCheapPInsrPExtrInsertPS = [&]() { |
4553 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. |
4554 | // Also, assume insertps is relatively cheap on all >= SSE41 targets. |
4555 | return (MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4556 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4557 | (MScalarTy == MVT::f32 && ST->hasSSE41() && |
4558 | Opcode == Instruction::InsertElement); |
4559 | }; |
4560 | |
4561 | if (Index == 0) { |
4562 | // Floating point scalars are already located in index #0. |
4563 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume |
4564 | // true for all. |
4565 | if (ScalarType->isFloatingPointTy() && |
4566 | (Opcode != Instruction::InsertElement || !Op0 || |
4567 | isa<UndefValue>(Val: Op0))) |
4568 | return RegisterFileMoveCost; |
4569 | |
4570 | if (Opcode == Instruction::InsertElement && |
4571 | isa_and_nonnull<UndefValue>(Val: Op0)) { |
4572 | // Consider the gather cost to be cheap. |
4573 | if (isa_and_nonnull<LoadInst>(Val: Op1)) |
4574 | return RegisterFileMoveCost; |
4575 | if (!IsCheapPInsrPExtrInsertPS()) { |
4576 | // mov constant-to-GPR + movd/movq GPR -> XMM. |
4577 | if (isa_and_nonnull<Constant>(Val: Op1) && Op1->getType()->isIntegerTy()) |
4578 | return 2 + RegisterFileMoveCost; |
4579 | // Assume movd/movq GPR -> XMM is relatively cheap on all targets. |
4580 | return 1 + RegisterFileMoveCost; |
4581 | } |
4582 | } |
4583 | |
4584 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. |
4585 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) |
4586 | return 1 + RegisterFileMoveCost; |
4587 | } |
4588 | |
4589 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
4590 | assert(ISD && "Unexpected vector opcode" ); |
4591 | if (ST->useSLMArithCosts()) |
4592 | if (auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MScalarTy)) |
4593 | return Entry->Cost + RegisterFileMoveCost; |
4594 | |
4595 | // Consider cheap cases. |
4596 | if (IsCheapPInsrPExtrInsertPS()) |
4597 | return 1 + RegisterFileMoveCost; |
4598 | |
4599 | // For extractions we just need to shuffle the element to index 0, which |
4600 | // should be very cheap (assume cost = 1). For insertions we need to shuffle |
4601 | // the elements to its destination. In both cases we must handle the |
4602 | // subvector move(s). |
4603 | // If the vector type is already less than 128-bits then don't reduce it. |
4604 | // TODO: Under what circumstances should we shuffle using the full width? |
4605 | InstructionCost ShuffleCost = 1; |
4606 | if (Opcode == Instruction::InsertElement) { |
4607 | auto *SubTy = cast<VectorType>(Val); |
4608 | EVT VT = TLI->getValueType(DL, Ty: Val); |
4609 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) |
4610 | SubTy = FixedVectorType::get(ElementType: ScalarType, NumElts: SubNumElts); |
4611 | ShuffleCost = getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SubTy, Mask: std::nullopt, |
4612 | CostKind, Index: 0, SubTp: SubTy); |
4613 | } |
4614 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; |
4615 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; |
4616 | } |
4617 | |
4618 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + |
4619 | RegisterFileMoveCost; |
4620 | } |
4621 | |
4622 | InstructionCost |
4623 | X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, |
4624 | bool Insert, bool , |
4625 | TTI::TargetCostKind CostKind) { |
4626 | assert(DemandedElts.getBitWidth() == |
4627 | cast<FixedVectorType>(Ty)->getNumElements() && |
4628 | "Vector size mismatch" ); |
4629 | |
4630 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
4631 | MVT MScalarTy = LT.second.getScalarType(); |
4632 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); |
4633 | InstructionCost Cost = 0; |
4634 | |
4635 | constexpr unsigned LaneBitWidth = 128; |
4636 | assert((LegalVectorBitWidth < LaneBitWidth || |
4637 | (LegalVectorBitWidth % LaneBitWidth) == 0) && |
4638 | "Illegal vector" ); |
4639 | |
4640 | const int NumLegalVectors = *LT.first.getValue(); |
4641 | assert(NumLegalVectors >= 0 && "Negative cost!" ); |
4642 | |
4643 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much |
4644 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. |
4645 | if (Insert) { |
4646 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4647 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4648 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { |
4649 | // For types we can insert directly, insertion into 128-bit sub vectors is |
4650 | // cheap, followed by a cheap chain of concatenations. |
4651 | if (LegalVectorBitWidth <= LaneBitWidth) { |
4652 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, |
4653 | /*Extract*/ false, CostKind); |
4654 | } else { |
4655 | // In each 128-lane, if at least one index is demanded but not all |
4656 | // indices are demanded and this 128-lane is not the first 128-lane of |
4657 | // the legalized-vector, then this 128-lane needs a extracti128; If in |
4658 | // each 128-lane, there is at least one demanded index, this 128-lane |
4659 | // needs a inserti128. |
4660 | |
4661 | // The following cases will help you build a better understanding: |
4662 | // Assume we insert several elements into a v8i32 vector in avx2, |
4663 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. |
4664 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + |
4665 | // inserti128. |
4666 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. |
4667 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector" ); |
4668 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
4669 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
4670 | unsigned NumLegalElts = |
4671 | LT.second.getVectorNumElements() * NumLegalVectors; |
4672 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
4673 | "Vector has been legalized to smaller element count" ); |
4674 | assert((NumLegalElts % NumLanesTotal) == 0 && |
4675 | "Unexpected elts per lane" ); |
4676 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
4677 | |
4678 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
4679 | auto *LaneTy = |
4680 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
4681 | |
4682 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
4683 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
4684 | numBits: NumEltsPerLane, bitPosition: NumEltsPerLane * I); |
4685 | if (LaneEltMask.isZero()) |
4686 | continue; |
4687 | // FIXME: we don't need to extract if all non-demanded elements |
4688 | // are legalization-inserted padding. |
4689 | if (!LaneEltMask.isAllOnes()) |
4690 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
4691 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4692 | Cost += BaseT::getScalarizationOverhead(InTy: LaneTy, DemandedElts: LaneEltMask, Insert, |
4693 | /*Extract*/ false, CostKind); |
4694 | } |
4695 | |
4696 | APInt AffectedLanes = |
4697 | APIntOps::ScaleBitMask(A: WidenedDemandedElts, NewBitWidth: NumLanesTotal); |
4698 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( |
4699 | A: AffectedLanes, NewBitWidth: NumLegalVectors, /*MatchAllBits=*/true); |
4700 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { |
4701 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { |
4702 | unsigned I = NumLegalLanes * LegalVec + Lane; |
4703 | // No need to insert unaffected lane; or lane 0 of each legal vector |
4704 | // iff ALL lanes of that vector were affected and will be inserted. |
4705 | if (!AffectedLanes[I] || |
4706 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) |
4707 | continue; |
4708 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, BaseTp: Ty, Mask: std::nullopt, |
4709 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4710 | } |
4711 | } |
4712 | } |
4713 | } else if (LT.second.isVector()) { |
4714 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded |
4715 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a |
4716 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be |
4717 | // considered cheap. |
4718 | if (Ty->isIntOrIntVectorTy()) |
4719 | Cost += DemandedElts.popcount(); |
4720 | |
4721 | // Get the smaller of the legalized or original pow2-extended number of |
4722 | // vector elements, which represents the number of unpacks we'll end up |
4723 | // performing. |
4724 | unsigned NumElts = LT.second.getVectorNumElements(); |
4725 | unsigned Pow2Elts = |
4726 | PowerOf2Ceil(A: cast<FixedVectorType>(Val: Ty)->getNumElements()); |
4727 | Cost += (std::min<unsigned>(a: NumElts, b: Pow2Elts) - 1) * LT.first; |
4728 | } |
4729 | } |
4730 | |
4731 | if (Extract) { |
4732 | // vXi1 can be efficiently extracted with MOVMSK. |
4733 | // TODO: AVX512 predicate mask handling. |
4734 | // NOTE: This doesn't work well for roundtrip scalarization. |
4735 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { |
4736 | unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
4737 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; |
4738 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; |
4739 | return MOVMSKCost; |
4740 | } |
4741 | |
4742 | if (LT.second.isVector()) { |
4743 | unsigned NumLegalElts = |
4744 | LT.second.getVectorNumElements() * NumLegalVectors; |
4745 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
4746 | "Vector has been legalized to smaller element count" ); |
4747 | |
4748 | // If we're extracting elements from a 128-bit subvector lane, |
4749 | // we only need to extract each lane once, not for every element. |
4750 | if (LegalVectorBitWidth > LaneBitWidth) { |
4751 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
4752 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
4753 | assert((NumLegalElts % NumLanesTotal) == 0 && |
4754 | "Unexpected elts per lane" ); |
4755 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
4756 | |
4757 | // Add cost for each demanded 128-bit subvector extraction. |
4758 | // Luckily this is a lot easier than for insertion. |
4759 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
4760 | auto *LaneTy = |
4761 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
4762 | |
4763 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
4764 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
4765 | numBits: NumEltsPerLane, bitPosition: I * NumEltsPerLane); |
4766 | if (LaneEltMask.isZero()) |
4767 | continue; |
4768 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
4769 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4770 | Cost += BaseT::getScalarizationOverhead( |
4771 | InTy: LaneTy, DemandedElts: LaneEltMask, /*Insert*/ false, Extract, CostKind); |
4772 | } |
4773 | |
4774 | return Cost; |
4775 | } |
4776 | } |
4777 | |
4778 | // Fallback to default extraction. |
4779 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, /*Insert*/ false, |
4780 | Extract, CostKind); |
4781 | } |
4782 | |
4783 | return Cost; |
4784 | } |
4785 | |
4786 | InstructionCost |
4787 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
4788 | int VF, const APInt &DemandedDstElts, |
4789 | TTI::TargetCostKind CostKind) { |
4790 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
4791 | // We don't differentiate element types here, only element bit width. |
4792 | EltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: EltTyBits); |
4793 | |
4794 | auto bailout = [&]() { |
4795 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, |
4796 | DemandedDstElts, CostKind); |
4797 | }; |
4798 | |
4799 | // For now, only deal with AVX512 cases. |
4800 | if (!ST->hasAVX512()) |
4801 | return bailout(); |
4802 | |
4803 | // Do we have a native shuffle for this element type, or should we promote? |
4804 | unsigned PromEltTyBits = EltTyBits; |
4805 | switch (EltTyBits) { |
4806 | case 32: |
4807 | case 64: |
4808 | break; // AVX512F. |
4809 | case 16: |
4810 | if (!ST->hasBWI()) |
4811 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4812 | break; // AVX512BW |
4813 | case 8: |
4814 | if (!ST->hasVBMI()) |
4815 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4816 | break; // AVX512VBMI |
4817 | case 1: |
4818 | // There is no support for shuffling i1 elements. We *must* promote. |
4819 | if (ST->hasBWI()) { |
4820 | if (ST->hasVBMI()) |
4821 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. |
4822 | else |
4823 | PromEltTyBits = 16; // promote to i16, AVX512BW. |
4824 | break; |
4825 | } |
4826 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4827 | break; |
4828 | default: |
4829 | return bailout(); |
4830 | } |
4831 | auto *PromEltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: PromEltTyBits); |
4832 | |
4833 | auto *SrcVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: VF); |
4834 | auto *PromSrcVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: VF); |
4835 | |
4836 | int NumDstElements = VF * ReplicationFactor; |
4837 | auto *PromDstVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: NumDstElements); |
4838 | auto *DstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumDstElements); |
4839 | |
4840 | // Legalize the types. |
4841 | MVT LegalSrcVecTy = getTypeLegalizationCost(Ty: SrcVecTy).second; |
4842 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(Ty: PromSrcVecTy).second; |
4843 | MVT LegalPromDstVecTy = getTypeLegalizationCost(Ty: PromDstVecTy).second; |
4844 | MVT LegalDstVecTy = getTypeLegalizationCost(Ty: DstVecTy).second; |
4845 | // They should have legalized into vector types. |
4846 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || |
4847 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) |
4848 | return bailout(); |
4849 | |
4850 | if (PromEltTyBits != EltTyBits) { |
4851 | // If we have to perform the shuffle with wider elt type than our data type, |
4852 | // then we will first need to anyext (we don't care about the new bits) |
4853 | // the source elements, and then truncate Dst elements. |
4854 | InstructionCost PromotionCost; |
4855 | PromotionCost += getCastInstrCost( |
4856 | Opcode: Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, |
4857 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
4858 | PromotionCost += |
4859 | getCastInstrCost(Opcode: Instruction::Trunc, /*Dst=*/DstVecTy, |
4860 | /*Src=*/PromDstVecTy, |
4861 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
4862 | return PromotionCost + getReplicationShuffleCost(EltTy: PromEltTy, |
4863 | ReplicationFactor, VF, |
4864 | DemandedDstElts, CostKind); |
4865 | } |
4866 | |
4867 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && |
4868 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && |
4869 | "We expect that the legalization doesn't affect the element width, " |
4870 | "doesn't coalesce/split elements." ); |
4871 | |
4872 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); |
4873 | unsigned NumDstVectors = |
4874 | divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: NumEltsPerDstVec); |
4875 | |
4876 | auto *SingleDstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltsPerDstVec); |
4877 | |
4878 | // Not all the produced Dst elements may be demanded. In our case, |
4879 | // given that a single Dst vector is formed by a single shuffle, |
4880 | // if all elements that will form a single Dst vector aren't demanded, |
4881 | // then we won't need to do that shuffle, so adjust the cost accordingly. |
4882 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( |
4883 | A: DemandedDstElts.zext(width: NumDstVectors * NumEltsPerDstVec), NewBitWidth: NumDstVectors); |
4884 | unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); |
4885 | |
4886 | InstructionCost SingleShuffleCost = getShuffleCost( |
4887 | Kind: TTI::SK_PermuteSingleSrc, BaseTp: SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, |
4888 | /*Index=*/0, /*SubTp=*/nullptr); |
4889 | return NumDstVectorsDemanded * SingleShuffleCost; |
4890 | } |
4891 | |
4892 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
4893 | MaybeAlign Alignment, |
4894 | unsigned AddressSpace, |
4895 | TTI::TargetCostKind CostKind, |
4896 | TTI::OperandValueInfo OpInfo, |
4897 | const Instruction *I) { |
4898 | // TODO: Handle other cost kinds. |
4899 | if (CostKind != TTI::TCK_RecipThroughput) { |
4900 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: I)) { |
4901 | // Store instruction with index and scale costs 2 Uops. |
4902 | // Check the preceding GEP to identify non-const indices. |
4903 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: SI->getPointerOperand())) { |
4904 | if (!all_of(Range: GEP->indices(), P: [](Value *V) { return isa<Constant>(Val: V); })) |
4905 | return TTI::TCC_Basic * 2; |
4906 | } |
4907 | } |
4908 | return TTI::TCC_Basic; |
4909 | } |
4910 | |
4911 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
4912 | "Invalid Opcode" ); |
4913 | // Type legalization can't handle structs |
4914 | if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other) |
4915 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4916 | CostKind); |
4917 | |
4918 | // Legalize the type. |
4919 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
4920 | |
4921 | auto *VTy = dyn_cast<FixedVectorType>(Val: Src); |
4922 | |
4923 | InstructionCost Cost = 0; |
4924 | |
4925 | // Add a cost for constant load to vector. |
4926 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
4927 | Cost += getMemoryOpCost(Opcode: Instruction::Load, Src, Alignment: DL.getABITypeAlign(Ty: Src), |
4928 | /*AddressSpace=*/0, CostKind); |
4929 | |
4930 | // Handle the simple case of non-vectors. |
4931 | // NOTE: this assumes that legalization never creates vector from scalars! |
4932 | if (!VTy || !LT.second.isVector()) { |
4933 | // Each load/store unit costs 1. |
4934 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; |
4935 | } |
4936 | |
4937 | bool IsLoad = Opcode == Instruction::Load; |
4938 | |
4939 | Type *EltTy = VTy->getElementType(); |
4940 | |
4941 | const int EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
4942 | |
4943 | // Source of truth: how many elements were there in the original IR vector? |
4944 | const unsigned SrcNumElt = VTy->getNumElements(); |
4945 | |
4946 | // How far have we gotten? |
4947 | int NumEltRemaining = SrcNumElt; |
4948 | // Note that we intentionally capture by-reference, NumEltRemaining changes. |
4949 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; |
4950 | |
4951 | const int MaxLegalOpSizeBytes = divideCeil(Numerator: LT.second.getSizeInBits(), Denominator: 8); |
4952 | |
4953 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. |
4954 | const unsigned XMMBits = 128; |
4955 | if (XMMBits % EltTyBits != 0) |
4956 | // Vector size must be a multiple of the element size. I.e. no padding. |
4957 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4958 | CostKind); |
4959 | const int NumEltPerXMM = XMMBits / EltTyBits; |
4960 | |
4961 | auto *XMMVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltPerXMM); |
4962 | |
4963 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; |
4964 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { |
4965 | // How many elements would a single op deal with at once? |
4966 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) |
4967 | // Vector size must be a multiple of the element size. I.e. no padding. |
4968 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4969 | CostKind); |
4970 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; |
4971 | |
4972 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?" ); |
4973 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || |
4974 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && |
4975 | "Unless we haven't halved the op size yet, " |
4976 | "we have less than two op's sized units of work left." ); |
4977 | |
4978 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM |
4979 | ? FixedVectorType::get(ElementType: EltTy, NumElts: CurrNumEltPerOp) |
4980 | : XMMVecTy; |
4981 | |
4982 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && |
4983 | "After halving sizes, the vector elt count is no longer a multiple " |
4984 | "of number of elements per operation?" ); |
4985 | auto *CoalescedVecTy = |
4986 | CurrNumEltPerOp == 1 |
4987 | ? CurrVecTy |
4988 | : FixedVectorType::get( |
4989 | ElementType: IntegerType::get(C&: Src->getContext(), |
4990 | NumBits: EltTyBits * CurrNumEltPerOp), |
4991 | NumElts: CurrVecTy->getNumElements() / CurrNumEltPerOp); |
4992 | assert(DL.getTypeSizeInBits(CoalescedVecTy) == |
4993 | DL.getTypeSizeInBits(CurrVecTy) && |
4994 | "coalesciing elements doesn't change vector width." ); |
4995 | |
4996 | while (NumEltRemaining > 0) { |
4997 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ); |
4998 | |
4999 | // Can we use this vector size, as per the remaining element count? |
5000 | // Iff the vector is naturally aligned, we can do a wide load regardless. |
5001 | if (NumEltRemaining < CurrNumEltPerOp && |
5002 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && |
5003 | CurrOpSizeBytes != 1) |
5004 | break; // Try smalled vector size. |
5005 | |
5006 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; |
5007 | |
5008 | // If we have fully processed the previous reg, we need to replenish it. |
5009 | if (SubVecEltsLeft == 0) { |
5010 | SubVecEltsLeft += CurrVecTy->getNumElements(); |
5011 | // And that's free only for the 0'th subvector of a legalized vector. |
5012 | if (!Is0thSubVec) |
5013 | Cost += getShuffleCost(Kind: IsLoad ? TTI::ShuffleKind::SK_InsertSubvector |
5014 | : TTI::ShuffleKind::SK_ExtractSubvector, |
5015 | BaseTp: VTy, Mask: std::nullopt, CostKind, Index: NumEltDone(), |
5016 | SubTp: CurrVecTy); |
5017 | } |
5018 | |
5019 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, |
5020 | // for smaller widths (32/16/8) we have to insert/extract them separately. |
5021 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, |
5022 | // but let's pretend that it is also true for 16/8 bit wide ops...) |
5023 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { |
5024 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; |
5025 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "" ); |
5026 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; |
5027 | APInt DemandedElts = |
5028 | APInt::getBitsSet(numBits: CoalescedVecTy->getNumElements(), |
5029 | loBit: CoalescedVecEltIdx, hiBit: CoalescedVecEltIdx + 1); |
5030 | assert(DemandedElts.popcount() == 1 && "Inserting single value" ); |
5031 | Cost += getScalarizationOverhead(Ty: CoalescedVecTy, DemandedElts, Insert: IsLoad, |
5032 | Extract: !IsLoad, CostKind); |
5033 | } |
5034 | |
5035 | // This isn't exactly right. We're using slow unaligned 32-byte accesses |
5036 | // as a proxy for a double-pumped AVX memory interface such as on |
5037 | // Sandybridge. |
5038 | // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or |
5039 | // will be scalarized. |
5040 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) |
5041 | Cost += 2; |
5042 | else if (CurrOpSizeBytes < 4) |
5043 | Cost += 2; |
5044 | else |
5045 | Cost += 1; |
5046 | |
5047 | SubVecEltsLeft -= CurrNumEltPerOp; |
5048 | NumEltRemaining -= CurrNumEltPerOp; |
5049 | Alignment = commonAlignment(A: Alignment.valueOrOne(), Offset: CurrOpSizeBytes); |
5050 | } |
5051 | } |
5052 | |
5053 | assert(NumEltRemaining <= 0 && "Should have processed all the elements." ); |
5054 | |
5055 | return Cost; |
5056 | } |
5057 | |
5058 | InstructionCost |
5059 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, |
5060 | unsigned AddressSpace, |
5061 | TTI::TargetCostKind CostKind) { |
5062 | bool IsLoad = (Instruction::Load == Opcode); |
5063 | bool IsStore = (Instruction::Store == Opcode); |
5064 | |
5065 | auto *SrcVTy = dyn_cast<FixedVectorType>(Val: SrcTy); |
5066 | if (!SrcVTy) |
5067 | // To calculate scalar take the regular cost, without mask |
5068 | return getMemoryOpCost(Opcode, Src: SrcTy, Alignment, AddressSpace, CostKind); |
5069 | |
5070 | unsigned NumElem = SrcVTy->getNumElements(); |
5071 | auto *MaskTy = |
5072 | FixedVectorType::get(ElementType: Type::getInt8Ty(C&: SrcVTy->getContext()), NumElts: NumElem); |
5073 | if ((IsLoad && !isLegalMaskedLoad(DataType: SrcVTy, Alignment)) || |
5074 | (IsStore && !isLegalMaskedStore(DataType: SrcVTy, Alignment))) { |
5075 | // Scalarization |
5076 | APInt DemandedElts = APInt::getAllOnes(numBits: NumElem); |
5077 | InstructionCost MaskSplitCost = getScalarizationOverhead( |
5078 | Ty: MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); |
5079 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( |
5080 | Opcode: Instruction::ICmp, ValTy: Type::getInt8Ty(C&: SrcVTy->getContext()), CondTy: nullptr, |
5081 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
5082 | InstructionCost BranchCost = getCFInstrCost(Opcode: Instruction::Br, CostKind); |
5083 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); |
5084 | InstructionCost ValueSplitCost = getScalarizationOverhead( |
5085 | Ty: SrcVTy, DemandedElts, Insert: IsLoad, Extract: IsStore, CostKind); |
5086 | InstructionCost MemopCost = |
5087 | NumElem * BaseT::getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
5088 | Alignment, AddressSpace, CostKind); |
5089 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; |
5090 | } |
5091 | |
5092 | // Legalize the type. |
5093 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcVTy); |
5094 | auto VT = TLI->getValueType(DL, Ty: SrcVTy); |
5095 | InstructionCost Cost = 0; |
5096 | MVT Ty = LT.second; |
5097 | if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64) |
5098 | // APX masked load/store for scalar is cheap. |
5099 | return Cost + LT.first; |
5100 | |
5101 | if (VT.isSimple() && Ty != VT.getSimpleVT() && |
5102 | LT.second.getVectorNumElements() == NumElem) |
5103 | // Promotion requires extend/truncate for data and a shuffle for mask. |
5104 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SrcVTy, Mask: std::nullopt, |
5105 | CostKind, Index: 0, SubTp: nullptr) + |
5106 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: MaskTy, Mask: std::nullopt, |
5107 | CostKind, Index: 0, SubTp: nullptr); |
5108 | |
5109 | else if (LT.first * Ty.getVectorNumElements() > NumElem) { |
5110 | auto *NewMaskTy = FixedVectorType::get(ElementType: MaskTy->getElementType(), |
5111 | NumElts: Ty.getVectorNumElements()); |
5112 | // Expanding requires fill mask with zeroes |
5113 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, BaseTp: NewMaskTy, Mask: std::nullopt, |
5114 | CostKind, Index: 0, SubTp: MaskTy); |
5115 | } |
5116 | |
5117 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. |
5118 | if (!ST->hasAVX512()) |
5119 | return Cost + LT.first * (IsLoad ? 2 : 8); |
5120 | |
5121 | // AVX-512 masked load/store is cheaper |
5122 | return Cost + LT.first; |
5123 | } |
5124 | |
5125 | InstructionCost |
5126 | X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs, |
5127 | const Value *Base, |
5128 | const TTI::PointersChainInfo &Info, |
5129 | Type *AccessTy, TTI::TargetCostKind CostKind) { |
5130 | if (Info.isSameBase() && Info.isKnownStride()) { |
5131 | // If all the pointers have known stride all the differences are translated |
5132 | // into constants. X86 memory addressing allows encoding it into |
5133 | // displacement. So we just need to take the base GEP cost. |
5134 | if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Val: Base)) { |
5135 | SmallVector<const Value *> Indices(BaseGEP->indices()); |
5136 | return getGEPCost(PointeeType: BaseGEP->getSourceElementType(), |
5137 | Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: nullptr, |
5138 | CostKind); |
5139 | } |
5140 | return TTI::TCC_Free; |
5141 | } |
5142 | return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); |
5143 | } |
5144 | |
5145 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, |
5146 | ScalarEvolution *SE, |
5147 | const SCEV *Ptr) { |
5148 | // Address computations in vectorized code with non-consecutive addresses will |
5149 | // likely result in more instructions compared to scalar code where the |
5150 | // computation can more often be merged into the index mode. The resulting |
5151 | // extra micro-ops can significantly decrease throughput. |
5152 | const unsigned NumVectorInstToHideOverhead = 10; |
5153 | |
5154 | // Cost modeling of Strided Access Computation is hidden by the indexing |
5155 | // modes of X86 regardless of the stride value. We dont believe that there |
5156 | // is a difference between constant strided access in gerenal and constant |
5157 | // strided value which is less than or equal to 64. |
5158 | // Even in the case of (loop invariant) stride whose value is not known at |
5159 | // compile time, the address computation will not incur more than one extra |
5160 | // ADD instruction. |
5161 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { |
5162 | // TODO: AVX2 is the current cut-off because we don't have correct |
5163 | // interleaving costs for prior ISA's. |
5164 | if (!BaseT::isStridedAccess(Ptr)) |
5165 | return NumVectorInstToHideOverhead; |
5166 | if (!BaseT::getConstantStrideStep(SE, Ptr)) |
5167 | return 1; |
5168 | } |
5169 | |
5170 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); |
5171 | } |
5172 | |
5173 | InstructionCost |
5174 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
5175 | std::optional<FastMathFlags> FMF, |
5176 | TTI::TargetCostKind CostKind) { |
5177 | if (TTI::requiresOrderedReduction(FMF)) |
5178 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
5179 | |
5180 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5181 | // and make it as the cost. |
5182 | |
5183 | static const CostTblEntry SLMCostTbl[] = { |
5184 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 3 }, |
5185 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 5 }, |
5186 | }; |
5187 | |
5188 | static const CostTblEntry SSE2CostTbl[] = { |
5189 | { .ISD: ISD::FADD, .Type: MVT::v2f64, .Cost: 2 }, |
5190 | { .ISD: ISD::FADD, .Type: MVT::v2f32, .Cost: 2 }, |
5191 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 4 }, |
5192 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 2 }, // The data reported by the IACA tool is "1.6". |
5193 | { .ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: 2 }, // FIXME: chosen to be less than v4i32 |
5194 | { .ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: 3 }, // The data reported by the IACA tool is "3.3". |
5195 | { .ISD: ISD::ADD, .Type: MVT::v2i16, .Cost: 2 }, // The data reported by the IACA tool is "4.3". |
5196 | { .ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: 3 }, // The data reported by the IACA tool is "4.3". |
5197 | { .ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: 4 }, // The data reported by the IACA tool is "4.3". |
5198 | { .ISD: ISD::ADD, .Type: MVT::v2i8, .Cost: 2 }, |
5199 | { .ISD: ISD::ADD, .Type: MVT::v4i8, .Cost: 2 }, |
5200 | { .ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: 2 }, |
5201 | { .ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: 3 }, |
5202 | }; |
5203 | |
5204 | static const CostTblEntry AVX1CostTbl[] = { |
5205 | { .ISD: ISD::FADD, .Type: MVT::v4f64, .Cost: 3 }, |
5206 | { .ISD: ISD::FADD, .Type: MVT::v4f32, .Cost: 3 }, |
5207 | { .ISD: ISD::FADD, .Type: MVT::v8f32, .Cost: 4 }, |
5208 | { .ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: 1 }, // The data reported by the IACA tool is "1.5". |
5209 | { .ISD: ISD::ADD, .Type: MVT::v4i64, .Cost: 3 }, |
5210 | { .ISD: ISD::ADD, .Type: MVT::v8i32, .Cost: 5 }, |
5211 | { .ISD: ISD::ADD, .Type: MVT::v16i16, .Cost: 5 }, |
5212 | { .ISD: ISD::ADD, .Type: MVT::v32i8, .Cost: 4 }, |
5213 | }; |
5214 | |
5215 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
5216 | assert(ISD && "Invalid opcode" ); |
5217 | |
5218 | // Before legalizing the type, give a chance to look up illegal narrow types |
5219 | // in the table. |
5220 | // FIXME: Is there a better way to do this? |
5221 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5222 | if (VT.isSimple()) { |
5223 | MVT MTy = VT.getSimpleVT(); |
5224 | if (ST->useSLMArithCosts()) |
5225 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
5226 | return Entry->Cost; |
5227 | |
5228 | if (ST->hasAVX()) |
5229 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5230 | return Entry->Cost; |
5231 | |
5232 | if (ST->hasSSE2()) |
5233 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5234 | return Entry->Cost; |
5235 | } |
5236 | |
5237 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5238 | |
5239 | MVT MTy = LT.second; |
5240 | |
5241 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5242 | |
5243 | // Special case: vXi8 mul reductions are performed as vXi16. |
5244 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { |
5245 | auto *WideSclTy = IntegerType::get(C&: ValVTy->getContext(), NumBits: 16); |
5246 | auto *WideVecTy = FixedVectorType::get(ElementType: WideSclTy, NumElts: ValVTy->getNumElements()); |
5247 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: ValTy, |
5248 | CCH: TargetTransformInfo::CastContextHint::None, |
5249 | CostKind) + |
5250 | getArithmeticReductionCost(Opcode, ValTy: WideVecTy, FMF, CostKind); |
5251 | } |
5252 | |
5253 | InstructionCost ArithmeticCost = 0; |
5254 | if (LT.first != 1 && MTy.isVector() && |
5255 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5256 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5257 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5258 | NumElts: MTy.getVectorNumElements()); |
5259 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5260 | ArithmeticCost *= LT.first - 1; |
5261 | } |
5262 | |
5263 | if (ST->useSLMArithCosts()) |
5264 | if (const auto *Entry = CostTableLookup(Table: SLMCostTbl, ISD, Ty: MTy)) |
5265 | return ArithmeticCost + Entry->Cost; |
5266 | |
5267 | if (ST->hasAVX()) |
5268 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5269 | return ArithmeticCost + Entry->Cost; |
5270 | |
5271 | if (ST->hasSSE2()) |
5272 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5273 | return ArithmeticCost + Entry->Cost; |
5274 | |
5275 | // FIXME: These assume a naive kshift+binop lowering, which is probably |
5276 | // conservative in most cases. |
5277 | static const CostTblEntry AVX512BoolReduction[] = { |
5278 | { .ISD: ISD::AND, .Type: MVT::v2i1, .Cost: 3 }, |
5279 | { .ISD: ISD::AND, .Type: MVT::v4i1, .Cost: 5 }, |
5280 | { .ISD: ISD::AND, .Type: MVT::v8i1, .Cost: 7 }, |
5281 | { .ISD: ISD::AND, .Type: MVT::v16i1, .Cost: 9 }, |
5282 | { .ISD: ISD::AND, .Type: MVT::v32i1, .Cost: 11 }, |
5283 | { .ISD: ISD::AND, .Type: MVT::v64i1, .Cost: 13 }, |
5284 | { .ISD: ISD::OR, .Type: MVT::v2i1, .Cost: 3 }, |
5285 | { .ISD: ISD::OR, .Type: MVT::v4i1, .Cost: 5 }, |
5286 | { .ISD: ISD::OR, .Type: MVT::v8i1, .Cost: 7 }, |
5287 | { .ISD: ISD::OR, .Type: MVT::v16i1, .Cost: 9 }, |
5288 | { .ISD: ISD::OR, .Type: MVT::v32i1, .Cost: 11 }, |
5289 | { .ISD: ISD::OR, .Type: MVT::v64i1, .Cost: 13 }, |
5290 | }; |
5291 | |
5292 | static const CostTblEntry AVX2BoolReduction[] = { |
5293 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
5294 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
5295 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 2 }, // vpmovmskb + cmp |
5296 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 2 }, // vpmovmskb + cmp |
5297 | }; |
5298 | |
5299 | static const CostTblEntry AVX1BoolReduction[] = { |
5300 | { .ISD: ISD::AND, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
5301 | { .ISD: ISD::AND, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
5302 | { .ISD: ISD::AND, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5303 | { .ISD: ISD::AND, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5304 | { .ISD: ISD::OR, .Type: MVT::v4i64, .Cost: 2 }, // vmovmskpd + cmp |
5305 | { .ISD: ISD::OR, .Type: MVT::v8i32, .Cost: 2 }, // vmovmskps + cmp |
5306 | { .ISD: ISD::OR, .Type: MVT::v16i16, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5307 | { .ISD: ISD::OR, .Type: MVT::v32i8, .Cost: 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5308 | }; |
5309 | |
5310 | static const CostTblEntry SSE2BoolReduction[] = { |
5311 | { .ISD: ISD::AND, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
5312 | { .ISD: ISD::AND, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
5313 | { .ISD: ISD::AND, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
5314 | { .ISD: ISD::AND, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
5315 | { .ISD: ISD::OR, .Type: MVT::v2i64, .Cost: 2 }, // movmskpd + cmp |
5316 | { .ISD: ISD::OR, .Type: MVT::v4i32, .Cost: 2 }, // movmskps + cmp |
5317 | { .ISD: ISD::OR, .Type: MVT::v8i16, .Cost: 2 }, // pmovmskb + cmp |
5318 | { .ISD: ISD::OR, .Type: MVT::v16i8, .Cost: 2 }, // pmovmskb + cmp |
5319 | }; |
5320 | |
5321 | // Handle bool allof/anyof patterns. |
5322 | if (ValVTy->getElementType()->isIntegerTy(Bitwidth: 1)) { |
5323 | InstructionCost ArithmeticCost = 0; |
5324 | if (LT.first != 1 && MTy.isVector() && |
5325 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5326 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5327 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5328 | NumElts: MTy.getVectorNumElements()); |
5329 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5330 | ArithmeticCost *= LT.first - 1; |
5331 | } |
5332 | |
5333 | if (ST->hasAVX512()) |
5334 | if (const auto *Entry = CostTableLookup(Table: AVX512BoolReduction, ISD, Ty: MTy)) |
5335 | return ArithmeticCost + Entry->Cost; |
5336 | if (ST->hasAVX2()) |
5337 | if (const auto *Entry = CostTableLookup(Table: AVX2BoolReduction, ISD, Ty: MTy)) |
5338 | return ArithmeticCost + Entry->Cost; |
5339 | if (ST->hasAVX()) |
5340 | if (const auto *Entry = CostTableLookup(Table: AVX1BoolReduction, ISD, Ty: MTy)) |
5341 | return ArithmeticCost + Entry->Cost; |
5342 | if (ST->hasSSE2()) |
5343 | if (const auto *Entry = CostTableLookup(Table: SSE2BoolReduction, ISD, Ty: MTy)) |
5344 | return ArithmeticCost + Entry->Cost; |
5345 | |
5346 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5347 | } |
5348 | |
5349 | unsigned NumVecElts = ValVTy->getNumElements(); |
5350 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); |
5351 | |
5352 | // Special case power of 2 reductions where the scalar type isn't changed |
5353 | // by type legalization. |
5354 | if (!isPowerOf2_32(Value: NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) |
5355 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5356 | |
5357 | InstructionCost ReductionCost = 0; |
5358 | |
5359 | auto *Ty = ValVTy; |
5360 | if (LT.first != 1 && MTy.isVector() && |
5361 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5362 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5363 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5364 | NumElts: MTy.getVectorNumElements()); |
5365 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
5366 | ReductionCost *= LT.first - 1; |
5367 | NumVecElts = MTy.getVectorNumElements(); |
5368 | } |
5369 | |
5370 | // Now handle reduction with the legal type, taking into account size changes |
5371 | // at each level. |
5372 | while (NumVecElts > 1) { |
5373 | // Determine the size of the remaining vector we need to reduce. |
5374 | unsigned Size = NumVecElts * ScalarSize; |
5375 | NumVecElts /= 2; |
5376 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5377 | if (Size > 128) { |
5378 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5379 | ReductionCost += |
5380 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, CostKind, |
5381 | Index: NumVecElts, SubTp: SubTy); |
5382 | Ty = SubTy; |
5383 | } else if (Size == 128) { |
5384 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5385 | FixedVectorType *ShufTy; |
5386 | if (ValVTy->isFloatingPointTy()) |
5387 | ShufTy = |
5388 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValVTy->getContext()), NumElts: 2); |
5389 | else |
5390 | ShufTy = |
5391 | FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValVTy->getContext()), NumElts: 2); |
5392 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5393 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5394 | } else if (Size == 64) { |
5395 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5396 | FixedVectorType *ShufTy; |
5397 | if (ValVTy->isFloatingPointTy()) |
5398 | ShufTy = |
5399 | FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValVTy->getContext()), NumElts: 4); |
5400 | else |
5401 | ShufTy = |
5402 | FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValVTy->getContext()), NumElts: 4); |
5403 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5404 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5405 | } else { |
5406 | // Reducing from smaller size is a shift by immediate. |
5407 | auto *ShiftTy = FixedVectorType::get( |
5408 | ElementType: Type::getIntNTy(C&: ValVTy->getContext(), N: Size), NumElts: 128 / Size); |
5409 | ReductionCost += getArithmeticInstrCost( |
5410 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind, |
5411 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5412 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5413 | } |
5414 | |
5415 | // Add the arithmetic op for this level. |
5416 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); |
5417 | } |
5418 | |
5419 | // Add the final extract element to the cost. |
5420 | return ReductionCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5421 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5422 | } |
5423 | |
5424 | InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, |
5425 | TTI::TargetCostKind CostKind, |
5426 | FastMathFlags FMF) { |
5427 | IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); |
5428 | return getIntrinsicInstrCost(ICA, CostKind); |
5429 | } |
5430 | |
5431 | InstructionCost |
5432 | X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, |
5433 | FastMathFlags FMF, |
5434 | TTI::TargetCostKind CostKind) { |
5435 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5436 | |
5437 | MVT MTy = LT.second; |
5438 | |
5439 | int ISD; |
5440 | if (ValTy->isIntOrIntVectorTy()) { |
5441 | ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN |
5442 | : ISD::SMIN; |
5443 | } else { |
5444 | assert(ValTy->isFPOrFPVectorTy() && |
5445 | "Expected float point or integer vector type." ); |
5446 | ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) |
5447 | ? ISD::FMINNUM |
5448 | : ISD::FMINIMUM; |
5449 | } |
5450 | |
5451 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5452 | // and make it as the cost. |
5453 | |
5454 | static const CostTblEntry SSE2CostTbl[] = { |
5455 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // need pxors to use pminsw/pmaxsw |
5456 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // need pxors to use pminsw/pmaxsw |
5457 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 9}, // need pxors to use pminsw/pmaxsw |
5458 | }; |
5459 | |
5460 | static const CostTblEntry SSE41CostTbl[] = { |
5461 | {.ISD: ISD::SMIN, .Type: MVT::v2i16, .Cost: 3}, // same as sse2 |
5462 | {.ISD: ISD::SMIN, .Type: MVT::v4i16, .Cost: 5}, // same as sse2 |
5463 | {.ISD: ISD::UMIN, .Type: MVT::v2i16, .Cost: 5}, // same as sse2 |
5464 | {.ISD: ISD::UMIN, .Type: MVT::v4i16, .Cost: 7}, // same as sse2 |
5465 | {.ISD: ISD::SMIN, .Type: MVT::v8i16, .Cost: 4}, // phminposuw+xor |
5466 | {.ISD: ISD::UMIN, .Type: MVT::v8i16, .Cost: 4}, // FIXME: umin is cheaper than umax |
5467 | {.ISD: ISD::SMIN, .Type: MVT::v2i8, .Cost: 3}, // pminsb |
5468 | {.ISD: ISD::SMIN, .Type: MVT::v4i8, .Cost: 5}, // pminsb |
5469 | {.ISD: ISD::SMIN, .Type: MVT::v8i8, .Cost: 7}, // pminsb |
5470 | {.ISD: ISD::SMIN, .Type: MVT::v16i8, .Cost: 6}, |
5471 | {.ISD: ISD::UMIN, .Type: MVT::v2i8, .Cost: 3}, // same as sse2 |
5472 | {.ISD: ISD::UMIN, .Type: MVT::v4i8, .Cost: 5}, // same as sse2 |
5473 | {.ISD: ISD::UMIN, .Type: MVT::v8i8, .Cost: 7}, // same as sse2 |
5474 | {.ISD: ISD::UMIN, .Type: MVT::v16i8, .Cost: 6}, // FIXME: umin is cheaper than umax |
5475 | }; |
5476 | |
5477 | static const CostTblEntry AVX1CostTbl[] = { |
5478 | {.ISD: ISD::SMIN, .Type: MVT::v16i16, .Cost: 6}, |
5479 | {.ISD: ISD::UMIN, .Type: MVT::v16i16, .Cost: 6}, // FIXME: umin is cheaper than umax |
5480 | {.ISD: ISD::SMIN, .Type: MVT::v32i8, .Cost: 8}, |
5481 | {.ISD: ISD::UMIN, .Type: MVT::v32i8, .Cost: 8}, |
5482 | }; |
5483 | |
5484 | static const CostTblEntry AVX512BWCostTbl[] = { |
5485 | {.ISD: ISD::SMIN, .Type: MVT::v32i16, .Cost: 8}, |
5486 | {.ISD: ISD::UMIN, .Type: MVT::v32i16, .Cost: 8}, // FIXME: umin is cheaper than umax |
5487 | {.ISD: ISD::SMIN, .Type: MVT::v64i8, .Cost: 10}, |
5488 | {.ISD: ISD::UMIN, .Type: MVT::v64i8, .Cost: 10}, |
5489 | }; |
5490 | |
5491 | // Before legalizing the type, give a chance to look up illegal narrow types |
5492 | // in the table. |
5493 | // FIXME: Is there a better way to do this? |
5494 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5495 | if (VT.isSimple()) { |
5496 | MVT MTy = VT.getSimpleVT(); |
5497 | if (ST->hasBWI()) |
5498 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
5499 | return Entry->Cost; |
5500 | |
5501 | if (ST->hasAVX()) |
5502 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5503 | return Entry->Cost; |
5504 | |
5505 | if (ST->hasSSE41()) |
5506 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
5507 | return Entry->Cost; |
5508 | |
5509 | if (ST->hasSSE2()) |
5510 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5511 | return Entry->Cost; |
5512 | } |
5513 | |
5514 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5515 | unsigned NumVecElts = ValVTy->getNumElements(); |
5516 | |
5517 | auto *Ty = ValVTy; |
5518 | InstructionCost MinMaxCost = 0; |
5519 | if (LT.first != 1 && MTy.isVector() && |
5520 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5521 | // Type needs to be split. We need LT.first - 1 operations ops. |
5522 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5523 | NumElts: MTy.getVectorNumElements()); |
5524 | MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); |
5525 | MinMaxCost *= LT.first - 1; |
5526 | NumVecElts = MTy.getVectorNumElements(); |
5527 | } |
5528 | |
5529 | if (ST->hasBWI()) |
5530 | if (const auto *Entry = CostTableLookup(Table: AVX512BWCostTbl, ISD, Ty: MTy)) |
5531 | return MinMaxCost + Entry->Cost; |
5532 | |
5533 | if (ST->hasAVX()) |
5534 | if (const auto *Entry = CostTableLookup(Table: AVX1CostTbl, ISD, Ty: MTy)) |
5535 | return MinMaxCost + Entry->Cost; |
5536 | |
5537 | if (ST->hasSSE41()) |
5538 | if (const auto *Entry = CostTableLookup(Table: SSE41CostTbl, ISD, Ty: MTy)) |
5539 | return MinMaxCost + Entry->Cost; |
5540 | |
5541 | if (ST->hasSSE2()) |
5542 | if (const auto *Entry = CostTableLookup(Table: SSE2CostTbl, ISD, Ty: MTy)) |
5543 | return MinMaxCost + Entry->Cost; |
5544 | |
5545 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); |
5546 | |
5547 | // Special case power of 2 reductions where the scalar type isn't changed |
5548 | // by type legalization. |
5549 | if (!isPowerOf2_32(Value: ValVTy->getNumElements()) || |
5550 | ScalarSize != MTy.getScalarSizeInBits()) |
5551 | return BaseT::getMinMaxReductionCost(IID, Ty: ValTy, FMF, CostKind); |
5552 | |
5553 | // Now handle reduction with the legal type, taking into account size changes |
5554 | // at each level. |
5555 | while (NumVecElts > 1) { |
5556 | // Determine the size of the remaining vector we need to reduce. |
5557 | unsigned Size = NumVecElts * ScalarSize; |
5558 | NumVecElts /= 2; |
5559 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5560 | if (Size > 128) { |
5561 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5562 | MinMaxCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
5563 | CostKind, Index: NumVecElts, SubTp: SubTy); |
5564 | Ty = SubTy; |
5565 | } else if (Size == 128) { |
5566 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5567 | VectorType *ShufTy; |
5568 | if (ValTy->isFloatingPointTy()) |
5569 | ShufTy = |
5570 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValTy->getContext()), NumElts: 2); |
5571 | else |
5572 | ShufTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValTy->getContext()), NumElts: 2); |
5573 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5574 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5575 | } else if (Size == 64) { |
5576 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5577 | FixedVectorType *ShufTy; |
5578 | if (ValTy->isFloatingPointTy()) |
5579 | ShufTy = FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), NumElts: 4); |
5580 | else |
5581 | ShufTy = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValTy->getContext()), NumElts: 4); |
5582 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5583 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5584 | } else { |
5585 | // Reducing from smaller size is a shift by immediate. |
5586 | auto *ShiftTy = FixedVectorType::get( |
5587 | ElementType: Type::getIntNTy(C&: ValTy->getContext(), N: Size), NumElts: 128 / Size); |
5588 | MinMaxCost += getArithmeticInstrCost( |
5589 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind: TTI::TCK_RecipThroughput, |
5590 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5591 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5592 | } |
5593 | |
5594 | // Add the arithmetic op for this level. |
5595 | MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); |
5596 | } |
5597 | |
5598 | // Add the final extract element to the cost. |
5599 | return MinMaxCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5600 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5601 | } |
5602 | |
5603 | /// Calculate the cost of materializing a 64-bit value. This helper |
5604 | /// method might only calculate a fraction of a larger immediate. Therefore it |
5605 | /// is valid to return a cost of ZERO. |
5606 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { |
5607 | if (Val == 0) |
5608 | return TTI::TCC_Free; |
5609 | |
5610 | if (isInt<32>(x: Val)) |
5611 | return TTI::TCC_Basic; |
5612 | |
5613 | return 2 * TTI::TCC_Basic; |
5614 | } |
5615 | |
5616 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
5617 | TTI::TargetCostKind CostKind) { |
5618 | assert(Ty->isIntegerTy()); |
5619 | |
5620 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5621 | if (BitSize == 0) |
5622 | return ~0U; |
5623 | |
5624 | // Never hoist constants larger than 128bit, because this might lead to |
5625 | // incorrect code generation or assertions in codegen. |
5626 | // Fixme: Create a cost model for types larger than i128 once the codegen |
5627 | // issues have been fixed. |
5628 | if (BitSize > 128) |
5629 | return TTI::TCC_Free; |
5630 | |
5631 | if (Imm == 0) |
5632 | return TTI::TCC_Free; |
5633 | |
5634 | // Sign-extend all constants to a multiple of 64-bit. |
5635 | APInt ImmVal = Imm; |
5636 | if (BitSize % 64 != 0) |
5637 | ImmVal = Imm.sext(width: alignTo(Value: BitSize, Align: 64)); |
5638 | |
5639 | // Split the constant into 64-bit chunks and calculate the cost for each |
5640 | // chunk. |
5641 | InstructionCost Cost = 0; |
5642 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
5643 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
5644 | int64_t Val = Tmp.getSExtValue(); |
5645 | Cost += getIntImmCost(Val); |
5646 | } |
5647 | // We need at least one instruction to materialize the constant. |
5648 | return std::max<InstructionCost>(a: 1, b: Cost); |
5649 | } |
5650 | |
5651 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
5652 | const APInt &Imm, Type *Ty, |
5653 | TTI::TargetCostKind CostKind, |
5654 | Instruction *Inst) { |
5655 | assert(Ty->isIntegerTy()); |
5656 | |
5657 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5658 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
5659 | // here, so that constant hoisting will ignore this constant. |
5660 | if (BitSize == 0) |
5661 | return TTI::TCC_Free; |
5662 | |
5663 | unsigned ImmIdx = ~0U; |
5664 | switch (Opcode) { |
5665 | default: |
5666 | return TTI::TCC_Free; |
5667 | case Instruction::GetElementPtr: |
5668 | // Always hoist the base address of a GetElementPtr. This prevents the |
5669 | // creation of new constants for every base constant that gets constant |
5670 | // folded with the offset. |
5671 | if (Idx == 0) |
5672 | return 2 * TTI::TCC_Basic; |
5673 | return TTI::TCC_Free; |
5674 | case Instruction::Store: |
5675 | ImmIdx = 0; |
5676 | break; |
5677 | case Instruction::ICmp: |
5678 | // This is an imperfect hack to prevent constant hoisting of |
5679 | // compares that might be trying to check if a 64-bit value fits in |
5680 | // 32-bits. The backend can optimize these cases using a right shift by 32. |
5681 | // Ideally we would check the compare predicate here. There also other |
5682 | // similar immediates the backend can use shifts for. |
5683 | if (Idx == 1 && Imm.getBitWidth() == 64) { |
5684 | uint64_t ImmVal = Imm.getZExtValue(); |
5685 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) |
5686 | return TTI::TCC_Free; |
5687 | } |
5688 | ImmIdx = 1; |
5689 | break; |
5690 | case Instruction::And: |
5691 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes |
5692 | // by using a 32-bit operation with implicit zero extension. Detect such |
5693 | // immediates here as the normal path expects bit 31 to be sign extended. |
5694 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(N: 32)) |
5695 | return TTI::TCC_Free; |
5696 | ImmIdx = 1; |
5697 | break; |
5698 | case Instruction::Add: |
5699 | case Instruction::Sub: |
5700 | // For add/sub, we can use the opposite instruction for INT32_MIN. |
5701 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) |
5702 | return TTI::TCC_Free; |
5703 | ImmIdx = 1; |
5704 | break; |
5705 | case Instruction::UDiv: |
5706 | case Instruction::SDiv: |
5707 | case Instruction::URem: |
5708 | case Instruction::SRem: |
5709 | // Division by constant is typically expanded later into a different |
5710 | // instruction sequence. This completely changes the constants. |
5711 | // Report them as "free" to stop ConstantHoist from marking them as opaque. |
5712 | return TTI::TCC_Free; |
5713 | case Instruction::Mul: |
5714 | case Instruction::Or: |
5715 | case Instruction::Xor: |
5716 | ImmIdx = 1; |
5717 | break; |
5718 | // Always return TCC_Free for the shift value of a shift instruction. |
5719 | case Instruction::Shl: |
5720 | case Instruction::LShr: |
5721 | case Instruction::AShr: |
5722 | if (Idx == 1) |
5723 | return TTI::TCC_Free; |
5724 | break; |
5725 | case Instruction::Trunc: |
5726 | case Instruction::ZExt: |
5727 | case Instruction::SExt: |
5728 | case Instruction::IntToPtr: |
5729 | case Instruction::PtrToInt: |
5730 | case Instruction::BitCast: |
5731 | case Instruction::PHI: |
5732 | case Instruction::Call: |
5733 | case Instruction::Select: |
5734 | case Instruction::Ret: |
5735 | case Instruction::Load: |
5736 | break; |
5737 | } |
5738 | |
5739 | if (Idx == ImmIdx) { |
5740 | uint64_t NumConstants = divideCeil(Numerator: BitSize, Denominator: 64); |
5741 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5742 | return (Cost <= NumConstants * TTI::TCC_Basic) |
5743 | ? static_cast<int>(TTI::TCC_Free) |
5744 | : Cost; |
5745 | } |
5746 | |
5747 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5748 | } |
5749 | |
5750 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
5751 | const APInt &Imm, Type *Ty, |
5752 | TTI::TargetCostKind CostKind) { |
5753 | assert(Ty->isIntegerTy()); |
5754 | |
5755 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5756 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
5757 | // here, so that constant hoisting will ignore this constant. |
5758 | if (BitSize == 0) |
5759 | return TTI::TCC_Free; |
5760 | |
5761 | switch (IID) { |
5762 | default: |
5763 | return TTI::TCC_Free; |
5764 | case Intrinsic::sadd_with_overflow: |
5765 | case Intrinsic::uadd_with_overflow: |
5766 | case Intrinsic::ssub_with_overflow: |
5767 | case Intrinsic::usub_with_overflow: |
5768 | case Intrinsic::smul_with_overflow: |
5769 | case Intrinsic::umul_with_overflow: |
5770 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 32)) |
5771 | return TTI::TCC_Free; |
5772 | break; |
5773 | case Intrinsic::experimental_stackmap: |
5774 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
5775 | return TTI::TCC_Free; |
5776 | break; |
5777 | case Intrinsic::experimental_patchpoint_void: |
5778 | case Intrinsic::experimental_patchpoint: |
5779 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
5780 | return TTI::TCC_Free; |
5781 | break; |
5782 | } |
5783 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5784 | } |
5785 | |
5786 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, |
5787 | TTI::TargetCostKind CostKind, |
5788 | const Instruction *I) { |
5789 | if (CostKind != TTI::TCK_RecipThroughput) |
5790 | return Opcode == Instruction::PHI ? 0 : 1; |
5791 | // Branches are assumed to be predicted. |
5792 | return 0; |
5793 | } |
5794 | |
5795 | int X86TTIImpl::getGatherOverhead() const { |
5796 | // Some CPUs have more overhead for gather. The specified overhead is relative |
5797 | // to the Load operation. "2" is the number provided by Intel architects. This |
5798 | // parameter is used for cost estimation of Gather Op and comparison with |
5799 | // other alternatives. |
5800 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only |
5801 | // enable gather with a -march. |
5802 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) |
5803 | return 2; |
5804 | |
5805 | return 1024; |
5806 | } |
5807 | |
5808 | int X86TTIImpl::getScatterOverhead() const { |
5809 | if (ST->hasAVX512()) |
5810 | return 2; |
5811 | |
5812 | return 1024; |
5813 | } |
5814 | |
5815 | // Return an average cost of Gather / Scatter instruction, maybe improved later. |
5816 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, |
5817 | TTI::TargetCostKind CostKind, |
5818 | Type *SrcVTy, const Value *Ptr, |
5819 | Align Alignment, |
5820 | unsigned AddressSpace) { |
5821 | |
5822 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ); |
5823 | unsigned VF = cast<FixedVectorType>(Val: SrcVTy)->getNumElements(); |
5824 | |
5825 | // Try to reduce index size from 64 bit (default for GEP) |
5826 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the |
5827 | // operation will use 16 x 64 indices which do not fit in a zmm and needs |
5828 | // to split. Also check that the base pointer is the same for all lanes, |
5829 | // and that there's at most one variable index. |
5830 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { |
5831 | unsigned IndexSize = DL.getPointerSizeInBits(); |
5832 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
5833 | if (IndexSize < 64 || !GEP) |
5834 | return IndexSize; |
5835 | |
5836 | unsigned NumOfVarIndices = 0; |
5837 | const Value *Ptrs = GEP->getPointerOperand(); |
5838 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(V: Ptrs)) |
5839 | return IndexSize; |
5840 | for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { |
5841 | if (isa<Constant>(Val: GEP->getOperand(i_nocapture: I))) |
5842 | continue; |
5843 | Type *IndxTy = GEP->getOperand(i_nocapture: I)->getType(); |
5844 | if (auto *IndexVTy = dyn_cast<VectorType>(Val: IndxTy)) |
5845 | IndxTy = IndexVTy->getElementType(); |
5846 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && |
5847 | !isa<SExtInst>(Val: GEP->getOperand(i_nocapture: I))) || |
5848 | ++NumOfVarIndices > 1) |
5849 | return IndexSize; // 64 |
5850 | } |
5851 | return (unsigned)32; |
5852 | }; |
5853 | |
5854 | // Trying to reduce IndexSize to 32 bits for vector 16. |
5855 | // By default the IndexSize is equal to pointer size. |
5856 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) |
5857 | ? getIndexSizeInBits(Ptr, DL) |
5858 | : DL.getPointerSizeInBits(); |
5859 | |
5860 | auto *IndexVTy = FixedVectorType::get( |
5861 | ElementType: IntegerType::get(C&: SrcVTy->getContext(), NumBits: IndexSize), NumElts: VF); |
5862 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(Ty: IndexVTy); |
5863 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcVTy); |
5864 | InstructionCost::CostType SplitFactor = |
5865 | *std::max(a: IdxsLT.first, b: SrcLT.first).getValue(); |
5866 | if (SplitFactor > 1) { |
5867 | // Handle splitting of vector of pointers |
5868 | auto *SplitSrcTy = |
5869 | FixedVectorType::get(ElementType: SrcVTy->getScalarType(), NumElts: VF / SplitFactor); |
5870 | return SplitFactor * getGSVectorCost(Opcode, CostKind, SrcVTy: SplitSrcTy, Ptr, |
5871 | Alignment, AddressSpace); |
5872 | } |
5873 | |
5874 | // If we didn't split, this will be a single gather/scatter instruction. |
5875 | if (CostKind == TTI::TCK_CodeSize) |
5876 | return 1; |
5877 | |
5878 | // The gather / scatter cost is given by Intel architects. It is a rough |
5879 | // number since we are looking at one instruction in a time. |
5880 | const int GSOverhead = (Opcode == Instruction::Load) ? getGatherOverhead() |
5881 | : getScatterOverhead(); |
5882 | return GSOverhead + VF * getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
5883 | Alignment: MaybeAlign(Alignment), AddressSpace, |
5884 | CostKind); |
5885 | } |
5886 | |
5887 | /// Calculate the cost of Gather / Scatter operation |
5888 | InstructionCost X86TTIImpl::getGatherScatterOpCost( |
5889 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, |
5890 | Align Alignment, TTI::TargetCostKind CostKind, |
5891 | const Instruction *I = nullptr) { |
5892 | if ((Opcode == Instruction::Load && |
5893 | (!isLegalMaskedGather(DataType: SrcVTy, Alignment: Align(Alignment)) || |
5894 | forceScalarizeMaskedGather(VTy: cast<VectorType>(Val: SrcVTy), |
5895 | Alignment: Align(Alignment)))) || |
5896 | (Opcode == Instruction::Store && |
5897 | (!isLegalMaskedScatter(DataType: SrcVTy, Alignment: Align(Alignment)) || |
5898 | forceScalarizeMaskedScatter(VTy: cast<VectorType>(Val: SrcVTy), |
5899 | Alignment: Align(Alignment))))) |
5900 | return BaseT::getGatherScatterOpCost(Opcode, DataTy: SrcVTy, Ptr, VariableMask, |
5901 | Alignment, CostKind, I); |
5902 | |
5903 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ); |
5904 | PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
5905 | if (!PtrTy && Ptr->getType()->isVectorTy()) |
5906 | PtrTy = dyn_cast<PointerType>( |
5907 | Val: cast<VectorType>(Val: Ptr->getType())->getElementType()); |
5908 | assert(PtrTy && "Unexpected type for Ptr argument" ); |
5909 | unsigned AddressSpace = PtrTy->getAddressSpace(); |
5910 | return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, |
5911 | AddressSpace); |
5912 | } |
5913 | |
5914 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
5915 | const TargetTransformInfo::LSRCost &C2) { |
5916 | // X86 specific here are "instruction number 1st priority". |
5917 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, |
5918 | args: C1.NumIVMuls, args: C1.NumBaseAdds, |
5919 | args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
5920 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, |
5921 | args: C2.NumIVMuls, args: C2.NumBaseAdds, |
5922 | args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
5923 | } |
5924 | |
5925 | bool X86TTIImpl::canMacroFuseCmp() { |
5926 | return ST->hasMacroFusion() || ST->hasBranchFusion(); |
5927 | } |
5928 | |
5929 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { |
5930 | Type *ScalarTy = DataTy->getScalarType(); |
5931 | |
5932 | // The backend can't handle a single element vector w/o CFCMOV. |
5933 | if (isa<VectorType>(Val: DataTy) && cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
5934 | return ST->hasCF() && hasConditionalLoadStoreForType(Ty: ScalarTy); |
5935 | |
5936 | if (!ST->hasAVX()) |
5937 | return false; |
5938 | |
5939 | if (ScalarTy->isPointerTy()) |
5940 | return true; |
5941 | |
5942 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
5943 | return true; |
5944 | |
5945 | if (ScalarTy->isHalfTy() && ST->hasBWI()) |
5946 | return true; |
5947 | |
5948 | if (ScalarTy->isBFloatTy() && ST->hasBF16()) |
5949 | return true; |
5950 | |
5951 | if (!ScalarTy->isIntegerTy()) |
5952 | return false; |
5953 | |
5954 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
5955 | return IntWidth == 32 || IntWidth == 64 || |
5956 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); |
5957 | } |
5958 | |
5959 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { |
5960 | return isLegalMaskedLoad(DataTy: DataType, Alignment); |
5961 | } |
5962 | |
5963 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { |
5964 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
5965 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 |
5966 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 |
5967 | // (the equivalent stores only require AVX). |
5968 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) |
5969 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); |
5970 | |
5971 | return false; |
5972 | } |
5973 | |
5974 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { |
5975 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
5976 | |
5977 | // SSE4A supports nontemporal stores of float and double at arbitrary |
5978 | // alignment. |
5979 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) |
5980 | return true; |
5981 | |
5982 | // Besides the SSE4A subtarget exception above, only aligned stores are |
5983 | // available nontemporaly on any other subtarget. And only stores with a size |
5984 | // of 4..32 bytes (powers of 2, only) are permitted. |
5985 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || |
5986 | !isPowerOf2_32(Value: DataSize)) |
5987 | return false; |
5988 | |
5989 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent |
5990 | // loads require AVX2). |
5991 | if (DataSize == 32) |
5992 | return ST->hasAVX(); |
5993 | if (DataSize == 16) |
5994 | return ST->hasSSE1(); |
5995 | return true; |
5996 | } |
5997 | |
5998 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, |
5999 | ElementCount NumElements) const { |
6000 | // movddup |
6001 | return ST->hasSSE3() && !NumElements.isScalable() && |
6002 | NumElements.getFixedValue() == 2 && |
6003 | ElementTy == Type::getDoubleTy(C&: ElementTy->getContext()); |
6004 | } |
6005 | |
6006 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { |
6007 | if (!isa<VectorType>(Val: DataTy)) |
6008 | return false; |
6009 | |
6010 | if (!ST->hasAVX512()) |
6011 | return false; |
6012 | |
6013 | // The backend can't handle a single element vector. |
6014 | if (cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
6015 | return false; |
6016 | |
6017 | Type *ScalarTy = cast<VectorType>(Val: DataTy)->getElementType(); |
6018 | |
6019 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6020 | return true; |
6021 | |
6022 | if (!ScalarTy->isIntegerTy()) |
6023 | return false; |
6024 | |
6025 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6026 | return IntWidth == 32 || IntWidth == 64 || |
6027 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); |
6028 | } |
6029 | |
6030 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { |
6031 | return isLegalMaskedExpandLoad(DataTy, Alignment); |
6032 | } |
6033 | |
6034 | bool X86TTIImpl::supportsGather() const { |
6035 | // Some CPUs have better gather performance than others. |
6036 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only |
6037 | // enable gather with a -march. |
6038 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); |
6039 | } |
6040 | |
6041 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { |
6042 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX |
6043 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend |
6044 | // it to 8 elements, but zeroing upper bits of the mask vector will add more |
6045 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: |
6046 | // Check, maybe the gather/scatter instruction is better in the VariableMask |
6047 | // case. |
6048 | unsigned NumElts = cast<FixedVectorType>(Val: VTy)->getNumElements(); |
6049 | return NumElts == 1 || |
6050 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); |
6051 | } |
6052 | |
6053 | bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) { |
6054 | Type *ScalarTy = DataTy->getScalarType(); |
6055 | if (ScalarTy->isPointerTy()) |
6056 | return true; |
6057 | |
6058 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6059 | return true; |
6060 | |
6061 | if (!ScalarTy->isIntegerTy()) |
6062 | return false; |
6063 | |
6064 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6065 | return IntWidth == 32 || IntWidth == 64; |
6066 | } |
6067 | |
6068 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { |
6069 | if (!supportsGather() || !ST->preferGather()) |
6070 | return false; |
6071 | return isLegalMaskedGatherScatter(DataTy, Alignment); |
6072 | } |
6073 | |
6074 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, |
6075 | unsigned Opcode1, |
6076 | const SmallBitVector &OpcodeMask) const { |
6077 | // ADDSUBPS 4xf32 SSE3 |
6078 | // VADDSUBPS 4xf32 AVX |
6079 | // VADDSUBPS 8xf32 AVX2 |
6080 | // ADDSUBPD 2xf64 SSE3 |
6081 | // VADDSUBPD 2xf64 AVX |
6082 | // VADDSUBPD 4xf64 AVX2 |
6083 | |
6084 | unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
6085 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible" ); |
6086 | if (!isPowerOf2_32(Value: NumElements)) |
6087 | return false; |
6088 | // Check the opcode pattern. We apply the mask on the opcode arguments and |
6089 | // then check if it is what we expect. |
6090 | for (int Lane : seq<int>(Begin: 0, End: NumElements)) { |
6091 | unsigned Opc = OpcodeMask.test(Idx: Lane) ? Opcode1 : Opcode0; |
6092 | // We expect FSub for even lanes and FAdd for odd lanes. |
6093 | if (Lane % 2 == 0 && Opc != Instruction::FSub) |
6094 | return false; |
6095 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) |
6096 | return false; |
6097 | } |
6098 | // Now check that the pattern is supported by the target ISA. |
6099 | Type *ElemTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6100 | if (ElemTy->isFloatTy()) |
6101 | return ST->hasSSE3() && NumElements % 4 == 0; |
6102 | if (ElemTy->isDoubleTy()) |
6103 | return ST->hasSSE3() && NumElements % 2 == 0; |
6104 | return false; |
6105 | } |
6106 | |
6107 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { |
6108 | // AVX2 doesn't support scatter |
6109 | if (!ST->hasAVX512() || !ST->preferScatter()) |
6110 | return false; |
6111 | return isLegalMaskedGatherScatter(DataTy: DataType, Alignment); |
6112 | } |
6113 | |
6114 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { |
6115 | EVT VT = TLI->getValueType(DL, Ty: DataType); |
6116 | return TLI->isOperationLegal(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); |
6117 | } |
6118 | |
6119 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { |
6120 | // FDIV is always expensive, even if it has a very low uop count. |
6121 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? |
6122 | if (I->getOpcode() == Instruction::FDiv) |
6123 | return true; |
6124 | |
6125 | return BaseT::isExpensiveToSpeculativelyExecute(I); |
6126 | } |
6127 | |
6128 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
6129 | return false; |
6130 | } |
6131 | |
6132 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, |
6133 | const Function *Callee) const { |
6134 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6135 | |
6136 | // Work this as a subsetting of subtarget features. |
6137 | const FeatureBitset &CallerBits = |
6138 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
6139 | const FeatureBitset &CalleeBits = |
6140 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
6141 | |
6142 | // Check whether features are the same (apart from the ignore list). |
6143 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
6144 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
6145 | if (RealCallerBits == RealCalleeBits) |
6146 | return true; |
6147 | |
6148 | // If the features are a subset, we need to additionally check for calls |
6149 | // that may become ABI-incompatible as a result of inlining. |
6150 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
6151 | return false; |
6152 | |
6153 | for (const Instruction &I : instructions(F: Callee)) { |
6154 | if (const auto *CB = dyn_cast<CallBase>(Val: &I)) { |
6155 | // Having more target features is fine for inline ASM. |
6156 | if (CB->isInlineAsm()) |
6157 | continue; |
6158 | |
6159 | SmallVector<Type *, 8> Types; |
6160 | for (Value *Arg : CB->args()) |
6161 | Types.push_back(Elt: Arg->getType()); |
6162 | if (!CB->getType()->isVoidTy()) |
6163 | Types.push_back(Elt: CB->getType()); |
6164 | |
6165 | // Simple types are always ABI compatible. |
6166 | auto IsSimpleTy = [](Type *Ty) { |
6167 | return !Ty->isVectorTy() && !Ty->isAggregateType(); |
6168 | }; |
6169 | if (all_of(Range&: Types, P: IsSimpleTy)) |
6170 | continue; |
6171 | |
6172 | if (Function *NestedCallee = CB->getCalledFunction()) { |
6173 | // Assume that intrinsics are always ABI compatible. |
6174 | if (NestedCallee->isIntrinsic()) |
6175 | continue; |
6176 | |
6177 | // Do a precise compatibility check. |
6178 | if (!areTypesABICompatible(Caller, Callee: NestedCallee, Type: Types)) |
6179 | return false; |
6180 | } else { |
6181 | // We don't know the target features of the callee, |
6182 | // assume it is incompatible. |
6183 | return false; |
6184 | } |
6185 | } |
6186 | } |
6187 | return true; |
6188 | } |
6189 | |
6190 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, |
6191 | const Function *Callee, |
6192 | const ArrayRef<Type *> &Types) const { |
6193 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
6194 | return false; |
6195 | |
6196 | // If we get here, we know the target features match. If one function |
6197 | // considers 512-bit vectors legal and the other does not, consider them |
6198 | // incompatible. |
6199 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6200 | |
6201 | if (TM.getSubtarget<X86Subtarget>(F: *Caller).useAVX512Regs() == |
6202 | TM.getSubtarget<X86Subtarget>(F: *Callee).useAVX512Regs()) |
6203 | return true; |
6204 | |
6205 | // Consider the arguments compatible if they aren't vectors or aggregates. |
6206 | // FIXME: Look at the size of vectors. |
6207 | // FIXME: Look at the element types of aggregates to see if there are vectors. |
6208 | return llvm::none_of(Range: Types, |
6209 | P: [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); |
6210 | } |
6211 | |
6212 | X86TTIImpl::TTI::MemCmpExpansionOptions |
6213 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
6214 | TTI::MemCmpExpansionOptions Options; |
6215 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
6216 | Options.NumLoadsPerBlock = 2; |
6217 | // All GPR and vector loads can be unaligned. |
6218 | Options.AllowOverlappingLoads = true; |
6219 | if (IsZeroCmp) { |
6220 | // Only enable vector loads for equality comparison. Right now the vector |
6221 | // version is not as fast for three way compare (see #33329). |
6222 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); |
6223 | if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) |
6224 | Options.LoadSizes.push_back(Elt: 64); |
6225 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(Elt: 32); |
6226 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(Elt: 16); |
6227 | } |
6228 | if (ST->is64Bit()) { |
6229 | Options.LoadSizes.push_back(Elt: 8); |
6230 | } |
6231 | Options.LoadSizes.push_back(Elt: 4); |
6232 | Options.LoadSizes.push_back(Elt: 2); |
6233 | Options.LoadSizes.push_back(Elt: 1); |
6234 | return Options; |
6235 | } |
6236 | |
6237 | bool X86TTIImpl::prefersVectorizedAddressing() const { |
6238 | return supportsGather(); |
6239 | } |
6240 | |
6241 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { |
6242 | return false; |
6243 | } |
6244 | |
6245 | bool X86TTIImpl::enableInterleavedAccessVectorization() { |
6246 | // TODO: We expect this to be beneficial regardless of arch, |
6247 | // but there are currently some unexplained performance artifacts on Atom. |
6248 | // As a temporary solution, disable on Atom. |
6249 | return !(ST->isAtom()); |
6250 | } |
6251 | |
6252 | // Get estimation for interleaved load/store operations and strided load. |
6253 | // \p Indices contains indices for strided load. |
6254 | // \p Factor - the factor of interleaving. |
6255 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. |
6256 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( |
6257 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, |
6258 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, |
6259 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { |
6260 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6261 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6262 | // VecTy = <12 x i32>. |
6263 | |
6264 | // Calculate the number of memory operations (NumOfMemOps), required |
6265 | // for load/store the VecTy. |
6266 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6267 | unsigned VecTySize = DL.getTypeStoreSize(Ty: VecTy); |
6268 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
6269 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
6270 | |
6271 | // Get the cost of one memory operation. |
6272 | auto *SingleMemOpTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6273 | NumElts: LegalVT.getVectorNumElements()); |
6274 | InstructionCost MemOpCost; |
6275 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; |
6276 | if (UseMaskedMemOp) |
6277 | MemOpCost = getMaskedMemoryOpCost(Opcode, SrcTy: SingleMemOpTy, Alignment, |
6278 | AddressSpace, CostKind); |
6279 | else |
6280 | MemOpCost = getMemoryOpCost(Opcode, Src: SingleMemOpTy, Alignment: MaybeAlign(Alignment), |
6281 | AddressSpace, CostKind); |
6282 | |
6283 | unsigned VF = VecTy->getNumElements() / Factor; |
6284 | MVT VT = |
6285 | MVT::getVectorVT(VT: TLI->getSimpleValueType(DL, Ty: VecTy->getScalarType()), NumElements: VF); |
6286 | |
6287 | InstructionCost MaskCost; |
6288 | if (UseMaskedMemOp) { |
6289 | APInt DemandedLoadStoreElts = APInt::getZero(numBits: VecTy->getNumElements()); |
6290 | for (unsigned Index : Indices) { |
6291 | assert(Index < Factor && "Invalid index for interleaved memory op" ); |
6292 | for (unsigned Elm = 0; Elm < VF; Elm++) |
6293 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
6294 | } |
6295 | |
6296 | Type *I1Type = Type::getInt1Ty(C&: VecTy->getContext()); |
6297 | |
6298 | MaskCost = getReplicationShuffleCost( |
6299 | EltTy: I1Type, ReplicationFactor: Factor, VF, |
6300 | DemandedDstElts: UseMaskForGaps ? DemandedLoadStoreElts |
6301 | : APInt::getAllOnes(numBits: VecTy->getNumElements()), |
6302 | CostKind); |
6303 | |
6304 | // The Gaps mask is invariant and created outside the loop, therefore the |
6305 | // cost of creating it is not accounted for here. However if we have both |
6306 | // a MaskForGaps and some other mask that guards the execution of the |
6307 | // memory access, we need to account for the cost of And-ing the two masks |
6308 | // inside the loop. |
6309 | if (UseMaskForGaps) { |
6310 | auto *MaskVT = FixedVectorType::get(ElementType: I1Type, NumElts: VecTy->getNumElements()); |
6311 | MaskCost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: MaskVT, CostKind); |
6312 | } |
6313 | } |
6314 | |
6315 | if (Opcode == Instruction::Load) { |
6316 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) |
6317 | // contain the cost of the optimized shuffle sequence that the |
6318 | // X86InterleavedAccess pass will generate. |
6319 | // The cost of loads and stores are computed separately from the table. |
6320 | |
6321 | // X86InterleavedAccess support only the following interleaved-access group. |
6322 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { |
6323 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, //(load 48i8 and) deinterleave into 3 x 16i8 |
6324 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6325 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 22}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6326 | }; |
6327 | |
6328 | if (const auto *Entry = |
6329 | CostTableLookup(Table: AVX512InterleavedLoadTbl, ISD: Factor, Ty: VT)) |
6330 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6331 | //If an entry does not exist, fallback to the default implementation. |
6332 | |
6333 | // Kind of shuffle depends on number of loaded values. |
6334 | // If we load the entire data in one register, we can use a 1-src shuffle. |
6335 | // Otherwise, we'll merge 2 sources in each operation. |
6336 | TTI::ShuffleKind ShuffleKind = |
6337 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; |
6338 | |
6339 | InstructionCost ShuffleCost = getShuffleCost( |
6340 | Kind: ShuffleKind, BaseTp: SingleMemOpTy, Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
6341 | |
6342 | unsigned NumOfLoadsInInterleaveGrp = |
6343 | Indices.size() ? Indices.size() : Factor; |
6344 | auto *ResultTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6345 | NumElts: VecTy->getNumElements() / Factor); |
6346 | InstructionCost NumOfResults = |
6347 | getTypeLegalizationCost(Ty: ResultTy).first * NumOfLoadsInInterleaveGrp; |
6348 | |
6349 | // About a half of the loads may be folded in shuffles when we have only |
6350 | // one result. If we have more than one result, or the loads are masked, |
6351 | // we do not fold loads at all. |
6352 | unsigned NumOfUnfoldedLoads = |
6353 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; |
6354 | |
6355 | // Get a number of shuffle operations per result. |
6356 | unsigned NumOfShufflesPerResult = |
6357 | std::max(a: (unsigned)1, b: (unsigned)(NumOfMemOps - 1)); |
6358 | |
6359 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6360 | // When we have more than one destination, we need additional instructions |
6361 | // to keep sources. |
6362 | InstructionCost NumOfMoves = 0; |
6363 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) |
6364 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; |
6365 | |
6366 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + |
6367 | MaskCost + NumOfUnfoldedLoads * MemOpCost + |
6368 | NumOfMoves; |
6369 | |
6370 | return Cost; |
6371 | } |
6372 | |
6373 | // Store. |
6374 | assert(Opcode == Instruction::Store && |
6375 | "Expected Store Instruction at this point" ); |
6376 | // X86InterleavedAccess support only the following interleaved-access group. |
6377 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { |
6378 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 12}, // interleave 3 x 16i8 into 48i8 (and store) |
6379 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // interleave 3 x 32i8 into 96i8 (and store) |
6380 | {.ISD: 3, .Type: MVT::v64i8, .Cost: 26}, // interleave 3 x 64i8 into 96i8 (and store) |
6381 | |
6382 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 10}, // interleave 4 x 8i8 into 32i8 (and store) |
6383 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 11}, // interleave 4 x 16i8 into 64i8 (and store) |
6384 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 14}, // interleave 4 x 32i8 into 128i8 (and store) |
6385 | {.ISD: 4, .Type: MVT::v64i8, .Cost: 24} // interleave 4 x 32i8 into 256i8 (and store) |
6386 | }; |
6387 | |
6388 | if (const auto *Entry = |
6389 | CostTableLookup(Table: AVX512InterleavedStoreTbl, ISD: Factor, Ty: VT)) |
6390 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6391 | //If an entry does not exist, fallback to the default implementation. |
6392 | |
6393 | // There is no strided stores meanwhile. And store can't be folded in |
6394 | // shuffle. |
6395 | unsigned NumOfSources = Factor; // The number of values to be merged. |
6396 | InstructionCost ShuffleCost = getShuffleCost( |
6397 | Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleMemOpTy, Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
6398 | unsigned NumOfShufflesPerStore = NumOfSources - 1; |
6399 | |
6400 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6401 | // We need additional instructions to keep sources. |
6402 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; |
6403 | InstructionCost Cost = |
6404 | MaskCost + |
6405 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + |
6406 | NumOfMoves; |
6407 | return Cost; |
6408 | } |
6409 | |
6410 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( |
6411 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, |
6412 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
6413 | bool UseMaskForCond, bool UseMaskForGaps) { |
6414 | auto *VecTy = cast<FixedVectorType>(Val: BaseTy); |
6415 | |
6416 | auto isSupportedOnAVX512 = [&](Type *VecTy) { |
6417 | Type *EltTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6418 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(Bitwidth: 64) || |
6419 | EltTy->isIntegerTy(Bitwidth: 32) || EltTy->isPointerTy()) |
6420 | return true; |
6421 | if (EltTy->isIntegerTy(Bitwidth: 16) || EltTy->isIntegerTy(Bitwidth: 8) || EltTy->isHalfTy()) |
6422 | return ST->hasBWI(); |
6423 | if (EltTy->isBFloatTy()) |
6424 | return ST->hasBF16(); |
6425 | return false; |
6426 | }; |
6427 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) |
6428 | return getInterleavedMemoryOpCostAVX512( |
6429 | Opcode, VecTy, Factor, Indices, Alignment, |
6430 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); |
6431 | |
6432 | if (UseMaskForCond || UseMaskForGaps) |
6433 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6434 | Alignment, AddressSpace, CostKind, |
6435 | UseMaskForCond, UseMaskForGaps); |
6436 | |
6437 | // Get estimation for interleaved load/store operations for SSE-AVX2. |
6438 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow |
6439 | // computing the cost using a generic formula as a function of generic |
6440 | // shuffles. We therefore use a lookup table instead, filled according to |
6441 | // the instruction sequences that codegen currently generates. |
6442 | |
6443 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6444 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6445 | // VecTy = <12 x i32>. |
6446 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6447 | |
6448 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case |
6449 | // the VF=2, while v2i128 is an unsupported MVT vector type |
6450 | // (see MachineValueType.h::getVectorVT()). |
6451 | if (!LegalVT.isVector()) |
6452 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6453 | Alignment, AddressSpace, CostKind); |
6454 | |
6455 | unsigned VF = VecTy->getNumElements() / Factor; |
6456 | Type *ScalarTy = VecTy->getElementType(); |
6457 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. |
6458 | if (!ScalarTy->isIntegerTy()) |
6459 | ScalarTy = |
6460 | Type::getIntNTy(C&: ScalarTy->getContext(), N: DL.getTypeSizeInBits(Ty: ScalarTy)); |
6461 | |
6462 | // Get the cost of all the memory operations. |
6463 | // FIXME: discount dead loads. |
6464 | InstructionCost MemOpCosts = getMemoryOpCost( |
6465 | Opcode, Src: VecTy, Alignment: MaybeAlign(Alignment), AddressSpace, CostKind); |
6466 | |
6467 | auto *VT = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF); |
6468 | EVT ETy = TLI->getValueType(DL, Ty: VT); |
6469 | if (!ETy.isSimple()) |
6470 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6471 | Alignment, AddressSpace, CostKind); |
6472 | |
6473 | // TODO: Complete for other data-types and strides. |
6474 | // Each combination of Stride, element bit width and VF results in a different |
6475 | // sequence; The cost tables are therefore accessed with: |
6476 | // Factor (stride) and VectorType=VFxiN. |
6477 | // The Cost accounts only for the shuffle sequence; |
6478 | // The cost of the loads/stores is accounted for separately. |
6479 | // |
6480 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { |
6481 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 2}, // (load 4i8 and) deinterleave into 2 x 2i8 |
6482 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 2}, // (load 8i8 and) deinterleave into 2 x 4i8 |
6483 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 2}, // (load 16i8 and) deinterleave into 2 x 8i8 |
6484 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 4}, // (load 32i8 and) deinterleave into 2 x 16i8 |
6485 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 6}, // (load 64i8 and) deinterleave into 2 x 32i8 |
6486 | |
6487 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 6}, // (load 16i16 and) deinterleave into 2 x 8i16 |
6488 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 9}, // (load 32i16 and) deinterleave into 2 x 16i16 |
6489 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 18}, // (load 64i16 and) deinterleave into 2 x 32i16 |
6490 | |
6491 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // (load 16i32 and) deinterleave into 2 x 8i32 |
6492 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // (load 32i32 and) deinterleave into 2 x 16i32 |
6493 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // (load 64i32 and) deinterleave into 2 x 32i32 |
6494 | |
6495 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // (load 8i64 and) deinterleave into 2 x 4i64 |
6496 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // (load 16i64 and) deinterleave into 2 x 8i64 |
6497 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // (load 32i64 and) deinterleave into 2 x 16i64 |
6498 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // (load 64i64 and) deinterleave into 2 x 32i64 |
6499 | |
6500 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 3}, // (load 6i8 and) deinterleave into 3 x 2i8 |
6501 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 3}, // (load 12i8 and) deinterleave into 3 x 4i8 |
6502 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // (load 24i8 and) deinterleave into 3 x 8i8 |
6503 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // (load 48i8 and) deinterleave into 3 x 16i8 |
6504 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 14}, // (load 96i8 and) deinterleave into 3 x 32i8 |
6505 | |
6506 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 5}, // (load 6i16 and) deinterleave into 3 x 2i16 |
6507 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 7}, // (load 12i16 and) deinterleave into 3 x 4i16 |
6508 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 9}, // (load 24i16 and) deinterleave into 3 x 8i16 |
6509 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 28}, // (load 48i16 and) deinterleave into 3 x 16i16 |
6510 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 56}, // (load 96i16 and) deinterleave into 3 x 32i16 |
6511 | |
6512 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 3}, // (load 6i32 and) deinterleave into 3 x 2i32 |
6513 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 3}, // (load 12i32 and) deinterleave into 3 x 4i32 |
6514 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 7}, // (load 24i32 and) deinterleave into 3 x 8i32 |
6515 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 14}, // (load 48i32 and) deinterleave into 3 x 16i32 |
6516 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 32}, // (load 96i32 and) deinterleave into 3 x 32i32 |
6517 | |
6518 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 1}, // (load 6i64 and) deinterleave into 3 x 2i64 |
6519 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 5}, // (load 12i64 and) deinterleave into 3 x 4i64 |
6520 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 10}, // (load 24i64 and) deinterleave into 3 x 8i64 |
6521 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 20}, // (load 48i64 and) deinterleave into 3 x 16i64 |
6522 | |
6523 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // (load 8i8 and) deinterleave into 4 x 2i8 |
6524 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // (load 16i8 and) deinterleave into 4 x 4i8 |
6525 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 12}, // (load 32i8 and) deinterleave into 4 x 8i8 |
6526 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 24}, // (load 64i8 and) deinterleave into 4 x 16i8 |
6527 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 56}, // (load 128i8 and) deinterleave into 4 x 32i8 |
6528 | |
6529 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 6}, // (load 8i16 and) deinterleave into 4 x 2i16 |
6530 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 17}, // (load 16i16 and) deinterleave into 4 x 4i16 |
6531 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 33}, // (load 32i16 and) deinterleave into 4 x 8i16 |
6532 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 75}, // (load 64i16 and) deinterleave into 4 x 16i16 |
6533 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 150}, // (load 128i16 and) deinterleave into 4 x 32i16 |
6534 | |
6535 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // (load 8i32 and) deinterleave into 4 x 2i32 |
6536 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 8}, // (load 16i32 and) deinterleave into 4 x 4i32 |
6537 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // (load 32i32 and) deinterleave into 4 x 8i32 |
6538 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // (load 64i32 and) deinterleave into 4 x 16i32 |
6539 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 68}, // (load 128i32 and) deinterleave into 4 x 32i32 |
6540 | |
6541 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // (load 8i64 and) deinterleave into 4 x 2i64 |
6542 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // (load 16i64 and) deinterleave into 4 x 4i64 |
6543 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // (load 32i64 and) deinterleave into 4 x 8i64 |
6544 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // (load 64i64 and) deinterleave into 4 x 16i64 |
6545 | |
6546 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 6}, // (load 12i8 and) deinterleave into 6 x 2i8 |
6547 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 14}, // (load 24i8 and) deinterleave into 6 x 4i8 |
6548 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 18}, // (load 48i8 and) deinterleave into 6 x 8i8 |
6549 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 43}, // (load 96i8 and) deinterleave into 6 x 16i8 |
6550 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 82}, // (load 192i8 and) deinterleave into 6 x 32i8 |
6551 | |
6552 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 13}, // (load 12i16 and) deinterleave into 6 x 2i16 |
6553 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 9}, // (load 24i16 and) deinterleave into 6 x 4i16 |
6554 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 39}, // (load 48i16 and) deinterleave into 6 x 8i16 |
6555 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 106}, // (load 96i16 and) deinterleave into 6 x 16i16 |
6556 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 212}, // (load 192i16 and) deinterleave into 6 x 32i16 |
6557 | |
6558 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 6}, // (load 12i32 and) deinterleave into 6 x 2i32 |
6559 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 15}, // (load 24i32 and) deinterleave into 6 x 4i32 |
6560 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 31}, // (load 48i32 and) deinterleave into 6 x 8i32 |
6561 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 64}, // (load 96i32 and) deinterleave into 6 x 16i32 |
6562 | |
6563 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 6}, // (load 12i64 and) deinterleave into 6 x 2i64 |
6564 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 18}, // (load 24i64 and) deinterleave into 6 x 4i64 |
6565 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 36}, // (load 48i64 and) deinterleave into 6 x 8i64 |
6566 | |
6567 | {.ISD: 8, .Type: MVT::v8i32, .Cost: 40} // (load 64i32 and) deinterleave into 8 x 8i32 |
6568 | }; |
6569 | |
6570 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { |
6571 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 2}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6572 | }; |
6573 | |
6574 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { |
6575 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 2}, // (load 4i16 and) deinterleave into 2 x 2i16 |
6576 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 7}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6577 | |
6578 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 2}, // (load 4i32 and) deinterleave into 2 x 2i32 |
6579 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // (load 8i32 and) deinterleave into 2 x 4i32 |
6580 | |
6581 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // (load 4i64 and) deinterleave into 2 x 2i64 |
6582 | }; |
6583 | |
6584 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { |
6585 | {.ISD: 2, .Type: MVT::v16i8, .Cost: 3}, // interleave 2 x 16i8 into 32i8 (and store) |
6586 | {.ISD: 2, .Type: MVT::v32i8, .Cost: 4}, // interleave 2 x 32i8 into 64i8 (and store) |
6587 | |
6588 | {.ISD: 2, .Type: MVT::v8i16, .Cost: 3}, // interleave 2 x 8i16 into 16i16 (and store) |
6589 | {.ISD: 2, .Type: MVT::v16i16, .Cost: 4}, // interleave 2 x 16i16 into 32i16 (and store) |
6590 | {.ISD: 2, .Type: MVT::v32i16, .Cost: 8}, // interleave 2 x 32i16 into 64i16 (and store) |
6591 | |
6592 | {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32 (and store) |
6593 | {.ISD: 2, .Type: MVT::v8i32, .Cost: 4}, // interleave 2 x 8i32 into 16i32 (and store) |
6594 | {.ISD: 2, .Type: MVT::v16i32, .Cost: 8}, // interleave 2 x 16i32 into 32i32 (and store) |
6595 | {.ISD: 2, .Type: MVT::v32i32, .Cost: 16}, // interleave 2 x 32i32 into 64i32 (and store) |
6596 | |
6597 | {.ISD: 2, .Type: MVT::v2i64, .Cost: 2}, // interleave 2 x 2i64 into 4i64 (and store) |
6598 | {.ISD: 2, .Type: MVT::v4i64, .Cost: 4}, // interleave 2 x 4i64 into 8i64 (and store) |
6599 | {.ISD: 2, .Type: MVT::v8i64, .Cost: 8}, // interleave 2 x 8i64 into 16i64 (and store) |
6600 | {.ISD: 2, .Type: MVT::v16i64, .Cost: 16}, // interleave 2 x 16i64 into 32i64 (and store) |
6601 | {.ISD: 2, .Type: MVT::v32i64, .Cost: 32}, // interleave 2 x 32i64 into 64i64 (and store) |
6602 | |
6603 | {.ISD: 3, .Type: MVT::v2i8, .Cost: 4}, // interleave 3 x 2i8 into 6i8 (and store) |
6604 | {.ISD: 3, .Type: MVT::v4i8, .Cost: 4}, // interleave 3 x 4i8 into 12i8 (and store) |
6605 | {.ISD: 3, .Type: MVT::v8i8, .Cost: 6}, // interleave 3 x 8i8 into 24i8 (and store) |
6606 | {.ISD: 3, .Type: MVT::v16i8, .Cost: 11}, // interleave 3 x 16i8 into 48i8 (and store) |
6607 | {.ISD: 3, .Type: MVT::v32i8, .Cost: 13}, // interleave 3 x 32i8 into 96i8 (and store) |
6608 | |
6609 | {.ISD: 3, .Type: MVT::v2i16, .Cost: 4}, // interleave 3 x 2i16 into 6i16 (and store) |
6610 | {.ISD: 3, .Type: MVT::v4i16, .Cost: 6}, // interleave 3 x 4i16 into 12i16 (and store) |
6611 | {.ISD: 3, .Type: MVT::v8i16, .Cost: 12}, // interleave 3 x 8i16 into 24i16 (and store) |
6612 | {.ISD: 3, .Type: MVT::v16i16, .Cost: 27}, // interleave 3 x 16i16 into 48i16 (and store) |
6613 | {.ISD: 3, .Type: MVT::v32i16, .Cost: 54}, // interleave 3 x 32i16 into 96i16 (and store) |
6614 | |
6615 | {.ISD: 3, .Type: MVT::v2i32, .Cost: 4}, // interleave 3 x 2i32 into 6i32 (and store) |
6616 | {.ISD: 3, .Type: MVT::v4i32, .Cost: 5}, // interleave 3 x 4i32 into 12i32 (and store) |
6617 | {.ISD: 3, .Type: MVT::v8i32, .Cost: 11}, // interleave 3 x 8i32 into 24i32 (and store) |
6618 | {.ISD: 3, .Type: MVT::v16i32, .Cost: 22}, // interleave 3 x 16i32 into 48i32 (and store) |
6619 | {.ISD: 3, .Type: MVT::v32i32, .Cost: 48}, // interleave 3 x 32i32 into 96i32 (and store) |
6620 | |
6621 | {.ISD: 3, .Type: MVT::v2i64, .Cost: 4}, // interleave 3 x 2i64 into 6i64 (and store) |
6622 | {.ISD: 3, .Type: MVT::v4i64, .Cost: 6}, // interleave 3 x 4i64 into 12i64 (and store) |
6623 | {.ISD: 3, .Type: MVT::v8i64, .Cost: 12}, // interleave 3 x 8i64 into 24i64 (and store) |
6624 | {.ISD: 3, .Type: MVT::v16i64, .Cost: 24}, // interleave 3 x 16i64 into 48i64 (and store) |
6625 | |
6626 | {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8 (and store) |
6627 | {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8 (and store) |
6628 | {.ISD: 4, .Type: MVT::v8i8, .Cost: 4}, // interleave 4 x 8i8 into 32i8 (and store) |
6629 | {.ISD: 4, .Type: MVT::v16i8, .Cost: 8}, // interleave 4 x 16i8 into 64i8 (and store) |
6630 | {.ISD: 4, .Type: MVT::v32i8, .Cost: 12}, // interleave 4 x 32i8 into 128i8 (and store) |
6631 | |
6632 | {.ISD: 4, .Type: MVT::v2i16, .Cost: 2}, // interleave 4 x 2i16 into 8i16 (and store) |
6633 | {.ISD: 4, .Type: MVT::v4i16, .Cost: 6}, // interleave 4 x 4i16 into 16i16 (and store) |
6634 | {.ISD: 4, .Type: MVT::v8i16, .Cost: 10}, // interleave 4 x 8i16 into 32i16 (and store) |
6635 | {.ISD: 4, .Type: MVT::v16i16, .Cost: 32}, // interleave 4 x 16i16 into 64i16 (and store) |
6636 | {.ISD: 4, .Type: MVT::v32i16, .Cost: 64}, // interleave 4 x 32i16 into 128i16 (and store) |
6637 | |
6638 | {.ISD: 4, .Type: MVT::v2i32, .Cost: 5}, // interleave 4 x 2i32 into 8i32 (and store) |
6639 | {.ISD: 4, .Type: MVT::v4i32, .Cost: 6}, // interleave 4 x 4i32 into 16i32 (and store) |
6640 | {.ISD: 4, .Type: MVT::v8i32, .Cost: 16}, // interleave 4 x 8i32 into 32i32 (and store) |
6641 | {.ISD: 4, .Type: MVT::v16i32, .Cost: 32}, // interleave 4 x 16i32 into 64i32 (and store) |
6642 | {.ISD: 4, .Type: MVT::v32i32, .Cost: 64}, // interleave 4 x 32i32 into 128i32 (and store) |
6643 | |
6644 | {.ISD: 4, .Type: MVT::v2i64, .Cost: 6}, // interleave 4 x 2i64 into 8i64 (and store) |
6645 | {.ISD: 4, .Type: MVT::v4i64, .Cost: 8}, // interleave 4 x 4i64 into 16i64 (and store) |
6646 | {.ISD: 4, .Type: MVT::v8i64, .Cost: 20}, // interleave 4 x 8i64 into 32i64 (and store) |
6647 | {.ISD: 4, .Type: MVT::v16i64, .Cost: 40}, // interleave 4 x 16i64 into 64i64 (and store) |
6648 | |
6649 | {.ISD: 6, .Type: MVT::v2i8, .Cost: 7}, // interleave 6 x 2i8 into 12i8 (and store) |
6650 | {.ISD: 6, .Type: MVT::v4i8, .Cost: 9}, // interleave 6 x 4i8 into 24i8 (and store) |
6651 | {.ISD: 6, .Type: MVT::v8i8, .Cost: 16}, // interleave 6 x 8i8 into 48i8 (and store) |
6652 | {.ISD: 6, .Type: MVT::v16i8, .Cost: 27}, // interleave 6 x 16i8 into 96i8 (and store) |
6653 | {.ISD: 6, .Type: MVT::v32i8, .Cost: 90}, // interleave 6 x 32i8 into 192i8 (and store) |
6654 | |
6655 | {.ISD: 6, .Type: MVT::v2i16, .Cost: 10}, // interleave 6 x 2i16 into 12i16 (and store) |
6656 | {.ISD: 6, .Type: MVT::v4i16, .Cost: 15}, // interleave 6 x 4i16 into 24i16 (and store) |
6657 | {.ISD: 6, .Type: MVT::v8i16, .Cost: 21}, // interleave 6 x 8i16 into 48i16 (and store) |
6658 | {.ISD: 6, .Type: MVT::v16i16, .Cost: 58}, // interleave 6 x 16i16 into 96i16 (and store) |
6659 | {.ISD: 6, .Type: MVT::v32i16, .Cost: 90}, // interleave 6 x 32i16 into 192i16 (and store) |
6660 | |
6661 | {.ISD: 6, .Type: MVT::v2i32, .Cost: 9}, // interleave 6 x 2i32 into 12i32 (and store) |
6662 | {.ISD: 6, .Type: MVT::v4i32, .Cost: 12}, // interleave 6 x 4i32 into 24i32 (and store) |
6663 | {.ISD: 6, .Type: MVT::v8i32, .Cost: 33}, // interleave 6 x 8i32 into 48i32 (and store) |
6664 | {.ISD: 6, .Type: MVT::v16i32, .Cost: 66}, // interleave 6 x 16i32 into 96i32 (and store) |
6665 | |
6666 | {.ISD: 6, .Type: MVT::v2i64, .Cost: 8}, // interleave 6 x 2i64 into 12i64 (and store) |
6667 | {.ISD: 6, .Type: MVT::v4i64, .Cost: 15}, // interleave 6 x 4i64 into 24i64 (and store) |
6668 | {.ISD: 6, .Type: MVT::v8i64, .Cost: 30}, // interleave 6 x 8i64 into 48i64 (and store) |
6669 | }; |
6670 | |
6671 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { |
6672 | {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8 (and store) |
6673 | {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8 (and store) |
6674 | {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8 (and store) |
6675 | |
6676 | {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16 (and store) |
6677 | {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16 (and store) |
6678 | |
6679 | {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32 (and store) |
6680 | }; |
6681 | |
6682 | if (Opcode == Instruction::Load) { |
6683 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), |
6684 | MemOpCosts](const CostTblEntry *Entry) { |
6685 | // NOTE: this is just an approximation! |
6686 | // It can over/under -estimate the cost! |
6687 | return MemOpCosts + divideCeil(Numerator: NumMembers * Entry->Cost, Denominator: Factor); |
6688 | }; |
6689 | |
6690 | if (ST->hasAVX2()) |
6691 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedLoadTbl, ISD: Factor, |
6692 | Ty: ETy.getSimpleVT())) |
6693 | return GetDiscountedCost(Entry); |
6694 | |
6695 | if (ST->hasSSSE3()) |
6696 | if (const auto *Entry = CostTableLookup(Table: SSSE3InterleavedLoadTbl, ISD: Factor, |
6697 | Ty: ETy.getSimpleVT())) |
6698 | return GetDiscountedCost(Entry); |
6699 | |
6700 | if (ST->hasSSE2()) |
6701 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedLoadTbl, ISD: Factor, |
6702 | Ty: ETy.getSimpleVT())) |
6703 | return GetDiscountedCost(Entry); |
6704 | } else { |
6705 | assert(Opcode == Instruction::Store && |
6706 | "Expected Store Instruction at this point" ); |
6707 | assert((!Indices.size() || Indices.size() == Factor) && |
6708 | "Interleaved store only supports fully-interleaved groups." ); |
6709 | if (ST->hasAVX2()) |
6710 | if (const auto *Entry = CostTableLookup(Table: AVX2InterleavedStoreTbl, ISD: Factor, |
6711 | Ty: ETy.getSimpleVT())) |
6712 | return MemOpCosts + Entry->Cost; |
6713 | |
6714 | if (ST->hasSSE2()) |
6715 | if (const auto *Entry = CostTableLookup(Table: SSE2InterleavedStoreTbl, ISD: Factor, |
6716 | Ty: ETy.getSimpleVT())) |
6717 | return MemOpCosts + Entry->Cost; |
6718 | } |
6719 | |
6720 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6721 | Alignment, AddressSpace, CostKind, |
6722 | UseMaskForCond, UseMaskForGaps); |
6723 | } |
6724 | |
6725 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
6726 | StackOffset BaseOffset, |
6727 | bool HasBaseReg, int64_t Scale, |
6728 | unsigned AddrSpace) const { |
6729 | // Scaling factors are not free at all. |
6730 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), |
6731 | // will take 2 allocations in the out of order engine instead of 1 |
6732 | // for plain addressing mode, i.e. inst (reg1). |
6733 | // E.g., |
6734 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 |
6735 | // Requires two allocations (one for the load, one for the computation) |
6736 | // whereas: |
6737 | // vaddps (%rsi), %ymm0, %ymm1 |
6738 | // Requires just 1 allocation, i.e., freeing allocations for other operations |
6739 | // and having less micro operations to execute. |
6740 | // |
6741 | // For some X86 architectures, this is even worse because for instance for |
6742 | // stores, the complex addressing mode forces the instruction to use the |
6743 | // "load" ports instead of the dedicated "store" port. |
6744 | // E.g., on Haswell: |
6745 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. |
6746 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. |
6747 | TargetLoweringBase::AddrMode AM; |
6748 | AM.BaseGV = BaseGV; |
6749 | AM.BaseOffs = BaseOffset.getFixed(); |
6750 | AM.HasBaseReg = HasBaseReg; |
6751 | AM.Scale = Scale; |
6752 | AM.ScalableOffset = BaseOffset.getScalable(); |
6753 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
6754 | // Scale represents reg2 * scale, thus account for 1 |
6755 | // as soon as we use a second register. |
6756 | return AM.Scale != 0; |
6757 | return -1; |
6758 | } |
6759 | |
6760 | InstructionCost X86TTIImpl::getBranchMispredictPenalty() const { |
6761 | // TODO: Hook MispredictPenalty of SchedMachineModel into this. |
6762 | return 14; |
6763 | } |
6764 | |