1//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines the WebAssembly-specific TargetTransformInfo
11/// implementation.
12///
13//===----------------------------------------------------------------------===//
14
15#include "WebAssemblyTargetTransformInfo.h"
16#include "llvm/IR/IntrinsicInst.h"
17#include "llvm/IR/IntrinsicsWebAssembly.h"
18#include "llvm/Transforms/InstCombine/InstCombiner.h"
19
20#include "llvm/CodeGen/CostTable.h"
21using namespace llvm;
22
23#define DEBUG_TYPE "wasmtti"
24
25TargetTransformInfo::PopcntSupportKind
26WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
27 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
28 return TargetTransformInfo::PSK_FastHardware;
29}
30
31unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
32 unsigned Result = BaseT::getNumberOfRegisters(ClassID);
33
34 // For SIMD, use at least 16 registers, as a rough guess.
35 bool Vector = (ClassID == 1);
36 if (Vector)
37 Result = std::max(a: Result, b: 16u);
38
39 return Result;
40}
41
42TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
43 TargetTransformInfo::RegisterKind K) const {
44 switch (K) {
45 case TargetTransformInfo::RGK_Scalar:
46 return TypeSize::getFixed(ExactSize: 64);
47 case TargetTransformInfo::RGK_FixedWidthVector:
48 return TypeSize::getFixed(ExactSize: getST()->hasSIMD128() ? 128 : 64);
49 case TargetTransformInfo::RGK_ScalableVector:
50 return TypeSize::getScalable(MinimumSize: 0);
51 }
52
53 llvm_unreachable("Unsupported register kind");
54}
55
56InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
57 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
58 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
59 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
60
61 if (ST->hasSIMD128()) {
62 static const CostTblEntry ArithCostTbl[]{
63 // extmul + (maybe awkward) shuffle
64 {.ISD: ISD::MUL, .Type: MVT::v8i8, .Cost: 4},
65 // 2x extmul + (okay) shuffle
66 {.ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: 4},
67 // extmul
68 {.ISD: ISD::MUL, .Type: MVT::v4i16, .Cost: 1},
69 // extmul
70 {.ISD: ISD::MUL, .Type: MVT::v2i32, .Cost: 1},
71 };
72 EVT DstVT = TLI->getValueType(DL, Ty);
73 if (DstVT.isSimple()) {
74 int ISD = TLI->InstructionOpcodeToISD(Opcode);
75 if (const auto *Entry =
76 CostTableLookup(Table: ArithCostTbl, ISD, Ty: DstVT.getSimpleVT()))
77 return Entry->Cost;
78 }
79 }
80
81 InstructionCost Cost =
82 BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
83 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
84
85 if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
86 switch (Opcode) {
87 case Instruction::LShr:
88 case Instruction::AShr:
89 case Instruction::Shl:
90 // SIMD128's shifts currently only accept a scalar shift count. For each
91 // element, we'll need to extract, op, insert. The following is a rough
92 // approximation.
93 if (!Op2Info.isUniform())
94 Cost =
95 cast<FixedVectorType>(Val: VTy)->getNumElements() *
96 (TargetTransformInfo::TCC_Basic +
97 getArithmeticInstrCost(Opcode, Ty: VTy->getElementType(), CostKind) +
98 TargetTransformInfo::TCC_Basic);
99 break;
100 }
101 }
102 return Cost;
103}
104
105InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
106 unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
107 TTI::TargetCostKind CostKind, const Instruction *I) const {
108 int ISD = TLI->InstructionOpcodeToISD(Opcode);
109 auto SrcTy = TLI->getValueType(DL, Ty: Src);
110 auto DstTy = TLI->getValueType(DL, Ty: Dst);
111
112 if (!SrcTy.isSimple() || !DstTy.isSimple()) {
113 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
114 }
115
116 if (!ST->hasSIMD128()) {
117 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
118 }
119
120 auto DstVT = DstTy.getSimpleVT();
121 auto SrcVT = SrcTy.getSimpleVT();
122
123 if (I && I->hasOneUser()) {
124 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
125 int UserISD = TLI->InstructionOpcodeToISD(Opcode: SingleUser->getOpcode());
126
127 // extmul_low support
128 if (UserISD == ISD::MUL &&
129 (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
130 // Free low extensions.
131 if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
132 (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
133 (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
134 return 0;
135 }
136 // Will require an additional extlow operation for the intermediate
137 // i16/i32 value.
138 if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
139 (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
140 return 1;
141 }
142 }
143 }
144
145 static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
146 // extend_low
147 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
148 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
149 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
150 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
151 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
152 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
153 // 2 x extend_low
154 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
155 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
156 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
157 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
158 // extend_low, extend_high
159 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
160 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
161 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
162 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
163 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
164 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
165 // 2x extend_low, extend_high
166 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
167 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
168 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
169 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
170 // 6x extend_low, extend_high
171 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
172 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
173 // shuffle
174 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: 2},
175 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: 4},
176 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 2},
177 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 4},
178 // narrow, and
179 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 2},
180 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 2},
181 // narrow, 2x and
182 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 3},
183 // 3x narrow, 4x and
184 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 7},
185 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 7},
186 // 7x narrow, 8x and
187 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 15},
188 // convert_i32x4
189 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
190 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
191 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
192 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
193 // extend_low, convert
194 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
195 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
196 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
197 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
198 // extend_low x 2, convert
199 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
200 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
201 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
202 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
203 // several shuffles
204 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
205 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
206 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 10},
207 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
208 /// trunc_sat, const, and, 3x narrow
209 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
210 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
211 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
212 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
213 /// trunc_sat, const, and, narrow
214 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
215 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
216 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
217 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
218 // 2x trunc_sat, const, 2x and, 3x narrow
219 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
220 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
221 // 2x trunc_sat, const, 2x and, narrow
222 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
223 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
224 };
225
226 if (const auto *Entry =
227 ConvertCostTableLookup(Table: ConversionTbl, ISD, Dst: DstVT, Src: SrcVT)) {
228 return Entry->Cost;
229 }
230
231 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
232}
233
234WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
235WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
236 TTI::MemCmpExpansionOptions Options;
237
238 Options.AllowOverlappingLoads = true;
239
240 if (ST->hasSIMD128())
241 Options.LoadSizes.push_back(Elt: 16);
242
243 Options.LoadSizes.append(IL: {8, 4, 2, 1});
244 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
245 Options.NumLoadsPerBlock = Options.MaxNumLoads;
246
247 return Options;
248}
249
250InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
251 unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
252 TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
253 const Instruction *I) const {
254 // FIXME: Load latency isn't handled here
255 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: Ty) ||
256 (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)) {
257 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
258 CostKind);
259 }
260
261 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
262 // Type legalization can't handle structs
263 if (VT == MVT::Other)
264 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
265 CostKind);
266
267 auto LT = getTypeLegalizationCost(Ty);
268 if (!LT.first.isValid())
269 return InstructionCost::getInvalid();
270
271 int ISD = TLI->InstructionOpcodeToISD(Opcode);
272 unsigned width = VT.getSizeInBits();
273 if (ISD == ISD::LOAD) {
274 // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
275 // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
276 // loads are twice as expensive as scalar.
277 switch (width) {
278 default:
279 break;
280 case 32:
281 case 64:
282 case 128:
283 return 2;
284 }
285 } else if (ISD == ISD::STORE) {
286 // For stores, we can use store lane operations.
287 switch (width) {
288 default:
289 break;
290 case 8:
291 case 16:
292 case 32:
293 case 64:
294 case 128:
295 return 2;
296 }
297 }
298
299 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, CostKind);
300}
301
302InstructionCost WebAssemblyTTIImpl::getShuffleCost(
303 TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
304 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
305 VectorType *SubTp, ArrayRef<const Value *> Args,
306 const Instruction *CxtI) const {
307 // Canonicalize the ShuffleKind in case optimizations didn't.
308 // Otherwise, we might end up with the wrong ShuffleKind to match against.
309
310 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
311
312 // Wasm SIMD128 has native splat instructions for all lane types.
313 if (ST->hasSIMD128() && Kind == TTI::SK_Broadcast &&
314 isa<FixedVectorType>(Val: SrcTy))
315 return 1;
316
317 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
318 Args, CxtI);
319}
320
321InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
322 unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
323 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
324 bool UseMaskForCond, bool UseMaskForGaps) const {
325 assert(Factor >= 2 && "Invalid interleave factor");
326
327 auto *VecTy = cast<VectorType>(Val: Ty);
328 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: VecTy)) {
329 return InstructionCost::getInvalid();
330 }
331
332 if (UseMaskForCond || UseMaskForGaps)
333 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy: Ty, Factor, Indices,
334 Alignment, AddressSpace, CostKind,
335 UseMaskForCond, UseMaskForGaps);
336
337 constexpr unsigned MaxInterleaveFactor = 4;
338 if (Factor <= MaxInterleaveFactor) {
339 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
340 // Ensure the number of vector elements is greater than 1.
341 if (MinElts < 2 || MinElts % Factor != 0)
342 return InstructionCost::getInvalid();
343
344 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
345 // Ensure the element type is legal.
346 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
347 return InstructionCost::getInvalid();
348
349 if (Factor != 2 && Factor != 4)
350 return InstructionCost::getInvalid();
351
352 auto *SubVecTy =
353 VectorType::get(ElementType: VecTy->getElementType(),
354 EC: VecTy->getElementCount().divideCoefficientBy(RHS: Factor));
355 InstructionCost MemCost =
356 getMemoryOpCost(Opcode, Ty: SubVecTy, Alignment, AddressSpace, CostKind);
357
358 unsigned VecSize = DL.getTypeSizeInBits(Ty: SubVecTy);
359 unsigned MaxVecSize = 128;
360 unsigned NumAccesses =
361 std::max<unsigned>(a: 1, b: (MinElts * ElSize + MaxVecSize - 1) / VecSize);
362
363 // A stride of two is commonly supported via dedicated instructions, so it
364 // should be relatively cheap for all element sizes. A stride of four is
365 // more expensive as it will likely require more shuffles. Using two
366 // simd128 inputs is considered more expensive and we mainly account for
367 // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
368 // arithmetic kernels with smaller (i8/i16) inputs.
369 static const CostTblEntry ShuffleCostTbl[] = {
370 // One reg.
371 {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8
372 {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8
373 {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8
374 {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16
375 {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16
376 {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32
377 {.ISD: 2, .Type: MVT::v2f32, .Cost: 1}, // interleave 2 x 2f32 into 4f32
378
379 // Two regs.
380 {.ISD: 2, .Type: MVT::v16i8, .Cost: 2}, // interleave 2 x 16i8 into 32i8
381 {.ISD: 2, .Type: MVT::v8i16, .Cost: 2}, // interleave 2 x 8i16 into 16i16
382 {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32
383 {.ISD: 2, .Type: MVT::v4f32, .Cost: 2}, // interleave 2 x 4f32 into 8f32
384
385 // One reg.
386 {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8
387 {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8
388 {.ISD: 4, .Type: MVT::v2i16, .Cost: 4}, // interleave 4 x 2i16 into 8i16
389
390 // Two regs.
391 {.ISD: 4, .Type: MVT::v8i8, .Cost: 16}, // interleave 4 x 8i8 into 32i8
392 {.ISD: 4, .Type: MVT::v4i16, .Cost: 8}, // interleave 4 x 4i16 into 16i16
393 {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // interleave 4 x 2i32 into 8i32
394 {.ISD: 4, .Type: MVT::v2f32, .Cost: 4}, // interleave 4 x 2f32 into 8f32
395
396 // Four regs.
397 {.ISD: 4, .Type: MVT::v4i32, .Cost: 16}, // interleave 4 x 4i32 into 16i32
398 };
399
400 EVT ETy = TLI->getValueType(DL, Ty: SubVecTy);
401 if (const auto *Entry =
402 CostTableLookup(Table: ShuffleCostTbl, ISD: Factor, Ty: ETy.getSimpleVT()))
403 return Entry->Cost + (NumAccesses * MemCost);
404 }
405
406 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
407 Alignment, AddressSpace, CostKind,
408 UseMaskForCond, UseMaskForGaps);
409}
410
411InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
412 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
413 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
414 InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
415 Opcode, Val, CostKind, Index, Op0, Op1, VIC);
416
417 // SIMD128's insert/extract currently only take constant indices.
418 if (Index == -1u)
419 return Cost + 25 * TargetTransformInfo::TCC_Expensive;
420
421 return Cost;
422}
423
424InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
425 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
426 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
427 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
428 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
429 InstructionCost Invalid = InstructionCost::getInvalid();
430 if (!VF.isFixed() || !ST->hasSIMD128())
431 return Invalid;
432
433 if (CostKind != TTI::TCK_RecipThroughput)
434 return Invalid;
435
436 if (Opcode != Instruction::Add)
437 return Invalid;
438
439 EVT AccumEVT = EVT::getEVT(Ty: AccumType);
440 // TODO: Add i64 accumulator.
441 if (AccumEVT != MVT::i32)
442 return Invalid;
443
444 // Possible options:
445 // - i16x8.extadd_pairwise_i8x16_sx
446 // - i32x4.extadd_pairwise_i16x8_sx
447 // - i32x4.dot_i16x8_s
448 // Only try to support dot, for now.
449
450 EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
451 if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
452 (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
453 return Invalid;
454 }
455
456 if (OpAExtend == TTI::PR_None)
457 return Invalid;
458
459 InstructionCost Cost(TTI::TCC_Basic);
460 if (!BinOp)
461 return Cost;
462
463 if (OpAExtend != OpBExtend)
464 return Invalid;
465
466 if (*BinOp != Instruction::Mul)
467 return Invalid;
468
469 if (InputTypeA != InputTypeB)
470 return Invalid;
471
472 // Signed inputs can lower to dot
473 if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
474 return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
475
476 // Double the size of the lowered sequence.
477 if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
478 return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
479
480 return Invalid;
481}
482
483TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
484 const IntrinsicInst *II) const {
485
486 switch (II->getIntrinsicID()) {
487 default:
488 break;
489 case Intrinsic::vector_reduce_fadd:
490 return TTI::ReductionShuffle::Pairwise;
491 }
492 return TTI::ReductionShuffle::SplitHalf;
493}
494
495void WebAssemblyTTIImpl::getUnrollingPreferences(
496 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
497 OptimizationRemarkEmitter *ORE) const {
498 // Scan the loop: don't unroll loops with calls. This is a standard approach
499 // for most (all?) targets.
500 for (BasicBlock *BB : L->blocks())
501 for (Instruction &I : *BB)
502 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
503 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
504 if (isLoweredToCall(F))
505 return;
506
507 // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
508 // the various microarchitectures that use the BasicTTI implementation and
509 // has been selected through heuristics across multiple cores and runtimes.
510 UP.Partial = UP.Runtime = UP.UpperBound = true;
511 UP.PartialThreshold = 30;
512
513 // Avoid unrolling when optimizing for size.
514 UP.OptSizeThreshold = 0;
515 UP.PartialOptSizeThreshold = 0;
516
517 // Set number of instructions optimized when "back edge"
518 // becomes "fall through" to default value of 2.
519 UP.BEInsns = 2;
520}
521
522bool WebAssemblyTTIImpl::supportsTailCalls() const {
523 return getST()->hasTailCall();
524}
525
526bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
527 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
528 using namespace llvm::PatternMatch;
529
530 if (!I->getType()->isVectorTy() || !I->isShift())
531 return false;
532
533 Value *V = I->getOperand(i: 1);
534 // We dont need to sink constant splat.
535 if (isa<Constant>(Val: V))
536 return false;
537
538 if (match(V, P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ZeroInt()),
539 v2: m_Value(), mask: m_ZeroMask()))) {
540 // Sink insert
541 Ops.push_back(Elt: &cast<Instruction>(Val: V)->getOperandUse(i: 0));
542 // Sink shuffle
543 Ops.push_back(Elt: &I->getOperandUse(i: 1));
544 return true;
545 }
546
547 return false;
548}
549
550/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
551/// constant.
552static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
553 InstCombiner::BuilderTy &Builder,
554 bool IsRelaxed) {
555 auto *V = dyn_cast<Constant>(Val: II.getArgOperand(i: 1));
556 if (!V)
557 return nullptr;
558
559 auto *VecTy = cast<FixedVectorType>(Val: II.getType());
560 unsigned NumElts = VecTy->getNumElements();
561 assert(NumElts == 16);
562
563 // Construct a shuffle mask from constant integers or UNDEFs.
564 int Indexes[16];
565 bool AnyOutOfBounds = false;
566
567 for (unsigned I = 0; I < NumElts; ++I) {
568 Constant *COp = V->getAggregateElement(Elt: I);
569 if (!COp || (!isa<UndefValue>(Val: COp) && !isa<ConstantInt>(Val: COp)))
570 return nullptr;
571
572 if (isa<UndefValue>(Val: COp)) {
573 Indexes[I] = -1;
574 continue;
575 }
576
577 if (IsRelaxed && cast<ConstantInt>(Val: COp)->getSExtValue() >= NumElts) {
578 // The relaxed_swizzle operation always returns 0 if the lane index is
579 // less than 0 when interpreted as a signed value. For lane indices above
580 // 15, however, it can choose between returning 0 or the lane at `Index %
581 // 16`. However, the choice must be made consistently. As the WebAssembly
582 // spec states:
583 //
584 // "The result of relaxed operators are implementation-dependent, because
585 // the set of possible results may depend on properties of the host
586 // environment, such as its hardware. Technically, their behaviour is
587 // controlled by a set of global parameters to the semantics that an
588 // implementation can instantiate in different ways. These choices are
589 // fixed, that is, parameters are constant during the execution of any
590 // given program."
591 //
592 // The WebAssembly runtime may choose differently from us, so we can't
593 // optimize a relaxed swizzle with lane indices above 15.
594 return nullptr;
595 }
596
597 uint64_t Index = cast<ConstantInt>(Val: COp)->getZExtValue();
598 if (Index >= NumElts) {
599 AnyOutOfBounds = true;
600 // If there are out-of-bounds indices, the swizzle instruction returns
601 // zeroes in those lanes. We'll provide an all-zeroes vector as the
602 // second argument to shufflevector and read the first element from it.
603 Indexes[I] = NumElts;
604 continue;
605 }
606
607 Indexes[I] = Index;
608 }
609
610 auto *V1 = II.getArgOperand(i: 0);
611 auto *V2 =
612 AnyOutOfBounds ? Constant::getNullValue(Ty: VecTy) : PoisonValue::get(T: VecTy);
613
614 return Builder.CreateShuffleVector(V1, V2, Mask: ArrayRef(Indexes, NumElts));
615}
616
617std::optional<Instruction *>
618WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
619 IntrinsicInst &II) const {
620 Intrinsic::ID IID = II.getIntrinsicID();
621 switch (IID) {
622 case Intrinsic::wasm_swizzle:
623 case Intrinsic::wasm_relaxed_swizzle:
624 if (Value *V = simplifyWasmSwizzle(
625 II, Builder&: IC.Builder, IsRelaxed: IID == Intrinsic::wasm_relaxed_swizzle)) {
626 return IC.replaceInstUsesWith(I&: II, V);
627 }
628 break;
629 }
630
631 return std::nullopt;
632}
633