1//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines the WebAssembly-specific TargetTransformInfo
11/// implementation.
12///
13//===----------------------------------------------------------------------===//
14
15#include "WebAssemblyTargetTransformInfo.h"
16#include "llvm/IR/IntrinsicInst.h"
17#include "llvm/IR/IntrinsicsWebAssembly.h"
18#include "llvm/Transforms/InstCombine/InstCombiner.h"
19
20#include "llvm/CodeGen/CostTable.h"
21using namespace llvm;
22
23#define DEBUG_TYPE "wasmtti"
24
25TargetTransformInfo::PopcntSupportKind
26WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
27 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
28 return TargetTransformInfo::PSK_FastHardware;
29}
30
31unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
32 unsigned Result = BaseT::getNumberOfRegisters(ClassID);
33
34 // For SIMD, use at least 16 registers, as a rough guess.
35 bool Vector = (ClassID == 1);
36 if (Vector)
37 Result = std::max(a: Result, b: 16u);
38
39 return Result;
40}
41
42TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
43 TargetTransformInfo::RegisterKind K) const {
44 switch (K) {
45 case TargetTransformInfo::RGK_Scalar:
46 return TypeSize::getFixed(ExactSize: 64);
47 case TargetTransformInfo::RGK_FixedWidthVector:
48 return TypeSize::getFixed(ExactSize: getST()->hasSIMD128() ? 128 : 64);
49 case TargetTransformInfo::RGK_ScalableVector:
50 return TypeSize::getScalable(MinimumSize: 0);
51 }
52
53 llvm_unreachable("Unsupported register kind");
54}
55
56InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
57 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
58 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
59 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
60
61 if (ST->hasSIMD128()) {
62 static const CostTblEntry ArithCostTbl[]{
63 // extmul + (maybe awkward) shuffle
64 {.ISD: ISD::MUL, .Type: MVT::v8i8, .Cost: 4},
65 // 2x extmul + (okay) shuffle
66 {.ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: 4},
67 // extmul
68 {.ISD: ISD::MUL, .Type: MVT::v4i16, .Cost: 1},
69 // extmul
70 {.ISD: ISD::MUL, .Type: MVT::v2i32, .Cost: 1},
71 };
72 EVT DstVT = TLI->getValueType(DL, Ty);
73 if (DstVT.isSimple()) {
74 int ISD = TLI->InstructionOpcodeToISD(Opcode);
75 if (const auto *Entry =
76 CostTableLookup(Table: ArithCostTbl, ISD, Ty: DstVT.getSimpleVT()))
77 return Entry->Cost;
78 }
79 }
80
81 InstructionCost Cost =
82 BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
83 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
84
85 if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
86 switch (Opcode) {
87 case Instruction::LShr:
88 case Instruction::AShr:
89 case Instruction::Shl:
90 // SIMD128's shifts currently only accept a scalar shift count. For each
91 // element, we'll need to extract, op, insert. The following is a rough
92 // approximation.
93 if (!Op2Info.isUniform())
94 Cost =
95 cast<FixedVectorType>(Val: VTy)->getNumElements() *
96 (TargetTransformInfo::TCC_Basic +
97 getArithmeticInstrCost(Opcode, Ty: VTy->getElementType(), CostKind) +
98 TargetTransformInfo::TCC_Basic);
99 break;
100 }
101 }
102 return Cost;
103}
104
105InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
106 unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
107 TTI::TargetCostKind CostKind, const Instruction *I) const {
108 int ISD = TLI->InstructionOpcodeToISD(Opcode);
109 auto SrcTy = TLI->getValueType(DL, Ty: Src);
110 auto DstTy = TLI->getValueType(DL, Ty: Dst);
111
112 if (!SrcTy.isSimple() || !DstTy.isSimple()) {
113 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
114 }
115
116 if (!ST->hasSIMD128()) {
117 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
118 }
119
120 auto DstVT = DstTy.getSimpleVT();
121 auto SrcVT = SrcTy.getSimpleVT();
122
123 if (I && I->hasOneUser()) {
124 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
125 int UserISD = TLI->InstructionOpcodeToISD(Opcode: SingleUser->getOpcode());
126
127 // extmul_low support
128 if (UserISD == ISD::MUL &&
129 (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
130 // Free low extensions.
131 if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
132 (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
133 (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
134 return 0;
135 }
136 // Will require an additional extlow operation for the intermediate
137 // i16/i32 value.
138 if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
139 (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
140 return 1;
141 }
142 }
143 }
144
145 static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
146 // extend_low
147 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
148 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
149 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
150 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
151 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
152 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
153 // 2 x extend_low
154 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
155 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
156 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
157 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
158 // extend_low, extend_high
159 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
160 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
161 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
162 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
163 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
164 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
165 // 2x extend_low, extend_high
166 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
167 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
168 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
169 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
170 // shuffle
171 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: 2},
172 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: 4},
173 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 2},
174 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 4},
175 // narrow, and
176 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 2},
177 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 2},
178 // narrow, 2x and
179 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 3},
180 // 3x narrow, 4x and
181 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 7},
182 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 7},
183 // 7x narrow, 8x and
184 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 15},
185 // convert_i32x4
186 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
187 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
188 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
189 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
190 // extend_low, convert
191 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
192 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
193 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
194 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
195 // extend_low x 2, convert
196 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
197 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
198 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
199 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
200 // several shuffles
201 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
202 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
203 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 10},
204 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
205 /// trunc_sat, const, and, 3x narrow
206 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
207 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
208 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
209 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
210 /// trunc_sat, const, and, narrow
211 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
212 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
213 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
214 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
215 // 2x trunc_sat, const, 2x and, 3x narrow
216 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
217 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
218 // 2x trunc_sat, const, 2x and, narrow
219 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
220 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
221 };
222
223 if (const auto *Entry =
224 ConvertCostTableLookup(Table: ConversionTbl, ISD, Dst: DstVT, Src: SrcVT)) {
225 return Entry->Cost;
226 }
227
228 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
229}
230
231WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
232WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
233 TTI::MemCmpExpansionOptions Options;
234
235 Options.AllowOverlappingLoads = true;
236
237 if (ST->hasSIMD128())
238 Options.LoadSizes.push_back(Elt: 16);
239
240 Options.LoadSizes.append(IL: {8, 4, 2, 1});
241 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
242 Options.NumLoadsPerBlock = Options.MaxNumLoads;
243
244 return Options;
245}
246
247InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
248 unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
249 TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
250 const Instruction *I) const {
251 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: Ty)) {
252 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
253 CostKind);
254 }
255
256 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
257 // Type legalization can't handle structs
258 if (VT == MVT::Other)
259 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
260 CostKind);
261
262 auto LT = getTypeLegalizationCost(Ty);
263 if (!LT.first.isValid())
264 return InstructionCost::getInvalid();
265
266 int ISD = TLI->InstructionOpcodeToISD(Opcode);
267 unsigned width = VT.getSizeInBits();
268 if (ISD == ISD::LOAD) {
269 // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
270 // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
271 // loads are twice as expensive as scalar.
272 switch (width) {
273 default:
274 break;
275 case 32:
276 case 64:
277 case 128:
278 return 2;
279 }
280 } else if (ISD == ISD::STORE) {
281 // For stores, we can use store lane operations.
282 switch (width) {
283 default:
284 break;
285 case 8:
286 case 16:
287 case 32:
288 case 64:
289 case 128:
290 return 2;
291 }
292 }
293
294 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, CostKind);
295}
296
297InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
298 unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
299 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
300 bool UseMaskForCond, bool UseMaskForGaps) const {
301 assert(Factor >= 2 && "Invalid interleave factor");
302
303 auto *VecTy = cast<VectorType>(Val: Ty);
304 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: VecTy)) {
305 return InstructionCost::getInvalid();
306 }
307
308 if (UseMaskForCond || UseMaskForGaps)
309 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy: Ty, Factor, Indices,
310 Alignment, AddressSpace, CostKind,
311 UseMaskForCond, UseMaskForGaps);
312
313 constexpr unsigned MaxInterleaveFactor = 4;
314 if (Factor <= MaxInterleaveFactor) {
315 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
316 // Ensure the number of vector elements is greater than 1.
317 if (MinElts < 2 || MinElts % Factor != 0)
318 return InstructionCost::getInvalid();
319
320 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
321 // Ensure the element type is legal.
322 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
323 return InstructionCost::getInvalid();
324
325 if (Factor != 2 && Factor != 4)
326 return InstructionCost::getInvalid();
327
328 auto *SubVecTy =
329 VectorType::get(ElementType: VecTy->getElementType(),
330 EC: VecTy->getElementCount().divideCoefficientBy(RHS: Factor));
331 InstructionCost MemCost =
332 getMemoryOpCost(Opcode, Ty: SubVecTy, Alignment, AddressSpace, CostKind);
333
334 unsigned VecSize = DL.getTypeSizeInBits(Ty: SubVecTy);
335 unsigned MaxVecSize = 128;
336 unsigned NumAccesses =
337 std::max<unsigned>(a: 1, b: (MinElts * ElSize + MaxVecSize - 1) / VecSize);
338
339 // A stride of two is commonly supported via dedicated instructions, so it
340 // should be relatively cheap for all element sizes. A stride of four is
341 // more expensive as it will likely require more shuffles. Using two
342 // simd128 inputs is considered more expensive and we mainly account for
343 // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
344 // arithmetic kernels.
345 static const CostTblEntry ShuffleCostTbl[] = {
346 // One reg.
347 {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8
348 {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8
349 {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8
350 {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16
351 {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16
352 {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32
353
354 // Two regs.
355 {.ISD: 2, .Type: MVT::v16i8, .Cost: 2}, // interleave 2 x 16i8 into 32i8
356 {.ISD: 2, .Type: MVT::v8i16, .Cost: 2}, // interleave 2 x 8i16 into 16i16
357 {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32
358
359 // One reg.
360 {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8
361 {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8
362 {.ISD: 4, .Type: MVT::v2i16, .Cost: 4}, // interleave 4 x 2i16 into 8i16
363
364 // Two regs.
365 {.ISD: 4, .Type: MVT::v8i8, .Cost: 16}, // interleave 4 x 8i8 into 32i8
366 {.ISD: 4, .Type: MVT::v4i16, .Cost: 8}, // interleave 4 x 4i16 into 16i16
367 {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // interleave 4 x 2i32 into 8i32
368
369 // Four regs.
370 {.ISD: 4, .Type: MVT::v4i32, .Cost: 16}, // interleave 4 x 4i32 into 16i32
371 };
372
373 EVT ETy = TLI->getValueType(DL, Ty: SubVecTy);
374 if (const auto *Entry =
375 CostTableLookup(Table: ShuffleCostTbl, ISD: Factor, Ty: ETy.getSimpleVT()))
376 return Entry->Cost + (NumAccesses * MemCost);
377 }
378
379 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
380 Alignment, AddressSpace, CostKind,
381 UseMaskForCond, UseMaskForGaps);
382}
383
384InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
385 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
386 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
387 InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
388 Opcode, Val, CostKind, Index, Op0, Op1, VIC);
389
390 // SIMD128's insert/extract currently only take constant indices.
391 if (Index == -1u)
392 return Cost + 25 * TargetTransformInfo::TCC_Expensive;
393
394 return Cost;
395}
396
397InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
398 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
399 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
400 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
401 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
402 InstructionCost Invalid = InstructionCost::getInvalid();
403 if (!VF.isFixed() || !ST->hasSIMD128())
404 return Invalid;
405
406 if (CostKind != TTI::TCK_RecipThroughput)
407 return Invalid;
408
409 if (Opcode != Instruction::Add)
410 return Invalid;
411
412 EVT AccumEVT = EVT::getEVT(Ty: AccumType);
413 // TODO: Add i64 accumulator.
414 if (AccumEVT != MVT::i32)
415 return Invalid;
416
417 // Possible options:
418 // - i16x8.extadd_pairwise_i8x16_sx
419 // - i32x4.extadd_pairwise_i16x8_sx
420 // - i32x4.dot_i16x8_s
421 // Only try to support dot, for now.
422
423 EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
424 if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
425 (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
426 return Invalid;
427 }
428
429 if (OpAExtend == TTI::PR_None)
430 return Invalid;
431
432 InstructionCost Cost(TTI::TCC_Basic);
433 if (!BinOp)
434 return Cost;
435
436 if (OpAExtend != OpBExtend)
437 return Invalid;
438
439 if (*BinOp != Instruction::Mul)
440 return Invalid;
441
442 if (InputTypeA != InputTypeB)
443 return Invalid;
444
445 // Signed inputs can lower to dot
446 if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
447 return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
448
449 // Double the size of the lowered sequence.
450 if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
451 return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
452
453 return Invalid;
454}
455
456InstructionCost
457WebAssemblyTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
458 TTI::TargetCostKind CostKind) const {
459 switch (ICA.getID()) {
460 case Intrinsic::experimental_vector_extract_last_active:
461 // TODO: Remove once the intrinsic can be lowered without crashes.
462 return InstructionCost::getInvalid();
463 default:
464 break;
465 }
466 return BaseT::getIntrinsicInstrCost(ICA, CostKind);
467}
468
469TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
470 const IntrinsicInst *II) const {
471
472 switch (II->getIntrinsicID()) {
473 default:
474 break;
475 case Intrinsic::vector_reduce_fadd:
476 return TTI::ReductionShuffle::Pairwise;
477 }
478 return TTI::ReductionShuffle::SplitHalf;
479}
480
481void WebAssemblyTTIImpl::getUnrollingPreferences(
482 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
483 OptimizationRemarkEmitter *ORE) const {
484 // Scan the loop: don't unroll loops with calls. This is a standard approach
485 // for most (all?) targets.
486 for (BasicBlock *BB : L->blocks())
487 for (Instruction &I : *BB)
488 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
489 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
490 if (isLoweredToCall(F))
491 return;
492
493 // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
494 // the various microarchitectures that use the BasicTTI implementation and
495 // has been selected through heuristics across multiple cores and runtimes.
496 UP.Partial = UP.Runtime = UP.UpperBound = true;
497 UP.PartialThreshold = 30;
498
499 // Avoid unrolling when optimizing for size.
500 UP.OptSizeThreshold = 0;
501 UP.PartialOptSizeThreshold = 0;
502
503 // Set number of instructions optimized when "back edge"
504 // becomes "fall through" to default value of 2.
505 UP.BEInsns = 2;
506}
507
508bool WebAssemblyTTIImpl::supportsTailCalls() const {
509 return getST()->hasTailCall();
510}
511
512bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
513 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
514 using namespace llvm::PatternMatch;
515
516 if (!I->getType()->isVectorTy() || !I->isShift())
517 return false;
518
519 Value *V = I->getOperand(i: 1);
520 // We dont need to sink constant splat.
521 if (isa<Constant>(Val: V))
522 return false;
523
524 if (match(V, P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ZeroInt()),
525 v2: m_Value(), mask: m_ZeroMask()))) {
526 // Sink insert
527 Ops.push_back(Elt: &cast<Instruction>(Val: V)->getOperandUse(i: 0));
528 // Sink shuffle
529 Ops.push_back(Elt: &I->getOperandUse(i: 1));
530 return true;
531 }
532
533 return false;
534}
535
536/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
537/// constant.
538static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
539 InstCombiner::BuilderTy &Builder,
540 bool IsRelaxed) {
541 auto *V = dyn_cast<Constant>(Val: II.getArgOperand(i: 1));
542 if (!V)
543 return nullptr;
544
545 auto *VecTy = cast<FixedVectorType>(Val: II.getType());
546 unsigned NumElts = VecTy->getNumElements();
547 assert(NumElts == 16);
548
549 // Construct a shuffle mask from constant integers or UNDEFs.
550 int Indexes[16];
551 bool AnyOutOfBounds = false;
552
553 for (unsigned I = 0; I < NumElts; ++I) {
554 Constant *COp = V->getAggregateElement(Elt: I);
555 if (!COp || (!isa<UndefValue>(Val: COp) && !isa<ConstantInt>(Val: COp)))
556 return nullptr;
557
558 if (isa<UndefValue>(Val: COp)) {
559 Indexes[I] = -1;
560 continue;
561 }
562
563 if (IsRelaxed && cast<ConstantInt>(Val: COp)->getSExtValue() >= NumElts) {
564 // The relaxed_swizzle operation always returns 0 if the lane index is
565 // less than 0 when interpreted as a signed value. For lane indices above
566 // 15, however, it can choose between returning 0 or the lane at `Index %
567 // 16`. However, the choice must be made consistently. As the WebAssembly
568 // spec states:
569 //
570 // "The result of relaxed operators are implementation-dependent, because
571 // the set of possible results may depend on properties of the host
572 // environment, such as its hardware. Technically, their behaviour is
573 // controlled by a set of global parameters to the semantics that an
574 // implementation can instantiate in different ways. These choices are
575 // fixed, that is, parameters are constant during the execution of any
576 // given program."
577 //
578 // The WebAssembly runtime may choose differently from us, so we can't
579 // optimize a relaxed swizzle with lane indices above 15.
580 return nullptr;
581 }
582
583 uint64_t Index = cast<ConstantInt>(Val: COp)->getZExtValue();
584 if (Index >= NumElts) {
585 AnyOutOfBounds = true;
586 // If there are out-of-bounds indices, the swizzle instruction returns
587 // zeroes in those lanes. We'll provide an all-zeroes vector as the
588 // second argument to shufflevector and read the first element from it.
589 Indexes[I] = NumElts;
590 continue;
591 }
592
593 Indexes[I] = Index;
594 }
595
596 auto *V1 = II.getArgOperand(i: 0);
597 auto *V2 =
598 AnyOutOfBounds ? Constant::getNullValue(Ty: VecTy) : PoisonValue::get(T: VecTy);
599
600 return Builder.CreateShuffleVector(V1, V2, Mask: ArrayRef(Indexes, NumElts));
601}
602
603std::optional<Instruction *>
604WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
605 IntrinsicInst &II) const {
606 Intrinsic::ID IID = II.getIntrinsicID();
607 switch (IID) {
608 case Intrinsic::wasm_swizzle:
609 case Intrinsic::wasm_relaxed_swizzle:
610 if (Value *V = simplifyWasmSwizzle(
611 II, Builder&: IC.Builder, IsRelaxed: IID == Intrinsic::wasm_relaxed_swizzle)) {
612 return IC.replaceInstUsesWith(I&: II, V);
613 }
614 break;
615 }
616
617 return std::nullopt;
618}
619