1//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file defines the WebAssembly-specific TargetTransformInfo
11/// implementation.
12///
13//===----------------------------------------------------------------------===//
14
15#include "WebAssemblyTargetTransformInfo.h"
16#include "llvm/IR/IntrinsicInst.h"
17#include "llvm/IR/IntrinsicsWebAssembly.h"
18#include "llvm/Transforms/InstCombine/InstCombiner.h"
19
20#include "llvm/CodeGen/CostTable.h"
21using namespace llvm;
22
23#define DEBUG_TYPE "wasmtti"
24
25TargetTransformInfo::PopcntSupportKind
26WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
27 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
28 return TargetTransformInfo::PSK_FastHardware;
29}
30
31unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
32 unsigned Result = BaseT::getNumberOfRegisters(ClassID);
33
34 // For SIMD, use at least 16 registers, as a rough guess.
35 bool Vector = (ClassID == 1);
36 if (Vector)
37 Result = std::max(a: Result, b: 16u);
38
39 return Result;
40}
41
42TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
43 TargetTransformInfo::RegisterKind K) const {
44 switch (K) {
45 case TargetTransformInfo::RGK_Scalar:
46 return TypeSize::getFixed(ExactSize: 64);
47 case TargetTransformInfo::RGK_FixedWidthVector:
48 return TypeSize::getFixed(ExactSize: getST()->hasSIMD128() ? 128 : 64);
49 case TargetTransformInfo::RGK_ScalableVector:
50 return TypeSize::getScalable(MinimumSize: 0);
51 }
52
53 llvm_unreachable("Unsupported register kind");
54}
55
56InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
57 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
58 TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
59 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
60
61 if (ST->hasSIMD128()) {
62 static const CostTblEntry ArithCostTbl[]{
63 // extmul + (maybe awkward) shuffle
64 {.ISD: ISD::MUL, .Type: MVT::v8i8, .Cost: 4},
65 // 2x extmul + (okay) shuffle
66 {.ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: 4},
67 // extmul
68 {.ISD: ISD::MUL, .Type: MVT::v4i16, .Cost: 1},
69 // extmul
70 {.ISD: ISD::MUL, .Type: MVT::v2i32, .Cost: 1},
71 };
72 EVT DstVT = TLI->getValueType(DL, Ty);
73 if (DstVT.isSimple()) {
74 int ISD = TLI->InstructionOpcodeToISD(Opcode);
75 if (const auto *Entry =
76 CostTableLookup(Table: ArithCostTbl, ISD, Ty: DstVT.getSimpleVT()))
77 return Entry->Cost;
78 }
79 }
80
81 InstructionCost Cost =
82 BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
83 Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
84
85 if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
86 switch (Opcode) {
87 case Instruction::LShr:
88 case Instruction::AShr:
89 case Instruction::Shl:
90 // SIMD128's shifts currently only accept a scalar shift count. For each
91 // element, we'll need to extract, op, insert. The following is a rough
92 // approximation.
93 if (!Op2Info.isUniform())
94 Cost =
95 cast<FixedVectorType>(Val: VTy)->getNumElements() *
96 (TargetTransformInfo::TCC_Basic +
97 getArithmeticInstrCost(Opcode, Ty: VTy->getElementType(), CostKind) +
98 TargetTransformInfo::TCC_Basic);
99 break;
100 }
101 }
102 return Cost;
103}
104
105InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
106 unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
107 TTI::TargetCostKind CostKind, const Instruction *I) const {
108 int ISD = TLI->InstructionOpcodeToISD(Opcode);
109 auto SrcTy = TLI->getValueType(DL, Ty: Src);
110 auto DstTy = TLI->getValueType(DL, Ty: Dst);
111
112 if (!SrcTy.isSimple() || !DstTy.isSimple()) {
113 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
114 }
115
116 if (!ST->hasSIMD128()) {
117 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
118 }
119
120 auto DstVT = DstTy.getSimpleVT();
121 auto SrcVT = SrcTy.getSimpleVT();
122
123 if (I && I->hasOneUser()) {
124 auto *SingleUser = cast<Instruction>(Val: *I->user_begin());
125 int UserISD = TLI->InstructionOpcodeToISD(Opcode: SingleUser->getOpcode());
126
127 // extmul_low support
128 if (UserISD == ISD::MUL &&
129 (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
130 // Free low extensions.
131 if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
132 (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
133 (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
134 return 0;
135 }
136 // Will require an additional extlow operation for the intermediate
137 // i16/i32 value.
138 if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
139 (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
140 return 1;
141 }
142 }
143 }
144
145 static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
146 // extend_low
147 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
148 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: 1},
149 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
150 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: 1},
151 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
152 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: 1},
153 // 2 x extend_low
154 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
155 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: 2},
156 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
157 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: 2},
158 // extend_low, extend_high
159 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
160 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: 2},
161 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
162 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: 2},
163 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
164 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: 2},
165 // 2x extend_low, extend_high
166 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
167 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: 4},
168 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
169 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: 4},
170 // 6x extend_low, extend_high
171 {.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
172 {.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: 6},
173 // shuffle
174 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: 2},
175 {.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: 4},
176 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: 2},
177 {.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: 4},
178 // narrow, and
179 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: 2},
180 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: 2},
181 // narrow, 2x and
182 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: 3},
183 // 3x narrow, 4x and
184 {.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: 7},
185 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: 7},
186 // 7x narrow, 8x and
187 {.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: 15},
188 // convert_i32x4
189 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
190 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: 1},
191 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
192 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: 1},
193 // extend_low, convert
194 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
195 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: 2},
196 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
197 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: 2},
198 // extend_low x 2, convert
199 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
200 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: 3},
201 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
202 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: 3},
203 // several shuffles
204 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
205 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
206 {.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: 10},
207 {.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: 10},
208 /// trunc_sat, const, and, 3x narrow
209 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
210 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: 6},
211 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
212 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: 6},
213 /// trunc_sat, const, and, narrow
214 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
215 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: 4},
216 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
217 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: 4},
218 // 2x trunc_sat, const, 2x and, 3x narrow
219 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
220 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: 8},
221 // 2x trunc_sat, const, 2x and, narrow
222 {.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
223 {.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: 6},
224 };
225
226 if (const auto *Entry =
227 ConvertCostTableLookup(Table: ConversionTbl, ISD, Dst: DstVT, Src: SrcVT)) {
228 return Entry->Cost;
229 }
230
231 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
232}
233
234WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
235WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
236 TTI::MemCmpExpansionOptions Options;
237
238 Options.AllowOverlappingLoads = true;
239
240 if (ST->hasSIMD128())
241 Options.LoadSizes.push_back(Elt: 16);
242
243 Options.LoadSizes.append(IL: {8, 4, 2, 1});
244 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
245 Options.NumLoadsPerBlock = Options.MaxNumLoads;
246
247 return Options;
248}
249
250InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
251 unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
252 TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
253 const Instruction *I) const {
254 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: Ty)) {
255 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
256 CostKind);
257 }
258
259 EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
260 // Type legalization can't handle structs
261 if (VT == MVT::Other)
262 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
263 CostKind);
264
265 auto LT = getTypeLegalizationCost(Ty);
266 if (!LT.first.isValid())
267 return InstructionCost::getInvalid();
268
269 int ISD = TLI->InstructionOpcodeToISD(Opcode);
270 unsigned width = VT.getSizeInBits();
271 if (ISD == ISD::LOAD) {
272 // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
273 // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
274 // loads are twice as expensive as scalar.
275 switch (width) {
276 default:
277 break;
278 case 32:
279 case 64:
280 case 128:
281 return 2;
282 }
283 } else if (ISD == ISD::STORE) {
284 // For stores, we can use store lane operations.
285 switch (width) {
286 default:
287 break;
288 case 8:
289 case 16:
290 case 32:
291 case 64:
292 case 128:
293 return 2;
294 }
295 }
296
297 return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, CostKind);
298}
299
300InstructionCost WebAssemblyTTIImpl::getShuffleCost(
301 TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
302 ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
303 VectorType *SubTp, ArrayRef<const Value *> Args,
304 const Instruction *CxtI) const {
305 // Canonicalize the ShuffleKind in case optimizations didn't.
306 // Otherwise, we might end up with the wrong ShuffleKind to match against.
307
308 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
309
310 // Wasm SIMD128 has native splat instructions for all lane types.
311 if (ST->hasSIMD128() && Kind == TTI::SK_Broadcast &&
312 isa<FixedVectorType>(Val: SrcTy))
313 return 1;
314
315 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
316 Args, CxtI);
317}
318
319InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
320 unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
321 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
322 bool UseMaskForCond, bool UseMaskForGaps) const {
323 assert(Factor >= 2 && "Invalid interleave factor");
324
325 auto *VecTy = cast<VectorType>(Val: Ty);
326 if (!ST->hasSIMD128() || !isa<FixedVectorType>(Val: VecTy)) {
327 return InstructionCost::getInvalid();
328 }
329
330 if (UseMaskForCond || UseMaskForGaps)
331 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy: Ty, Factor, Indices,
332 Alignment, AddressSpace, CostKind,
333 UseMaskForCond, UseMaskForGaps);
334
335 constexpr unsigned MaxInterleaveFactor = 4;
336 if (Factor <= MaxInterleaveFactor) {
337 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
338 // Ensure the number of vector elements is greater than 1.
339 if (MinElts < 2 || MinElts % Factor != 0)
340 return InstructionCost::getInvalid();
341
342 unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
343 // Ensure the element type is legal.
344 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
345 return InstructionCost::getInvalid();
346
347 if (Factor != 2 && Factor != 4)
348 return InstructionCost::getInvalid();
349
350 auto *SubVecTy =
351 VectorType::get(ElementType: VecTy->getElementType(),
352 EC: VecTy->getElementCount().divideCoefficientBy(RHS: Factor));
353 InstructionCost MemCost =
354 getMemoryOpCost(Opcode, Ty: SubVecTy, Alignment, AddressSpace, CostKind);
355
356 unsigned VecSize = DL.getTypeSizeInBits(Ty: SubVecTy);
357 unsigned MaxVecSize = 128;
358 unsigned NumAccesses =
359 std::max<unsigned>(a: 1, b: (MinElts * ElSize + MaxVecSize - 1) / VecSize);
360
361 // A stride of two is commonly supported via dedicated instructions, so it
362 // should be relatively cheap for all element sizes. A stride of four is
363 // more expensive as it will likely require more shuffles. Using two
364 // simd128 inputs is considered more expensive and we mainly account for
365 // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
366 // arithmetic kernels.
367 static const CostTblEntry ShuffleCostTbl[] = {
368 // One reg.
369 {.ISD: 2, .Type: MVT::v2i8, .Cost: 1}, // interleave 2 x 2i8 into 4i8
370 {.ISD: 2, .Type: MVT::v4i8, .Cost: 1}, // interleave 2 x 4i8 into 8i8
371 {.ISD: 2, .Type: MVT::v8i8, .Cost: 1}, // interleave 2 x 8i8 into 16i8
372 {.ISD: 2, .Type: MVT::v2i16, .Cost: 1}, // interleave 2 x 2i16 into 4i16
373 {.ISD: 2, .Type: MVT::v4i16, .Cost: 1}, // interleave 2 x 4i16 into 8i16
374 {.ISD: 2, .Type: MVT::v2i32, .Cost: 1}, // interleave 2 x 2i32 into 4i32
375
376 // Two regs.
377 {.ISD: 2, .Type: MVT::v16i8, .Cost: 2}, // interleave 2 x 16i8 into 32i8
378 {.ISD: 2, .Type: MVT::v8i16, .Cost: 2}, // interleave 2 x 8i16 into 16i16
379 {.ISD: 2, .Type: MVT::v4i32, .Cost: 2}, // interleave 2 x 4i32 into 8i32
380
381 // One reg.
382 {.ISD: 4, .Type: MVT::v2i8, .Cost: 4}, // interleave 4 x 2i8 into 8i8
383 {.ISD: 4, .Type: MVT::v4i8, .Cost: 4}, // interleave 4 x 4i8 into 16i8
384 {.ISD: 4, .Type: MVT::v2i16, .Cost: 4}, // interleave 4 x 2i16 into 8i16
385
386 // Two regs.
387 {.ISD: 4, .Type: MVT::v8i8, .Cost: 16}, // interleave 4 x 8i8 into 32i8
388 {.ISD: 4, .Type: MVT::v4i16, .Cost: 8}, // interleave 4 x 4i16 into 16i16
389 {.ISD: 4, .Type: MVT::v2i32, .Cost: 4}, // interleave 4 x 2i32 into 8i32
390
391 // Four regs.
392 {.ISD: 4, .Type: MVT::v4i32, .Cost: 16}, // interleave 4 x 4i32 into 16i32
393 };
394
395 EVT ETy = TLI->getValueType(DL, Ty: SubVecTy);
396 if (const auto *Entry =
397 CostTableLookup(Table: ShuffleCostTbl, ISD: Factor, Ty: ETy.getSimpleVT()))
398 return Entry->Cost + (NumAccesses * MemCost);
399 }
400
401 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
402 Alignment, AddressSpace, CostKind,
403 UseMaskForCond, UseMaskForGaps);
404}
405
406InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
407 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
408 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
409 InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
410 Opcode, Val, CostKind, Index, Op0, Op1, VIC);
411
412 // SIMD128's insert/extract currently only take constant indices.
413 if (Index == -1u)
414 return Cost + 25 * TargetTransformInfo::TCC_Expensive;
415
416 return Cost;
417}
418
419InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
420 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
421 ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
422 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
423 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
424 InstructionCost Invalid = InstructionCost::getInvalid();
425 if (!VF.isFixed() || !ST->hasSIMD128())
426 return Invalid;
427
428 if (CostKind != TTI::TCK_RecipThroughput)
429 return Invalid;
430
431 if (Opcode != Instruction::Add)
432 return Invalid;
433
434 EVT AccumEVT = EVT::getEVT(Ty: AccumType);
435 // TODO: Add i64 accumulator.
436 if (AccumEVT != MVT::i32)
437 return Invalid;
438
439 // Possible options:
440 // - i16x8.extadd_pairwise_i8x16_sx
441 // - i32x4.extadd_pairwise_i16x8_sx
442 // - i32x4.dot_i16x8_s
443 // Only try to support dot, for now.
444
445 EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
446 if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
447 (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
448 return Invalid;
449 }
450
451 if (OpAExtend == TTI::PR_None)
452 return Invalid;
453
454 InstructionCost Cost(TTI::TCC_Basic);
455 if (!BinOp)
456 return Cost;
457
458 if (OpAExtend != OpBExtend)
459 return Invalid;
460
461 if (*BinOp != Instruction::Mul)
462 return Invalid;
463
464 if (InputTypeA != InputTypeB)
465 return Invalid;
466
467 // Signed inputs can lower to dot
468 if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
469 return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
470
471 // Double the size of the lowered sequence.
472 if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
473 return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
474
475 return Invalid;
476}
477
478TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
479 const IntrinsicInst *II) const {
480
481 switch (II->getIntrinsicID()) {
482 default:
483 break;
484 case Intrinsic::vector_reduce_fadd:
485 return TTI::ReductionShuffle::Pairwise;
486 }
487 return TTI::ReductionShuffle::SplitHalf;
488}
489
490void WebAssemblyTTIImpl::getUnrollingPreferences(
491 Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
492 OptimizationRemarkEmitter *ORE) const {
493 // Scan the loop: don't unroll loops with calls. This is a standard approach
494 // for most (all?) targets.
495 for (BasicBlock *BB : L->blocks())
496 for (Instruction &I : *BB)
497 if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I))
498 if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
499 if (isLoweredToCall(F))
500 return;
501
502 // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
503 // the various microarchitectures that use the BasicTTI implementation and
504 // has been selected through heuristics across multiple cores and runtimes.
505 UP.Partial = UP.Runtime = UP.UpperBound = true;
506 UP.PartialThreshold = 30;
507
508 // Avoid unrolling when optimizing for size.
509 UP.OptSizeThreshold = 0;
510 UP.PartialOptSizeThreshold = 0;
511
512 // Set number of instructions optimized when "back edge"
513 // becomes "fall through" to default value of 2.
514 UP.BEInsns = 2;
515}
516
517bool WebAssemblyTTIImpl::supportsTailCalls() const {
518 return getST()->hasTailCall();
519}
520
521bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
522 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
523 using namespace llvm::PatternMatch;
524
525 if (!I->getType()->isVectorTy() || !I->isShift())
526 return false;
527
528 Value *V = I->getOperand(i: 1);
529 // We dont need to sink constant splat.
530 if (isa<Constant>(Val: V))
531 return false;
532
533 if (match(V, P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ZeroInt()),
534 v2: m_Value(), mask: m_ZeroMask()))) {
535 // Sink insert
536 Ops.push_back(Elt: &cast<Instruction>(Val: V)->getOperandUse(i: 0));
537 // Sink shuffle
538 Ops.push_back(Elt: &I->getOperandUse(i: 1));
539 return true;
540 }
541
542 return false;
543}
544
545/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
546/// constant.
547static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
548 InstCombiner::BuilderTy &Builder,
549 bool IsRelaxed) {
550 auto *V = dyn_cast<Constant>(Val: II.getArgOperand(i: 1));
551 if (!V)
552 return nullptr;
553
554 auto *VecTy = cast<FixedVectorType>(Val: II.getType());
555 unsigned NumElts = VecTy->getNumElements();
556 assert(NumElts == 16);
557
558 // Construct a shuffle mask from constant integers or UNDEFs.
559 int Indexes[16];
560 bool AnyOutOfBounds = false;
561
562 for (unsigned I = 0; I < NumElts; ++I) {
563 Constant *COp = V->getAggregateElement(Elt: I);
564 if (!COp || (!isa<UndefValue>(Val: COp) && !isa<ConstantInt>(Val: COp)))
565 return nullptr;
566
567 if (isa<UndefValue>(Val: COp)) {
568 Indexes[I] = -1;
569 continue;
570 }
571
572 if (IsRelaxed && cast<ConstantInt>(Val: COp)->getSExtValue() >= NumElts) {
573 // The relaxed_swizzle operation always returns 0 if the lane index is
574 // less than 0 when interpreted as a signed value. For lane indices above
575 // 15, however, it can choose between returning 0 or the lane at `Index %
576 // 16`. However, the choice must be made consistently. As the WebAssembly
577 // spec states:
578 //
579 // "The result of relaxed operators are implementation-dependent, because
580 // the set of possible results may depend on properties of the host
581 // environment, such as its hardware. Technically, their behaviour is
582 // controlled by a set of global parameters to the semantics that an
583 // implementation can instantiate in different ways. These choices are
584 // fixed, that is, parameters are constant during the execution of any
585 // given program."
586 //
587 // The WebAssembly runtime may choose differently from us, so we can't
588 // optimize a relaxed swizzle with lane indices above 15.
589 return nullptr;
590 }
591
592 uint64_t Index = cast<ConstantInt>(Val: COp)->getZExtValue();
593 if (Index >= NumElts) {
594 AnyOutOfBounds = true;
595 // If there are out-of-bounds indices, the swizzle instruction returns
596 // zeroes in those lanes. We'll provide an all-zeroes vector as the
597 // second argument to shufflevector and read the first element from it.
598 Indexes[I] = NumElts;
599 continue;
600 }
601
602 Indexes[I] = Index;
603 }
604
605 auto *V1 = II.getArgOperand(i: 0);
606 auto *V2 =
607 AnyOutOfBounds ? Constant::getNullValue(Ty: VecTy) : PoisonValue::get(T: VecTy);
608
609 return Builder.CreateShuffleVector(V1, V2, Mask: ArrayRef(Indexes, NumElts));
610}
611
612std::optional<Instruction *>
613WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
614 IntrinsicInst &II) const {
615 Intrinsic::ID IID = II.getIntrinsicID();
616 switch (IID) {
617 case Intrinsic::wasm_swizzle:
618 case Intrinsic::wasm_relaxed_swizzle:
619 if (Value *V = simplifyWasmSwizzle(
620 II, Builder&: IC.Builder, IsRelaxed: IID == Intrinsic::wasm_relaxed_swizzle)) {
621 return IC.replaceInstUsesWith(I&: II, V);
622 }
623 break;
624 }
625
626 return std::nullopt;
627}
628