WebAssemblyTargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp]

1	//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file defines the WebAssembly-specific TargetTransformInfo
11	/// implementation.
12	///
13	//===----------------------------------------------------------------------===//
14
15	#include "WebAssemblyTargetTransformInfo.h"
16	#include "llvm/IR/IntrinsicInst.h"
17	#include "llvm/IR/IntrinsicsWebAssembly.h"
18	#include "llvm/Transforms/InstCombine/InstCombiner.h"
19
20	#include "llvm/CodeGen/CostTable.h"
21	using namespace llvm;
22
23	#define DEBUG_TYPE "wasmtti"
24
25	TargetTransformInfo::PopcntSupportKind
26	WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
27	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
28	return TargetTransformInfo::PSK_FastHardware;
29	}
30
31	unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
32	unsigned Result = BaseT::getNumberOfRegisters(ClassID);
33
34	// For SIMD, use at least 16 registers, as a rough guess.
35	bool Vector = (ClassID == `1`);
36	if (Vector)
37	Result = std::max(a: Result, b: `16u`);
38
39	return Result;
40	}
41
42	TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
43	TargetTransformInfo::RegisterKind K) const {
44	switch (K) {
45	case TargetTransformInfo::RGK_Scalar:
46	return TypeSize::getFixed(ExactSize: `64`);
47	case TargetTransformInfo::RGK_FixedWidthVector:
48	return TypeSize::getFixed(ExactSize: getST()->hasSIMD128() ? `128` : `64`);
49	case TargetTransformInfo::RGK_ScalableVector:
50	return TypeSize::getScalable(MinimumSize: `0`);
51	}
52
53	llvm_unreachable("Unsupported register kind");
54	}
55
56	InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
57	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
58	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
59	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
60
61	if (ST->hasSIMD128()) {
62	static const CostTblEntry ArithCostTbl[]{
63	// extmul + (maybe awkward) shuffle
64	{.ISD: ISD::MUL, .Type: MVT::v8i8, .Cost: `4`},
65	// 2x extmul + (okay) shuffle
66	{.ISD: ISD::MUL, .Type: MVT::v16i8, .Cost: `4`},
67	// extmul
68	{.ISD: ISD::MUL, .Type: MVT::v4i16, .Cost: `1`},
69	// extmul
70	{.ISD: ISD::MUL, .Type: MVT::v2i32, .Cost: `1`},
71	};
72	EVT DstVT = TLI->getValueType(DL, Ty);
73	if (DstVT.isSimple()) {
74	int ISD = TLI->InstructionOpcodeToISD(Opcode);
75	if (const auto *Entry =
76	CostTableLookup(Table: ArithCostTbl, ISD, Ty: DstVT.getSimpleVT()))
77	return Entry->Cost;
78	}
79	}
80
81	InstructionCost Cost =
82	BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
83	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
84
85	if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) {
86	switch (Opcode) {
87	case Instruction::LShr:
88	case Instruction::AShr:
89	case Instruction::Shl:
90	// SIMD128's shifts currently only accept a scalar shift count. For each
91	// element, we'll need to extract, op, insert. The following is a rough
92	// approximation.
93	if (!Op2Info.isUniform())
94	Cost =
95	cast<FixedVectorType>(Val: VTy)->getNumElements() *
96	(TargetTransformInfo::TCC_Basic +
97	getArithmeticInstrCost(Opcode, Ty: VTy->getElementType(), CostKind) +
98	TargetTransformInfo::TCC_Basic);
99	break;
100	}
101	}
102	return Cost;
103	}
104
105	InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
106	unsigned Opcode, Type Dst, Type Src, TTI::CastContextHint CCH,
107	TTI::TargetCostKind CostKind, const Instruction I) const* {
108	int ISD = TLI->InstructionOpcodeToISD(Opcode);
109	auto SrcTy = TLI->getValueType(DL, Ty: Src);
110	auto DstTy = TLI->getValueType(DL, Ty: Dst);
111
112	if (!SrcTy.isSimple() \|\| !DstTy.isSimple()) {
113	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
114	}
115
116	if (!ST->hasSIMD128()) {
117	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
118	}
119
120	auto DstVT = DstTy.getSimpleVT();
121	auto SrcVT = SrcTy.getSimpleVT();
122
123	if (I && I->hasOneUser()) {
124	auto SingleUser = cast<Instruction>(Val: I->user_begin());
125	int UserISD = TLI->InstructionOpcodeToISD(Opcode: SingleUser->getOpcode());
126
127	// extmul_low support
128	if (UserISD == ISD::MUL &&
129	(ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND)) {
130	// Free low extensions.
131	if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) \|\|
132	(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) \|\|
133	(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
134	return `0`;
135	}
136	// Will require an additional extlow operation for the intermediate
137	// i16/i32 value.
138	if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) \|\|
139	(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
140	return `1`;
141	}
142	}
143	}
144
145	static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
146	// extend_low
147	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1`},
148	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i32, .Cost: `1`},
149	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1`},
150	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i16, .Cost: `1`},
151	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1`},
152	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i16, .Src: MVT::v8i8, .Cost: `1`},
153	// 2 x extend_low
154	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2`},
155	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v2i64, .Src: MVT::v2i16, .Cost: `2`},
156	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2`},
157	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i32, .Src: MVT::v4i8, .Cost: `2`},
158	// extend_low, extend_high
159	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
160	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
161	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
162	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
163	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
164	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
165	// 2x extend_low, extend_high
166	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: `4`},
167	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i32, .Cost: `4`},
168	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: `4`},
169	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i16, .Cost: `4`},
170	// shuffle
171	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i32, .Cost: `2`},
172	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i32, .Cost: `4`},
173	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `2`},
174	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: `4`},
175	// narrow, and
176	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: `2`},
177	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: `2`},
178	// narrow, 2x and
179	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: `3`},
180	// 3x narrow, 4x and
181	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: `7`},
182	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `7`},
183	// 7x narrow, 8x and
184	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: `15`},
185	// convert_i32x4
186	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
187	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
188	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
189	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
190	// extend_low, convert
191	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2`},
192	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `2`},
193	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
194	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
195	// extend_low x 2, convert
196	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
197	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
198	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3`},
199	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3`},
200	// several shuffles
201	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
202	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
203	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `10`},
204	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
205	/// trunc_sat, const, and, 3x narrow
206	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `6`},
207	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `6`},
208	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `6`},
209	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `6`},
210	/// trunc_sat, const, and, narrow
211	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `4`},
212	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `4`},
213	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `4`},
214	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `4`},
215	// 2x trunc_sat, const, 2x and, 3x narrow
216	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: `8`},
217	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f32, .Cost: `8`},
218	// 2x trunc_sat, const, 2x and, narrow
219	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `6`},
220	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f32, .Cost: `6`},
221	};
222
223	if (const auto *Entry =
224	ConvertCostTableLookup(Table: ConversionTbl, ISD, Dst: DstVT, Src: SrcVT)) {
225	return Entry->Cost;
226	}
227
228	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
229	}
230
231	WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
232	WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
233	TTI::MemCmpExpansionOptions Options;
234
235	Options.AllowOverlappingLoads = true;
236
237	if (ST->hasSIMD128())
238	Options.LoadSizes.push_back(Elt: `16`);
239
240	Options.LoadSizes.append(IL: {`8`, `4`, `2`, `1`});
241	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
242	Options.NumLoadsPerBlock = Options.MaxNumLoads;
243
244	return Options;
245	}
246
247	InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
248	unsigned Opcode, Type Ty, Align Alignment, unsigned* AddressSpace,
249	TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
250	const Instruction I) const* {
251	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(Val: Ty)) {
252	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
253	CostKind);
254	}
255
256	EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
257	// Type legalization can't handle structs
258	if (VT == MVT::Other)
259	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
260	CostKind);
261
262	auto LT = getTypeLegalizationCost(Ty);
263	if (!LT.first.isValid())
264	return InstructionCost::getInvalid();
265
266	int ISD = TLI->InstructionOpcodeToISD(Opcode);
267	unsigned width = VT.getSizeInBits();
268	if (ISD == ISD::LOAD) {
269	// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
270	// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
271	// loads are twice as expensive as scalar.
272	switch (width) {
273	default:
274	break;
275	case `32`:
276	case `64`:
277	case `128`:
278	return `2`;
279	}
280	} else if (ISD == ISD::STORE) {
281	// For stores, we can use store lane operations.
282	switch (width) {
283	default:
284	break;
285	case `8`:
286	case `16`:
287	case `32`:
288	case `64`:
289	case `128`:
290	return `2`;
291	}
292	}
293
294	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, CostKind);
295	}
296
297	InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
298	unsigned Opcode, Type Ty, unsigned* Factor, ArrayRef<unsigned> Indices,
299	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
300	bool UseMaskForCond, bool UseMaskForGaps) const {
301	assert(Factor >= `2` && "Invalid interleave factor");
302
303	auto *VecTy = cast<VectorType>(Val: Ty);
304	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(Val: VecTy)) {
305	return InstructionCost::getInvalid();
306	}
307
308	if (UseMaskForCond \|\| UseMaskForGaps)
309	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy: Ty, Factor, Indices,
310	Alignment, AddressSpace, CostKind,
311	UseMaskForCond, UseMaskForGaps);
312
313	constexpr unsigned MaxInterleaveFactor = `4`;
314	if (Factor <= MaxInterleaveFactor) {
315	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
316	// Ensure the number of vector elements is greater than 1.
317	if (MinElts < `2` \|\| MinElts % Factor != `0`)
318	return InstructionCost::getInvalid();
319
320	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
321	// Ensure the element type is legal.
322	if (ElSize != `8` && ElSize != `16` && ElSize != `32` && ElSize != `64`)
323	return InstructionCost::getInvalid();
324
325	if (Factor != `2` && Factor != `4`)
326	return InstructionCost::getInvalid();
327
328	auto *SubVecTy =
329	VectorType::get(ElementType: VecTy->getElementType(),
330	EC: VecTy->getElementCount().divideCoefficientBy(RHS: Factor));
331	InstructionCost MemCost =
332	getMemoryOpCost(Opcode, Ty: SubVecTy, Alignment, AddressSpace, CostKind);
333
334	unsigned VecSize = DL.getTypeSizeInBits(Ty: SubVecTy);
335	unsigned MaxVecSize = `128`;
336	unsigned NumAccesses =
337	std::max<unsigned>(a: `1`, b: (MinElts * ElSize + MaxVecSize - `1`) / VecSize);
338
339	// A stride of two is commonly supported via dedicated instructions, so it
340	// should be relatively cheap for all element sizes. A stride of four is
341	// more expensive as it will likely require more shuffles. Using two
342	// simd128 inputs is considered more expensive and we mainly account for
343	// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
344	// arithmetic kernels.
345	static const CostTblEntry ShuffleCostTbl[] = {
346	// One reg.
347	{.ISD: `2`, .Type: MVT::v2i8, .Cost: `1`}, // interleave 2 x 2i8 into 4i8
348	{.ISD: `2`, .Type: MVT::v4i8, .Cost: `1`}, // interleave 2 x 4i8 into 8i8
349	{.ISD: `2`, .Type: MVT::v8i8, .Cost: `1`}, // interleave 2 x 8i8 into 16i8
350	{.ISD: `2`, .Type: MVT::v2i16, .Cost: `1`}, // interleave 2 x 2i16 into 4i16
351	{.ISD: `2`, .Type: MVT::v4i16, .Cost: `1`}, // interleave 2 x 4i16 into 8i16
352	{.ISD: `2`, .Type: MVT::v2i32, .Cost: `1`}, // interleave 2 x 2i32 into 4i32
353
354	// Two regs.
355	{.ISD: `2`, .Type: MVT::v16i8, .Cost: `2`}, // interleave 2 x 16i8 into 32i8
356	{.ISD: `2`, .Type: MVT::v8i16, .Cost: `2`}, // interleave 2 x 8i16 into 16i16
357	{.ISD: `2`, .Type: MVT::v4i32, .Cost: `2`}, // interleave 2 x 4i32 into 8i32
358
359	// One reg.
360	{.ISD: `4`, .Type: MVT::v2i8, .Cost: `4`}, // interleave 4 x 2i8 into 8i8
361	{.ISD: `4`, .Type: MVT::v4i8, .Cost: `4`}, // interleave 4 x 4i8 into 16i8
362	{.ISD: `4`, .Type: MVT::v2i16, .Cost: `4`}, // interleave 4 x 2i16 into 8i16
363
364	// Two regs.
365	{.ISD: `4`, .Type: MVT::v8i8, .Cost: `16`}, // interleave 4 x 8i8 into 32i8
366	{.ISD: `4`, .Type: MVT::v4i16, .Cost: `8`}, // interleave 4 x 4i16 into 16i16
367	{.ISD: `4`, .Type: MVT::v2i32, .Cost: `4`}, // interleave 4 x 2i32 into 8i32
368
369	// Four regs.
370	{.ISD: `4`, .Type: MVT::v4i32, .Cost: `16`}, // interleave 4 x 4i32 into 16i32
371	};
372
373	EVT ETy = TLI->getValueType(DL, Ty: SubVecTy);
374	if (const auto *Entry =
375	CostTableLookup(Table: ShuffleCostTbl, ISD: Factor, Ty: ETy.getSimpleVT()))
376	return Entry->Cost + (NumAccesses * MemCost);
377	}
378
379	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
380	Alignment, AddressSpace, CostKind,
381	UseMaskForCond, UseMaskForGaps);
382	}
383
384	InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
385	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
386	const Value Op0, const* Value Op1, TTI::VectorInstrContext VIC) const* {
387	InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
388	Opcode, Val, CostKind, Index, Op0, Op1, VIC);
389
390	// SIMD128's insert/extract currently only take constant indices.
391	if (Index == -`1u`)
392	return Cost + `25` * TargetTransformInfo::TCC_Expensive;
393
394	return Cost;
395	}
396
397	InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
398	unsigned Opcode, Type InputTypeA, Type InputTypeB, Type *AccumType,
399	ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
400	TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
401	TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
402	InstructionCost Invalid = InstructionCost::getInvalid();
403	if (!VF.isFixed() \|\| !ST->hasSIMD128())
404	return Invalid;
405
406	if (CostKind != TTI::TCK_RecipThroughput)
407	return Invalid;
408
409	if (Opcode != Instruction::Add)
410	return Invalid;
411
412	EVT AccumEVT = EVT::getEVT(Ty: AccumType);
413	// TODO: Add i64 accumulator.
414	if (AccumEVT != MVT::i32)
415	return Invalid;
416
417	// Possible options:
418	// - i16x8.extadd_pairwise_i8x16_sx
419	// - i32x4.extadd_pairwise_i16x8_sx
420	// - i32x4.dot_i16x8_s
421	// Only try to support dot, for now.
422
423	EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
424	if (!((InputEVT == MVT::i16 && VF.getFixedValue() == `8`) \|\|
425	(InputEVT == MVT::i8 && VF.getFixedValue() == `16`))) {
426	return Invalid;
427	}
428
429	if (OpAExtend == TTI::PR_None)
430	return Invalid;
431
432	InstructionCost Cost(TTI::TCC_Basic);
433	if (!BinOp)
434	return Cost;
435
436	if (OpAExtend != OpBExtend)
437	return Invalid;
438
439	if (*BinOp != Instruction::Mul)
440	return Invalid;
441
442	if (InputTypeA != InputTypeB)
443	return Invalid;
444
445	// Signed inputs can lower to dot
446	if (InputEVT == MVT::i16 && VF.getFixedValue() == `8`)
447	return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * `2`;
448
449	// Double the size of the lowered sequence.
450	if (InputEVT == MVT::i8 && VF.getFixedValue() == `16`)
451	return OpAExtend == TTI::PR_SignExtend ? Cost * `2` : Cost * `4`;
452
453	return Invalid;
454	}
455
456	TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
457	const IntrinsicInst II) const* {
458
459	switch (II->getIntrinsicID()) {
460	default:
461	break;
462	case Intrinsic::vector_reduce_fadd:
463	return TTI::ReductionShuffle::Pairwise;
464	}
465	return TTI::ReductionShuffle::SplitHalf;
466	}
467
468	void WebAssemblyTTIImpl::getUnrollingPreferences(
469	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
470	OptimizationRemarkEmitter ORE) const* {
471	// Scan the loop: don't unroll loops with calls. This is a standard approach
472	// for most (all?) targets.
473	for (BasicBlock *BB : L->blocks())
474	for (Instruction &I : *BB)
475	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I))
476	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
477	if (isLoweredToCall(F))
478	return;
479
480	// The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
481	// the various microarchitectures that use the BasicTTI implementation and
482	// has been selected through heuristics across multiple cores and runtimes.
483	UP.Partial = UP.Runtime = UP.UpperBound = true;
484	UP.PartialThreshold = `30`;
485
486	// Avoid unrolling when optimizing for size.
487	UP.OptSizeThreshold = `0`;
488	UP.PartialOptSizeThreshold = `0`;
489
490	// Set number of instructions optimized when "back edge"
491	// becomes "fall through" to default value of 2.
492	UP.BEInsns = `2`;
493	}
494
495	bool WebAssemblyTTIImpl::supportsTailCalls() const {
496	return getST()->hasTailCall();
497	}
498
499	bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
500	Instruction I, SmallVectorImpl<Use > &Ops) const {
501	using namespace llvm::PatternMatch;
502
503	if (!I->getType()->isVectorTy() \|\| !I->isShift())
504	return false;
505
506	Value *V = I->getOperand(i: `1`);
507	// We dont need to sink constant splat.
508	if (isa<Constant>(Val: V))
509	return false;
510
511	if (match(V, P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ZeroInt()),
512	v2: m_Value(), mask: m_ZeroMask ()))) {
513	// Sink insert
514	Ops.push_back(Elt: &cast<Instruction>(Val: V)->getOperandUse(i: `0`));
515	// Sink shuffle
516	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
517	return true;
518	}
519
520	return false;
521	}
522
523	/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
524	/// constant.
525	static Value simplifyWasmSwizzle(const* IntrinsicInst &II,
526	InstCombiner::BuilderTy &Builder,
527	bool IsRelaxed) {
528	auto *V = dyn_cast<Constant>(Val: II.getArgOperand(i: `1`));
529	if (!V)
530	return nullptr;
531
532	auto *VecTy = cast<FixedVectorType>(Val: II.getType());
533	unsigned NumElts = VecTy->getNumElements();
534	assert(NumElts == `16`);
535
536	// Construct a shuffle mask from constant integers or UNDEFs.
537	int Indexes[`16`];
538	bool AnyOutOfBounds = false;
539
540	for (unsigned I = `0`; I < NumElts; ++I) {
541	Constant *COp = V->getAggregateElement(Elt: I);
542	if (!COp \|\| (!isa<UndefValue>(Val: COp) && !isa<ConstantInt>(Val: COp)))
543	return nullptr;
544
545	if (isa<UndefValue>(Val: COp)) {
546	Indexes[I] = -`1`;
547	continue;
548	}
549
550	if (IsRelaxed && cast<ConstantInt>(Val: COp)->getSExtValue() >= NumElts) {
551	// The relaxed_swizzle operation always returns 0 if the lane index is
552	// less than 0 when interpreted as a signed value. For lane indices above
553	// 15, however, it can choose between returning 0 or the lane at `Index %
554	// 16`. However, the choice must be made consistently. As the WebAssembly
555	// spec states:
556	//
557	// "The result of relaxed operators are implementation-dependent, because
558	// the set of possible results may depend on properties of the host
559	// environment, such as its hardware. Technically, their behaviour is
560	// controlled by a set of global parameters to the semantics that an
561	// implementation can instantiate in different ways. These choices are
562	// fixed, that is, parameters are constant during the execution of any
563	// given program."
564	//
565	// The WebAssembly runtime may choose differently from us, so we can't
566	// optimize a relaxed swizzle with lane indices above 15.
567	return nullptr;
568	}
569
570	uint64_t Index = cast<ConstantInt>(Val: COp)->getZExtValue();
571	if (Index >= NumElts) {
572	AnyOutOfBounds = true;
573	// If there are out-of-bounds indices, the swizzle instruction returns
574	// zeroes in those lanes. We'll provide an all-zeroes vector as the
575	// second argument to shufflevector and read the first element from it.
576	Indexes[I] = NumElts;
577	continue;
578	}
579
580	Indexes[I] = Index;
581	}
582
583	auto *V1 = II.getArgOperand(i: `0`);
584	auto *V2 =
585	AnyOutOfBounds ? Constant::getNullValue(Ty: VecTy) : PoisonValue::get(T: VecTy);
586
587	return Builder.CreateShuffleVector(V1, V2, Mask: ArrayRef(Indexes, NumElts));
588	}
589
590	std::optional<Instruction *>
591	WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
592	IntrinsicInst &II) const {
593	Intrinsic::ID IID = II.getIntrinsicID();
594	switch (IID) {
595	case Intrinsic::wasm_swizzle:
596	case Intrinsic::wasm_relaxed_swizzle:
597	if (Value *V = simplifyWasmSwizzle(
598	II, Builder&: IC.Builder, IsRelaxed: IID == Intrinsic::wasm_relaxed_swizzle)) {
599	return IC.replaceInstUsesWith(I&: II, V);
600	}
601	break;
602	}
603
604	return std::nullopt;
605	}
606

Browse the source code of llvm_projects/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp