AArch64TargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp]

1	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AArch64TargetTransformInfo.h"
10	#include "AArch64ExpandImm.h"
11	#include "AArch64PerfectShuffle.h"
12	#include "MCTargetDesc/AArch64AddressingModes.h"
13	#include "Utils/AArch64SMEAttributes.h"
14	#include "llvm/ADT/DenseMap.h"
15	#include "llvm/Analysis/LoopInfo.h"
16	#include "llvm/Analysis/TargetTransformInfo.h"
17	#include "llvm/CodeGen/BasicTTIImpl.h"
18	#include "llvm/CodeGen/CostTable.h"
19	#include "llvm/CodeGen/TargetLowering.h"
20	#include "llvm/IR/DerivedTypes.h"
21	#include "llvm/IR/IntrinsicInst.h"
22	#include "llvm/IR/Intrinsics.h"
23	#include "llvm/IR/IntrinsicsAArch64.h"
24	#include "llvm/IR/PatternMatch.h"
25	#include "llvm/Support/Debug.h"
26	#include "llvm/TargetParser/AArch64TargetParser.h"
27	#include "llvm/Transforms/InstCombine/InstCombiner.h"
28	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
29	#include <algorithm>
30	#include <optional>
31	using namespace llvm;
32	using namespace llvm::PatternMatch;
33
34	#define DEBUG_TYPE "aarch64tti"
35
36	static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
37	cl::init(Val: true), cl::Hidden);
38
39	static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
40	"sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
41
42	static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: `10`),
43	cl::Hidden);
44
45	static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
46	cl::init(Val: `10`), cl::Hidden);
47
48	static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
49	cl::init(Val: `15`), cl::Hidden);
50
51	static cl::opt<unsigned>
52	NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: `10`),
53	cl::Hidden);
54
55	static cl::opt<unsigned> CallPenaltyChangeSM(
56	"call-penalty-sm-change", cl::init(Val: `5`), cl::Hidden,
57	cl::desc (
58	"Penalty of calling a function that requires a change to PSTATE.SM"));
59
60	static cl::opt<unsigned> InlineCallPenaltyChangeSM(
61	"inline-call-penalty-sm-change", cl::init(Val: `10`), cl::Hidden,
62	cl::desc ("Penalty of inlining a call that requires a change to PSTATE.SM"));
63
64	static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
65	cl::init(Val: true), cl::Hidden);
66
67	static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
68	cl::init(Val: true), cl::Hidden);
69
70	// A complete guess as to a reasonable cost.
71	static cl::opt<unsigned>
72	BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: `8`), cl::Hidden,
73	cl::desc ("The cost of a histcnt instruction"));
74
75	static cl::opt<unsigned> DMBLookaheadThreshold(
76	"dmb-lookahead-threshold", cl::init(Val: `10`), cl::Hidden,
77	cl::desc ("The number of instructions to search for a redundant dmb"));
78
79	namespace {
80	class TailFoldingOption {
81	// These bitfields will only ever be set to something non-zero in operator=,
82	// when setting the -sve-tail-folding option. This option should always be of
83	// the form (default\|simple\|all\|disable)[+(Flag1\|Flag2\|etc)], where here
84	// InitialBits is one of (disabled\|all\|simple). EnableBits represents
85	// additional flags we're enabling, and DisableBits for those flags we're
86	// disabling. The default flag is tracked in the variable NeedsDefault, since
87	// at the time of setting the option we may not know what the default value
88	// for the CPU is.
89	TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
90	TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
91	TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
92
93	// This value needs to be initialised to true in case the user does not
94	// explicitly set the -sve-tail-folding option.
95	bool NeedsDefault = true;
96
97	void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
98
99	void setNeedsDefault(bool V) { NeedsDefault = V; }
100
101	void setEnableBit(TailFoldingOpts Bit) {
102	EnableBits \|= Bit;
103	DisableBits &= ~Bit;
104	}
105
106	void setDisableBit(TailFoldingOpts Bit) {
107	EnableBits &= ~Bit;
108	DisableBits \|= Bit;
109	}
110
111	TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
112	TailFoldingOpts Bits = TailFoldingOpts::Disabled;
113
114	assert((InitialBits == TailFoldingOpts::Disabled \|\| !NeedsDefault) &&
115	"Initial bits should only include one of "
116	"(disabled\|all\|simple\|default)");
117	Bits = NeedsDefault ? DefaultBits : InitialBits;
118	Bits \|= EnableBits;
119	Bits &= ~DisableBits;
120
121	return Bits;
122	}
123
124	void reportError(std::string Opt) {
125	errs() << "invalid argument '" << Opt
126	<< "' to -sve-tail-folding=; the option should be of the form\n"
127	" (disabled\|all\|default\|simple)[+(reductions\|recurrences"
128	"\|reverse\|noreductions\|norecurrences\|noreverse)]\n";
129	report_fatal_error(reason: "Unrecognised tail-folding option");
130	}
131
132	public:
133
134	void operator=(const std::string &Val) {
135	// If the user explicitly sets -sve-tail-folding= then treat as an error.
136	if (Val.empty()) {
137	reportError(Opt: "");
138	return;
139	}
140
141	// Since the user is explicitly setting the option we don't automatically
142	// need the default unless they require it.
143	setNeedsDefault(false);
144
145	SmallVector<StringRef, `4`> TailFoldTypes;
146	StringRef (Val).split(A&: TailFoldTypes, Separator: `'+'`, MaxSplit: -`1`, KeepEmpty: false);
147
148	unsigned StartIdx = `1`;
149	if (TailFoldTypes [`0`] == "disabled")
150	setInitialBits(TailFoldingOpts::Disabled);
151	else if (TailFoldTypes [`0`] == "all")
152	setInitialBits(TailFoldingOpts::All);
153	else if (TailFoldTypes [`0`] == "default")
154	setNeedsDefault(true);
155	else if (TailFoldTypes [`0`] == "simple")
156	setInitialBits(TailFoldingOpts::Simple);
157	else {
158	StartIdx = `0`;
159	setInitialBits(TailFoldingOpts::Disabled);
160	}
161
162	for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
163	if (TailFoldTypes [I] == "reductions")
164	setEnableBit(TailFoldingOpts::Reductions);
165	else if (TailFoldTypes [I] == "recurrences")
166	setEnableBit(TailFoldingOpts::Recurrences);
167	else if (TailFoldTypes [I] == "reverse")
168	setEnableBit(TailFoldingOpts::Reverse);
169	else if (TailFoldTypes [I] == "noreductions")
170	setDisableBit(TailFoldingOpts::Reductions);
171	else if (TailFoldTypes [I] == "norecurrences")
172	setDisableBit(TailFoldingOpts::Recurrences);
173	else if (TailFoldTypes [I] == "noreverse")
174	setDisableBit(TailFoldingOpts::Reverse);
175	else
176	reportError(Opt: Val);
177	}
178	}
179
180	bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
181	return (getBits(DefaultBits) & Required) == Required;
182	}
183	};
184	} // namespace
185
186	TailFoldingOption TailFoldingOptionLoc;
187
188	static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
189	"sve-tail-folding",
190	cl::desc (
191	"Control the use of vectorisation using tail-folding for SVE where the"
192	" option is specified in the form (Initial)[+(Flag1\|Flag2\|...)]:"
193	"\ndisabled (Initial) No loop types will vectorize using "
194	"tail-folding"
195	"\ndefault (Initial) Uses the default tail-folding settings for "
196	"the target CPU"
197	"\nall (Initial) All legal loop types will vectorize using "
198	"tail-folding"
199	"\nsimple (Initial) Use tail-folding for simple loops (not "
200	"reductions or recurrences)"
201	"\nreductions Use tail-folding for loops containing reductions"
202	"\nnoreductions Inverse of above"
203	"\nrecurrences Use tail-folding for loops containing fixed order "
204	"recurrences"
205	"\nnorecurrences Inverse of above"
206	"\nreverse Use tail-folding for loops requiring reversed "
207	"predicates"
208	"\nnoreverse Inverse of above"),
209	cl::location(L&: TailFoldingOptionLoc));
210
211	// Experimental option that will only be fully functional when the
212	// code-generator is changed to use SVE instead of NEON for all fixed-width
213	// operations.
214	static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
215	"enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
216
217	// Experimental option that will only be fully functional when the cost-model
218	// and code-generator have been changed to avoid using scalable vector
219	// instructions that are not legal in streaming SVE mode.
220	static cl::opt<bool> EnableScalableAutovecInStreamingMode(
221	"enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
222
223	static bool isSMEABIRoutineCall(const CallInst &CI) {
224	const auto *F = CI.getCalledFunction();
225	return F && StringSwitch<bool>(F->getName())
226	.Case(S: "__arm_sme_state", Value: true)
227	.Case(S: "__arm_tpidr2_save", Value: true)
228	.Case(S: "__arm_tpidr2_restore", Value: true)
229	.Case(S: "__arm_za_disable", Value: true)
230	.Default(Value: false);
231	}
232
233	/// Returns true if the function has explicit operations that can only be
234	/// lowered using incompatible instructions for the selected mode. This also
235	/// returns true if the function F may use or modify ZA state.
236	static bool hasPossibleIncompatibleOps(const Function *F) {
237	for (const BasicBlock &BB : *F) {
238	for (const Instruction &I : BB) {
239	// Be conservative for now and assume that any call to inline asm or to
240	// intrinsics could could result in non-streaming ops (e.g. calls to
241	// @llvm.aarch64. or @llvm.gather/scatter intrinsics). We can assume that*
242	// all native LLVM instructions can be lowered to compatible instructions.
243	if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
244	(cast<CallInst>(Val: I).isInlineAsm() \|\| isa<IntrinsicInst>(Val: I) \|\|
245	isSMEABIRoutineCall(CI: cast<CallInst>(Val: I))))
246	return true;
247	}
248	}
249	return false;
250	}
251
252	uint64_t AArch64TTIImpl::getFeatureMask(const Function &F) const {
253	StringRef AttributeStr =
254	isMultiversionedFunction(F) ? "fmv-features" : "target-features";
255	StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
256	SmallVector<StringRef, `8`> Features;
257	FeatureStr.split(A&: Features, Separator: ",");
258	return AArch64::getFMVPriority(Features);
259	}
260
261	bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
262	return F.hasFnAttribute(Kind: "fmv-features");
263	}
264
265	const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
266	AArch64::FeatureExecuteOnly,
267	};
268
269	bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
270	const Function Callee) const* {
271	SMECallAttrs CallAttrs(Caller, Callee);
272
273	// When inlining, we should consider the body of the function, not the
274	// interface.
275	if (CallAttrs.callee().hasStreamingBody()) {
276	CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
277	CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
278	}
279
280	if (CallAttrs.callee().isNewZA() \|\| CallAttrs.callee().isNewZT0())
281	return false;
282
283	if (CallAttrs.requiresLazySave() \|\| CallAttrs.requiresSMChange() \|\|
284	CallAttrs.requiresPreservingZT0() \|\|
285	CallAttrs.requiresPreservingAllZAState()) {
286	if (hasPossibleIncompatibleOps(F: Callee))
287	return false;
288	}
289
290	const TargetMachine &TM = getTLI()->getTargetMachine();
291	const FeatureBitset &CallerBits =
292	TM.getSubtargetImpl(*Caller)->getFeatureBits();
293	const FeatureBitset &CalleeBits =
294	TM.getSubtargetImpl(*Callee)->getFeatureBits();
295	// Adjust the feature bitsets by inverting some of the bits. This is needed
296	// for target features that represent restrictions rather than capabilities,
297	// for example a "+execute-only" callee can be inlined into a caller without
298	// "+execute-only", but not vice versa.
299	FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
300	FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
301
302	return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
303	}
304
305	bool AArch64TTIImpl::areTypesABICompatible(
306	const Function Caller, const* Function *Callee,
307	const ArrayRef<Type > &Types) const* {
308	if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
309	return false;
310
311	// We need to ensure that argument promotion does not attempt to promote
312	// pointers to fixed-length vector types larger than 128 bits like
313	// <8 x float> (and pointers to aggregate types which have such fixed-length
314	// vector type members) into the values of the pointees. Such vector types
315	// are used for SVE VLS but there is no ABI for SVE VLS arguments and the
316	// backend cannot lower such value arguments. The 128-bit fixed-length SVE
317	// types can be safely treated as 128-bit NEON types and they cannot be
318	// distinguished in IR.
319	if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) {
320	auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
321	return FVTy &&
322	FVTy->getScalarSizeInBits() * FVTy->getNumElements() > `128`;
323	}))
324	return false;
325
326	return true;
327	}
328
329	unsigned
330	AArch64TTIImpl::getInlineCallPenalty(const Function F, const* CallBase &Call,
331	unsigned DefaultCallPenalty) const {
332	// This function calculates a penalty for executing Call in F.
333	//
334	// There are two ways this function can be called:
335	// (1) F:
336	// call from F -> G (the call here is Call)
337	//
338	// For (1), Call.getCaller() == F, so it will always return a high cost if
339	// a streaming-mode change is required (thus promoting the need to inline the
340	// function)
341	//
342	// (2) F:
343	// call from F -> G (the call here is not Call)
344	// G:
345	// call from G -> H (the call here is Call)
346	//
347	// For (2), if after inlining the body of G into F the call to H requires a
348	// streaming-mode change, and the call to G from F would also require a
349	// streaming-mode change, then there is benefit to do the streaming-mode
350	// change only once and avoid inlining of G into F.
351
352	SMEAttrs FAttrs(*F);
353	SMECallAttrs CallAttrs(Call);
354
355	if (SMECallAttrs (FAttrs, CallAttrs.callee()).requiresSMChange()) {
356	if (F == Call.getCaller()) // (1)
357	return CallPenaltyChangeSM * DefaultCallPenalty;
358	if (SMECallAttrs (FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
359	return InlineCallPenaltyChangeSM * DefaultCallPenalty;
360	}
361
362	return DefaultCallPenalty;
363	}
364
365	bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
366	TargetTransformInfo::RegisterKind K) const {
367	assert(K != TargetTransformInfo::RGK_Scalar);
368	return (K == TargetTransformInfo::RGK_FixedWidthVector &&
369	ST->isNeonAvailable());
370	}
371
372	/// Calculate the cost of materializing a 64-bit value. This helper
373	/// method might only calculate a fraction of a larger immediate. Therefore it
374	/// is valid to return a cost of ZERO.
375	InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
376	// Check if the immediate can be encoded within an instruction.
377	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: `64`))
378	return `0`;
379
380	if (Val < `0`)
381	Val = ~Val;
382
383	// Calculate how many moves we will need to materialize this constant.
384	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
385	AArch64_IMM::expandMOVImm(Imm: Val, BitSize: `64`, Insn);
386	return Insn.size();
387	}
388
389	/// Calculate the cost of materializing the given constant.
390	InstructionCost
391	AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
392	TTI::TargetCostKind CostKind) const {
393	assert(Ty->isIntegerTy());
394
395	unsigned BitSize = Ty->getPrimitiveSizeInBits();
396	if (BitSize == `0`)
397	return ~`0U`;
398
399	// Sign-extend all constants to a multiple of 64-bit.
400	APInt ImmVal = Imm;
401	if (BitSize & `0x3f`)
402	ImmVal = Imm.sext(width: (BitSize + `63`) & ~`0x3fU`);
403
404	// Split the constant into 64-bit chunks and calculate the cost for each
405	// chunk.
406	InstructionCost Cost = `0`;
407	for (unsigned ShiftVal = `0`; ShiftVal < BitSize; ShiftVal += `64`) {
408	APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: `64`);
409	int64_t Val = Tmp.getSExtValue();
410	Cost += getIntImmCost(Val);
411	}
412	// We need at least one instruction to materialze the constant.
413	return std::max<InstructionCost>(a: `1`, b: Cost);
414	}
415
416	InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
417	const APInt &Imm, Type *Ty,
418	TTI::TargetCostKind CostKind,
419	Instruction Inst) const* {
420	assert(Ty->isIntegerTy());
421
422	unsigned BitSize = Ty->getPrimitiveSizeInBits();
423	// There is no cost model for constants with a bit size of 0. Return TCC_Free
424	// here, so that constant hoisting will ignore this constant.
425	if (BitSize == `0`)
426	return TTI::TCC_Free;
427
428	unsigned ImmIdx = ~`0U`;
429	switch (Opcode) {
430	default:
431	return TTI::TCC_Free;
432	case Instruction::GetElementPtr:
433	// Always hoist the base address of a GetElementPtr.
434	if (Idx == `0`)
435	return `2` * TTI::TCC_Basic;
436	return TTI::TCC_Free;
437	case Instruction::Store:
438	ImmIdx = `0`;
439	break;
440	case Instruction::Add:
441	case Instruction::Sub:
442	case Instruction::Mul:
443	case Instruction::UDiv:
444	case Instruction::SDiv:
445	case Instruction::URem:
446	case Instruction::SRem:
447	case Instruction::And:
448	case Instruction::Or:
449	case Instruction::Xor:
450	case Instruction::ICmp:
451	ImmIdx = `1`;
452	break;
453	// Always return TCC_Free for the shift value of a shift instruction.
454	case Instruction::Shl:
455	case Instruction::LShr:
456	case Instruction::AShr:
457	if (Idx == `1`)
458	return TTI::TCC_Free;
459	break;
460	case Instruction::Trunc:
461	case Instruction::ZExt:
462	case Instruction::SExt:
463	case Instruction::IntToPtr:
464	case Instruction::PtrToInt:
465	case Instruction::BitCast:
466	case Instruction::PHI:
467	case Instruction::Call:
468	case Instruction::Select:
469	case Instruction::Ret:
470	case Instruction::Load:
471	break;
472	}
473
474	if (Idx == ImmIdx) {
475	int NumConstants = (BitSize + `63`) / `64`;
476	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
477	return (Cost <= NumConstants * TTI::TCC_Basic)
478	? static_cast<int>(TTI::TCC_Free)
479	: Cost;
480	}
481	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
482	}
483
484	InstructionCost
485	AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
486	const APInt &Imm, Type *Ty,
487	TTI::TargetCostKind CostKind) const {
488	assert(Ty->isIntegerTy());
489
490	unsigned BitSize = Ty->getPrimitiveSizeInBits();
491	// There is no cost model for constants with a bit size of 0. Return TCC_Free
492	// here, so that constant hoisting will ignore this constant.
493	if (BitSize == `0`)
494	return TTI::TCC_Free;
495
496	// Most (all?) AArch64 intrinsics do not support folding immediates into the
497	// selected instruction, so we compute the materialization cost for the
498	// immediate directly.
499	if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
500	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
501
502	switch (IID) {
503	default:
504	return TTI::TCC_Free;
505	case Intrinsic::sadd_with_overflow:
506	case Intrinsic::uadd_with_overflow:
507	case Intrinsic::ssub_with_overflow:
508	case Intrinsic::usub_with_overflow:
509	case Intrinsic::smul_with_overflow:
510	case Intrinsic::umul_with_overflow:
511	if (Idx == `1`) {
512	int NumConstants = (BitSize + `63`) / `64`;
513	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
514	return (Cost <= NumConstants * TTI::TCC_Basic)
515	? static_cast<int>(TTI::TCC_Free)
516	: Cost;
517	}
518	break;
519	case Intrinsic::experimental_stackmap:
520	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
521	return TTI::TCC_Free;
522	break;
523	case Intrinsic::experimental_patchpoint_void:
524	case Intrinsic::experimental_patchpoint:
525	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
526	return TTI::TCC_Free;
527	break;
528	case Intrinsic::experimental_gc_statepoint:
529	if ((Idx < `5`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
530	return TTI::TCC_Free;
531	break;
532	}
533	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
534	}
535
536	TargetTransformInfo::PopcntSupportKind
537	AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
538	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
539	if (TyWidth == `32` \|\| TyWidth == `64`)
540	return TTI::PSK_FastHardware;
541	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
542	return TTI::PSK_Software;
543	}
544
545	static bool isUnpackedVectorVT(EVT VecVT) {
546	return VecVT.isScalableVector() &&
547	VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
548	}
549
550	static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
551	Type BucketPtrsTy = ICA.getArgTypes()[`0`]; // Type of vector of pointers*
552	Type EltTy = ICA.getArgTypes()[`1`]; // Type of bucket elements*
553	unsigned TotalHistCnts = `1`;
554
555	unsigned EltSize = EltTy->getScalarSizeInBits();
556	// Only allow (up to 64b) integers or pointers
557	if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) \|\| EltSize > `64`)
558	return InstructionCost::getInvalid();
559
560	// FIXME: We should be able to generate histcnt for fixed-length vectors
561	// using ptrue with a specific VL.
562	if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
563	unsigned EC = VTy->getElementCount().getKnownMinValue();
564	if (!isPowerOf2_64(Value: EC) \|\| !VTy->isScalableTy())
565	return InstructionCost::getInvalid();
566
567	// HistCnt only supports 32b and 64b element types
568	unsigned LegalEltSize = EltSize <= `32` ? `32` : `64`;
569
570	if (EC == `2` \|\| (LegalEltSize == `32` && EC == `4`))
571	return InstructionCost (BaseHistCntCost);
572
573	unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
574	TotalHistCnts = EC / NaturalVectorWidth;
575	}
576
577	return InstructionCost (BaseHistCntCost * TotalHistCnts);
578	}
579
580	InstructionCost
581	AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
582	TTI::TargetCostKind CostKind) const {
583	// The code-generator is currently not able to handle scalable vectors
584	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
585	// it. This change will be removed when code-generation for these types is
586	// sufficiently reliable.
587	auto *RetTy = ICA.getReturnType();
588	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
589	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
590	return InstructionCost::getInvalid();
591
592	switch (ICA.getID()) {
593	case Intrinsic::experimental_vector_histogram_add:
594	if (!ST->hasSVE2())
595	return InstructionCost::getInvalid();
596	return getHistogramCost(ICA);
597	case Intrinsic::umin:
598	case Intrinsic::umax:
599	case Intrinsic::smin:
600	case Intrinsic::smax: {
601	static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
602	MVT::v8i16, MVT::v2i32, MVT::v4i32,
603	MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
604	MVT::nxv2i64};
605	auto LT = getTypeLegalizationCost(Ty: RetTy);
606	// v2i64 types get converted to cmp+bif hence the cost of 2
607	if (LT.second == MVT::v2i64)
608	return LT.first * `2`;
609	if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }))
610	return LT.first;
611	break;
612	}
613	case Intrinsic::sadd_sat:
614	case Intrinsic::ssub_sat:
615	case Intrinsic::uadd_sat:
616	case Intrinsic::usub_sat: {
617	static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
618	MVT::v8i16, MVT::v2i32, MVT::v4i32,
619	MVT::v2i64};
620	auto LT = getTypeLegalizationCost(Ty: RetTy);
621	// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
622	// need to extend the type, as it uses shr(qadd(shl, shl)).
623	unsigned Instrs =
624	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1` : `4`;
625	if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
626	return LT.first * Instrs;
627	break;
628	}
629	case Intrinsic::abs: {
630	static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
631	MVT::v8i16, MVT::v2i32, MVT::v4i32,
632	MVT::v2i64};
633	auto LT = getTypeLegalizationCost(Ty: RetTy);
634	if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }))
635	return LT.first;
636	break;
637	}
638	case Intrinsic::bswap: {
639	static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
640	MVT::v4i32, MVT::v2i64};
641	auto LT = getTypeLegalizationCost(Ty: RetTy);
642	if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }) &&
643	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
644	return LT.first;
645	break;
646	}
647	case Intrinsic::stepvector: {
648	InstructionCost Cost = `1`; // Cost of the `index' instruction
649	auto LT = getTypeLegalizationCost(Ty: RetTy);
650	// Legalisation of illegal vectors involves an `index' instruction plus
651	// (LT.first - 1) vector adds.
652	if (LT.first > `1`) {
653	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: RetTy->getContext());
654	InstructionCost AddCost =
655	getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
656	Cost += AddCost * (LT.first - `1`);
657	}
658	return Cost;
659	}
660	case Intrinsic::vector_extract:
661	case Intrinsic::vector_insert: {
662	// If both the vector and subvector types are legal types and the index
663	// is 0, then this should be a no-op or simple operation; return a
664	// relatively low cost.
665
666	// If arguments aren't actually supplied, then we cannot determine the
667	// value of the index. We also want to skip predicate types.
668	if (ICA.getArgs().size() != ICA.getArgTypes().size() \|\|
669	ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: `1`))
670	break;
671
672	LLVMContext &C = RetTy->getContext();
673	EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
674	bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
675	EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
676	: getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`1`]);
677	// Skip this if either the vector or subvector types are unpacked
678	// SVE types; they may get lowered to stack stores and loads.
679	if (isUnpackedVectorVT(VecVT) \|\| isUnpackedVectorVT(VecVT: SubVecVT))
680	break;
681
682	TargetLoweringBase::LegalizeKind SubVecLK =
683	getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
684	TargetLoweringBase::LegalizeKind VecLK =
685	getTLI()->getTypeConversion(Context&: C, VT: VecVT);
686	const Value *Idx = IsExtract ? ICA.getArgs()[`1`] : ICA.getArgs()[`2`];
687	const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
688	if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
689	VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
690	return TTI::TCC_Free;
691	break;
692	}
693	case Intrinsic::bitreverse: {
694	static const CostTblEntry BitreverseTbl[] = {
695	{.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: `1`},
696	{.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: `1`},
697	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: `1`},
698	{.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: `1`},
699	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: `2`},
700	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: `2`},
701	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: `2`},
702	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: `2`},
703	{.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: `2`},
704	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: `2`},
705	};
706	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
707	const auto *Entry =
708	CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
709	if (Entry) {
710	// Cost Model is using the legal type(i32) that i8 and i16 will be
711	// converted to +1 so that we match the actual lowering cost
712	if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 \|\|
713	TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
714	return LegalisationCost.first * Entry->Cost + `1`;
715
716	return LegalisationCost.first * Entry->Cost;
717	}
718	break;
719	}
720	case Intrinsic::ctpop: {
721	if (!ST->hasNEON()) {
722	// 32-bit or 64-bit ctpop without NEON is 12 instructions.
723	return getTypeLegalizationCost(Ty: RetTy).first * `12`;
724	}
725	static const CostTblEntry CtpopCostTbl[] = {
726	{.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: `4`},
727	{.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: `3`},
728	{.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: `2`},
729	{.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: `1`},
730	{.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: `4`},
731	{.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: `3`},
732	{.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: `2`},
733	{.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: `1`},
734	{.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: `5`},
735	};
736	auto LT = getTypeLegalizationCost(Ty: RetTy);
737	MVT MTy = LT.second;
738	if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
739	// Extra cost of +1 when illegal vector types are legalized by promoting
740	// the integer type.
741	int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
742	RetTy->getScalarSizeInBits()
743	? `1`
744	: `0`;
745	return LT.first * Entry->Cost + ExtraCost;
746	}
747	break;
748	}
749	case Intrinsic::sadd_with_overflow:
750	case Intrinsic::uadd_with_overflow:
751	case Intrinsic::ssub_with_overflow:
752	case Intrinsic::usub_with_overflow:
753	case Intrinsic::smul_with_overflow:
754	case Intrinsic::umul_with_overflow: {
755	static const CostTblEntry WithOverflowCostTbl[] = {
756	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
757	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
758	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
759	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
760	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
761	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
762	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
763	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
764	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: `3`},
765	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: `3`},
766	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: `3`},
767	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: `3`},
768	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: `1`},
769	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: `1`},
770	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: `1`},
771	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: `1`},
772	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: `5`},
773	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: `4`},
774	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: `5`},
775	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: `4`},
776	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;tst
777	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;cmp sxtw
778	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;smulh;cmp
779	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;umulh;cmp asr
780	};
781	EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: `0`), AllowUnknown: true);
782	if (MTy.isSimple())
783	if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
784	Ty: MTy.getSimpleVT()))
785	return Entry->Cost;
786	break;
787	}
788	case Intrinsic::fptosi_sat:
789	case Intrinsic::fptoui_sat: {
790	if (ICA.getArgTypes().empty())
791	break;
792	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
793	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
794	EVT MTy = TLI->getValueType(DL, Ty: RetTy);
795	// Check for the legal types, which are where the size of the input and the
796	// output are the same, or we are using cvt f64->i32 or f32->i64.
797	if ((LT.second == MVT::f32 \|\| LT.second == MVT::f64 \|\|
798	LT.second == MVT::v2f32 \|\| LT.second == MVT::v4f32 \|\|
799	LT.second == MVT::v2f64)) {
800	if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() \|\|
801	(LT.second == MVT::f64 && MTy == MVT::i32) \|\|
802	(LT.second == MVT::f32 && MTy == MVT::i64)))
803	return LT.first;
804	// Extending vector types v2f32->v2i64, fcvtl2 + fcvt2
805	if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
806	MTy.getScalarSizeInBits() == `64`)
807	return LT.first * (MTy.getVectorNumElements() > `2` ? `4` : `2`);
808	}
809	// Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
810	// f32.
811	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
812	return LT.first + getIntrinsicInstrCost(
813	ICA: {ICA.getID(),
814	RetTy,
815	{ICA.getArgTypes()[`0`]->getWithNewType(
816	EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
817	CostKind);
818	if ((LT.second == MVT::f16 && MTy == MVT::i32) \|\|
819	(LT.second == MVT::f16 && MTy == MVT::i64) \|\|
820	((LT.second == MVT::v4f16 \|\| LT.second == MVT::v8f16) &&
821	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
822	return LT.first;
823	// Extending vector types v8f16->v8i32, fcvtl2 + fcvt2
824	if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
825	MTy.getScalarSizeInBits() == `32`)
826	return LT.first * (MTy.getVectorNumElements() > `4` ? `4` : `2`);
827	// Extending vector types v8f16->v8i32. These current scalarize but the
828	// codegen could be better.
829	if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
830	MTy.getScalarSizeInBits() == `64`)
831	return MTy.getVectorNumElements() * `3`;
832
833	// If we can we use a legal convert followed by a min+max
834	if ((LT.second.getScalarType() == MVT::f32 \|\|
835	LT.second.getScalarType() == MVT::f64 \|\|
836	LT.second.getScalarType() == MVT::f16) &&
837	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
838	Type *LegalTy =
839	Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
840	if (LT.second.isVector())
841	LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
842	InstructionCost Cost = `1`;
843	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
844	LegalTy, {LegalTy, LegalTy});
845	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
846	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
847	LegalTy, {LegalTy, LegalTy});
848	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
849	return LT.first * Cost +
850	((LT.second.getScalarType() != MVT::f16 \|\| ST->hasFullFP16()) ? `0`
851	: `1`);
852	}
853	// Otherwise we need to follow the default expansion that clamps the value
854	// using a float min/max with a fcmp+sel for nan handling when signed.
855	Type *FPTy = ICA.getArgTypes()[`0`]->getScalarType();
856	RetTy = RetTy->getScalarType();
857	if (LT.second.isVector()) {
858	FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
859	RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
860	}
861	IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
862	InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
863	IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
864	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
865	Cost +=
866	getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
867	Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
868	if (IsSigned) {
869	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
870	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
871	VecPred: CmpInst::FCMP_UNO, CostKind);
872	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
873	VecPred: CmpInst::FCMP_UNO, CostKind);
874	}
875	return LT.first * Cost;
876	}
877	case Intrinsic::fshl:
878	case Intrinsic::fshr: {
879	if (ICA.getArgs().empty())
880	break;
881
882	// TODO: Add handling for fshl where third argument is not a constant.
883	const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[`2`]);
884	if (!OpInfoZ.isConstant())
885	break;
886
887	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
888	if (OpInfoZ.isUniform()) {
889	static const CostTblEntry FshlTbl[] = {
890	{.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: `2`}, // shl + usra
891	{.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: `2`},
892	{.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: `2`},
893	{.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: `2`}};
894	// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
895	// to avoid having to duplicate the costs.
896	const auto *Entry =
897	CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
898	if (Entry)
899	return LegalisationCost.first * Entry->Cost;
900	}
901
902	auto TyL = getTypeLegalizationCost(Ty: RetTy);
903	if (!RetTy->isIntegerTy())
904	break;
905
906	// Estimate cost manually, as types like i8 and i16 will get promoted to
907	// i32 and CostTableLookup will ignore the extra conversion cost.
908	bool HigherCost = (RetTy->getScalarSizeInBits() != `32` &&
909	RetTy->getScalarSizeInBits() < `64`) \|\|
910	(RetTy->getScalarSizeInBits() % `64` != `0`);
911	unsigned ExtraCost = HigherCost ? `1` : `0`;
912	if (RetTy->getScalarSizeInBits() == `32` \|\|
913	RetTy->getScalarSizeInBits() == `64`)
914	ExtraCost = `0`; // fhsl/fshr for i32 and i64 can be lowered to a single
915	// extr instruction.
916	else if (HigherCost)
917	ExtraCost = `1`;
918	else
919	break;
920	return TyL.first + ExtraCost;
921	}
922	case Intrinsic::get_active_lane_mask: {
923	auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType());
924	if (RetTy) {
925	EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
926	EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
927	if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) &&
928	!getTLI()->isTypeLegal(VT: RetVT)) {
929	// We don't have enough context at this point to determine if the mask
930	// is going to be kept live after the block, which will force the vXi1
931	// type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
932	// For now, we just assume the vectorizer created this intrinsic and
933	// the result will be the input for a PHI. In this case the cost will
934	// be extremely high for fixed-width vectors.
935	// NOTE: getScalarizationOverhead returns a cost that's far too
936	// pessimistic for the actual generated codegen. In reality there are
937	// two instructions generated per lane.
938	return RetTy->getNumElements() * `2`;
939	}
940	}
941	break;
942	}
943	case Intrinsic::experimental_vector_match: {
944	auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[`1`]);
945	EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
946	unsigned SearchSize = NeedleTy->getNumElements();
947	if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
948	// Base cost for MATCH instructions. At least on the Neoverse V2 and
949	// Neoverse V3, these are cheap operations with the same latency as a
950	// vector ADD. In most cases, however, we also need to do an extra DUP.
951	// For fixed-length vectors we currently need an extra five--six
952	// instructions besides the MATCH.
953	InstructionCost Cost = `4`;
954	if (isa<FixedVectorType>(Val: RetTy))
955	Cost += `10`;
956	return Cost;
957	}
958	break;
959	}
960	case Intrinsic::experimental_cttz_elts: {
961	EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
962	if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
963	// This will consist of a SVE brkb and a cntp instruction. These
964	// typically have the same latency and half the throughput as a vector
965	// add instruction.
966	return `4`;
967	}
968	break;
969	}
970	default:
971	break;
972	}
973	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
974	}
975
976	/// The function will remove redundant reinterprets casting in the presence
977	/// of the control flow
978	static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
979	IntrinsicInst &II) {
980	SmallVector<Instruction *, `32`> Worklist;
981	auto RequiredType = II.getType();
982
983	auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: `0`));
984	assert(PN && "Expected Phi Node!");
985
986	// Don't create a new Phi unless we can remove the old one.
987	if (!PN->hasOneUse())
988	return std::nullopt;
989
990	for (Value *IncValPhi : PN->incoming_values()) {
991	auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
992	if (!Reinterpret \|\|
993	Reinterpret->getIntrinsicID() !=
994	Intrinsic::aarch64_sve_convert_to_svbool \|\|
995	RequiredType != Reinterpret->getArgOperand(i: `0`)->getType())
996	return std::nullopt;
997	}
998
999	// Create the new Phi
1000	IC.Builder.SetInsertPoint(PN);
1001	PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1002	Worklist.push_back(Elt: PN);
1003
1004	for (unsigned I = `0`; I < PN->getNumIncomingValues(); I++) {
1005	auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1006	NPN->addIncoming(V: Reinterpret->getOperand(i: `0`), BB: PN->getIncomingBlock(i: I));
1007	Worklist.push_back(Elt: Reinterpret);
1008	}
1009
1010	// Cleanup Phi Node and reinterprets
1011	return IC.replaceInstUsesWith(I&: II, V: NPN);
1012	}
1013
1014	// A collection of properties common to SVE intrinsics that allow for combines
1015	// to be written without needing to know the specific intrinsic.
1016	struct SVEIntrinsicInfo {
1017	//
1018	// Helper routines for common intrinsic definitions.
1019	//
1020
1021	// e.g. llvm.aarch64.sve.add pg, op1, op2
1022	// with IID ==> llvm.aarch64.sve.add_u
1023	static SVEIntrinsicInfo
1024	defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1025	return SVEIntrinsicInfo ()
1026	.setGoverningPredicateOperandIdx(`0`)
1027	.setOperandIdxInactiveLanesTakenFrom(`1`)
1028	.setMatchingUndefIntrinsic(IID);
1029	}
1030
1031	// e.g. llvm.aarch64.sve.neg inactive, pg, op
1032	static SVEIntrinsicInfo defaultMergingUnaryOp() {
1033	return SVEIntrinsicInfo ()
1034	.setGoverningPredicateOperandIdx(`1`)
1035	.setOperandIdxInactiveLanesTakenFrom(`0`)
1036	.setOperandIdxWithNoActiveLanes(`0`);
1037	}
1038
1039	// e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1040	static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1041	return SVEIntrinsicInfo ()
1042	.setGoverningPredicateOperandIdx(`1`)
1043	.setOperandIdxInactiveLanesTakenFrom(`0`);
1044	}
1045
1046	// e.g. llvm.aarch64.sve.add_u pg, op1, op2
1047	static SVEIntrinsicInfo defaultUndefOp() {
1048	return SVEIntrinsicInfo ()
1049	.setGoverningPredicateOperandIdx(`0`)
1050	.setInactiveLanesAreNotDefined();
1051	}
1052
1053	// e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1054	// llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1055	static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1056	return SVEIntrinsicInfo ()
1057	.setGoverningPredicateOperandIdx(GPIndex)
1058	.setInactiveLanesAreUnused();
1059	}
1060
1061	// e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1062	// llvm.aarch64.sve.ld1 pg, ptr
1063	static SVEIntrinsicInfo defaultZeroingOp() {
1064	return SVEIntrinsicInfo ()
1065	.setGoverningPredicateOperandIdx(`0`)
1066	.setInactiveLanesAreUnused()
1067	.setResultIsZeroInitialized();
1068	}
1069
1070	// All properties relate to predication and thus having a general predicate
1071	// is the minimum requirement to say there is intrinsic info to act on.
1072	explicit operator bool() const { return hasGoverningPredicate(); }
1073
1074	//
1075	// Properties relating to the governing predicate.
1076	//
1077
1078	bool hasGoverningPredicate() const {
1079	return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1080	}
1081
1082	unsigned getGoverningPredicateOperandIdx() const {
1083	assert(hasGoverningPredicate() && "Propery not set!");
1084	return GoverningPredicateIdx;
1085	}
1086
1087	SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1088	assert(!hasGoverningPredicate() && "Cannot set property twice!");
1089	GoverningPredicateIdx = Index;
1090	return *this;
1091	}
1092
1093	//
1094	// Properties relating to operations the intrinsic could be transformed into.
1095	// NOTE: This does not mean such a transformation is always possible, but the
1096	// knowledge makes it possible to reuse existing optimisations without needing
1097	// to embed specific handling for each intrinsic. For example, instruction
1098	// simplification can be used to optimise an intrinsic's active lanes.
1099	//
1100
1101	bool hasMatchingUndefIntrinsic() const {
1102	return UndefIntrinsic != Intrinsic::not_intrinsic;
1103	}
1104
1105	Intrinsic::ID getMatchingUndefIntrinsic() const {
1106	assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1107	return UndefIntrinsic;
1108	}
1109
1110	SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1111	assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1112	UndefIntrinsic = IID;
1113	return *this;
1114	}
1115
1116	bool hasMatchingIROpode() const { return IROpcode != `0`; }
1117
1118	unsigned getMatchingIROpode() const {
1119	assert(hasMatchingIROpode() && "Propery not set!");
1120	return IROpcode;
1121	}
1122
1123	SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1124	assert(!hasMatchingIROpode() && "Cannot set property twice!");
1125	IROpcode = Opcode;
1126	return *this;
1127	}
1128
1129	//
1130	// Properties relating to the result of inactive lanes.
1131	//
1132
1133	bool inactiveLanesTakenFromOperand() const {
1134	return ResultLanes == InactiveLanesTakenFromOperand;
1135	}
1136
1137	unsigned getOperandIdxInactiveLanesTakenFrom() const {
1138	assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1139	return OperandIdxForInactiveLanes;
1140	}
1141
1142	SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1143	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1144	ResultLanes = InactiveLanesTakenFromOperand;
1145	OperandIdxForInactiveLanes = Index;
1146	return *this;
1147	}
1148
1149	bool inactiveLanesAreNotDefined() const {
1150	return ResultLanes == InactiveLanesAreNotDefined;
1151	}
1152
1153	SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1154	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1155	ResultLanes = InactiveLanesAreNotDefined;
1156	return *this;
1157	}
1158
1159	bool inactiveLanesAreUnused() const {
1160	return ResultLanes == InactiveLanesAreUnused;
1161	}
1162
1163	SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1164	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1165	ResultLanes = InactiveLanesAreUnused;
1166	return *this;
1167	}
1168
1169	// NOTE: Whilst not limited to only inactive lanes, the common use case is:
1170	// inactiveLanesAreZeroed =
1171	// resultIsZeroInitialized() && inactiveLanesAreUnused()
1172	bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1173
1174	SVEIntrinsicInfo &setResultIsZeroInitialized() {
1175	ResultIsZeroInitialized = true;
1176	return *this;
1177	}
1178
1179	//
1180	// The first operand of unary merging operations is typically only used to
1181	// set the result for inactive lanes. Knowing this allows us to deadcode the
1182	// operand when we can prove there are no inactive lanes.
1183	//
1184
1185	bool hasOperandWithNoActiveLanes() const {
1186	return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1187	}
1188
1189	unsigned getOperandIdxWithNoActiveLanes() const {
1190	assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1191	return OperandIdxWithNoActiveLanes;
1192	}
1193
1194	SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1195	assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1196	OperandIdxWithNoActiveLanes = Index;
1197	return *this;
1198	}
1199
1200	private:
1201	unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1202
1203	Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1204	unsigned IROpcode = `0`;
1205
1206	enum PredicationStyle {
1207	Uninitialized,
1208	InactiveLanesTakenFromOperand,
1209	InactiveLanesAreNotDefined,
1210	InactiveLanesAreUnused
1211	} ResultLanes = Uninitialized;
1212
1213	bool ResultIsZeroInitialized = false;
1214	unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1215	unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1216	};
1217
1218	static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1219	// Some SVE intrinsics do not use scalable vector types, but since they are
1220	// not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1221	if (!isa<ScalableVectorType>(Val: II.getType()) &&
1222	all_of(Range: II.args(), P: [&](const Value *V) {
1223	return !isa<ScalableVectorType>(Val: V->getType());
1224	}))
1225	return SVEIntrinsicInfo ();
1226
1227	Intrinsic::ID IID = II.getIntrinsicID();
1228	switch (IID) {
1229	default:
1230	break;
1231	case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1232	case Intrinsic::aarch64_sve_fcvt_f16f32:
1233	case Intrinsic::aarch64_sve_fcvt_f16f64:
1234	case Intrinsic::aarch64_sve_fcvt_f32f16:
1235	case Intrinsic::aarch64_sve_fcvt_f32f64:
1236	case Intrinsic::aarch64_sve_fcvt_f64f16:
1237	case Intrinsic::aarch64_sve_fcvt_f64f32:
1238	case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1239	case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1240	case Intrinsic::aarch64_sve_fcvtx_f32f64:
1241	case Intrinsic::aarch64_sve_fcvtzs:
1242	case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1243	case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1244	case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1245	case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1246	case Intrinsic::aarch64_sve_fcvtzu:
1247	case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1248	case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1249	case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1250	case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1251	case Intrinsic::aarch64_sve_scvtf:
1252	case Intrinsic::aarch64_sve_scvtf_f16i32:
1253	case Intrinsic::aarch64_sve_scvtf_f16i64:
1254	case Intrinsic::aarch64_sve_scvtf_f32i64:
1255	case Intrinsic::aarch64_sve_scvtf_f64i32:
1256	case Intrinsic::aarch64_sve_ucvtf:
1257	case Intrinsic::aarch64_sve_ucvtf_f16i32:
1258	case Intrinsic::aarch64_sve_ucvtf_f16i64:
1259	case Intrinsic::aarch64_sve_ucvtf_f32i64:
1260	case Intrinsic::aarch64_sve_ucvtf_f64i32:
1261	return SVEIntrinsicInfo::defaultMergingUnaryOp();
1262
1263	case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1264	case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1265	case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1266	case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1267	return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1268
1269	case Intrinsic::aarch64_sve_fabd:
1270	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1271	case Intrinsic::aarch64_sve_fadd:
1272	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1273	.setMatchingIROpcode(Instruction::FAdd);
1274	case Intrinsic::aarch64_sve_fdiv:
1275	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1276	.setMatchingIROpcode(Instruction::FDiv);
1277	case Intrinsic::aarch64_sve_fmax:
1278	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1279	case Intrinsic::aarch64_sve_fmaxnm:
1280	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1281	case Intrinsic::aarch64_sve_fmin:
1282	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1283	case Intrinsic::aarch64_sve_fminnm:
1284	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1285	case Intrinsic::aarch64_sve_fmla:
1286	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1287	case Intrinsic::aarch64_sve_fmls:
1288	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1289	case Intrinsic::aarch64_sve_fmul:
1290	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1291	.setMatchingIROpcode(Instruction::FMul);
1292	case Intrinsic::aarch64_sve_fmulx:
1293	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1294	case Intrinsic::aarch64_sve_fnmla:
1295	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1296	case Intrinsic::aarch64_sve_fnmls:
1297	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1298	case Intrinsic::aarch64_sve_fsub:
1299	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1300	.setMatchingIROpcode(Instruction::FSub);
1301	case Intrinsic::aarch64_sve_add:
1302	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1303	.setMatchingIROpcode(Instruction::Add);
1304	case Intrinsic::aarch64_sve_mla:
1305	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1306	case Intrinsic::aarch64_sve_mls:
1307	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1308	case Intrinsic::aarch64_sve_mul:
1309	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1310	.setMatchingIROpcode(Instruction::Mul);
1311	case Intrinsic::aarch64_sve_sabd:
1312	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1313	case Intrinsic::aarch64_sve_sdiv:
1314	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1315	.setMatchingIROpcode(Instruction::SDiv);
1316	case Intrinsic::aarch64_sve_smax:
1317	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1318	case Intrinsic::aarch64_sve_smin:
1319	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1320	case Intrinsic::aarch64_sve_smulh:
1321	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1322	case Intrinsic::aarch64_sve_sub:
1323	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1324	.setMatchingIROpcode(Instruction::Sub);
1325	case Intrinsic::aarch64_sve_uabd:
1326	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1327	case Intrinsic::aarch64_sve_udiv:
1328	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1329	.setMatchingIROpcode(Instruction::UDiv);
1330	case Intrinsic::aarch64_sve_umax:
1331	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1332	case Intrinsic::aarch64_sve_umin:
1333	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1334	case Intrinsic::aarch64_sve_umulh:
1335	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1336	case Intrinsic::aarch64_sve_asr:
1337	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1338	.setMatchingIROpcode(Instruction::AShr);
1339	case Intrinsic::aarch64_sve_lsl:
1340	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1341	.setMatchingIROpcode(Instruction::Shl);
1342	case Intrinsic::aarch64_sve_lsr:
1343	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1344	.setMatchingIROpcode(Instruction::LShr);
1345	case Intrinsic::aarch64_sve_and:
1346	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1347	.setMatchingIROpcode(Instruction::And);
1348	case Intrinsic::aarch64_sve_bic:
1349	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1350	case Intrinsic::aarch64_sve_eor:
1351	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1352	.setMatchingIROpcode(Instruction::Xor);
1353	case Intrinsic::aarch64_sve_orr:
1354	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1355	.setMatchingIROpcode(Instruction::Or);
1356	case Intrinsic::aarch64_sve_sqsub:
1357	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1358	case Intrinsic::aarch64_sve_uqsub:
1359	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1360
1361	case Intrinsic::aarch64_sve_add_u:
1362	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1363	Instruction::Add);
1364	case Intrinsic::aarch64_sve_and_u:
1365	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1366	Instruction::And);
1367	case Intrinsic::aarch64_sve_asr_u:
1368	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1369	Instruction::AShr);
1370	case Intrinsic::aarch64_sve_eor_u:
1371	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1372	Instruction::Xor);
1373	case Intrinsic::aarch64_sve_fadd_u:
1374	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1375	Instruction::FAdd);
1376	case Intrinsic::aarch64_sve_fdiv_u:
1377	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1378	Instruction::FDiv);
1379	case Intrinsic::aarch64_sve_fmul_u:
1380	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1381	Instruction::FMul);
1382	case Intrinsic::aarch64_sve_fsub_u:
1383	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1384	Instruction::FSub);
1385	case Intrinsic::aarch64_sve_lsl_u:
1386	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1387	Instruction::Shl);
1388	case Intrinsic::aarch64_sve_lsr_u:
1389	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1390	Instruction::LShr);
1391	case Intrinsic::aarch64_sve_mul_u:
1392	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1393	Instruction::Mul);
1394	case Intrinsic::aarch64_sve_orr_u:
1395	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1396	Instruction::Or);
1397	case Intrinsic::aarch64_sve_sdiv_u:
1398	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1399	Instruction::SDiv);
1400	case Intrinsic::aarch64_sve_sub_u:
1401	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1402	Instruction::Sub);
1403	case Intrinsic::aarch64_sve_udiv_u:
1404	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1405	Instruction::UDiv);
1406
1407	case Intrinsic::aarch64_sve_addqv:
1408	case Intrinsic::aarch64_sve_and_z:
1409	case Intrinsic::aarch64_sve_bic_z:
1410	case Intrinsic::aarch64_sve_brka_z:
1411	case Intrinsic::aarch64_sve_brkb_z:
1412	case Intrinsic::aarch64_sve_brkn_z:
1413	case Intrinsic::aarch64_sve_brkpa_z:
1414	case Intrinsic::aarch64_sve_brkpb_z:
1415	case Intrinsic::aarch64_sve_cntp:
1416	case Intrinsic::aarch64_sve_compact:
1417	case Intrinsic::aarch64_sve_eor_z:
1418	case Intrinsic::aarch64_sve_eorv:
1419	case Intrinsic::aarch64_sve_eorqv:
1420	case Intrinsic::aarch64_sve_nand_z:
1421	case Intrinsic::aarch64_sve_nor_z:
1422	case Intrinsic::aarch64_sve_orn_z:
1423	case Intrinsic::aarch64_sve_orr_z:
1424	case Intrinsic::aarch64_sve_orv:
1425	case Intrinsic::aarch64_sve_orqv:
1426	case Intrinsic::aarch64_sve_pnext:
1427	case Intrinsic::aarch64_sve_rdffr_z:
1428	case Intrinsic::aarch64_sve_saddv:
1429	case Intrinsic::aarch64_sve_uaddv:
1430	case Intrinsic::aarch64_sve_umaxv:
1431	case Intrinsic::aarch64_sve_umaxqv:
1432	case Intrinsic::aarch64_sve_cmpeq:
1433	case Intrinsic::aarch64_sve_cmpeq_wide:
1434	case Intrinsic::aarch64_sve_cmpge:
1435	case Intrinsic::aarch64_sve_cmpge_wide:
1436	case Intrinsic::aarch64_sve_cmpgt:
1437	case Intrinsic::aarch64_sve_cmpgt_wide:
1438	case Intrinsic::aarch64_sve_cmphi:
1439	case Intrinsic::aarch64_sve_cmphi_wide:
1440	case Intrinsic::aarch64_sve_cmphs:
1441	case Intrinsic::aarch64_sve_cmphs_wide:
1442	case Intrinsic::aarch64_sve_cmple_wide:
1443	case Intrinsic::aarch64_sve_cmplo_wide:
1444	case Intrinsic::aarch64_sve_cmpls_wide:
1445	case Intrinsic::aarch64_sve_cmplt_wide:
1446	case Intrinsic::aarch64_sve_cmpne:
1447	case Intrinsic::aarch64_sve_cmpne_wide:
1448	case Intrinsic::aarch64_sve_facge:
1449	case Intrinsic::aarch64_sve_facgt:
1450	case Intrinsic::aarch64_sve_fcmpeq:
1451	case Intrinsic::aarch64_sve_fcmpge:
1452	case Intrinsic::aarch64_sve_fcmpgt:
1453	case Intrinsic::aarch64_sve_fcmpne:
1454	case Intrinsic::aarch64_sve_fcmpuo:
1455	case Intrinsic::aarch64_sve_ld1:
1456	case Intrinsic::aarch64_sve_ld1_gather:
1457	case Intrinsic::aarch64_sve_ld1_gather_index:
1458	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1459	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1460	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1461	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1462	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1463	case Intrinsic::aarch64_sve_ld1q_gather_index:
1464	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1465	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1466	case Intrinsic::aarch64_sve_ld1ro:
1467	case Intrinsic::aarch64_sve_ld1rq:
1468	case Intrinsic::aarch64_sve_ld1udq:
1469	case Intrinsic::aarch64_sve_ld1uwq:
1470	case Intrinsic::aarch64_sve_ld2_sret:
1471	case Intrinsic::aarch64_sve_ld2q_sret:
1472	case Intrinsic::aarch64_sve_ld3_sret:
1473	case Intrinsic::aarch64_sve_ld3q_sret:
1474	case Intrinsic::aarch64_sve_ld4_sret:
1475	case Intrinsic::aarch64_sve_ld4q_sret:
1476	case Intrinsic::aarch64_sve_ldff1:
1477	case Intrinsic::aarch64_sve_ldff1_gather:
1478	case Intrinsic::aarch64_sve_ldff1_gather_index:
1479	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1480	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1481	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1482	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1483	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1484	case Intrinsic::aarch64_sve_ldnf1:
1485	case Intrinsic::aarch64_sve_ldnt1:
1486	case Intrinsic::aarch64_sve_ldnt1_gather:
1487	case Intrinsic::aarch64_sve_ldnt1_gather_index:
1488	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1489	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1490	return SVEIntrinsicInfo::defaultZeroingOp();
1491
1492	case Intrinsic::aarch64_sve_prf:
1493	case Intrinsic::aarch64_sve_prfb_gather_index:
1494	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1495	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1496	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1497	case Intrinsic::aarch64_sve_prfd_gather_index:
1498	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1499	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1500	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1501	case Intrinsic::aarch64_sve_prfh_gather_index:
1502	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1503	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1504	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1505	case Intrinsic::aarch64_sve_prfw_gather_index:
1506	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1507	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1508	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1509	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `0`);
1510
1511	case Intrinsic::aarch64_sve_st1_scatter:
1512	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1513	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1514	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1515	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1516	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1517	case Intrinsic::aarch64_sve_st1dq:
1518	case Intrinsic::aarch64_sve_st1q_scatter_index:
1519	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1520	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1521	case Intrinsic::aarch64_sve_st1wq:
1522	case Intrinsic::aarch64_sve_stnt1:
1523	case Intrinsic::aarch64_sve_stnt1_scatter:
1524	case Intrinsic::aarch64_sve_stnt1_scatter_index:
1525	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1526	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1527	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `1`);
1528	case Intrinsic::aarch64_sve_st2:
1529	case Intrinsic::aarch64_sve_st2q:
1530	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `2`);
1531	case Intrinsic::aarch64_sve_st3:
1532	case Intrinsic::aarch64_sve_st3q:
1533	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `3`);
1534	case Intrinsic::aarch64_sve_st4:
1535	case Intrinsic::aarch64_sve_st4q:
1536	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `4`);
1537	}
1538
1539	return SVEIntrinsicInfo ();
1540	}
1541
1542	static bool isAllActivePredicate(Value *Pred) {
1543	// Look through convert.from.svbool(convert.to.svbool(...) chain.
1544	Value *UncastedPred;
1545	if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1546	Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1547	Op0: m_Value(V&: UncastedPred)))))
1548	// If the predicate has the same or less lanes than the uncasted
1549	// predicate then we know the casting has no effect.
1550	if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <=
1551	cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements())
1552	Pred = UncastedPred;
1553	auto *C = dyn_cast<Constant>(Val: Pred);
1554	return (C && C->isAllOnesValue());
1555	}
1556
1557	// Simplify `V` by only considering the operations that affect active lanes.
1558	// This function should only return existing Values or newly created Constants.
1559	static Value stripInactiveLanes(Value V, const Value *Pg) {
1560	auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1561	if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1562	Dup->getOperand(i_nocapture: `1`) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: `2`)))
1563	return ConstantVector::getSplat(
1564	EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1565	Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: `2`)));
1566
1567	return V;
1568	}
1569
1570	static std::optional<Instruction *>
1571	simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1572	const SVEIntrinsicInfo &IInfo) {
1573	const unsigned Opc = IInfo.getMatchingIROpode();
1574	assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1575
1576	Value *Pg = II.getOperand(i_nocapture: `0`);
1577	Value *Op1 = II.getOperand(i_nocapture: `1`);
1578	Value *Op2 = II.getOperand(i_nocapture: `2`);
1579	const DataLayout &DL = II.getDataLayout();
1580
1581	// Canonicalise constants to the RHS.
1582	if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1583	isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1584	IC.replaceOperand(I&: II, OpNum: `1`, V: Op2);
1585	IC.replaceOperand(I&: II, OpNum: `2`, V: Op1);
1586	return &II;
1587	}
1588
1589	// Only active lanes matter when simplifying the operation.
1590	Op1 = stripInactiveLanes(V: Op1, Pg);
1591	Op2 = stripInactiveLanes(V: Op2, Pg);
1592
1593	Value *SimpleII;
1594	if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1595	SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1596	else
1597	SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1598
1599	// An SVE intrinsic's result is always defined. However, this is not the case
1600	// for its equivalent IR instruction (e.g. when shifting by an amount more
1601	// than the data's bitwidth). Simplifications to an undefined result must be
1602	// ignored to preserve the intrinsic's expected behaviour.
1603	if (!SimpleII \|\| isa<UndefValue>(Val: SimpleII))
1604	return std::nullopt;
1605
1606	if (IInfo.inactiveLanesAreNotDefined())
1607	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1608
1609	Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1610
1611	// The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1612	if (SimpleII == Inactive)
1613	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1614
1615	// Inactive lanes must be preserved.
1616	SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1617	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1618	}
1619
1620	// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1621	// to operations with less strict inactive lane requirements.
1622	static std::optional<Instruction *>
1623	simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1624	const SVEIntrinsicInfo &IInfo) {
1625	if (!IInfo.hasGoverningPredicate())
1626	return std::nullopt;
1627
1628	auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1629
1630	// If there are no active lanes.
1631	if (match(V: OpPredicate, P: m_ZeroInt())) {
1632	if (IInfo.inactiveLanesTakenFromOperand())
1633	return IC.replaceInstUsesWith(
1634	I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1635
1636	if (IInfo.inactiveLanesAreUnused()) {
1637	if (IInfo.resultIsZeroInitialized())
1638	IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1639
1640	return IC.eraseInstFromFunction(I&: II);
1641	}
1642	}
1643
1644	// If there are no inactive lanes.
1645	if (isAllActivePredicate(Pred: OpPredicate)) {
1646	if (IInfo.hasOperandWithNoActiveLanes()) {
1647	unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1648	if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1649	return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1650	}
1651
1652	if (IInfo.hasMatchingUndefIntrinsic()) {
1653	auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1654	M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()});
1655	II.setCalledFunction(NewDecl);
1656	return &II;
1657	}
1658	}
1659
1660	// Operation specific simplifications.
1661	if (IInfo.hasMatchingIROpode() &&
1662	Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1663	return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1664
1665	return std::nullopt;
1666	}
1667
1668	// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1669	// => (binop (pred) (from_svbool _) (from_svbool _))
1670	//
1671	// The above transformation eliminates a `to_svbool` in the predicate
1672	// operand of bitwise operation `binop` by narrowing the vector width of
1673	// the operation. For example, it would convert a `<vscale x 16 x i1>
1674	// and` into a `<vscale x 4 x i1> and`. This is profitable because
1675	// to_svbool must zero the new lanes during widening, whereas
1676	// from_svbool is free.
1677	static std::optional<Instruction *>
1678	tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
1679	auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: `0`));
1680	if (!BinOp)
1681	return std::nullopt;
1682
1683	auto IntrinsicID = BinOp->getIntrinsicID();
1684	switch (IntrinsicID) {
1685	case Intrinsic::aarch64_sve_and_z:
1686	case Intrinsic::aarch64_sve_bic_z:
1687	case Intrinsic::aarch64_sve_eor_z:
1688	case Intrinsic::aarch64_sve_nand_z:
1689	case Intrinsic::aarch64_sve_nor_z:
1690	case Intrinsic::aarch64_sve_orn_z:
1691	case Intrinsic::aarch64_sve_orr_z:
1692	break;
1693	default:
1694	return std::nullopt;
1695	}
1696
1697	auto BinOpPred = BinOp->getOperand(i_nocapture: `0`);
1698	auto BinOpOp1 = BinOp->getOperand(i_nocapture: `1`);
1699	auto BinOpOp2 = BinOp->getOperand(i_nocapture: `2`);
1700
1701	auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
1702	if (!PredIntr \|\|
1703	PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1704	return std::nullopt;
1705
1706	auto PredOp = PredIntr->getOperand(i_nocapture: `0`);
1707	auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
1708	if (PredOpTy != II.getType())
1709	return std::nullopt;
1710
1711	SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1712	auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1713	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
1714	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1715	if (BinOpOp1 == BinOpOp2)
1716	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1717	else
1718	NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
1719	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
1720
1721	auto NarrowedBinOp =
1722	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
1723	return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
1724	}
1725
1726	static std::optional<Instruction *>
1727	instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1728	// If the reinterpret instruction operand is a PHI Node
1729	if (isa<PHINode>(Val: II.getArgOperand(i: `0`)))
1730	return processPhiNode(IC, II);
1731
1732	if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1733	return BinOpCombine;
1734
1735	// Ignore converts to/from svcount_t.
1736	if (isa<TargetExtType>(Val: II.getArgOperand(i: `0`)->getType()) \|\|
1737	isa<TargetExtType>(Val: II.getType()))
1738	return std::nullopt;
1739
1740	SmallVector<Instruction *, `32`> CandidatesForRemoval;
1741	Value Cursor = II.getOperand(i_nocapture: `0`), EarliestReplacement = nullptr;
1742
1743	const auto *IVTy = cast<VectorType>(Val: II.getType());
1744
1745	// Walk the chain of conversions.
1746	while (Cursor) {
1747	// If the type of the cursor has fewer lanes than the final result, zeroing
1748	// must take place, which breaks the equivalence chain.
1749	const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
1750	if (CursorVTy->getElementCount().getKnownMinValue() <
1751	IVTy->getElementCount().getKnownMinValue())
1752	break;
1753
1754	// If the cursor has the same type as I, it is a viable replacement.
1755	if (Cursor->getType() == IVTy)
1756	EarliestReplacement = Cursor;
1757
1758	auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
1759
1760	// If this is not an SVE conversion intrinsic, this is the end of the chain.
1761	if (!IntrinsicCursor \|\| !(IntrinsicCursor->getIntrinsicID() ==
1762	Intrinsic::aarch64_sve_convert_to_svbool \|\|
1763	IntrinsicCursor->getIntrinsicID() ==
1764	Intrinsic::aarch64_sve_convert_from_svbool))
1765	break;
1766
1767	CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1768	Cursor = IntrinsicCursor->getOperand(i_nocapture: `0`);
1769	}
1770
1771	// If no viable replacement in the conversion chain was found, there is
1772	// nothing to do.
1773	if (!EarliestReplacement)
1774	return std::nullopt;
1775
1776	return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
1777	}
1778
1779	static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1780	IntrinsicInst &II) {
1781	// svsel(ptrue, x, y) => x
1782	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1783	if (isAllActivePredicate(Pred: OpPredicate))
1784	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
1785
1786	auto Select =
1787	IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: `1`), False: II.getOperand(i_nocapture: `2`));
1788	return IC.replaceInstUsesWith(I&: II, V: Select);
1789	}
1790
1791	static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1792	IntrinsicInst &II) {
1793	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1794	if (!Pg)
1795	return std::nullopt;
1796
1797	if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1798	return std::nullopt;
1799
1800	const auto PTruePattern =
1801	cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: `0`))->getZExtValue();
1802	if (PTruePattern != AArch64SVEPredPattern::vl1)
1803	return std::nullopt;
1804
1805	// The intrinsic is inserting into lane zero so use an insert instead.
1806	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1807	auto *Insert = InsertElementInst::Create(
1808	Vec: II.getArgOperand(i: `0`), NewElt: II.getArgOperand(i: `2`), Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1809	Insert->insertBefore(InsertPos: II.getIterator());
1810	Insert->takeName(V: &II);
1811
1812	return IC.replaceInstUsesWith(I&: II, V: Insert);
1813	}
1814
1815	static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1816	IntrinsicInst &II) {
1817	// Replace DupX with a regular IR splat.
1818	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1819	Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1820	V: II.getArgOperand(i: `0`));
1821	Splat->takeName(V: &II);
1822	return IC.replaceInstUsesWith(I&: II, V: Splat);
1823	}
1824
1825	static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1826	IntrinsicInst &II) {
1827	LLVMContext &Ctx = II.getContext();
1828
1829	if (!isAllActivePredicate(Pred: II.getArgOperand(i: `0`)))
1830	return std::nullopt;
1831
1832	// Check that we have a compare of zero..
1833	auto *SplatValue =
1834	dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: `2`)));
1835	if (!SplatValue \|\| !SplatValue->isZero())
1836	return std::nullopt;
1837
1838	// ..against a dupq
1839	auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1840	if (!DupQLane \|\|
1841	DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1842	return std::nullopt;
1843
1844	// Where the dupq is a lane 0 replicate of a vector insert
1845	auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: `1`));
1846	if (!DupQLaneIdx \|\| !DupQLaneIdx->isZero())
1847	return std::nullopt;
1848
1849	auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: `0`));
1850	if (!VecIns \|\| VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1851	return std::nullopt;
1852
1853	// Where the vector insert is a fixed constant vector insert into undef at
1854	// index zero
1855	if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: `0`)))
1856	return std::nullopt;
1857
1858	if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: `2`))->isZero())
1859	return std::nullopt;
1860
1861	auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: `1`));
1862	if (!ConstVec)
1863	return std::nullopt;
1864
1865	auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
1866	auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
1867	if (!VecTy \|\| !OutTy \|\| VecTy->getNumElements() != OutTy->getMinNumElements())
1868	return std::nullopt;
1869
1870	unsigned NumElts = VecTy->getNumElements();
1871	unsigned PredicateBits = `0`;
1872
1873	// Expand intrinsic operands to a 16-bit byte level predicate
1874	for (unsigned I = `0`; I < NumElts; ++I) {
1875	auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
1876	if (!Arg)
1877	return std::nullopt;
1878	if (!Arg->isZero())
1879	PredicateBits \|= `1` << (I * (`16` / NumElts));
1880	}
1881
1882	// If all bits are zero bail early with an empty predicate
1883	if (PredicateBits == `0`) {
1884	auto *PFalse = Constant::getNullValue(Ty: II.getType());
1885	PFalse->takeName(V: &II);
1886	return IC.replaceInstUsesWith(I&: II, V: PFalse);
1887	}
1888
1889	// Calculate largest predicate type used (where byte predicate is largest)
1890	unsigned Mask = `8`;
1891	for (unsigned I = `0`; I < `16`; ++I)
1892	if ((PredicateBits & (`1` << I)) != `0`)
1893	Mask \|= (I % `8`);
1894
1895	unsigned PredSize = Mask & -Mask;
1896	auto *PredType = ScalableVectorType::get(
1897	ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * `8`));
1898
1899	// Ensure all relevant bits are set
1900	for (unsigned I = `0`; I < `16`; I += PredSize)
1901	if ((PredicateBits & (`1` << I)) == `0`)
1902	return std::nullopt;
1903
1904	auto *PTruePat =
1905	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
1906	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
1907	Types: {PredType}, Args: {PTruePat});
1908	auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1909	ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
1910	auto *ConvertFromSVBool =
1911	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
1912	Types: {II.getType()}, Args: {ConvertToSVBool});
1913
1914	ConvertFromSVBool->takeName(V: &II);
1915	return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
1916	}
1917
1918	static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1919	IntrinsicInst &II) {
1920	Value *Pg = II.getArgOperand(i: `0`);
1921	Value *Vec = II.getArgOperand(i: `1`);
1922	auto IntrinsicID = II.getIntrinsicID();
1923	bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1924
1925	// lastX(splat(X)) --> X
1926	if (auto *SplatVal = getSplatValue(V: Vec))
1927	return IC.replaceInstUsesWith(I&: II, V: SplatVal);
1928
1929	// If x and/or y is a splat value then:
1930	// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1931	Value LHS, RHS;
1932	if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
1933	if (isSplatValue(V: LHS) \|\| isSplatValue(V: RHS)) {
1934	auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
1935	auto OpC = OldBinOp->getOpcode();
1936	auto *NewLHS =
1937	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
1938	auto *NewRHS =
1939	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
1940	auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1941	Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
1942	return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
1943	}
1944	}
1945
1946	auto *C = dyn_cast<Constant>(Val: Pg);
1947	if (IsAfter && C && C->isNullValue()) {
1948	// The intrinsic is extracting lane 0 so use an extract instead.
1949	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1950	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1951	Extract->insertBefore(InsertPos: II.getIterator());
1952	Extract->takeName(V: &II);
1953	return IC.replaceInstUsesWith(I&: II, V: Extract);
1954	}
1955
1956	auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
1957	if (!IntrPG)
1958	return std::nullopt;
1959
1960	if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1961	return std::nullopt;
1962
1963	const auto PTruePattern =
1964	cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: `0`))->getZExtValue();
1965
1966	// Can the intrinsic's predicate be converted to a known constant index?
1967	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
1968	if (!MinNumElts)
1969	return std::nullopt;
1970
1971	unsigned Idx = MinNumElts - `1`;
1972	// Increment the index if extracting the element after the last active
1973	// predicate element.
1974	if (IsAfter)
1975	++Idx;
1976
1977	// Ignore extracts whose index is larger than the known minimum vector
1978	// length. NOTE: This is an artificial constraint where we prefer to
1979	// maintain what the user asked for until an alternative is proven faster.
1980	auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
1981	if (Idx >= PgVTy->getMinNumElements())
1982	return std::nullopt;
1983
1984	// The intrinsic is extracting a fixed lane so use an extract instead.
1985	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1986	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
1987	Extract->insertBefore(InsertPos: II.getIterator());
1988	Extract->takeName(V: &II);
1989	return IC.replaceInstUsesWith(I&: II, V: Extract);
1990	}
1991
1992	static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1993	IntrinsicInst &II) {
1994	// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1995	// integer variant across a variety of micro-architectures. Replace scalar
1996	// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1997	// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1998	// depending on the micro-architecture, but has been observed as generally
1999	// being faster, particularly when the CLAST[AB] op is a loop-carried
2000	// dependency.
2001	Value *Pg = II.getArgOperand(i: `0`);
2002	Value *Fallback = II.getArgOperand(i: `1`);
2003	Value *Vec = II.getArgOperand(i: `2`);
2004	Type *Ty = II.getType();
2005
2006	if (!Ty->isIntegerTy())
2007	return std::nullopt;
2008
2009	Type *FPTy;
2010	switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2011	default:
2012	return std::nullopt;
2013	case `16`:
2014	FPTy = IC.Builder.getHalfTy();
2015	break;
2016	case `32`:
2017	FPTy = IC.Builder.getFloatTy();
2018	break;
2019	case `64`:
2020	FPTy = IC.Builder.getDoubleTy();
2021	break;
2022	}
2023
2024	Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2025	auto *FPVTy = VectorType::get(
2026	ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2027	Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2028	auto *FPII = IC.Builder.CreateIntrinsic(
2029	ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2030	Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2031	return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2032	}
2033
2034	static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2035	IntrinsicInst &II) {
2036	LLVMContext &Ctx = II.getContext();
2037	// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2038	// can work with RDFFR_PP for ptest elimination.
2039	auto *AllPat =
2040	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2041	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2042	Types: {II.getType()}, Args: {AllPat});
2043	auto *RDFFR =
2044	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue});
2045	RDFFR->takeName(V: &II);
2046	return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2047	}
2048
2049	static std::optional<Instruction *>
2050	instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2051	const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: `0`))->getZExtValue();
2052
2053	if (Pattern == AArch64SVEPredPattern::all) {
2054	Value *Cnt = IC.Builder.CreateElementCount(
2055	Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2056	Cnt->takeName(V: &II);
2057	return IC.replaceInstUsesWith(I&: II, V: Cnt);
2058	}
2059
2060	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2061
2062	return MinNumElts && NumElts >= MinNumElts
2063	? std::optional<Instruction *>(IC.replaceInstUsesWith(
2064	I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2065	: std::nullopt;
2066	}
2067
2068	static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2069	IntrinsicInst &II) {
2070	Value *PgVal = II.getArgOperand(i: `0`);
2071	Value *OpVal = II.getArgOperand(i: `1`);
2072
2073	// PTEST_<FIRST\|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2074	// Later optimizations prefer this form.
2075	if (PgVal == OpVal &&
2076	(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first \|\|
2077	II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2078	Value *Ops[] = {PgVal, OpVal};
2079	Type *Tys[] = {PgVal->getType()};
2080
2081	auto *PTest =
2082	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
2083	PTest->takeName(V: &II);
2084
2085	return IC.replaceInstUsesWith(I&: II, V: PTest);
2086	}
2087
2088	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2089	IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2090
2091	if (!Pg \|\| !Op)
2092	return std::nullopt;
2093
2094	Intrinsic::ID OpIID = Op->getIntrinsicID();
2095
2096	if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2097	OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2098	Pg->getArgOperand(i: `0`)->getType() == Op->getArgOperand(i: `0`)->getType()) {
2099	Value *Ops[] = {Pg->getArgOperand(i: `0`), Op->getArgOperand(i: `0`)};
2100	Type *Tys[] = {Pg->getArgOperand(i: `0`)->getType()};
2101
2102	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2103
2104	PTest->takeName(V: &II);
2105	return IC.replaceInstUsesWith(I&: II, V: PTest);
2106	}
2107
2108	// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2109	// Later optimizations may rewrite sequence to use the flag-setting variant
2110	// of instruction X to remove PTEST.
2111	if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2112	((OpIID == Intrinsic::aarch64_sve_brka_z) \|\|
2113	(OpIID == Intrinsic::aarch64_sve_brkb_z) \|\|
2114	(OpIID == Intrinsic::aarch64_sve_brkpa_z) \|\|
2115	(OpIID == Intrinsic::aarch64_sve_brkpb_z) \|\|
2116	(OpIID == Intrinsic::aarch64_sve_rdffr_z) \|\|
2117	(OpIID == Intrinsic::aarch64_sve_and_z) \|\|
2118	(OpIID == Intrinsic::aarch64_sve_bic_z) \|\|
2119	(OpIID == Intrinsic::aarch64_sve_eor_z) \|\|
2120	(OpIID == Intrinsic::aarch64_sve_nand_z) \|\|
2121	(OpIID == Intrinsic::aarch64_sve_nor_z) \|\|
2122	(OpIID == Intrinsic::aarch64_sve_orn_z) \|\|
2123	(OpIID == Intrinsic::aarch64_sve_orr_z))) {
2124	Value *Ops[] = {Pg->getArgOperand(i: `0`), Pg};
2125	Type *Tys[] = {Pg->getType()};
2126
2127	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2128	PTest->takeName(V: &II);
2129
2130	return IC.replaceInstUsesWith(I&: II, V: PTest);
2131	}
2132
2133	return std::nullopt;
2134	}
2135
2136	template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
2137	static std::optional<Instruction *>
2138	instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2139	bool MergeIntoAddendOp) {
2140	Value *P = II.getOperand(i_nocapture: `0`);
2141	Value MulOp0, MulOp1, AddendOp, Mul;
2142	if (MergeIntoAddendOp) {
2143	AddendOp = II.getOperand(i_nocapture: `1`);
2144	Mul = II.getOperand(i_nocapture: `2`);
2145	} else {
2146	AddendOp = II.getOperand(i_nocapture: `2`);
2147	Mul = II.getOperand(i_nocapture: `1`);
2148	}
2149
2150	if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2151	m_Value(V&: MulOp1))))
2152	return std::nullopt;
2153
2154	if (!Mul->hasOneUse())
2155	return std::nullopt;
2156
2157	Instruction FMFSource = nullptr*;
2158	if (II.getType()->isFPOrFPVectorTy()) {
2159	llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2160	// Stop the combine when the flags on the inputs differ in case dropping
2161	// flags would lead to us missing out on more beneficial optimizations.
2162	if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2163	return std::nullopt;
2164	if (!FAddFlags.allowContract())
2165	return std::nullopt;
2166	FMFSource = &II;
2167	}
2168
2169	CallInst *Res;
2170	if (MergeIntoAddendOp)
2171	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2172	Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2173	else
2174	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2175	Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2176
2177	return IC.replaceInstUsesWith(I&: II, V: Res);
2178	}
2179
2180	static std::optional<Instruction *>
2181	instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2182	Value *Pred = II.getOperand(i_nocapture: `0`);
2183	Value *PtrOp = II.getOperand(i_nocapture: `1`);
2184	Type *VecTy = II.getType();
2185
2186	if (isAllActivePredicate(Pred)) {
2187	LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2188	Load->copyMetadata(SrcInst: II);
2189	return IC.replaceInstUsesWith(I&: II, V: Load);
2190	}
2191
2192	CallInst *MaskedLoad =
2193	IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2194	Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2195	MaskedLoad->copyMetadata(SrcInst: II);
2196	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2197	}
2198
2199	static std::optional<Instruction *>
2200	instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2201	Value *VecOp = II.getOperand(i_nocapture: `0`);
2202	Value *Pred = II.getOperand(i_nocapture: `1`);
2203	Value *PtrOp = II.getOperand(i_nocapture: `2`);
2204
2205	if (isAllActivePredicate(Pred)) {
2206	StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2207	Store->copyMetadata(SrcInst: II);
2208	return IC.eraseInstFromFunction(I&: II);
2209	}
2210
2211	CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2212	Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2213	MaskedStore->copyMetadata(SrcInst: II);
2214	return IC.eraseInstFromFunction(I&: II);
2215	}
2216
2217	static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2218	switch (Intrinsic) {
2219	case Intrinsic::aarch64_sve_fmul_u:
2220	return Instruction::BinaryOps::FMul;
2221	case Intrinsic::aarch64_sve_fadd_u:
2222	return Instruction::BinaryOps::FAdd;
2223	case Intrinsic::aarch64_sve_fsub_u:
2224	return Instruction::BinaryOps::FSub;
2225	default:
2226	return Instruction::BinaryOpsEnd;
2227	}
2228	}
2229
2230	static std::optional<Instruction *>
2231	instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2232	// Bail due to missing support for ISD::STRICT_ scalable vector operations.
2233	if (II.isStrictFP())
2234	return std::nullopt;
2235
2236	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
2237	auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2238	if (BinOpCode == Instruction::BinaryOpsEnd \|\|
2239	!isAllActivePredicate(Pred: OpPredicate))
2240	return std::nullopt;
2241	auto BinOp = IC.Builder.CreateBinOpFMF(
2242	Opc: BinOpCode, LHS: II.getOperand(i_nocapture: `1`), RHS: II.getOperand(i_nocapture: `2`), FMFSource: II.getFastMathFlags());
2243	return IC.replaceInstUsesWith(I&: II, V: BinOp);
2244	}
2245
2246	static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2247	IntrinsicInst &II) {
2248	if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2249	Intrinsic::aarch64_sve_mla>(
2250	IC, II, MergeIntoAddendOp: true))
2251	return MLA;
2252	if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2253	Intrinsic::aarch64_sve_mad>(
2254	IC, II, MergeIntoAddendOp: false))
2255	return MAD;
2256	return std::nullopt;
2257	}
2258
2259	static std::optional<Instruction *>
2260	instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2261	if (auto FMLA =
2262	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2263	Intrinsic::aarch64_sve_fmla>(IC, II,
2264	MergeIntoAddendOp: true))
2265	return FMLA;
2266	if (auto FMAD =
2267	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2268	Intrinsic::aarch64_sve_fmad>(IC, II,
2269	MergeIntoAddendOp: false))
2270	return FMAD;
2271	if (auto FMLA =
2272	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2273	Intrinsic::aarch64_sve_fmla>(IC, II,
2274	MergeIntoAddendOp: true))
2275	return FMLA;
2276	return std::nullopt;
2277	}
2278
2279	static std::optional<Instruction *>
2280	instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2281	if (auto FMLA =
2282	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2283	Intrinsic::aarch64_sve_fmla>(IC, II,
2284	MergeIntoAddendOp: true))
2285	return FMLA;
2286	if (auto FMAD =
2287	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2288	Intrinsic::aarch64_sve_fmad>(IC, II,
2289	MergeIntoAddendOp: false))
2290	return FMAD;
2291	if (auto FMLA_U =
2292	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2293	Intrinsic::aarch64_sve_fmla_u>(
2294	IC, II, MergeIntoAddendOp: true))
2295	return FMLA_U;
2296	return instCombineSVEVectorBinOp(IC, II);
2297	}
2298
2299	static std::optional<Instruction *>
2300	instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2301	if (auto FMLS =
2302	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2303	Intrinsic::aarch64_sve_fmls>(IC, II,
2304	MergeIntoAddendOp: true))
2305	return FMLS;
2306	if (auto FMSB =
2307	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2308	Intrinsic::aarch64_sve_fnmsb>(
2309	IC, II, MergeIntoAddendOp: false))
2310	return FMSB;
2311	if (auto FMLS =
2312	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2313	Intrinsic::aarch64_sve_fmls>(IC, II,
2314	MergeIntoAddendOp: true))
2315	return FMLS;
2316	return std::nullopt;
2317	}
2318
2319	static std::optional<Instruction *>
2320	instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2321	if (auto FMLS =
2322	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2323	Intrinsic::aarch64_sve_fmls>(IC, II,
2324	MergeIntoAddendOp: true))
2325	return FMLS;
2326	if (auto FMSB =
2327	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2328	Intrinsic::aarch64_sve_fnmsb>(
2329	IC, II, MergeIntoAddendOp: false))
2330	return FMSB;
2331	if (auto FMLS_U =
2332	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2333	Intrinsic::aarch64_sve_fmls_u>(
2334	IC, II, MergeIntoAddendOp: true))
2335	return FMLS_U;
2336	return instCombineSVEVectorBinOp(IC, II);
2337	}
2338
2339	static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2340	IntrinsicInst &II) {
2341	if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2342	Intrinsic::aarch64_sve_mls>(
2343	IC, II, MergeIntoAddendOp: true))
2344	return MLS;
2345	return std::nullopt;
2346	}
2347
2348	static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2349	IntrinsicInst &II) {
2350	Value *UnpackArg = II.getArgOperand(i: `0`);
2351	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2352	bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi \|\|
2353	II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2354
2355	// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2356	// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2357	if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2358	ScalarArg =
2359	IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2360	Value *NewVal =
2361	IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2362	NewVal->takeName(V: &II);
2363	return IC.replaceInstUsesWith(I&: II, V: NewVal);
2364	}
2365
2366	return std::nullopt;
2367	}
2368	static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2369	IntrinsicInst &II) {
2370	auto *OpVal = II.getOperand(i_nocapture: `0`);
2371	auto *OpIndices = II.getOperand(i_nocapture: `1`);
2372	VectorType *VTy = cast<VectorType>(Val: II.getType());
2373
2374	// Check whether OpIndices is a constant splat value < minimal element count
2375	// of result.
2376	auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2377	if (!SplatValue \|\|
2378	SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2379	return std::nullopt;
2380
2381	// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2382	// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2383	auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2384	auto *VectorSplat =
2385	IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2386
2387	VectorSplat->takeName(V: &II);
2388	return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2389	}
2390
2391	static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2392	IntrinsicInst &II) {
2393	Value A, B;
2394	Type *RetTy = II.getType();
2395	constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2396	constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2397
2398	// uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2399	// uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2400	if ((match(V: II.getArgOperand(i: `0`),
2401	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2402	match(V: II.getArgOperand(i: `1`),
2403	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) \|\|
2404	(match(V: II.getArgOperand(i: `0`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2405	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2406	auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2407	if (TyA == B->getType() &&
2408	RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2409	auto *SubVec = IC.Builder.CreateInsertVector(
2410	DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(`0`));
2411	auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2412	Idx: TyA->getMinNumElements());
2413	ConcatVec->takeName(V: &II);
2414	return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2415	}
2416	}
2417
2418	return std::nullopt;
2419	}
2420
2421	static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2422	IntrinsicInst &II) {
2423	// zip1(uzp1(A, B), uzp2(A, B)) --> A
2424	// zip2(uzp1(A, B), uzp2(A, B)) --> B
2425	Value A, B;
2426	if (match(V: II.getArgOperand(i: `0`),
2427	P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2428	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2429	Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2430	return IC.replaceInstUsesWith(
2431	I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2432
2433	return std::nullopt;
2434	}
2435
2436	static std::optional<Instruction *>
2437	instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2438	Value *Mask = II.getOperand(i_nocapture: `0`);
2439	Value *BasePtr = II.getOperand(i_nocapture: `1`);
2440	Value *Index = II.getOperand(i_nocapture: `2`);
2441	Type *Ty = II.getType();
2442	Value *PassThru = ConstantAggregateZero::get(Ty);
2443
2444	// Contiguous gather => masked load.
2445	// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2446	// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2447	Value *IndexBase;
2448	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2449	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
2450	Align Alignment =
2451	BasePtr->getPointerAlignment(DL: II.getDataLayout());
2452
2453	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2454	Ptr: BasePtr, IdxList: IndexBase);
2455	CallInst *MaskedLoad =
2456	IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2457	MaskedLoad->takeName(V: &II);
2458	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2459	}
2460
2461	return std::nullopt;
2462	}
2463
2464	static std::optional<Instruction *>
2465	instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2466	Value *Val = II.getOperand(i_nocapture: `0`);
2467	Value *Mask = II.getOperand(i_nocapture: `1`);
2468	Value *BasePtr = II.getOperand(i_nocapture: `2`);
2469	Value *Index = II.getOperand(i_nocapture: `3`);
2470	Type *Ty = Val->getType();
2471
2472	// Contiguous scatter => masked store.
2473	// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2474	// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2475	Value *IndexBase;
2476	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2477	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
2478	Align Alignment =
2479	BasePtr->getPointerAlignment(DL: II.getDataLayout());
2480
2481	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2482	Ptr: BasePtr, IdxList: IndexBase);
2483	(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2484
2485	return IC.eraseInstFromFunction(I&: II);
2486	}
2487
2488	return std::nullopt;
2489	}
2490
2491	static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2492	IntrinsicInst &II) {
2493	Type *Int32Ty = IC.Builder.getInt32Ty();
2494	Value *Pred = II.getOperand(i_nocapture: `0`);
2495	Value *Vec = II.getOperand(i_nocapture: `1`);
2496	Value *DivVec = II.getOperand(i_nocapture: `2`);
2497
2498	Value *SplatValue = getSplatValue(V: DivVec);
2499	ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2500	if (!SplatConstantInt)
2501	return std::nullopt;
2502
2503	APInt Divisor = SplatConstantInt->getValue();
2504	const int64_t DivisorValue = Divisor.getSExtValue();
2505	if (DivisorValue == -`1`)
2506	return std::nullopt;
2507	if (DivisorValue == `1`)
2508	IC.replaceInstUsesWith(I&: II, V: Vec);
2509
2510	if (Divisor.isPowerOf2()) {
2511	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2512	auto ASRD = IC.Builder.CreateIntrinsic(
2513	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2514	return IC.replaceInstUsesWith(I&: II, V: ASRD);
2515	}
2516	if (Divisor.isNegatedPowerOf2()) {
2517	Divisor.negate();
2518	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2519	auto ASRD = IC.Builder.CreateIntrinsic(
2520	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2521	auto NEG = IC.Builder.CreateIntrinsic(
2522	ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2523	return IC.replaceInstUsesWith(I&: II, V: NEG);
2524	}
2525
2526	return std::nullopt;
2527	}
2528
2529	bool SimplifyValuePattern(SmallVector<Value > &Vec, bool* AllowPoison) {
2530	size_t VecSize = Vec.size();
2531	if (VecSize == `1`)
2532	return true;
2533	if (!isPowerOf2_64(Value: VecSize))
2534	return false;
2535	size_t HalfVecSize = VecSize / `2`;
2536
2537	for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2538	RHS != Vec.end(); LHS++, RHS++) {
2539	if (LHS != nullptr* && RHS != nullptr*) {
2540	if (LHS == RHS)
2541	continue;
2542	else
2543	return false;
2544	}
2545	if (!AllowPoison)
2546	return false;
2547	if (LHS == nullptr* && RHS != nullptr*)
2548	LHS = RHS;
2549	}
2550
2551	Vec.resize(N: HalfVecSize);
2552	SimplifyValuePattern(Vec, AllowPoison);
2553	return true;
2554	}
2555
2556	// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2557	// to dupqlane(f64(C)) where C is A concatenated with B
2558	static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2559	IntrinsicInst &II) {
2560	Value CurrentInsertElt = nullptr, Default = nullptr;
2561	if (!match(V: II.getOperand(i_nocapture: `0`),
2562	P: m_Intrinsic<Intrinsic::vector_insert>(
2563	Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) \|\|
2564	!isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2565	return std::nullopt;
2566	auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2567
2568	// Insert the scalars into a container ordered by InsertElement index
2569	SmallVector<Value > Elts(IIScalableTy->getMinNumElements(), nullptr*);
2570	while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2571	auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: `2`));
2572	Elts [Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: `1`);
2573	CurrentInsertElt = InsertElt->getOperand(i_nocapture: `0`);
2574	}
2575
2576	bool AllowPoison =
2577	isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2578	if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2579	return std::nullopt;
2580
2581	// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2582	Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2583	for (size_t I = `0`; I < Elts.size(); I++) {
2584	if (Elts [I] == nullptr)
2585	continue;
2586	InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts [I],
2587	Idx: IC.Builder.getInt64(C: I));
2588	}
2589	if (InsertEltChain == nullptr)
2590	return std::nullopt;
2591
2592	// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2593	// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2594	// be bitcast to a type wide enough to fit the sequence, be splatted, and then
2595	// be narrowed back to the original type.
2596	unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2597	unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2598	IIScalableTy->getMinNumElements() /
2599	PatternWidth;
2600
2601	IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2602	auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2603	auto *WideShuffleMaskTy =
2604	ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2605
2606	auto InsertSubvector = IC.Builder.CreateInsertVector(
2607	DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2608	Idx: uint64_t(`0`));
2609	auto WideBitcast =
2610	IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2611	auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2612	auto WideShuffle = IC.Builder.CreateShuffleVector(
2613	V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2614	auto NarrowBitcast =
2615	IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2616
2617	return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2618	}
2619
2620	static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2621	IntrinsicInst &II) {
2622	Value *A = II.getArgOperand(i: `0`);
2623	Value *B = II.getArgOperand(i: `1`);
2624	if (A == B)
2625	return IC.replaceInstUsesWith(I&: II, V: A);
2626
2627	return std::nullopt;
2628	}
2629
2630	static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2631	IntrinsicInst &II) {
2632	Value *Pred = II.getOperand(i_nocapture: `0`);
2633	Value *Vec = II.getOperand(i_nocapture: `1`);
2634	Value *Shift = II.getOperand(i_nocapture: `2`);
2635
2636	// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2637	Value AbsPred, MergedValue;
2638	if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2639	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2640	!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2641	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2642
2643	return std::nullopt;
2644
2645	// Transform is valid if any of the following are true:
2646	// The ABS merge value is an undef or non-negative*
2647	// The ABS predicate is all active*
2648	// The ABS predicate and the SRSHL predicates are the same*
2649	if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2650	AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2651	return std::nullopt;
2652
2653	// Only valid when the shift amount is non-negative, otherwise the rounding
2654	// behaviour of SRSHL cannot be ignored.
2655	if (!match(V: Shift, P: m_NonNegative()))
2656	return std::nullopt;
2657
2658	auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2659	Types: {II.getType()}, Args: {Pred, Vec, Shift});
2660
2661	return IC.replaceInstUsesWith(I&: II, V: LSL);
2662	}
2663
2664	static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2665	IntrinsicInst &II) {
2666	Value *Vec = II.getOperand(i_nocapture: `0`);
2667
2668	if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: `1`))
2669	return IC.replaceInstUsesWith(I&: II, V: Vec);
2670
2671	return std::nullopt;
2672	}
2673
2674	static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2675	IntrinsicInst &II) {
2676	// If this barrier is post-dominated by identical one we can remove it
2677	auto *NI = II.getNextNonDebugInstruction();
2678	unsigned LookaheadThreshold = DMBLookaheadThreshold;
2679	auto CanSkipOver = [](Instruction *I) {
2680	return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2681	};
2682	while (LookaheadThreshold-- && CanSkipOver (NI)) {
2683	auto *NIBB = NI->getParent();
2684	NI = NI->getNextNonDebugInstruction();
2685	if (!NI) {
2686	if (auto *SuccBB = NIBB->getUniqueSuccessor())
2687	NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2688	else
2689	break;
2690	}
2691	}
2692	auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
2693	if (NextII && II.isIdenticalTo(I: NextII))
2694	return IC.eraseInstFromFunction(I&: II);
2695
2696	return std::nullopt;
2697	}
2698
2699	static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2700	IntrinsicInst &II) {
2701	if (match(V: II.getOperand(i_nocapture: `0`), P: m_ConstantInt<AArch64SVEPredPattern::all>()))
2702	return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType()));
2703	return std::nullopt;
2704	}
2705
2706	static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2707	IntrinsicInst &II,
2708	unsigned NumBits) {
2709	Value *Passthru = II.getOperand(i_nocapture: `0`);
2710	Value *Pg = II.getOperand(i_nocapture: `1`);
2711	Value *Op = II.getOperand(i_nocapture: `2`);
2712
2713	// Convert UXT[BHW] to AND.
2714	if (isa<UndefValue>(Val: Passthru) \|\| isAllActivePredicate(Pred: Pg)) {
2715	auto *Ty = cast<VectorType>(Val: II.getType());
2716	auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
2717	auto *Mask = ConstantInt::get(Ty, V: MaskValue);
2718	auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty},
2719	Args: {Pg, Op, Mask});
2720	return IC.replaceInstUsesWith(I&: II, V: And);
2721	}
2722
2723	return std::nullopt;
2724	}
2725
2726	std::optional<Instruction *>
2727	AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2728	IntrinsicInst &II) const {
2729	const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2730	if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2731	return I;
2732
2733	Intrinsic::ID IID = II.getIntrinsicID();
2734	switch (IID) {
2735	default:
2736	break;
2737	case Intrinsic::aarch64_dmb:
2738	return instCombineDMB(IC, II);
2739	case Intrinsic::aarch64_neon_fmaxnm:
2740	case Intrinsic::aarch64_neon_fminnm:
2741	return instCombineMaxMinNM(IC, II);
2742	case Intrinsic::aarch64_sve_convert_from_svbool:
2743	return instCombineConvertFromSVBool(IC, II);
2744	case Intrinsic::aarch64_sve_dup:
2745	return instCombineSVEDup(IC, II);
2746	case Intrinsic::aarch64_sve_dup_x:
2747	return instCombineSVEDupX(IC, II);
2748	case Intrinsic::aarch64_sve_cmpne:
2749	case Intrinsic::aarch64_sve_cmpne_wide:
2750	return instCombineSVECmpNE(IC, II);
2751	case Intrinsic::aarch64_sve_rdffr:
2752	return instCombineRDFFR(IC, II);
2753	case Intrinsic::aarch64_sve_lasta:
2754	case Intrinsic::aarch64_sve_lastb:
2755	return instCombineSVELast(IC, II);
2756	case Intrinsic::aarch64_sve_clasta_n:
2757	case Intrinsic::aarch64_sve_clastb_n:
2758	return instCombineSVECondLast(IC, II);
2759	case Intrinsic::aarch64_sve_cntd:
2760	return instCombineSVECntElts(IC, II, NumElts: `2`);
2761	case Intrinsic::aarch64_sve_cntw:
2762	return instCombineSVECntElts(IC, II, NumElts: `4`);
2763	case Intrinsic::aarch64_sve_cnth:
2764	return instCombineSVECntElts(IC, II, NumElts: `8`);
2765	case Intrinsic::aarch64_sve_cntb:
2766	return instCombineSVECntElts(IC, II, NumElts: `16`);
2767	case Intrinsic::aarch64_sve_ptest_any:
2768	case Intrinsic::aarch64_sve_ptest_first:
2769	case Intrinsic::aarch64_sve_ptest_last:
2770	return instCombineSVEPTest(IC, II);
2771	case Intrinsic::aarch64_sve_fadd:
2772	return instCombineSVEVectorFAdd(IC, II);
2773	case Intrinsic::aarch64_sve_fadd_u:
2774	return instCombineSVEVectorFAddU(IC, II);
2775	case Intrinsic::aarch64_sve_fmul_u:
2776	return instCombineSVEVectorBinOp(IC, II);
2777	case Intrinsic::aarch64_sve_fsub:
2778	return instCombineSVEVectorFSub(IC, II);
2779	case Intrinsic::aarch64_sve_fsub_u:
2780	return instCombineSVEVectorFSubU(IC, II);
2781	case Intrinsic::aarch64_sve_add:
2782	return instCombineSVEVectorAdd(IC, II);
2783	case Intrinsic::aarch64_sve_add_u:
2784	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2785	Intrinsic::aarch64_sve_mla_u>(
2786	IC, II, MergeIntoAddendOp: true);
2787	case Intrinsic::aarch64_sve_sub:
2788	return instCombineSVEVectorSub(IC, II);
2789	case Intrinsic::aarch64_sve_sub_u:
2790	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2791	Intrinsic::aarch64_sve_mls_u>(
2792	IC, II, MergeIntoAddendOp: true);
2793	case Intrinsic::aarch64_sve_tbl:
2794	return instCombineSVETBL(IC, II);
2795	case Intrinsic::aarch64_sve_uunpkhi:
2796	case Intrinsic::aarch64_sve_uunpklo:
2797	case Intrinsic::aarch64_sve_sunpkhi:
2798	case Intrinsic::aarch64_sve_sunpklo:
2799	return instCombineSVEUnpack(IC, II);
2800	case Intrinsic::aarch64_sve_uzp1:
2801	return instCombineSVEUzp1(IC, II);
2802	case Intrinsic::aarch64_sve_zip1:
2803	case Intrinsic::aarch64_sve_zip2:
2804	return instCombineSVEZip(IC, II);
2805	case Intrinsic::aarch64_sve_ld1_gather_index:
2806	return instCombineLD1GatherIndex(IC, II);
2807	case Intrinsic::aarch64_sve_st1_scatter_index:
2808	return instCombineST1ScatterIndex(IC, II);
2809	case Intrinsic::aarch64_sve_ld1:
2810	return instCombineSVELD1(IC, II, DL);
2811	case Intrinsic::aarch64_sve_st1:
2812	return instCombineSVEST1(IC, II, DL);
2813	case Intrinsic::aarch64_sve_sdiv:
2814	return instCombineSVESDIV(IC, II);
2815	case Intrinsic::aarch64_sve_sel:
2816	return instCombineSVESel(IC, II);
2817	case Intrinsic::aarch64_sve_srshl:
2818	return instCombineSVESrshl(IC, II);
2819	case Intrinsic::aarch64_sve_dupq_lane:
2820	return instCombineSVEDupqLane(IC, II);
2821	case Intrinsic::aarch64_sve_insr:
2822	return instCombineSVEInsr(IC, II);
2823	case Intrinsic::aarch64_sve_ptrue:
2824	return instCombinePTrue(IC, II);
2825	case Intrinsic::aarch64_sve_uxtb:
2826	return instCombineSVEUxt(IC, II, NumBits: `8`);
2827	case Intrinsic::aarch64_sve_uxth:
2828	return instCombineSVEUxt(IC, II, NumBits: `16`);
2829	case Intrinsic::aarch64_sve_uxtw:
2830	return instCombineSVEUxt(IC, II, NumBits: `32`);
2831	}
2832
2833	return std::nullopt;
2834	}
2835
2836	std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2837	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2838	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2839	std::function<void(Instruction , unsigned*, APInt, APInt &)>
2840	SimplifyAndSetOp) const {
2841	switch (II.getIntrinsicID()) {
2842	default:
2843	break;
2844	case Intrinsic::aarch64_neon_fcvtxn:
2845	case Intrinsic::aarch64_neon_rshrn:
2846	case Intrinsic::aarch64_neon_sqrshrn:
2847	case Intrinsic::aarch64_neon_sqrshrun:
2848	case Intrinsic::aarch64_neon_sqshrn:
2849	case Intrinsic::aarch64_neon_sqshrun:
2850	case Intrinsic::aarch64_neon_sqxtn:
2851	case Intrinsic::aarch64_neon_sqxtun:
2852	case Intrinsic::aarch64_neon_uqrshrn:
2853	case Intrinsic::aarch64_neon_uqshrn:
2854	case Intrinsic::aarch64_neon_uqxtn:
2855	SimplifyAndSetOp (&II, `0`, OrigDemandedElts, UndefElts);
2856	break;
2857	}
2858
2859	return std::nullopt;
2860	}
2861
2862	bool AArch64TTIImpl::enableScalableVectorization() const {
2863	return ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
2864	EnableScalableAutovecInStreamingMode);
2865	}
2866
2867	TypeSize
2868	AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2869	switch (K) {
2870	case TargetTransformInfo::RGK_Scalar:
2871	return TypeSize::getFixed(ExactSize: `64`);
2872	case TargetTransformInfo::RGK_FixedWidthVector:
2873	if (ST->useSVEForFixedLengthVectors() &&
2874	(ST->isSVEAvailable() \|\| EnableFixedwidthAutovecInStreamingMode))
2875	return TypeSize::getFixed(
2876	ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: `128u`));
2877	else if (ST->isNeonAvailable())
2878	return TypeSize::getFixed(ExactSize: `128`);
2879	else
2880	return TypeSize::getFixed(ExactSize: `0`);
2881	case TargetTransformInfo::RGK_ScalableVector:
2882	if (ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
2883	EnableScalableAutovecInStreamingMode))
2884	return TypeSize::getScalable(MinimumSize: `128`);
2885	else
2886	return TypeSize::getScalable(MinimumSize: `0`);
2887	}
2888	llvm_unreachable("Unsupported register kind");
2889	}
2890
2891	bool AArch64TTIImpl::isWideningInstruction(Type DstTy, unsigned* Opcode,
2892	ArrayRef<const Value *> Args,
2893	Type SrcOverrideTy) const* {
2894	// A helper that returns a vector type from the given type. The number of
2895	// elements in type Ty determines the vector width.
2896	auto toVectorTy = [&](Type *ArgTy) {
2897	return VectorType::get(ElementType: ArgTy->getScalarType(),
2898	EC: cast<VectorType>(Val: DstTy)->getElementCount());
2899	};
2900
2901	// Exit early if DstTy is not a vector type whose elements are one of [i16,
2902	// i32, i64]. SVE doesn't generally have the same set of instructions to
2903	// perform an extend with the add/sub/mul. There are SMULLB style
2904	// instructions, but they operate on top/bottom, requiring some sort of lane
2905	// interleaving to be used with zext/sext.
2906	unsigned DstEltSize = DstTy->getScalarSizeInBits();
2907	if (!useNeonVector(Ty: DstTy) \|\| Args.size() != `2` \|\|
2908	(DstEltSize != `16` && DstEltSize != `32` && DstEltSize != `64`))
2909	return false;
2910
2911	// Determine if the operation has a widening variant. We consider both the
2912	// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2913	// instructions.
2914	//
2915	// TODO: Add additional widening operations (e.g., shl, etc.) once we
2916	// verify that their extending operands are eliminated during code
2917	// generation.
2918	Type *SrcTy = SrcOverrideTy;
2919	switch (Opcode) {
2920	case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2921	case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2922	// The second operand needs to be an extend
2923	if (isa<SExtInst>(Val: Args [`1`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2924	if (!SrcTy)
2925	SrcTy =
2926	toVectorTy (cast<Instruction>(Val: Args [`1`])->getOperand(i: `0`)->getType());
2927	} else
2928	return false;
2929	break;
2930	case Instruction::Mul: { // SMULL(2), UMULL(2)
2931	// Both operands need to be extends of the same type.
2932	if ((isa<SExtInst>(Val: Args [`0`]) && isa<SExtInst>(Val: Args [`1`])) \|\|
2933	(isa<ZExtInst>(Val: Args [`0`]) && isa<ZExtInst>(Val: Args [`1`]))) {
2934	if (!SrcTy)
2935	SrcTy =
2936	toVectorTy (cast<Instruction>(Val: Args [`0`])->getOperand(i: `0`)->getType());
2937	} else if (isa<ZExtInst>(Val: Args [`0`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2938	// If one of the operands is a Zext and the other has enough zero bits to
2939	// be treated as unsigned, we can still general a umull, meaning the zext
2940	// is free.
2941	KnownBits Known =
2942	computeKnownBits(V: isa<ZExtInst>(Val: Args [`0`]) ? Args [`1`] : Args [`0`], DL);
2943	if (Args [`0`]->getType()->getScalarSizeInBits() -
2944	Known.Zero.countLeadingOnes() >
2945	DstTy->getScalarSizeInBits() / `2`)
2946	return false;
2947	if (!SrcTy)
2948	SrcTy = toVectorTy (Type::getIntNTy(C&: DstTy->getContext(),
2949	N: DstTy->getScalarSizeInBits() / `2`));
2950	} else
2951	return false;
2952	break;
2953	}
2954	default:
2955	return false;
2956	}
2957
2958	// Legalize the destination type and ensure it can be used in a widening
2959	// operation.
2960	auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
2961	if (!DstTyL.second.isVector() \|\| DstEltSize != DstTy->getScalarSizeInBits())
2962	return false;
2963
2964	// Legalize the source type and ensure it can be used in a widening
2965	// operation.
2966	assert(SrcTy && "Expected some SrcTy");
2967	auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
2968	unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2969	if (!SrcTyL.second.isVector() \|\| SrcElTySize != SrcTy->getScalarSizeInBits())
2970	return false;
2971
2972	// Get the total number of vector elements in the legalized types.
2973	InstructionCost NumDstEls =
2974	DstTyL.first * DstTyL.second.getVectorMinNumElements();
2975	InstructionCost NumSrcEls =
2976	SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2977
2978	// Return true if the legalized types have the same number of vector elements
2979	// and the destination element type size is twice that of the source type.
2980	return NumDstEls == NumSrcEls && `2` * SrcElTySize == DstEltSize;
2981	}
2982
2983	// s/urhadd instructions implement the following pattern, making the
2984	// extends free:
2985	// %x = add ((zext i8 -> i16), 1)
2986	// %y = (zext i8 -> i16)
2987	// trunc i16 (lshr (add %x, %y), 1) -> i8
2988	//
2989	bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction ExtUser, Type Dst,
2990	Type Src) const* {
2991	// The source should be a legal vector type.
2992	if (!Src->isVectorTy() \|\| !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) \|\|
2993	(Src->isScalableTy() && !ST->hasSVE2()))
2994	return false;
2995
2996	if (ExtUser->getOpcode() != Instruction::Add \|\| !ExtUser->hasOneUse())
2997	return false;
2998
2999	// Look for trunc/shl/add before trying to match the pattern.
3000	const Instruction *Add = ExtUser;
3001	auto *AddUser =
3002	dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3003	if (AddUser && AddUser->getOpcode() == Instruction::Add)
3004	Add = AddUser;
3005
3006	auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3007	if (!Shr \|\| Shr->getOpcode() != Instruction::LShr)
3008	return false;
3009
3010	auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3011	if (!Trunc \|\| Trunc->getOpcode() != Instruction::Trunc \|\|
3012	Src->getScalarSizeInBits() !=
3013	cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3014	return false;
3015
3016	// Try to match the whole pattern. Ext could be either the first or second
3017	// m_ZExtOrSExt matched.
3018	Instruction Ex1, Ex2;
3019	if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3020	R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: `1`))))))
3021	return false;
3022
3023	// Ensure both extends are of the same type
3024	if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3025	Ex1->getOpcode() == Ex2->getOpcode())
3026	return true;
3027
3028	return false;
3029	}
3030
3031	InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3032	Type *Src,
3033	TTI::CastContextHint CCH,
3034	TTI::TargetCostKind CostKind,
3035	const Instruction I) const* {
3036	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3037	assert(ISD && "Invalid opcode");
3038	// If the cast is observable, and it is used by a widening instruction (e.g.,
3039	// uaddl, saddw, etc.), it may be free.
3040	if (I && I->hasOneUser()) {
3041	auto SingleUser = cast<Instruction>(Val: I->user_begin());
3042	SmallVector<const Value *, `4`> Operands(SingleUser->operand_values());
3043	if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) {
3044	// For adds only count the second operand as free if both operands are
3045	// extends but not the same operation. (i.e both operands are not free in
3046	// add(sext, zext)).
3047	if (SingleUser->getOpcode() == Instruction::Add) {
3048	if (I == SingleUser->getOperand(i: `1`) \|\|
3049	(isa<CastInst>(Val: SingleUser->getOperand(i: `1`)) &&
3050	cast<CastInst>(Val: SingleUser->getOperand(i: `1`))->getOpcode() == Opcode))
3051	return `0`;
3052	} else // Others are free so long as isWideningInstruction returned true.
3053	return `0`;
3054	}
3055
3056	// The cast will be free for the s/urhadd instructions
3057	if ((isa<ZExtInst>(Val: I) \|\| isa<SExtInst>(Val: I)) &&
3058	isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3059	return `0`;
3060	}
3061
3062	// TODO: Allow non-throughput costs that aren't binary.
3063	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
3064	if (CostKind != TTI::TCK_RecipThroughput)
3065	return Cost == `0` ? `0` : `1`;
3066	return Cost;
3067	};
3068
3069	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3070	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3071
3072	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
3073	return AdjustCost (
3074	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3075
3076	static const TypeConversionCostTblEntry BF16Tbl[] = {
3077	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: `1`}, // bfcvt
3078	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: `1`}, // bfcvt
3079	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: `1`}, // bfcvtn
3080	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: `2`}, // bfcvtn+bfcvtn2
3081	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: `2`}, // bfcvtn+fcvtn
3082	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: `3`}, // fcvtn+fcvtl2+bfcvtn
3083	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: `6`}, // 2 fcvtn+fcvtn2+bfcvtn*
3084	};
3085
3086	if (ST->hasBF16())
3087	if (const auto *Entry = ConvertCostTableLookup(
3088	Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3089	return AdjustCost (Entry->Cost);
3090
3091	// Symbolic constants for the SVE sitofp/uitofp entries in the table below
3092	// The cost of unpacking twice is artificially increased for now in order
3093	// to avoid regressions against NEON, which will use tbl instructions directly
3094	// instead of multiple layers of [s\|u]unpk[lo\|hi].
3095	// We use the unpacks in cases where the destination type is illegal and
3096	// requires splitting of the input, even if the input type itself is legal.
3097	const unsigned int SVE_EXT_COST = `1`;
3098	const unsigned int SVE_FCVT_COST = `1`;
3099	const unsigned int SVE_UNPACK_ONCE = `4`;
3100	const unsigned int SVE_UNPACK_TWICE = `16`;
3101
3102	static const TypeConversionCostTblEntry ConversionTbl[] = {
3103	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3104	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3105	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3106	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: `1`}, // xtn
3107	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: `3`}, // 2 xtn + 1 uzp1
3108	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1`}, // xtn
3109	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: `2`}, // 1 uzp1 + 1 xtn
3110	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `1`}, // 1 uzp1
3111	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: `1`}, // 1 xtn
3112	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `2`}, // 1 uzp1 + 1 xtn
3113	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: `4`}, // 3 x uzp1 + xtn
3114	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: `1`}, // 1 uzp1
3115	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: `3`}, // 3 x uzp1
3116	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: `2`}, // 2 x uzp1
3117	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: `1`}, // uzp1
3118	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `3`}, // (2 + 1) x uzp1
3119	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: `7`}, // (4 + 2 + 1) x uzp1
3120	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: `2`}, // 2 x uzp1
3121	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: `6`}, // (4 + 2) x uzp1
3122	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: `4`}, // 4 x uzp1
3123
3124	// Truncations on nxvmiN
3125	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: `2`},
3126	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: `2`},
3127	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: `2`},
3128	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: `2`},
3129	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: `2`},
3130	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: `2`},
3131	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: `2`},
3132	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: `5`},
3133	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: `2`},
3134	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: `2`},
3135	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: `5`},
3136	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: `11`},
3137	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: `2`},
3138	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: `0`},
3139	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: `0`},
3140	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: `0`},
3141	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: `0`},
3142	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: `0`},
3143	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: `0`},
3144	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: `0`},
3145	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: `0`},
3146	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: `1`},
3147	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: `0`},
3148	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: `1`},
3149	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: `1`},
3150	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: `0`},
3151	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: `1`},
3152	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: `3`},
3153	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: `1`},
3154	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: `3`},
3155	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: `1`},
3156	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: `3`},
3157	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: `7`},
3158
3159	// The number of shll instructions for the extension.
3160	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3`},
3161	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3`},
3162	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
3163	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
3164	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3`},
3165	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3`},
3166	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
3167	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
3168	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7`},
3169	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7`},
3170	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6`},
3171	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6`},
3172	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
3173	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
3174	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6`},
3175	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6`},
3176
3177	// FP Ext and trunc
3178	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: `1`}, // fcvt
3179	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: `1`}, // fcvtl
3180	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: `2`}, // fcvtl+fcvtl2
3181	// FP16
3182	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: `1`}, // fcvt
3183	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: `1`}, // fcvt
3184	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`}, // fcvtl
3185	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `2`}, // fcvtl+fcvtl2
3186	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: `2`}, // fcvtl+fcvtl
3187	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: `3`}, // fcvtl+fcvtl2+fcvtl
3188	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: `6`}, // 2 fcvtl+fcvtl2+fcvtl*
3189	// BF16 (uses shift)
3190	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: `1`}, // shl
3191	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: `2`}, // shl+fcvt
3192	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: `1`}, // shll
3193	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: `2`}, // shll+shll2
3194	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: `2`}, // shll+fcvtl
3195	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: `3`}, // shll+fcvtl+fcvtl2
3196	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: `6`}, // 2 shll+fcvtl+fcvtl2*
3197	// FP Ext and trunc
3198	{.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: `1`}, // fcvt
3199	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: `1`}, // fcvtn
3200	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: `2`}, // fcvtn+fcvtn2
3201	// FP16
3202	{.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: `1`}, // fcvt
3203	{.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: `1`}, // fcvt
3204	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: `1`}, // fcvtn
3205	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: `2`}, // fcvtn+fcvtn2
3206	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: `2`}, // fcvtn+fcvtn
3207	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: `3`}, // fcvtn+fcvtn2+fcvtn
3208	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: `6`}, // 2 fcvtn+fcvtn2+fcvtn*
3209	// BF16 (more complex, with +bf16 is handled above)
3210	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: `8`}, // Expansion is ~8 insns
3211	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: `9`}, // fcvtn + above
3212	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: `8`},
3213	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: `8`},
3214	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: `15`},
3215	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: `9`},
3216	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: `10`},
3217	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: `19`},
3218
3219	// LowerVectorINT_TO_FP:
3220	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
3221	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
3222	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1`},
3223	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
3224	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
3225	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1`},
3226
3227	// SVE: to nxv2f16
3228	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3229	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3230	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3231	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3232	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3233	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3234	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3235	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3236	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3237	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3238
3239	// SVE: to nxv4f16
3240	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3241	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3242	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3243	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3244	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3245	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3246	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3247	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3248
3249	// SVE: to nxv8f16
3250	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3251	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3252	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3253	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3254	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3255	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3256
3257	// SVE: to nxv16f16
3258	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3259	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3260	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3261	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3262
3263	// Complex: to v2f32
3264	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
3265	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3`},
3266	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
3267	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3`},
3268
3269	// SVE: to nxv2f32
3270	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3271	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3272	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3273	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3274	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3275	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3276	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3277	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3278	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3279	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3280
3281	// Complex: to v4f32
3282	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `4`},
3283	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
3284	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3`},
3285	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
3286
3287	// SVE: to nxv4f32
3288	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3289	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3290	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3291	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3292	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3293	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3294	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3295	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3296
3297	// Complex: to v8f32
3298	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
3299	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4`},
3300	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
3301	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4`},
3302
3303	// SVE: to nxv8f32
3304	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3305	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3306	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3307	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3308	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3309	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3310	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3311	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3312
3313	// SVE: to nxv16f32
3314	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3315	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3316	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3317	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3318
3319	// Complex: to v16f32
3320	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21`},
3321	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21`},
3322
3323	// Complex: to v2f64
3324	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4`},
3325	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4`},
3326	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2`},
3327	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4`},
3328	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4`},
3329	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2`},
3330
3331	// SVE: to nxv2f64
3332	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3333	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3334	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3335	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3336	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3337	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3338	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3339	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3340	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3341	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3342
3343	// Complex: to v4f64
3344	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4`},
3345	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4`},
3346
3347	// SVE: to nxv4f64
3348	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3349	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3350	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3351	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3352	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3353	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3354	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3355	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3356	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3357	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3358	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3359	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3360
3361	// SVE: to nxv8f64
3362	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3363	.Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3364	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3365	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3366	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3367	.Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3368	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3369	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3370
3371	// LowerVectorFP_TO_INT
3372	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1`},
3373	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1`},
3374	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1`},
3375	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1`},
3376	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1`},
3377	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1`},
3378
3379	// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3380	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2`},
3381	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1`},
3382	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1`},
3383	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2`},
3384	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1`},
3385	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1`},
3386
3387	// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3388	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2`},
3389	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2`},
3390	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2`},
3391	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2`},
3392
3393	// Complex, from nxv2f32.
3394	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1`},
3395	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1`},
3396	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1`},
3397	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1`},
3398	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1`},
3399	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1`},
3400	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1`},
3401	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1`},
3402
3403	// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3404	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2`},
3405	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2`},
3406	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2`},
3407	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2`},
3408	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2`},
3409	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2`},
3410
3411	// Complex, from nxv2f64.
3412	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1`},
3413	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1`},
3414	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1`},
3415	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1`},
3416	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: `1`},
3417	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1`},
3418	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1`},
3419	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1`},
3420	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1`},
3421	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: `1`},
3422
3423	// Complex, from nxv4f32.
3424	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4`},
3425	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1`},
3426	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1`},
3427	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1`},
3428	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: `1`},
3429	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4`},
3430	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1`},
3431	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1`},
3432	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1`},
3433	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: `1`},
3434
3435	// Complex, from nxv8f64. Illegal -> illegal conversions not required.
3436	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7`},
3437	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7`},
3438	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7`},
3439	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7`},
3440
3441	// Complex, from nxv4f64. Illegal -> illegal conversions not required.
3442	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3`},
3443	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3`},
3444	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3`},
3445	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3`},
3446	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3`},
3447	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3`},
3448
3449	// Complex, from nxv8f32. Illegal -> illegal conversions not required.
3450	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3`},
3451	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3`},
3452	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3`},
3453	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3`},
3454
3455	// Complex, from nxv8f16.
3456	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10`},
3457	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4`},
3458	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1`},
3459	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1`},
3460	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: `1`},
3461	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10`},
3462	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4`},
3463	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1`},
3464	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1`},
3465	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: `1`},
3466
3467	// Complex, from nxv4f16.
3468	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4`},
3469	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1`},
3470	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1`},
3471	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1`},
3472	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4`},
3473	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1`},
3474	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1`},
3475	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1`},
3476
3477	// Complex, from nxv2f16.
3478	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1`},
3479	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1`},
3480	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1`},
3481	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1`},
3482	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1`},
3483	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1`},
3484	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1`},
3485	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1`},
3486
3487	// Truncate from nxvmf32 to nxvmf16.
3488	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: `1`},
3489	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: `1`},
3490	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: `3`},
3491
3492	// Truncate from nxvmf64 to nxvmf16.
3493	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: `1`},
3494	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: `3`},
3495	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: `7`},
3496
3497	// Truncate from nxvmf64 to nxvmf32.
3498	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: `1`},
3499	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: `3`},
3500	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: `6`},
3501
3502	// Extend from nxvmf16 to nxvmf32.
3503	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: `1`},
3504	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: `1`},
3505	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: `2`},
3506
3507	// Extend from nxvmf16 to nxvmf64.
3508	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: `1`},
3509	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: `2`},
3510	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: `4`},
3511
3512	// Extend from nxvmf32 to nxvmf64.
3513	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: `1`},
3514	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: `2`},
3515	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: `6`},
3516
3517	// Bitcasts from float to integer
3518	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: `0`},
3519	{.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: `0`},
3520	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: `0`},
3521
3522	// Bitcasts from integer to float
3523	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `0`},
3524	{.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `0`},
3525	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `0`},
3526
3527	// Add cost for extending to illegal -too wide- scalable vectors.
3528	// zero/sign extend are implemented by multiple unpack operations,
3529	// where each operation has a cost of 1.
3530	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
3531	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
3532	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
3533	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
3534	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
3535	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
3536
3537	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
3538	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
3539	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
3540	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
3541	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
3542	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
3543	};
3544
3545	// We have to estimate a cost of fixed length operation upon
3546	// SVE registers(operations) with the number of registers required
3547	// for a fixed type to be represented upon SVE registers.
3548	EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3549	if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3550	SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3551	ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3552	std::pair<InstructionCost, MVT> LT =
3553	getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3554	unsigned NumElements =
3555	AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3556	return AdjustCost (
3557	LT.first *
3558	getCastInstrCost(
3559	Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3560	Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3561	CostKind, I));
3562	}
3563
3564	if (const auto *Entry = ConvertCostTableLookup(
3565	Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3566	return AdjustCost (Entry->Cost);
3567
3568	static const TypeConversionCostTblEntry FP16Tbl[] = {
3569	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
3570	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`},
3571	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
3572	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`},
3573	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`}, // fcvtl+fcvtzs
3574	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`},
3575	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`}, // fcvtzs+xtn
3576	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`},
3577	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`}, // fcvtzs
3578	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`},
3579	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`}, // 2fcvtl+2fcvtzs
3580	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`},
3581	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`}, // 2fcvtzs+xtn*
3582	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`},
3583	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`}, // 2fcvtzs*
3584	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`},
3585	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`}, // 4fcvtl+4fcvtzs
3586	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`},
3587	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // ushll + ucvtf
3588	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // sshll + scvtf
3589	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 ushl(2) + 2 * ucvtf*
3590	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 sshl(2) + 2 * scvtf*
3591	};
3592
3593	if (ST->hasFullFP16())
3594	if (const auto *Entry = ConvertCostTableLookup(
3595	Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3596	return AdjustCost (Entry->Cost);
3597
3598	// INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3599	// double-rounding issues.
3600	if ((ISD == ISD::SINT_TO_FP \|\| ISD == ISD::UINT_TO_FP) &&
3601	DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > `32` &&
3602	isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
3603	return AdjustCost (
3604	cast<FixedVectorType>(Val: Dst)->getNumElements() *
3605	getCastInstrCost(Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(),
3606	CCH, CostKind) +
3607	BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false, Extract: true,
3608	CostKind) +
3609	BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true, Extract: false,
3610	CostKind));
3611
3612	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
3613	CCH == TTI::CastContextHint::Masked &&
3614	ST->isSVEorStreamingSVEAvailable() &&
3615	TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
3616	TargetLowering::TypePromoteInteger &&
3617	TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
3618	TargetLowering::TypeSplitVector) {
3619	// The standard behaviour in the backend for these cases is to split the
3620	// extend up into two parts:
3621	// 1. Perform an extending load or masked load up to the legal type.
3622	// 2. Extend the loaded data to the final type.
3623	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
3624	Type *LegalTy = EVT (SrcLT.second).getTypeForEVT(Context&: Src->getContext());
3625	InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3626	Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
3627	InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3628	Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
3629	return Part1 + Part2;
3630	}
3631
3632	// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3633	// but we also want to include the TTI::CastContextHint::Masked case too.
3634	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
3635	CCH == TTI::CastContextHint::Masked &&
3636	ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
3637	CCH = TTI::CastContextHint::Normal;
3638
3639	return AdjustCost (
3640	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
3641	}
3642
3643	InstructionCost
3644	AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
3645	VectorType VecTy, unsigned* Index,
3646	TTI::TargetCostKind CostKind) const {
3647
3648	// Make sure we were given a valid extend opcode.
3649	assert((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
3650	"Invalid opcode");
3651
3652	// We are extending an element we extract from a vector, so the source type
3653	// of the extend is the element type of the vector.
3654	auto *Src = VecTy->getElementType();
3655
3656	// Sign- and zero-extends are for integer types only.
3657	assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
3658
3659	// Get the cost for the extract. We compute the cost (if any) for the extend
3660	// below.
3661	InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
3662	CostKind, Index, Op0: nullptr, Op1: nullptr);
3663
3664	// Legalize the types.
3665	auto VecLT = getTypeLegalizationCost(Ty: VecTy);
3666	auto DstVT = TLI->getValueType(DL, Ty: Dst);
3667	auto SrcVT = TLI->getValueType(DL, Ty: Src);
3668
3669	// If the resulting type is still a vector and the destination type is legal,
3670	// we may get the extension for free. If not, get the default cost for the
3671	// extend.
3672	if (!VecLT.second.isVector() \|\| !TLI->isTypeLegal(VT: DstVT))
3673	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3674	CostKind);
3675
3676	// The destination type should be larger than the element type. If not, get
3677	// the default cost for the extend.
3678	if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
3679	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3680	CostKind);
3681
3682	switch (Opcode) {
3683	default:
3684	llvm_unreachable("Opcode should be either SExt or ZExt");
3685
3686	// For sign-extends, we only need a smov, which performs the extension
3687	// automatically.
3688	case Instruction::SExt:
3689	return Cost;
3690
3691	// For zero-extends, the extend is performed automatically by a umov unless
3692	// the destination type is i64 and the element type is i8 or i16.
3693	case Instruction::ZExt:
3694	if (DstVT.getSizeInBits() != `64u` \|\| SrcVT.getSizeInBits() == `32u`)
3695	return Cost;
3696	}
3697
3698	// If we are unable to perform the extend for free, get the default cost.
3699	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
3700	CostKind);
3701	}
3702
3703	InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
3704	TTI::TargetCostKind CostKind,
3705	const Instruction I) const* {
3706	if (CostKind != TTI::TCK_RecipThroughput)
3707	return Opcode == Instruction::PHI ? `0` : `1`;
3708	assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
3709	// Branches are assumed to be predicted.
3710	return `0`;
3711	}
3712
3713	InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
3714	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
3715	bool HasRealUse, const Instruction I, Value Scalar,
3716	ArrayRef<std::tuple<Value , User , int>> ScalarUserAndIdx) const {
3717	assert(Val->isVectorTy() && "This must be a vector type");
3718
3719	if (Index != -`1U`) {
3720	// Legalize the type.
3721	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
3722
3723	// This type is legalized to a scalar type.
3724	if (!LT.second.isVector())
3725	return `0`;
3726
3727	// The type may be split. For fixed-width vectors we can normalize the
3728	// index to the new type.
3729	if (LT.second.isFixedLengthVector()) {
3730	unsigned Width = LT.second.getVectorNumElements();
3731	Index = Index % Width;
3732	}
3733
3734	// The element at index zero is already inside the vector.
3735	// - For a physical (HasRealUse==true) insert-element or extract-element
3736	// instruction that extracts integers, an explicit FPR -> GPR move is
3737	// needed. So it has non-zero cost.
3738	// - For the rest of cases (virtual instruction or element type is float),
3739	// consider the instruction free.
3740	if (Index == `0` && (!HasRealUse \|\| !Val->getScalarType()->isIntegerTy()))
3741	return `0`;
3742
3743	// This is recognising a LD1 single-element structure to one lane of one
3744	// register instruction. I.e., if this is an `insertelement` instruction,
3745	// and its second operand is a load, then we will generate a LD1, which
3746	// are expensive instructions.
3747	if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: `1`)))
3748	return CostKind == TTI::TCK_CodeSize
3749	? `0`
3750	: ST->getVectorInsertExtractBaseCost() + `1`;
3751
3752	// i1 inserts and extract will include an extra cset or cmp of the vector
3753	// value. Increase the cost by 1 to account.
3754	if (Val->getScalarSizeInBits() == `1`)
3755	return CostKind == TTI::TCK_CodeSize
3756	? `2`
3757	: ST->getVectorInsertExtractBaseCost() + `1`;
3758
3759	// FIXME:
3760	// If the extract-element and insert-element instructions could be
3761	// simplified away (e.g., could be combined into users by looking at use-def
3762	// context), they have no cost. This is not done in the first place for
3763	// compile-time considerations.
3764	}
3765
3766	// In case of Neon, if there exists extractelement from lane != 0 such that
3767	// 1. extractelement does not necessitate a move from vector_reg -> GPR.
3768	// 2. extractelement result feeds into fmul.
3769	// 3. Other operand of fmul is an extractelement from lane 0 or lane
3770	// equivalent to 0.
3771	// then the extractelement can be merged with fmul in the backend and it
3772	// incurs no cost.
3773	// e.g.
3774	// define double @foo(<2 x double> %a) {
3775	// %1 = extractelement <2 x double> %a, i32 0
3776	// %2 = extractelement <2 x double> %a, i32 1
3777	// %res = fmul double %1, %2
3778	// ret double %res
3779	// }
3780	// %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
3781	auto ExtractCanFuseWithFmul = [&]() {
3782	// We bail out if the extract is from lane 0.
3783	if (Index == `0`)
3784	return false;
3785
3786	// Check if the scalar element type of the vector operand of ExtractElement
3787	// instruction is one of the allowed types.
3788	auto IsAllowedScalarTy = [&](const Type *T) {
3789	return T->isFloatTy() \|\| T->isDoubleTy() \|\|
3790	(T->isHalfTy() && ST->hasFullFP16());
3791	};
3792
3793	// Check if the extractelement user is scalar fmul.
3794	auto IsUserFMulScalarTy = [](const Value *EEUser) {
3795	// Check if the user is scalar fmul.
3796	const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
3797	return BO && BO->getOpcode() == BinaryOperator::FMul &&
3798	!BO->getType()->isVectorTy();
3799	};
3800
3801	// Check if the extract index is from lane 0 or lane equivalent to 0 for a
3802	// certain scalar type and a certain vector register width.
3803	auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
3804	auto RegWidth =
3805	getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
3806	.getFixedValue();
3807	return Idx == `0` \|\| (RegWidth != `0` && (Idx * EltSz) % RegWidth == `0`);
3808	};
3809
3810	// Check if the type constraints on input vector type and result scalar type
3811	// of extractelement instruction are satisfied.
3812	if (!isa<FixedVectorType>(Val) \|\| !IsAllowedScalarTy(Val->getScalarType()))
3813	return false;
3814
3815	if (Scalar) {
3816	DenseMap<User , unsigned*> UserToExtractIdx;
3817	for (auto *U : Scalar->users()) {
3818	if (!IsUserFMulScalarTy(U))
3819	return false;
3820	// Recording entry for the user is important. Index value is not
3821	// important.
3822	UserToExtractIdx [U];
3823	}
3824	if (UserToExtractIdx.empty())
3825	return false;
3826	for (auto &[S, U, L] : ScalarUserAndIdx) {
3827	for (auto *U : S->users()) {
3828	if (UserToExtractIdx.contains(Val: U)) {
3829	auto *FMul = cast<BinaryOperator>(Val: U);
3830	auto *Op0 = FMul->getOperand(i_nocapture: `0`);
3831	auto *Op1 = FMul->getOperand(i_nocapture: `1`);
3832	if ((Op0 == S && Op1 == S) \|\| Op0 != S \|\| Op1 != S) {
3833	UserToExtractIdx [U] = L;
3834	break;
3835	}
3836	}
3837	}
3838	}
3839	for (auto &[U, L] : UserToExtractIdx) {
3840	if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
3841	!IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
3842	return false;
3843	}
3844	} else {
3845	const auto *EE = cast<ExtractElementInst>(Val: I);
3846
3847	const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
3848	if (!IdxOp)
3849	return false;
3850
3851	return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
3852	if (!IsUserFMulScalarTy(U))
3853	return false;
3854
3855	// Check if the other operand of extractelement is also extractelement
3856	// from lane equivalent to 0.
3857	const auto *BO = cast<BinaryOperator>(Val: U);
3858	const auto *OtherEE = dyn_cast<ExtractElementInst>(
3859	Val: BO->getOperand(i_nocapture: `0`) == EE ? BO->getOperand(i_nocapture: `1`) : BO->getOperand(i_nocapture: `0`));
3860	if (OtherEE) {
3861	const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
3862	if (!IdxOp)
3863	return false;
3864	return IsExtractLaneEquivalentToZero(
3865	cast<ConstantInt>(Val: OtherEE->getIndexOperand())
3866	->getValue()
3867	.getZExtValue(),
3868	OtherEE->getType()->getScalarSizeInBits());
3869	}
3870	return true;
3871	});
3872	}
3873	return true;
3874	};
3875
3876	if (Opcode == Instruction::ExtractElement && (I \|\| Scalar) &&
3877	ExtractCanFuseWithFmul ())
3878	return `0`;
3879
3880	// All other insert/extracts cost this much.
3881	return CostKind == TTI::TCK_CodeSize ? `1`
3882	: ST->getVectorInsertExtractBaseCost();
3883	}
3884
3885	InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3886	TTI::TargetCostKind CostKind,
3887	unsigned Index,
3888	const Value *Op0,
3889	const Value Op1) const* {
3890	bool HasRealUse =
3891	Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0);
3892	return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
3893	}
3894
3895	InstructionCost AArch64TTIImpl::getVectorInstrCost(
3896	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
3897	Value *Scalar,
3898	ArrayRef<std::tuple<Value , User , int>> ScalarUserAndIdx) const {
3899	return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse: false, I: nullptr,
3900	Scalar, ScalarUserAndIdx);
3901	}
3902
3903	InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
3904	Type *Val,
3905	TTI::TargetCostKind CostKind,
3906	unsigned Index) const {
3907	return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index,
3908	HasRealUse: true / HasRealUse /, I: &I);
3909	}
3910
3911	InstructionCost AArch64TTIImpl::getScalarizationOverhead(
3912	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
3913	TTI::TargetCostKind CostKind, bool ForPoisonSrc,
3914	ArrayRef<Value > VL) const* {
3915	if (isa<ScalableVectorType>(Val: Ty))
3916	return InstructionCost::getInvalid();
3917	if (Ty->getElementType()->isFloatingPointTy())
3918	return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
3919	CostKind);
3920	unsigned VecInstCost =
3921	CostKind == TTI::TCK_CodeSize ? `1` : ST->getVectorInsertExtractBaseCost();
3922	return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
3923	}
3924
3925	InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
3926	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3927	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
3928	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
3929
3930	// The code-generator is currently not able to handle scalable vectors
3931	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3932	// it. This change will be removed when code-generation for these types is
3933	// sufficiently reliable.
3934	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3935	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3936	return InstructionCost::getInvalid();
3937
3938	// TODO: Handle more cost kinds.
3939	if (CostKind != TTI::TCK_RecipThroughput)
3940	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3941	Opd2Info: Op2Info, Args, CxtI);
3942
3943	// Legalize the type.
3944	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3945	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3946
3947	switch (ISD) {
3948	default:
3949	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3950	Opd2Info: Op2Info);
3951	case ISD::SREM:
3952	case ISD::SDIV:
3953	/*
3954	Notes for sdiv/srem specific costs:
3955	1. This only considers the cases where the divisor is constant, uniform and
3956	(pow-of-2/non-pow-of-2). Other cases are not important since they either
3957	result in some form of (ldr + adrp), corresponding to constant vectors, or
3958	scalarization of the division operation.
3959	2. Constant divisors, either negative in whole or partially, don't result in
3960	significantly different codegen as compared to positive constant divisors.
3961	So, we don't consider negative divisors separately.
3962	3. If the codegen is significantly different with SVE, it has been indicated
3963	using comments at appropriate places.
3964
3965	sdiv specific cases:
3966	-----------------------------------------------------------------------
3967	codegen \| pow-of-2 \| Type
3968	-----------------------------------------------------------------------
3969	add + cmp + csel + asr \| Y \| i64
3970	add + cmp + csel + asr \| Y \| i32
3971	-----------------------------------------------------------------------
3972
3973	srem specific cases:
3974	-----------------------------------------------------------------------
3975	codegen \| pow-of-2 \| Type
3976	-----------------------------------------------------------------------
3977	negs + and + and + csneg \| Y \| i64
3978	negs + and + and + csneg \| Y \| i32
3979	-----------------------------------------------------------------------
3980
3981	other sdiv/srem cases:
3982	-------------------------------------------------------------------------
3983	common codegen \| + srem \| + sdiv \| pow-of-2 \| Type
3984	-------------------------------------------------------------------------
3985	smulh + asr + add + add \| - \| - \| N \| i64
3986	smull + lsr + add + add \| - \| - \| N \| i32
3987	usra \| and + sub \| sshr \| Y \| <2 x i64>
3988	2 (scalar code) \| - \| - \| N \| <2 x i64>*
3989	usra \| bic + sub \| sshr + neg \| Y \| <4 x i32>
3990	smull2 + smull + uzp2 \| mls \| - \| N \| <4 x i32>
3991	+ sshr + usra \| \| \| \|
3992	-------------------------------------------------------------------------
3993	*/
3994	if (Op2Info.isConstant() && Op2Info.isUniform()) {
3995	InstructionCost AddCost =
3996	getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
3997	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3998	InstructionCost AsrCost =
3999	getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4000	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4001	InstructionCost MulCost =
4002	getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4003	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4004	// add/cmp/csel/csneg should have similar cost while asr/negs/and should
4005	// have similar cost.
4006	auto VT = TLI->getValueType(DL, Ty);
4007	if (VT.isScalarInteger() && VT.getSizeInBits() <= `64`) {
4008	if (Op2Info.isPowerOf2() \|\| Op2Info.isNegatedPowerOf2()) {
4009	// Neg can be folded into the asr instruction.
4010	return ISD == ISD::SDIV ? (`3` * AddCost + AsrCost)
4011	: (`3` * AsrCost + AddCost);
4012	} else {
4013	return MulCost + AsrCost + `2` * AddCost;
4014	}
4015	} else if (VT.isVector()) {
4016	InstructionCost UsraCost = `2` * AsrCost;
4017	if (Op2Info.isPowerOf2() \|\| Op2Info.isNegatedPowerOf2()) {
4018	// Division with scalable types corresponds to native 'asrd'
4019	// instruction when SVE is available.
4020	// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4021
4022	// One more for the negation in SDIV
4023	InstructionCost Cost =
4024	(Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : `0`;
4025	if (Ty->isScalableTy() && ST->hasSVE())
4026	Cost += `2` * AsrCost;
4027	else {
4028	Cost +=
4029	UsraCost +
4030	(ISD == ISD::SDIV
4031	? (LT.second.getScalarType() == MVT::i64 ? `1` : `2`) * AsrCost
4032	: `2` * AddCost);
4033	}
4034	return Cost;
4035	} else if (LT.second == MVT::v2i64) {
4036	return VT.getVectorNumElements() *
4037	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4038	Op1Info: Op1Info.getNoProps(),
4039	Op2Info: Op2Info.getNoProps());
4040	} else {
4041	// When SVE is available, we get:
4042	// smulh + lsr + add/sub + asr + add/sub.
4043	if (Ty->isScalableTy() && ST->hasSVE())
4044	return MulCost /smulh cost/ + `2` * AddCost + `2` * AsrCost;
4045	return `2` * MulCost + AddCost /uzp2 cost/ + AsrCost + UsraCost;
4046	}
4047	}
4048	}
4049	if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4050	LT.second.isFixedLengthVector()) {
4051	// FIXME: When the constant vector is non-uniform, this may result in
4052	// loading the vector from constant pool or in some cases, may also result
4053	// in scalarization. For now, we are approximating this with the
4054	// scalarization cost.
4055	auto ExtractCost = `2` * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4056	CostKind, Index: -`1`, Op0: nullptr, Op1: nullptr);
4057	auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4058	CostKind, Index: -`1`, Op0: nullptr, Op1: nullptr);
4059	unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4060	return ExtractCost + InsertCost +
4061	NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4062	CostKind, Op1Info: Op1Info.getNoProps(),
4063	Op2Info: Op2Info.getNoProps());
4064	}
4065	[[fallthrough]];
4066	case ISD::UDIV:
4067	case ISD::UREM: {
4068	auto VT = TLI->getValueType(DL, Ty);
4069	if (Op2Info.isConstant()) {
4070	// If the operand is a power of 2 we can use the shift or and cost.
4071	if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4072	return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4073	Op1Info: Op1Info.getNoProps(),
4074	Op2Info: Op2Info.getNoProps());
4075	if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4076	return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4077	Op1Info: Op1Info.getNoProps(),
4078	Op2Info: Op2Info.getNoProps());
4079
4080	if (ISD == ISD::UDIV \|\| ISD == ISD::UREM) {
4081	// Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4082	// The MULHU will be expanded to UMULL for the types not listed below,
4083	// and will become a pair of UMULL+MULL2 for 128bit vectors.
4084	bool HasMULH = VT == MVT::i64 \|\| LT.second == MVT::nxv2i64 \|\|
4085	LT.second == MVT::nxv4i32 \|\| LT.second == MVT::nxv8i16 \|\|
4086	LT.second == MVT::nxv16i8;
4087	bool Is128bit = LT.second.is128BitVector();
4088
4089	InstructionCost MulCost =
4090	getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4091	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4092	InstructionCost AddCost =
4093	getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4094	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4095	InstructionCost ShrCost =
4096	getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4097	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4098	InstructionCost DivCost = MulCost * (Is128bit ? `2` : `1`) + // UMULL/UMULH
4099	(HasMULH ? `0` : ShrCost) + // UMULL shift
4100	AddCost * `2` + ShrCost;
4101	return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : `0`);
4102	}
4103	}
4104
4105	// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4106	// emitted by the backend even when those functions are not declared in the
4107	// module.
4108	if (!VT.isVector() && VT.getSizeInBits() > `64`)
4109	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4110
4111	InstructionCost Cost = BaseT::getArithmeticInstrCost(
4112	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4113	if (Ty->isVectorTy() && (ISD == ISD::SDIV \|\| ISD == ISD::UDIV)) {
4114	if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4115	// SDIV/UDIV operations are lowered using SVE, then we can have less
4116	// costs.
4117	if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty)
4118	->getPrimitiveSizeInBits()
4119	.getFixedValue() < `128`) {
4120	EVT VT = TLI->getValueType(DL, Ty);
4121	static const CostTblEntry DivTbl[]{
4122	{.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: `8`},
4123	{.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: `5`},
4124	{.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `1`},
4125	{.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: `8`},
4126	{.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: `5`},
4127	{.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `1`}};
4128
4129	const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4130	if (nullptr != Entry)
4131	return Entry->Cost;
4132	}
4133	// For 8/16-bit elements, the cost is higher because the type
4134	// requires promotion and possibly splitting:
4135	if (LT.second.getScalarType() == MVT::i8)
4136	Cost *= `8`;
4137	else if (LT.second.getScalarType() == MVT::i16)
4138	Cost *= `4`;
4139	return Cost;
4140	} else {
4141	// If one of the operands is a uniform constant then the cost for each
4142	// element is Cost for insertion, extraction and division.
4143	// Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4144	// operation with scalar type
4145	if ((Op1Info.isConstant() && Op1Info.isUniform()) \|\|
4146	(Op2Info.isConstant() && Op2Info.isUniform())) {
4147	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4148	InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4149	Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4150	return (`4` + DivCost) * VTy->getNumElements();
4151	}
4152	}
4153	// On AArch64, without SVE, vector divisions are expanded
4154	// into scalar divisions of each pair of elements.
4155	Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4156	Index: -`1`, Op0: nullptr, Op1: nullptr);
4157	Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -`1`,
4158	Op0: nullptr, Op1: nullptr);
4159	}
4160
4161	// TODO: if one of the arguments is scalar, then it's not necessary to
4162	// double the cost of handling the vector elements.
4163	Cost += Cost;
4164	}
4165	return Cost;
4166	}
4167	case ISD::MUL:
4168	// When SVE is available, then we can lower the v2i64 operation using
4169	// the SVE mul instruction, which has a lower cost.
4170	if (LT.second == MVT::v2i64 && ST->hasSVE())
4171	return LT.first;
4172
4173	// When SVE is not available, there is no MUL.2d instruction,
4174	// which means mul <2 x i64> is expensive as elements are extracted
4175	// from the vectors and the muls scalarized.
4176	// As getScalarizationOverhead is a bit too pessimistic, we
4177	// estimate the cost for a i64 vector directly here, which is:
4178	// - four 2-cost i64 extracts,
4179	// - two 2-cost i64 inserts, and
4180	// - two 1-cost muls.
4181	// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4182	// LT.first = 2 the cost is 28. If both operands are extensions it will not
4183	// need to scalarize so the cost can be cheaper (smull or umull).
4184	// so the cost can be cheaper (smull or umull).
4185	if (LT.second != MVT::v2i64 \|\| isWideningInstruction(DstTy: Ty, Opcode, Args))
4186	return LT.first;
4187	return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4188	(getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4189	getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -`1`,
4190	Op0: nullptr, Op1: nullptr) *
4191	`2` +
4192	getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -`1`,
4193	Op0: nullptr, Op1: nullptr));
4194	case ISD::ADD:
4195	case ISD::XOR:
4196	case ISD::OR:
4197	case ISD::AND:
4198	case ISD::SRL:
4199	case ISD::SRA:
4200	case ISD::SHL:
4201	// These nodes are marked as 'custom' for combining purposes only.
4202	// We know that they are legal. See LowerAdd in ISelLowering.
4203	return LT.first;
4204
4205	case ISD::FNEG:
4206	// Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4207	if ((Ty->isFloatTy() \|\| Ty->isDoubleTy() \|\|
4208	(Ty->isHalfTy() && ST->hasFullFP16())) &&
4209	CxtI &&
4210	((CxtI->hasOneUse() &&
4211	match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) \|\|
4212	match(V: CxtI->getOperand(i: `0`), P: m_FMul(L: m_Value(), R: m_Value()))))
4213	return `0`;
4214	[[fallthrough]];
4215	case ISD::FADD:
4216	case ISD::FSUB:
4217	// Increase the cost for half and bfloat types if not architecturally
4218	// supported.
4219	if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) \|\|
4220	(Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
4221	return `2` * LT.first;
4222	if (!Ty->getScalarType()->isFP128Ty())
4223	return LT.first;
4224	[[fallthrough]];
4225	case ISD::FMUL:
4226	case ISD::FDIV:
4227	// These nodes are marked as 'custom' just to lower them to SVE.
4228	// We know said lowering will incur no additional cost.
4229	if (!Ty->getScalarType()->isFP128Ty())
4230	return `2` * LT.first;
4231
4232	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4233	Opd2Info: Op2Info);
4234	case ISD::FREM:
4235	// Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4236	// those functions are not declared in the module.
4237	if (!Ty->isVectorTy())
4238	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4239	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4240	Opd2Info: Op2Info);
4241	}
4242	}
4243
4244	InstructionCost
4245	AArch64TTIImpl::getAddressComputationCost(Type Ty, ScalarEvolution SE,
4246	const SCEV Ptr) const* {
4247	// Address computations in vectorized code with non-consecutive addresses will
4248	// likely result in more instructions compared to scalar code where the
4249	// computation can more often be merged into the index mode. The resulting
4250	// extra micro-ops can significantly decrease throughput.
4251	unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4252	int MaxMergeDistance = `64`;
4253
4254	if (Ty->isVectorTy() && SE &&
4255	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
4256	return NumVectorInstToHideOverhead;
4257
4258	// In many cases the address computation is not merged into the instruction
4259	// addressing mode.
4260	return `1`;
4261	}
4262
4263	InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4264	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
4265	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4266	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
4267	// TODO: Handle other cost kinds.
4268	if (CostKind != TTI::TCK_RecipThroughput)
4269	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4270	Op1Info, Op2Info, I);
4271
4272	int ISD = TLI->InstructionOpcodeToISD(Opcode);
4273	// We don't lower some vector selects well that are wider than the register
4274	// width.
4275	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) {
4276	// We would need this many instructions to hide the scalarization happening.
4277	const int AmortizationCost = `20`;
4278
4279	// If VecPred is not set, check if we can get a predicate from the context
4280	// instruction, if its type matches the requested ValTy.
4281	if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4282	CmpPredicate CurrentPred;
4283	if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4284	R: m_Value())))
4285	VecPred = CurrentPred;
4286	}
4287	// Check if we have a compare/select chain that can be lowered using
4288	// a (F)CMxx & BFI pair.
4289	if (CmpInst::isIntPredicate(P: VecPred) \|\| VecPred == CmpInst::FCMP_OLE \|\|
4290	VecPred == CmpInst::FCMP_OLT \|\| VecPred == CmpInst::FCMP_OGT \|\|
4291	VecPred == CmpInst::FCMP_OGE \|\| VecPred == CmpInst::FCMP_OEQ \|\|
4292	VecPred == CmpInst::FCMP_UNE) {
4293	static const auto ValidMinMaxTys = {
4294	MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4295	MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4296	static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4297
4298	auto LT = getTypeLegalizationCost(Ty: ValTy);
4299	if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }) \|\|
4300	(ST->hasFullFP16() &&
4301	any_of(Range: ValidFP16MinMaxTys, P: [&LT](MVT M) { return M == LT.second; })))
4302	return LT.first;
4303	}
4304
4305	static const TypeConversionCostTblEntry
4306	VectorSelectTbl[] = {
4307	{ .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: `2` },
4308	{ .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: `2` },
4309	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: `2` },
4310	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: `2` },
4311	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: `2` },
4312	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: `16` },
4313	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: `8` },
4314	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: `16` },
4315	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4` * AmortizationCost },
4316	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `8` * AmortizationCost },
4317	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `16` * AmortizationCost }
4318	};
4319
4320	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4321	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4322	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4323	if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD,
4324	Dst: SelCondTy.getSimpleVT(),
4325	Src: SelValTy.getSimpleVT()))
4326	return Entry->Cost;
4327	}
4328	}
4329
4330	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) {
4331	Type *ValScalarTy = ValTy->getScalarType();
4332	if ((ValScalarTy->isHalfTy() && !ST->hasFullFP16()) \|\|
4333	ValScalarTy->isBFloatTy()) {
4334	auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
4335
4336	// Without dedicated instructions we promote [b]f16 compares to f32.
4337	auto *PromotedTy =
4338	VectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), Other: ValVTy);
4339
4340	InstructionCost Cost = `0`;
4341	// Promote operands to float vectors.
4342	Cost += `2` * getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: ValTy,
4343	CCH: TTI::CastContextHint::None, CostKind);
4344	// Compare float vectors.
4345	Cost += getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred, CostKind,
4346	Op1Info, Op2Info);
4347	// During codegen we'll truncate the vector result from i32 to i16.
4348	Cost +=
4349	getCastInstrCost(Opcode: Instruction::Trunc, Dst: VectorType::getInteger(VTy: ValVTy),
4350	Src: VectorType::getInteger(VTy: PromotedTy),
4351	CCH: TTI::CastContextHint::None, CostKind);
4352	return Cost;
4353	}
4354	}
4355
4356	// Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4357	// icmp(and, 0) as free, as we can make use of ands, but only if the
4358	// comparison is not unsigned.
4359	if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
4360	!CmpInst::isUnsigned(predicate: VecPred) &&
4361	TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4362	match(V: I->getOperand(i: `0`), P: m_And(L: m_Value(), R: m_Value()))) {
4363	if (match(V: I->getOperand(i: `1`), P: m_Zero()))
4364	return `0`;
4365
4366	// x >= 1 / x < 1 -> x > 0 / x <= 0
4367	if (match(V: I->getOperand(i: `1`), P: m_One()) &&
4368	(VecPred == CmpInst::ICMP_SLT \|\| VecPred == CmpInst::ICMP_SGE))
4369	return `0`;
4370
4371	// x <= -1 / x > -1 -> x > 0 / x <= 0
4372	if (match(V: I->getOperand(i: `1`), P: m_AllOnes()) &&
4373	(VecPred == CmpInst::ICMP_SLE \|\| VecPred == CmpInst::ICMP_SGT))
4374	return `0`;
4375	}
4376
4377	// The base case handles scalable vectors fine for now, since it treats the
4378	// cost as 1 legalization cost.*
4379	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4380	Op1Info, Op2Info, I);
4381	}
4382
4383	AArch64TTIImpl::TTI::MemCmpExpansionOptions
4384	AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4385	TTI::MemCmpExpansionOptions Options;
4386	if (ST->requiresStrictAlign()) {
4387	// TODO: Add cost modeling for strict align. Misaligned loads expand to
4388	// a bunch of instructions when strict align is enabled.
4389	return Options;
4390	}
4391	Options.AllowOverlappingLoads = true;
4392	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4393	Options.NumLoadsPerBlock = Options.MaxNumLoads;
4394	// TODO: Though vector loads usually perform well on AArch64, in some targets
4395	// they may wake up the FP unit, which raises the power consumption. Perhaps
4396	// they could be used with no holds barred (-O3).
4397	Options.LoadSizes = {`8`, `4`, `2`, `1`};
4398	Options.AllowedTailExpansions = {`3`, `5`, `6`};
4399	return Options;
4400	}
4401
4402	bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4403	return ST->hasSVE();
4404	}
4405
4406	InstructionCost
4407	AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
4408	Align Alignment, unsigned AddressSpace,
4409	TTI::TargetCostKind CostKind) const {
4410	if (useNeonVector(Ty: Src))
4411	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
4412	CostKind);
4413	auto LT = getTypeLegalizationCost(Ty: Src);
4414	if (!LT.first.isValid())
4415	return InstructionCost::getInvalid();
4416
4417	// Return an invalid cost for element types that we are unable to lower.
4418	auto *VT = cast<VectorType>(Val: Src);
4419	if (VT->getElementType()->isIntegerTy(Bitwidth: `1`))
4420	return InstructionCost::getInvalid();
4421
4422	// The code-generator is currently not able to handle scalable vectors
4423	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4424	// it. This change will be removed when code-generation for these types is
4425	// sufficiently reliable.
4426	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
4427	return InstructionCost::getInvalid();
4428
4429	return LT.first;
4430	}
4431
4432	// This function returns gather/scatter overhead either from
4433	// user-provided value or specialized values per-target from \p ST.
4434	static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4435	const AArch64Subtarget *ST) {
4436	assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
4437	"Should be called on only load or stores.");
4438	switch (Opcode) {
4439	case Instruction::Load:
4440	if (SVEGatherOverhead.getNumOccurrences() > `0`)
4441	return SVEGatherOverhead;
4442	return ST->getGatherOverhead();
4443	break;
4444	case Instruction::Store:
4445	if (SVEScatterOverhead.getNumOccurrences() > `0`)
4446	return SVEScatterOverhead;
4447	return ST->getScatterOverhead();
4448	break;
4449	default:
4450	llvm_unreachable("Shouldn't have reached here");
4451	}
4452	}
4453
4454	InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
4455	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
4456	Align Alignment, TTI::TargetCostKind CostKind, const Instruction I) const* {
4457	if (useNeonVector(Ty: DataTy) \|\| !isLegalMaskedGatherScatter(DataType: DataTy))
4458	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
4459	Alignment, CostKind, I);
4460	auto *VT = cast<VectorType>(Val: DataTy);
4461	auto LT = getTypeLegalizationCost(Ty: DataTy);
4462	if (!LT.first.isValid())
4463	return InstructionCost::getInvalid();
4464
4465	// Return an invalid cost for element types that we are unable to lower.
4466	if (!LT.second.isVector() \|\|
4467	!isElementTypeLegalForScalableVector(Ty: VT->getElementType()) \|\|
4468	VT->getElementType()->isIntegerTy(Bitwidth: `1`))
4469	return InstructionCost::getInvalid();
4470
4471	// The code-generator is currently not able to handle scalable vectors
4472	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4473	// it. This change will be removed when code-generation for these types is
4474	// sufficiently reliable.
4475	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
4476	return InstructionCost::getInvalid();
4477
4478	ElementCount LegalVF = LT.second.getVectorElementCount();
4479	InstructionCost MemOpCost =
4480	getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: `0`, CostKind,
4481	OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
4482	// Add on an overhead cost for using gathers/scatters.
4483	MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4484	return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
4485	}
4486
4487	bool AArch64TTIImpl::useNeonVector(const Type Ty) const* {
4488	return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
4489	}
4490
4491	InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
4492	Align Alignment,
4493	unsigned AddressSpace,
4494	TTI::TargetCostKind CostKind,
4495	TTI::OperandValueInfo OpInfo,
4496	const Instruction I) const* {
4497	EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
4498	// Type legalization can't handle structs
4499	if (VT == MVT::Other)
4500	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
4501	CostKind);
4502
4503	auto LT = getTypeLegalizationCost(Ty);
4504	if (!LT.first.isValid())
4505	return InstructionCost::getInvalid();
4506
4507	// The code-generator is currently not able to handle scalable vectors
4508	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4509	// it. This change will be removed when code-generation for these types is
4510	// sufficiently reliable.
4511	// We also only support full register predicate loads and stores.
4512	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4513	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`) \|\|
4514	(VTy->getElementType()->isIntegerTy(Bitwidth: `1`) &&
4515	!VTy->getElementCount().isKnownMultipleOf(
4516	RHS: ElementCount::getScalable(MinVal: `16`))))
4517	return InstructionCost::getInvalid();
4518
4519	// TODO: consider latency as well for TCK_SizeAndLatency.
4520	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency)
4521	return LT.first;
4522
4523	if (CostKind != TTI::TCK_RecipThroughput)
4524	return `1`;
4525
4526	if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
4527	LT.second.is128BitVector() && Alignment < Align (`16`)) {
4528	// Unaligned stores are extremely inefficient. We don't split all
4529	// unaligned 128-bit stores because the negative impact that has shown in
4530	// practice on inlined block copy code.
4531	// We make such stores expensive so that we will only vectorize if there
4532	// are 6 other instructions getting vectorized.
4533	const int AmortizationCost = `6`;
4534
4535	return LT.first * `2` * AmortizationCost;
4536	}
4537
4538	// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
4539	if (Ty->isPtrOrPtrVectorTy())
4540	return LT.first;
4541
4542	if (useNeonVector(Ty)) {
4543	// Check truncating stores and extending loads.
4544	if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
4545	// v4i8 types are lowered to scalar a load/store and sshll/xtn.
4546	if (VT == MVT::v4i8)
4547	return `2`;
4548	// Otherwise we need to scalarize.
4549	return cast<FixedVectorType>(Val: Ty)->getNumElements() * `2`;
4550	}
4551	EVT EltVT = VT.getVectorElementType();
4552	unsigned EltSize = EltVT.getScalarSizeInBits();
4553	if (!isPowerOf2_32(Value: EltSize) \|\| EltSize < `8` \|\| EltSize > `64` \|\|
4554	VT.getVectorNumElements() >= (`128` / EltSize) \|\| Alignment != Align (`1`))
4555	return LT.first;
4556	// FIXME: v3i8 lowering currently is very inefficient, due to automatic
4557	// widening to v4i8, which produces suboptimal results.
4558	if (VT.getVectorNumElements() == `3` && EltVT == MVT::i8)
4559	return LT.first;
4560
4561	// Check non-power-of-2 loads/stores for legal vector element types with
4562	// NEON. Non-power-of-2 memory ops will get broken down to a set of
4563	// operations on smaller power-of-2 ops, including ld1/st1.
4564	LLVMContext &C = Ty->getContext();
4565	InstructionCost Cost(`0`);
4566	SmallVector<EVT> TypeWorklist;
4567	TypeWorklist.push_back(Elt: VT);
4568	while (!TypeWorklist.empty()) {
4569	EVT CurrVT = TypeWorklist.pop_back_val();
4570	unsigned CurrNumElements = CurrVT.getVectorNumElements();
4571	if (isPowerOf2_32(Value: CurrNumElements)) {
4572	Cost += `1`;
4573	continue;
4574	}
4575
4576	unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / `2`;
4577	TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
4578	TypeWorklist.push_back(
4579	Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
4580	}
4581	return Cost;
4582	}
4583
4584	return LT.first;
4585	}
4586
4587	InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
4588	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
4589	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
4590	bool UseMaskForCond, bool UseMaskForGaps) const {
4591	assert(Factor >= `2` && "Invalid interleave factor");
4592	auto *VecVTy = cast<VectorType>(Val: VecTy);
4593
4594	if (VecTy->isScalableTy() && !ST->hasSVE())
4595	return InstructionCost::getInvalid();
4596
4597	// Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
4598	// only have lowering for power-of-2 factors.
4599	// TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
4600	// InterleavedAccessPass for ld3/st3
4601	if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
4602	return InstructionCost::getInvalid();
4603
4604	// Vectorization for masked interleaved accesses is only enabled for scalable
4605	// VF.
4606	if (!VecTy->isScalableTy() && (UseMaskForCond \|\| UseMaskForGaps))
4607	return InstructionCost::getInvalid();
4608
4609	if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4610	unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4611	auto *SubVecTy =
4612	VectorType::get(ElementType: VecVTy->getElementType(),
4613	EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
4614
4615	// ldN/stN only support legal vector types of size 64 or 128 in bits.
4616	// Accesses having vector types that are a multiple of 128 bits can be
4617	// matched to more than one ldN/stN instruction.
4618	bool UseScalable;
4619	if (MinElts % Factor == `0` &&
4620	TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
4621	return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
4622	}
4623
4624	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
4625	Alignment, AddressSpace, CostKind,
4626	UseMaskForCond, UseMaskForGaps);
4627	}
4628
4629	InstructionCost
4630	AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type > Tys) const* {
4631	InstructionCost Cost = `0`;
4632	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4633	for (auto *I : Tys) {
4634	if (!I->isVectorTy())
4635	continue;
4636	if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
4637	`128`)
4638	Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind) +
4639	getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind);
4640	}
4641	return Cost;
4642	}
4643
4644	unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
4645	return ST->getMaxInterleaveFactor();
4646	}
4647
4648	// For Falkor, we want to avoid having too many strided loads in a loop since
4649	// that can exhaust the HW prefetcher resources. We adjust the unroller
4650	// MaxCount preference below to attempt to ensure unrolling doesn't create too
4651	// many strided loads.
4652	static void
4653	getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
4654	TargetTransformInfo::UnrollingPreferences &UP) {
4655	enum { MaxStridedLoads = `7` };
4656	auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
4657	int StridedLoads = `0`;
4658	// FIXME? We could make this more precise by looking at the CFG and
4659	// e.g. not counting loads in each side of an if-then-else diamond.
4660	for (const auto BB : L->blocks()) {
4661	for (auto &I : *BB) {
4662	LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
4663	if (!LMemI)
4664	continue;
4665
4666	Value *PtrValue = LMemI->getPointerOperand();
4667	if (L->isLoopInvariant(V: PtrValue))
4668	continue;
4669
4670	const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
4671	const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
4672	if (!LSCEVAddRec \|\| !LSCEVAddRec->isAffine())
4673	continue;
4674
4675	// FIXME? We could take pairing of unrolled load copies into account
4676	// by looking at the AddRec, but we would probably have to limit this
4677	// to loops with no stores or other memory optimization barriers.
4678	++StridedLoads;
4679	// We've seen enough strided loads that seeing more won't make a
4680	// difference.
4681	if (StridedLoads > MaxStridedLoads / `2`)
4682	return StridedLoads;
4683	}
4684	}
4685	return StridedLoads;
4686	};
4687
4688	int StridedLoads = countStridedLoads (L, SE);
4689	LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
4690	<< " strided loads\n");
4691	// Pick the largest power of 2 unroll count that won't result in too many
4692	// strided loads.
4693	if (StridedLoads) {
4694	UP.MaxCount = `1` << Log2_32(Value: MaxStridedLoads / StridedLoads);
4695	LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
4696	<< UP.MaxCount << `'\n'`);
4697	}
4698	}
4699
4700	// This function returns true if the loop:
4701	// 1. Has a valid cost, and
4702	// 2. Has a cost within the supplied budget.
4703	// Otherwise it returns false.
4704	static bool isLoopSizeWithinBudget(Loop L, const* AArch64TTIImpl &TTI,
4705	InstructionCost Budget,
4706	unsigned *FinalSize) {
4707	// Estimate the size of the loop.
4708	InstructionCost LoopCost = `0`;
4709
4710	for (auto *BB : L->getBlocks()) {
4711	for (auto &I : *BB) {
4712	SmallVector<const Value *, `4`> Operands(I.operand_values());
4713	InstructionCost Cost =
4714	TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
4715	// This can happen with intrinsics that don't currently have a cost model
4716	// or for some operations that require SVE.
4717	if (!Cost.isValid())
4718	return false;
4719
4720	LoopCost += Cost;
4721	if (LoopCost > Budget)
4722	return false;
4723	}
4724	}
4725
4726	if (FinalSize)
4727	*FinalSize = LoopCost.getValue();
4728	return true;
4729	}
4730
4731	static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
4732	const AArch64TTIImpl &TTI) {
4733	// Only consider loops with unknown trip counts for which we can determine
4734	// a symbolic expression. Multi-exit loops with small known trip counts will
4735	// likely be unrolled anyway.
4736	const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4737	if (isa<SCEVConstant>(Val: BTC) \|\| isa<SCEVCouldNotCompute>(Val: BTC))
4738	return false;
4739
4740	// It might not be worth unrolling loops with low max trip counts. Restrict
4741	// this to max trip counts > 32 for now.
4742	unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
4743	if (MaxTC > `0` && MaxTC <= `32`)
4744	return false;
4745
4746	// Make sure the loop size is <= 5.
4747	if (!isLoopSizeWithinBudget(L, TTI, Budget: `5`, FinalSize: nullptr))
4748	return false;
4749
4750	// Small search loops with multiple exits can be highly beneficial to unroll.
4751	// We only care about loops with exactly two exiting blocks, although each
4752	// block could jump to the same exit block.
4753	ArrayRef<BasicBlock *> Blocks = L->getBlocks();
4754	if (Blocks.size() != `2`)
4755	return false;
4756
4757	if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
4758	return !isa<BranchInst>(Val: BB->getTerminator());
4759	}))
4760	return false;
4761
4762	return true;
4763	}
4764
4765	/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
4766	/// OOO engine's wide instruction window and various predictors.
4767	static void
4768	getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
4769	TargetTransformInfo::UnrollingPreferences &UP,
4770	const AArch64TTIImpl &TTI) {
4771	// Limit loops with structure that is highly likely to benefit from runtime
4772	// unrolling; that is we exclude outer loops and loops with many blocks (i.e.
4773	// likely with complex control flow). Note that the heuristics here may be
4774	// overly conservative and we err on the side of avoiding runtime unrolling
4775	// rather than unroll excessively. They are all subject to further refinement.
4776	if (!L->isInnermost() \|\| L->getNumBlocks() > `8`)
4777	return;
4778
4779	// Loops with multiple exits are handled by common code.
4780	if (!L->getExitBlock())
4781	return;
4782
4783	const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
4784	if (isa<SCEVConstant>(Val: BTC) \|\| isa<SCEVCouldNotCompute>(Val: BTC) \|\|
4785	(SE.getSmallConstantMaxTripCount(L) > `0` &&
4786	SE.getSmallConstantMaxTripCount(L) <= `32`))
4787	return;
4788
4789	if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
4790	return;
4791
4792	if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
4793	return;
4794
4795	// Limit to loops with trip counts that are cheap to expand.
4796	UP.SCEVExpansionBudget = `1`;
4797
4798	// Try to unroll small, single block loops, if they have load/store
4799	// dependencies, to expose more parallel memory access streams.
4800	BasicBlock *Header = L->getHeader();
4801	if (Header == L->getLoopLatch()) {
4802	// Estimate the size of the loop.
4803	unsigned Size;
4804	if (!isLoopSizeWithinBudget(L, TTI, Budget: `8`, FinalSize: &Size))
4805	return;
4806
4807	SmallPtrSet<Value *, `8`> LoadedValues;
4808	SmallVector<StoreInst *> Stores;
4809	for (auto *BB : L->blocks()) {
4810	for (auto &I : *BB) {
4811	Value *Ptr = getLoadStorePointerOperand(V: &I);
4812	if (!Ptr)
4813	continue;
4814	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4815	if (SE.isLoopInvariant(S: PtrSCEV, L))
4816	continue;
4817	if (isa<LoadInst>(Val: &I))
4818	LoadedValues.insert(Ptr: &I);
4819	else
4820	Stores.push_back(Elt: cast<StoreInst>(Val: &I));
4821	}
4822	}
4823
4824	// Try to find an unroll count that maximizes the use of the instruction
4825	// window, i.e. trying to fetch as many instructions per cycle as possible.
4826	unsigned MaxInstsPerLine = `16`;
4827	unsigned UC = `1`;
4828	unsigned BestUC = `1`;
4829	unsigned SizeWithBestUC = BestUC * Size;
4830	while (UC <= `8`) {
4831	unsigned SizeWithUC = UC * Size;
4832	if (SizeWithUC > `48`)
4833	break;
4834	if ((SizeWithUC % MaxInstsPerLine) == `0` \|\|
4835	(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
4836	BestUC = UC;
4837	SizeWithBestUC = BestUC * Size;
4838	}
4839	UC++;
4840	}
4841
4842	if (BestUC == `1` \|\| none_of(Range&: Stores, P: [&LoadedValues](StoreInst *SI) {
4843	return LoadedValues.contains(Ptr: SI->getOperand(i_nocapture: `0`));
4844	}))
4845	return;
4846
4847	UP.Runtime = true;
4848	UP.DefaultUnrollRuntimeCount = BestUC;
4849	return;
4850	}
4851
4852	// Try to runtime-unroll loops with early-continues depending on loop-varying
4853	// loads; this helps with branch-prediction for the early-continues.
4854	auto *Term = dyn_cast<BranchInst>(Val: Header->getTerminator());
4855	auto *Latch = L->getLoopLatch();
4856	SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
4857	if (!Term \|\| !Term->isConditional() \|\| Preds.size() == `1` \|\|
4858	!llvm::is_contained(Range&: Preds, Element: Header) \|\|
4859	none_of(Range&: Preds, P: [L](BasicBlock Pred) { return* L->contains(BB: Pred); }))
4860	return;
4861
4862	std::function<bool(Instruction , unsigned*)> DependsOnLoopLoad =
4863	[&](Instruction I, unsigned* Depth) -> bool {
4864	if (isa<PHINode>(Val: I) \|\| L->isLoopInvariant(V: I) \|\| Depth > `8`)
4865	return false;
4866
4867	if (isa<LoadInst>(Val: I))
4868	return true;
4869
4870	return any_of(Range: I->operands(), P: [&](Value *V) {
4871	auto *I = dyn_cast<Instruction>(Val: V);
4872	return I && DependsOnLoopLoad (I, Depth + `1`);
4873	});
4874	};
4875	CmpPredicate Pred;
4876	Instruction *I;
4877	if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
4878	F: m_Value())) &&
4879	DependsOnLoopLoad (I, `0`)) {
4880	UP.Runtime = true;
4881	}
4882	}
4883
4884	void AArch64TTIImpl::getUnrollingPreferences(
4885	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
4886	OptimizationRemarkEmitter ORE) const* {
4887	// Enable partial unrolling and runtime unrolling.
4888	BaseT::getUnrollingPreferences(L, SE, UP, ORE);
4889
4890	UP.UpperBound = true;
4891
4892	// For inner loop, it is more likely to be a hot one, and the runtime check
4893	// can be promoted out from LICM pass, so the overhead is less, let's try
4894	// a larger threshold to unroll more loops.
4895	if (L->getLoopDepth() > `1`)
4896	UP.PartialThreshold *= `2`;
4897
4898	// Disable partial & runtime unrolling on -Os.
4899	UP.PartialOptSizeThreshold = `0`;
4900
4901	// Scan the loop: don't unroll loops with calls as this could prevent
4902	// inlining. Don't unroll vector loops either, as they don't benefit much from
4903	// unrolling.
4904	for (auto *BB : L->getBlocks()) {
4905	for (auto &I : *BB) {
4906	// Don't unroll vectorised loop.
4907	if (I.getType()->isVectorTy())
4908	return;
4909
4910	if (isa<CallBase>(Val: I)) {
4911	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I))
4912	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
4913	if (!isLoweredToCall(F))
4914	continue;
4915	return;
4916	}
4917	}
4918	}
4919
4920	// Apply subtarget-specific unrolling preferences.
4921	switch (ST->getProcFamily()) {
4922	case AArch64Subtarget::AppleA14:
4923	case AArch64Subtarget::AppleA15:
4924	case AArch64Subtarget::AppleA16:
4925	case AArch64Subtarget::AppleM4:
4926	getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
4927	break;
4928	case AArch64Subtarget::Falkor:
4929	if (EnableFalkorHWPFUnrollFix)
4930	getFalkorUnrollingPreferences(L, SE, UP);
4931	break;
4932	default:
4933	break;
4934	}
4935
4936	// If this is a small, multi-exit loop similar to something like std::find,
4937	// then there is typically a performance improvement achieved by unrolling.
4938	if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
4939	UP.RuntimeUnrollMultiExit = true;
4940	UP.Runtime = true;
4941	// Limit unroll count.
4942	UP.DefaultUnrollRuntimeCount = `4`;
4943	// Allow slightly more costly trip-count expansion to catch search loops
4944	// with pointer inductions.
4945	UP.SCEVExpansionBudget = `5`;
4946	return;
4947	}
4948
4949	// Enable runtime unrolling for in-order models
4950	// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
4951	// checking for that case, we can ensure that the default behaviour is
4952	// unchanged
4953	if (ST->getProcFamily() != AArch64Subtarget::Generic &&
4954	!ST->getSchedModel().isOutOfOrder()) {
4955	UP.Runtime = true;
4956	UP.Partial = true;
4957	UP.UnrollRemainder = true;
4958	UP.DefaultUnrollRuntimeCount = `4`;
4959
4960	UP.UnrollAndJam = true;
4961	UP.UnrollAndJamInnerLoopThreshold = `60`;
4962	}
4963	}
4964
4965	void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
4966	TTI::PeelingPreferences &PP) const {
4967	BaseT::getPeelingPreferences(L, SE, PP);
4968	}
4969
4970	Value *
4971	AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
4972	Type ExpectedType) const* {
4973	switch (Inst->getIntrinsicID()) {
4974	default:
4975	return nullptr;
4976	case Intrinsic::aarch64_neon_st2:
4977	case Intrinsic::aarch64_neon_st3:
4978	case Intrinsic::aarch64_neon_st4: {
4979	// Create a struct type
4980	StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
4981	if (!ST)
4982	return nullptr;
4983	unsigned NumElts = Inst->arg_size() - `1`;
4984	if (ST->getNumElements() != NumElts)
4985	return nullptr;
4986	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
4987	if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
4988	return nullptr;
4989	}
4990	Value *Res = PoisonValue::get(T: ExpectedType);
4991	IRBuilder<> Builder(Inst);
4992	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
4993	Value *L = Inst->getArgOperand(i);
4994	Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
4995	}
4996	return Res;
4997	}
4998	case Intrinsic::aarch64_neon_ld2:
4999	case Intrinsic::aarch64_neon_ld3:
5000	case Intrinsic::aarch64_neon_ld4:
5001	if (Inst->getType() == ExpectedType)
5002	return Inst;
5003	return nullptr;
5004	}
5005	}
5006
5007	bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5008	MemIntrinsicInfo &Info) const {
5009	switch (Inst->getIntrinsicID()) {
5010	default:
5011	break;
5012	case Intrinsic::aarch64_neon_ld2:
5013	case Intrinsic::aarch64_neon_ld3:
5014	case Intrinsic::aarch64_neon_ld4:
5015	Info.ReadMem = true;
5016	Info.WriteMem = false;
5017	Info.PtrVal = Inst->getArgOperand(i: `0`);
5018	break;
5019	case Intrinsic::aarch64_neon_st2:
5020	case Intrinsic::aarch64_neon_st3:
5021	case Intrinsic::aarch64_neon_st4:
5022	Info.ReadMem = false;
5023	Info.WriteMem = true;
5024	Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - `1`);
5025	break;
5026	}
5027
5028	switch (Inst->getIntrinsicID()) {
5029	default:
5030	return false;
5031	case Intrinsic::aarch64_neon_ld2:
5032	case Intrinsic::aarch64_neon_st2:
5033	Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5034	break;
5035	case Intrinsic::aarch64_neon_ld3:
5036	case Intrinsic::aarch64_neon_st3:
5037	Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5038	break;
5039	case Intrinsic::aarch64_neon_ld4:
5040	case Intrinsic::aarch64_neon_st4:
5041	Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5042	break;
5043	}
5044	return true;
5045	}
5046
5047	/// See if \p I should be considered for address type promotion. We check if \p
5048	/// I is a sext with right type and used in memory accesses. If it used in a
5049	/// "complex" getelementptr, we allow it to be promoted without finding other
5050	/// sext instructions that sign extended the same initial value. A getelementptr
5051	/// is considered as "complex" if it has more than 2 operands.
5052	bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5053	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5054	bool Considerable = false;
5055	AllowPromotionWithoutCommonHeader = false;
5056	if (!isa<SExtInst>(Val: &I))
5057	return false;
5058	Type *ConsideredSExtType =
5059	Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5060	if (I.getType() != ConsideredSExtType)
5061	return false;
5062	// See if the sext is the one with the right type and used in at least one
5063	// GetElementPtrInst.
5064	for (const User *U : I.users()) {
5065	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5066	Considerable = true;
5067	// A getelementptr is considered as "complex" if it has more than 2
5068	// operands. We will promote a SExt used in such complex GEP as we
5069	// expect some computation to be merged if they are done on 64 bits.
5070	if (GEPInst->getNumOperands() > `2`) {
5071	AllowPromotionWithoutCommonHeader = true;
5072	break;
5073	}
5074	}
5075	}
5076	return Considerable;
5077	}
5078
5079	bool AArch64TTIImpl::isLegalToVectorizeReduction(
5080	const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5081	if (!VF.isScalable())
5082	return true;
5083
5084	Type *Ty = RdxDesc.getRecurrenceType();
5085	if (Ty->isBFloatTy() \|\| !isElementTypeLegalForScalableVector(Ty))
5086	return false;
5087
5088	switch (RdxDesc.getRecurrenceKind()) {
5089	case RecurKind::Add:
5090	case RecurKind::FAdd:
5091	case RecurKind::And:
5092	case RecurKind::Or:
5093	case RecurKind::Xor:
5094	case RecurKind::SMin:
5095	case RecurKind::SMax:
5096	case RecurKind::UMin:
5097	case RecurKind::UMax:
5098	case RecurKind::FMin:
5099	case RecurKind::FMax:
5100	case RecurKind::FMulAdd:
5101	case RecurKind::AnyOf:
5102	return true;
5103	default:
5104	return false;
5105	}
5106	}
5107
5108	InstructionCost
5109	AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5110	FastMathFlags FMF,
5111	TTI::TargetCostKind CostKind) const {
5112	// The code-generator is currently not able to handle scalable vectors
5113	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5114	// it. This change will be removed when code-generation for these types is
5115	// sufficiently reliable.
5116	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5117	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5118	return InstructionCost::getInvalid();
5119
5120	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5121
5122	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5123	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5124
5125	InstructionCost LegalizationCost = `0`;
5126	if (LT.first > `1`) {
5127	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Ty->getContext());
5128	IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5129	LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - `1`);
5130	}
5131
5132	return LegalizationCost + /Cost of horizontal reduction/ `2`;
5133	}
5134
5135	InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5136	unsigned Opcode, VectorType ValTy, TTI::TargetCostKind CostKind) const* {
5137	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5138	InstructionCost LegalizationCost = `0`;
5139	if (LT.first > `1`) {
5140	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: ValTy->getContext());
5141	LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5142	LegalizationCost *= LT.first - `1`;
5143	}
5144
5145	int ISD = TLI->InstructionOpcodeToISD(Opcode);
5146	assert(ISD && "Invalid opcode");
5147	// Add the final reduction cost for the legal horizontal reduction
5148	switch (ISD) {
5149	case ISD::ADD:
5150	case ISD::AND:
5151	case ISD::OR:
5152	case ISD::XOR:
5153	case ISD::FADD:
5154	return LegalizationCost + `2`;
5155	default:
5156	return InstructionCost::getInvalid();
5157	}
5158	}
5159
5160	InstructionCost
5161	AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5162	std::optional<FastMathFlags> FMF,
5163	TTI::TargetCostKind CostKind) const {
5164	// The code-generator is currently not able to handle scalable vectors
5165	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5166	// it. This change will be removed when code-generation for these types is
5167	// sufficiently reliable.
5168	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5169	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5170	return InstructionCost::getInvalid();
5171
5172	if (TTI::requiresOrderedReduction(FMF)) {
5173	if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5174	InstructionCost BaseCost =
5175	BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5176	// Add on extra cost to reflect the extra overhead on some CPUs. We still
5177	// end up vectorizing for more computationally intensive loops.
5178	return BaseCost + FixedVTy->getNumElements();
5179	}
5180
5181	if (Opcode != Instruction::FAdd)
5182	return InstructionCost::getInvalid();
5183
5184	auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5185	InstructionCost Cost =
5186	getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5187	Cost *= getMaxNumElements(VF: VTy->getElementCount());
5188	return Cost;
5189	}
5190
5191	if (isa<ScalableVectorType>(Val: ValTy))
5192	return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5193
5194	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5195	MVT MTy = LT.second;
5196	int ISD = TLI->InstructionOpcodeToISD(Opcode);
5197	assert(ISD && "Invalid opcode");
5198
5199	// Horizontal adds can use the 'addv' instruction. We model the cost of these
5200	// instructions as twice a normal vector add, plus 1 for each legalization
5201	// step (LT.first). This is the only arithmetic vector reduction operation for
5202	// which we have an instruction.
5203	// OR, XOR and AND costs should match the codegen from:
5204	// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5205	// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5206	// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5207	static const CostTblEntry CostTblNoPairwise[]{
5208	{.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: `2`},
5209	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `2`},
5210	{.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: `2`},
5211	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `2`},
5212	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `2`},
5213	{.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: `2`},
5214	{.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: `15`},
5215	{.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: `17`},
5216	{.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: `7`},
5217	{.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: `9`},
5218	{.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: `3`},
5219	{.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: `5`},
5220	{.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: `3`},
5221	{.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: `15`},
5222	{.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: `17`},
5223	{.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: `7`},
5224	{.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: `9`},
5225	{.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: `3`},
5226	{.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: `5`},
5227	{.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: `3`},
5228	{.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: `15`},
5229	{.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: `17`},
5230	{.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: `7`},
5231	{.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: `9`},
5232	{.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: `3`},
5233	{.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: `5`},
5234	{.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: `3`},
5235	};
5236	switch (ISD) {
5237	default:
5238	break;
5239	case ISD::FADD:
5240	if (Type *EltTy = ValTy->getScalarType();
5241	// FIXME: For half types without fullfp16 support, this could extend and
5242	// use a fp32 faddp reduction but current codegen unrolls.
5243	MTy.isVector() && (EltTy->isFloatTy() \|\| EltTy->isDoubleTy() \|\|
5244	(EltTy->isHalfTy() && ST->hasFullFP16()))) {
5245	const unsigned NElts = MTy.getVectorNumElements();
5246	if (ValTy->getElementCount().getFixedValue() >= `2` && NElts >= `2` &&
5247	isPowerOf2_32(Value: NElts))
5248	// Reduction corresponding to series of fadd instructions is lowered to
5249	// series of faddp instructions. faddp has latency/throughput that
5250	// matches fadd instruction and hence, every faddp instruction can be
5251	// considered to have a relative cost = 1 with
5252	// CostKind = TCK_RecipThroughput.
5253	// An faddp will pairwise add vector elements, so the size of input
5254	// vector reduces by half every time, requiring
5255	// #(faddp instructions) = log2_32(NElts).
5256	return (LT.first - `1`) + /No of faddp instructions/ Log2_32(Value: NElts);
5257	}
5258	break;
5259	case ISD::ADD:
5260	if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5261	return (LT.first - `1`) + Entry->Cost;
5262	break;
5263	case ISD::XOR:
5264	case ISD::AND:
5265	case ISD::OR:
5266	const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5267	if (!Entry)
5268	break;
5269	auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5270	if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5271	isPowerOf2_32(Value: ValVTy->getNumElements())) {
5272	InstructionCost ExtraCost = `0`;
5273	if (LT.first != `1`) {
5274	// Type needs to be split, so there is an extra cost of LT.first - 1
5275	// arithmetic ops.
5276	auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5277	NumElts: MTy.getVectorNumElements());
5278	ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5279	ExtraCost *= LT.first - `1`;
5280	}
5281	// All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5282	auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: `1`) ? `2` : Entry->Cost;
5283	return Cost + ExtraCost;
5284	}
5285	break;
5286	}
5287	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5288	}
5289
5290	InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5291	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType VecTy,
5292	std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5293	EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5294	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5295
5296	if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5297	VecVT.getSizeInBits() >= `64`) {
5298	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5299
5300	// The legal cases are:
5301	// UADDLV 8/16/32->32
5302	// UADDLP 32->64
5303	unsigned RevVTSize = ResVT.getSizeInBits();
5304	if (((LT.second == MVT::v8i8 \|\| LT.second == MVT::v16i8) &&
5305	RevVTSize <= `32`) \|\|
5306	((LT.second == MVT::v4i16 \|\| LT.second == MVT::v8i16) &&
5307	RevVTSize <= `32`) \|\|
5308	((LT.second == MVT::v2i32 \|\| LT.second == MVT::v4i32) &&
5309	RevVTSize <= `64`))
5310	return (LT.first - `1`) * `2` + `2`;
5311	}
5312
5313	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
5314	CostKind);
5315	}
5316
5317	InstructionCost
5318	AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
5319	VectorType *VecTy,
5320	TTI::TargetCostKind CostKind) const {
5321	EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5322	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5323
5324	if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple()) {
5325	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5326
5327	// The legal cases with dotprod are
5328	// UDOT 8->32
5329	// Which requires an additional uaddv to sum the i32 values.
5330	if ((LT.second == MVT::v8i8 \|\| LT.second == MVT::v16i8) &&
5331	ResVT == MVT::i32)
5332	return LT.first + `2`;
5333	}
5334
5335	return BaseT::getMulAccReductionCost(IsUnsigned, ResTy, Ty: VecTy, CostKind);
5336	}
5337
5338	InstructionCost
5339	AArch64TTIImpl::getSpliceCost(VectorType Tp, int* Index,
5340	TTI::TargetCostKind CostKind) const {
5341	static const CostTblEntry ShuffleTbl[] = {
5342	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: `1` },
5343	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: `1` },
5344	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: `1` },
5345	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: `1` },
5346	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: `1` },
5347	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: `1` },
5348	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: `1` },
5349	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: `1` },
5350	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: `1` },
5351	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: `1` },
5352	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: `1` },
5353	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: `1` },
5354	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: `1` },
5355	};
5356
5357	// The code-generator is currently not able to handle scalable vectors
5358	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5359	// it. This change will be removed when code-generation for these types is
5360	// sufficiently reliable.
5361	if (Tp->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5362	return InstructionCost::getInvalid();
5363
5364	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
5365	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Tp->getContext());
5366	EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5367	? TLI->getPromotedVTForPredicate(VT: EVT (LT.second))
5368	: LT.second;
5369	Type *PromotedVTy = EVT (PromotedVT).getTypeForEVT(Context&: Tp->getContext());
5370	InstructionCost LegalizationCost = `0`;
5371	if (Index < `0`) {
5372	LegalizationCost =
5373	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
5374	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5375	getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
5376	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
5377	}
5378
5379	// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5380	// Cost performed on a promoted type.
5381	if (LT.second.getScalarType() == MVT::i1) {
5382	LegalizationCost +=
5383	getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
5384	CCH: TTI::CastContextHint::None, CostKind) +
5385	getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
5386	CCH: TTI::CastContextHint::None, CostKind);
5387	}
5388	const auto *Entry =
5389	CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
5390	assert(Entry && "Illegal Type for Splice");
5391	LegalizationCost += Entry->Cost;
5392	return LegalizationCost * LT.first;
5393	}
5394
5395	InstructionCost AArch64TTIImpl::getPartialReductionCost(
5396	unsigned Opcode, Type InputTypeA, Type InputTypeB, Type *AccumType,
5397	ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5398	TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5399	TTI::TargetCostKind CostKind) const {
5400	InstructionCost Invalid = InstructionCost::getInvalid();
5401	InstructionCost Cost(TTI::TCC_Basic);
5402
5403	if (CostKind != TTI::TCK_RecipThroughput)
5404	return Invalid;
5405
5406	// Sub opcodes currently only occur in chained cases.
5407	// Independent partial reduction subtractions are still costed as an add
5408	if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) \|\|
5409	OpAExtend == TTI::PR_None)
5410	return Invalid;
5411
5412	// We only support multiply binary operations for now, and for muls we
5413	// require the types being extended to be the same.
5414	// NOTE: For muls AArch64 supports lowering mixed extensions to a usdot but
5415	// only if the i8mm or sve/streaming features are available.
5416	if (BinOp && (*BinOp != Instruction::Mul \|\| InputTypeA != InputTypeB \|\|
5417	OpBExtend == TTI::PR_None \|\|
5418	(OpAExtend != OpBExtend && !ST->hasMatMulInt8() &&
5419	!ST->isSVEorStreamingSVEAvailable())))
5420	return Invalid;
5421	assert((BinOp \|\| (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5422	"Unexpected values for OpBExtend or InputTypeB");
5423
5424	EVT InputEVT = EVT::getEVT(Ty: InputTypeA);
5425	EVT AccumEVT = EVT::getEVT(Ty: AccumType);
5426
5427	unsigned VFMinValue = VF.getKnownMinValue();
5428
5429	if (VF.isScalable()) {
5430	if (!ST->isSVEorStreamingSVEAvailable())
5431	return Invalid;
5432
5433	// Don't accept a partial reduction if the scaled accumulator is vscale x 1,
5434	// since we can't lower that type.
5435	unsigned Scale =
5436	AccumEVT.getScalarSizeInBits() / InputEVT.getScalarSizeInBits();
5437	if (VFMinValue == Scale)
5438	return Invalid;
5439	}
5440	if (VF.isFixed() &&
5441	(!ST->isNeonAvailable() \|\| !ST->hasDotProd() \|\| AccumEVT == MVT::i64))
5442	return Invalid;
5443
5444	if (InputEVT == MVT::i8) {
5445	switch (VFMinValue) {
5446	default:
5447	return Invalid;
5448	case `8`:
5449	if (AccumEVT == MVT::i32)
5450	Cost *= `2`;
5451	else if (AccumEVT != MVT::i64)
5452	return Invalid;
5453	break;
5454	case `16`:
5455	if (AccumEVT == MVT::i64)
5456	Cost *= `2`;
5457	else if (AccumEVT != MVT::i32)
5458	return Invalid;
5459	break;
5460	}
5461	} else if (InputEVT == MVT::i16) {
5462	// FIXME: Allow i32 accumulator but increase cost, as we would extend
5463	// it to i64.
5464	if (VFMinValue != `8` \|\| AccumEVT != MVT::i64)
5465	return Invalid;
5466	} else
5467	return Invalid;
5468
5469	return Cost;
5470	}
5471
5472	InstructionCost
5473	AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
5474	VectorType SrcTy, ArrayRef<int*> Mask,
5475	TTI::TargetCostKind CostKind, int Index,
5476	VectorType SubTp, ArrayRef<const* Value *> Args,
5477	const Instruction CxtI) const* {
5478	assert((Mask.empty() \|\| DstTy->isScalableTy() \|\|
5479	Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
5480	"Expected the Mask to match the return size if given");
5481	assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
5482	"Expected the same scalar types");
5483	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
5484
5485	// If we have a Mask, and the LT is being legalized somehow, split the Mask
5486	// into smaller vectors and sum the cost of each shuffle.
5487	if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
5488	LT.second.getScalarSizeInBits() * Mask.size() > `128` &&
5489	SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
5490	Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
5491	// Check for LD3/LD4 instructions, which are represented in llvm IR as
5492	// deinterleaving-shuffle(load). The shuffle cost could potentially be free,
5493	// but we model it with a cost of LT.first so that LD3/LD4 have a higher
5494	// cost than just the load.
5495	if (Args.size() >= `1` && isa<LoadInst>(Val: Args [`0`]) &&
5496	(ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `3`) \|\|
5497	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `4`)))
5498	return std::max<InstructionCost>(a: `1`, b: LT.first / `4`);
5499
5500	// Check for ST3/ST4 instructions, which are represented in llvm IR as
5501	// store(interleaving-shuffle). The shuffle cost could potentially be free,
5502	// but we model it with a cost of LT.first so that ST3/ST4 have a higher
5503	// cost than just the store.
5504	if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
5505	(ShuffleVectorInst::isInterleaveMask(
5506	Mask, Factor: `4`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`) \|\|
5507	ShuffleVectorInst::isInterleaveMask(
5508	Mask, Factor: `3`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`)))
5509	return LT.first;
5510
5511	unsigned TpNumElts = Mask.size();
5512	unsigned LTNumElts = LT.second.getVectorNumElements();
5513	unsigned NumVecs = (TpNumElts + LTNumElts - `1`) / LTNumElts;
5514	VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
5515	EC: LT.second.getVectorElementCount());
5516	InstructionCost Cost;
5517	std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
5518	PreviousCosts;
5519	for (unsigned N = `0`; N < NumVecs; N++) {
5520	SmallVector<int> NMask;
5521	// Split the existing mask into chunks of size LTNumElts. Track the source
5522	// sub-vectors to ensure the result has at most 2 inputs.
5523	unsigned Source1 = -`1U`, Source2 = -`1U`;
5524	unsigned NumSources = `0`;
5525	for (unsigned E = `0`; E < LTNumElts; E++) {
5526	int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask [N * LTNumElts + E]
5527	: PoisonMaskElem;
5528	if (MaskElt < `0`) {
5529	NMask.push_back(Elt: PoisonMaskElem);
5530	continue;
5531	}
5532
5533	// Calculate which source from the input this comes from and whether it
5534	// is new to us.
5535	unsigned Source = MaskElt / LTNumElts;
5536	if (NumSources == `0`) {
5537	Source1 = Source;
5538	NumSources = `1`;
5539	} else if (NumSources == `1` && Source != Source1) {
5540	Source2 = Source;
5541	NumSources = `2`;
5542	} else if (NumSources >= `2` && Source != Source1 && Source != Source2) {
5543	NumSources++;
5544	}
5545
5546	// Add to the new mask. For the NumSources>2 case these are not correct,
5547	// but are only used for the modular lane number.
5548	if (Source == Source1)
5549	NMask.push_back(Elt: MaskElt % LTNumElts);
5550	else if (Source == Source2)
5551	NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
5552	else
5553	NMask.push_back(Elt: MaskElt % LTNumElts);
5554	}
5555	// Check if we have already generated this sub-shuffle, which means we
5556	// will have already generated the output. For example a <16 x i32> splat
5557	// will be the same sub-splat 4 times, which only needs to be generated
5558	// once and reused.
5559	auto Result =
5560	PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), `0`});
5561	// Check if it was already in the map (already costed).
5562	if (!Result.second)
5563	continue;
5564	// If the sub-mask has at most 2 input sub-vectors then re-cost it using
5565	// getShuffleCost. If not then cost it using the worst case as the number
5566	// of element moves into a new vector.
5567	InstructionCost NCost =
5568	NumSources <= `2`
5569	? getShuffleCost(Kind: NumSources <= `1` ? TTI::SK_PermuteSingleSrc
5570	: TTI::SK_PermuteTwoSrc,
5571	DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: `0`, SubTp: nullptr, Args,
5572	CxtI)
5573	: LTNumElts;
5574	Result.first ->second = NCost;
5575	Cost += NCost;
5576	}
5577	return Cost;
5578	}
5579
5580	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
5581	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
5582	// A subvector extract can be implemented with an ext (or trivial extract, if
5583	// from lane 0). This currently only handles low or high extracts to prevent
5584	// SLP vectorizer regressions.
5585	if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
5586	if (LT.second.is128BitVector() &&
5587	cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
5588	LT.second.getVectorNumElements() / `2`) {
5589	if (Index == `0`)
5590	return `0`;
5591	if (Index == (int)LT.second.getVectorNumElements() / `2`)
5592	return `1`;
5593	}
5594	Kind = TTI::SK_PermuteSingleSrc;
5595	}
5596	// FIXME: This was added to keep the costs equal when adding DstTys. Update
5597	// the code to handle length-changing shuffles.
5598	if (Kind == TTI::SK_InsertSubvector) {
5599	LT = getTypeLegalizationCost(Ty: DstTy);
5600	SrcTy = DstTy;
5601	}
5602
5603	// Segmented shuffle matching.
5604	if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
5605	!Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
5606	SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
5607	RHS: AArch64::SVEBitsPerBlock)) {
5608
5609	FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
5610	unsigned Segments =
5611	VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
5612	unsigned SegmentElts = VTy->getNumElements() / Segments;
5613
5614	// dupq zd.t, zn.t[idx]
5615	if ((ST->hasSVE2p1() \|\| ST->hasSME2p1()) &&
5616	ST->isSVEorStreamingSVEAvailable() &&
5617	isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
5618	return LT.first;
5619
5620	// mov zd.q, vn
5621	if (ST->isSVEorStreamingSVEAvailable() &&
5622	isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
5623	return LT.first;
5624	}
5625
5626	// Check for broadcast loads, which are supported by the LD1R instruction.
5627	// In terms of code-size, the shuffle vector is free when a load + dup get
5628	// folded into a LD1R. That's what we check and return here. For performance
5629	// and reciprocal throughput, a LD1R is not completely free. In this case, we
5630	// return the cost for the broadcast below (i.e. 1 for most/all types), so
5631	// that we model the load + dup sequence slightly higher because LD1R is a
5632	// high latency instruction.
5633	if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
5634	bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args [`0`]);
5635	if (IsLoad && LT.second.isVector() &&
5636	isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
5637	NumElements: LT.second.getVectorElementCount()))
5638	return `0`;
5639	}
5640
5641	// If we have 4 elements for the shuffle and a Mask, get the cost straight
5642	// from the perfect shuffle tables.
5643	if (Mask.size() == `4` &&
5644	SrcTy->getElementCount() == ElementCount::getFixed(MinVal: `4`) &&
5645	(SrcTy->getScalarSizeInBits() == `16` \|\|
5646	SrcTy->getScalarSizeInBits() == `32`) &&
5647	all_of(Range&: Mask, P: [](int E) { return E < `8`; }))
5648	return getPerfectShuffleCost(M: Mask);
5649
5650	// Check for identity masks, which we can treat as free.
5651	if (!Mask.empty() && LT.second.isFixedLengthVector() &&
5652	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
5653	all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
5654	return M.value() < `0` \|\| M.value() == (int)M.index();
5655	}))
5656	return `0`;
5657
5658	// Check for other shuffles that are not SK_ kinds but we have native
5659	// instructions for, for example ZIP and UZP.
5660	unsigned Unused;
5661	if (LT.second.isFixedLengthVector() &&
5662	LT.second.getVectorNumElements() == Mask.size() &&
5663	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
5664	(isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) \|\|
5665	isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) \|\|
5666	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5667	NumElts: LT.second.getVectorNumElements(), BlockSize: `16`) \|\|
5668	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5669	NumElts: LT.second.getVectorNumElements(), BlockSize: `32`) \|\|
5670	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
5671	NumElts: LT.second.getVectorNumElements(), BlockSize: `64`) \|\|
5672	// Check for non-zero lane splats
5673	all_of(Range: drop_begin(RangeOrContainer&: Mask),
5674	P: [&Mask](int M) { return M < `0` \|\| M == Mask [`0`]; })))
5675	return `1`;
5676
5677	if (Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Transpose \|\|
5678	Kind == TTI::SK_Select \|\| Kind == TTI::SK_PermuteSingleSrc \|\|
5679	Kind == TTI::SK_Reverse \|\| Kind == TTI::SK_Splice) {
5680	static const CostTblEntry ShuffleTbl[] = {
5681	// Broadcast shuffle kinds can be performed with 'dup'.
5682	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: `1`},
5683	{.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: `1`},
5684	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: `1`},
5685	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: `1`},
5686	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: `1`},
5687	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: `1`},
5688	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: `1`},
5689	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: `1`},
5690	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: `1`},
5691	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: `1`},
5692	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: `1`},
5693	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: `1`},
5694	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: `1`},
5695	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: `1`},
5696	// Transpose shuffle kinds can be performed with 'trn1/trn2' and
5697	// 'zip1/zip2' instructions.
5698	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: `1`},
5699	{.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: `1`},
5700	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: `1`},
5701	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: `1`},
5702	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: `1`},
5703	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: `1`},
5704	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: `1`},
5705	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: `1`},
5706	{.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: `1`},
5707	{.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: `1`},
5708	{.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: `1`},
5709	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: `1`},
5710	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: `1`},
5711	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: `1`},
5712	// Select shuffle kinds.
5713	// TODO: handle vXi8/vXi16.
5714	{.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: `1`}, // mov.
5715	{.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: `2`}, // rev+trn (or similar).
5716	{.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: `1`}, // mov.
5717	{.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: `1`}, // mov.
5718	{.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: `2`}, // rev+trn (or similar).
5719	{.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: `1`}, // mov.
5720	// PermuteSingleSrc shuffle kinds.
5721	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: `1`}, // mov.
5722	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: `3`}, // perfectshuffle worst case.
5723	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: `1`}, // mov.
5724	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: `1`}, // mov.
5725	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: `3`}, // perfectshuffle worst case.
5726	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: `1`}, // mov.
5727	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: `3`}, // perfectshuffle worst case.
5728	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: `3`}, // perfectshuffle worst case.
5729	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: `3`}, // same
5730	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: `8`}, // constpool + load + tbl
5731	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: `8`}, // constpool + load + tbl
5732	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: `8`}, // constpool + load + tbl
5733	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: `8`}, // constpool + load + tbl
5734	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: `8`}, // constpool + load + tbl
5735	// Reverse can be lowered with `rev`.
5736	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: `1`}, // REV64
5737	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: `2`}, // REV64; EXT
5738	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: `1`}, // EXT
5739	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: `1`}, // REV64
5740	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: `2`}, // REV64; EXT
5741	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: `1`}, // EXT
5742	{.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: `2`}, // REV64; EXT
5743	{.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: `2`}, // REV64; EXT
5744	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: `2`}, // REV64; EXT
5745	{.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: `2`}, // REV64; EXT
5746	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: `1`}, // REV64
5747	{.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: `1`}, // REV64
5748	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: `1`}, // REV64
5749	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: `1`}, // REV64
5750	// Splice can all be lowered as `ext`.
5751	{.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: `1`},
5752	{.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: `1`},
5753	{.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: `1`},
5754	{.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: `1`},
5755	{.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: `1`},
5756	{.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: `1`},
5757	{.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: `1`},
5758	{.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: `1`},
5759	{.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: `1`},
5760	{.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: `1`},
5761	{.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: `1`},
5762	{.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: `1`},
5763	{.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: `1`},
5764	{.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: `1`},
5765	// Broadcast shuffle kinds for scalable vectors
5766	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: `1`},
5767	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: `1`},
5768	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: `1`},
5769	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: `1`},
5770	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: `1`},
5771	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: `1`},
5772	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: `1`},
5773	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: `1`},
5774	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: `1`},
5775	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: `1`},
5776	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: `1`},
5777	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: `1`},
5778	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: `1`},
5779	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: `1`},
5780	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: `1`},
5781	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: `1`},
5782	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: `1`},
5783	// Handle the cases for vector.reverse with scalable vectors
5784	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: `1`},
5785	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: `1`},
5786	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: `1`},
5787	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: `1`},
5788	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: `1`},
5789	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: `1`},
5790	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: `1`},
5791	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: `1`},
5792	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: `1`},
5793	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: `1`},
5794	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: `1`},
5795	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: `1`},
5796	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: `1`},
5797	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: `1`},
5798	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: `1`},
5799	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: `1`},
5800	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: `1`},
5801	};
5802	if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
5803	return LT.first * Entry->Cost;
5804	}
5805
5806	if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
5807	return getSpliceCost(Tp: SrcTy, Index, CostKind);
5808
5809	// Inserting a subvector can often be done with either a D, S or H register
5810	// move, so long as the inserted vector is "aligned".
5811	if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
5812	LT.second.getSizeInBits() <= `128` && SubTp) {
5813	std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
5814	if (SubLT.second.isVector()) {
5815	int NumElts = LT.second.getVectorNumElements();
5816	int NumSubElts = SubLT.second.getVectorNumElements();
5817	if ((Index % NumSubElts) == `0` && (NumElts % NumSubElts) == `0`)
5818	return SubLT.first;
5819	}
5820	}
5821
5822	// Restore optimal kind.
5823	if (IsExtractSubvector)
5824	Kind = TTI::SK_ExtractSubvector;
5825	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
5826	Args, CxtI);
5827	}
5828
5829	static bool containsDecreasingPointers(Loop *TheLoop,
5830	PredicatedScalarEvolution *PSE) {
5831	const auto &Strides = DenseMap<Value , const* SCEV *>();
5832	for (BasicBlock *BB : TheLoop->blocks()) {
5833	// Scan the instructions in the block and look for addresses that are
5834	// consecutive and decreasing.
5835	for (Instruction &I : *BB) {
5836	if (isa<LoadInst>(Val: &I) \|\| isa<StoreInst>(Val: &I)) {
5837	Value *Ptr = getLoadStorePointerOperand(V: &I);
5838	Type *AccessTy = getLoadStoreType(I: &I);
5839	if (getPtrStride(PSE&: PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /Assume=/*true,
5840	/ShouldCheckWrap=/false)
5841	.value_or(u: `0`) < `0`)
5842	return true;
5843	}
5844	}
5845	}
5846	return false;
5847	}
5848
5849	bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost() const {
5850	if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
5851	return SVEPreferFixedOverScalableIfEqualCost;
5852	return ST->useFixedOverScalableIfEqualCost();
5853	}
5854
5855	unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
5856	return ST->getEpilogueVectorizationMinVF();
5857	}
5858
5859	bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo TFI) const* {
5860	if (!ST->hasSVE())
5861	return false;
5862
5863	// We don't currently support vectorisation with interleaving for SVE - with
5864	// such loops we're better off not using tail-folding. This gives us a chance
5865	// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
5866	if (TFI->IAI->hasGroups())
5867	return false;
5868
5869	TailFoldingOpts Required = TailFoldingOpts::Disabled;
5870	if (TFI->LVL->getReductionVars().size())
5871	Required \|= TailFoldingOpts::Reductions;
5872	if (TFI->LVL->getFixedOrderRecurrences().size())
5873	Required \|= TailFoldingOpts::Recurrences;
5874
5875	// We call this to discover whether any load/store pointers in the loop have
5876	// negative strides. This will require extra work to reverse the loop
5877	// predicate, which may be expensive.
5878	if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
5879	PSE: TFI->LVL->getPredicatedScalarEvolution()))
5880	Required \|= TailFoldingOpts::Reverse;
5881	if (Required == TailFoldingOpts::Disabled)
5882	Required \|= TailFoldingOpts::Simple;
5883
5884	if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
5885	Required))
5886	return false;
5887
5888	// Don't tail-fold for tight loops where we would be better off interleaving
5889	// with an unpredicated loop.
5890	unsigned NumInsns = `0`;
5891	for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
5892	NumInsns += BB->sizeWithoutDebug();
5893	}
5894
5895	// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
5896	return NumInsns >= SVETailFoldInsnThreshold;
5897	}
5898
5899	InstructionCost
5900	AArch64TTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
5901	StackOffset BaseOffset, bool HasBaseReg,
5902	int64_t Scale, unsigned AddrSpace) const {
5903	// Scaling factors are not free at all.
5904	// Operands \| Rt Latency
5905	// -------------------------------------------
5906	// Rt, [Xn, Xm] \| 4
5907	// -------------------------------------------
5908	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
5909	// Rt, [Xn, Wm, <extend> #imm] \|
5910	TargetLoweringBase::AddrMode AM;
5911	AM.BaseGV = BaseGV;
5912	AM.BaseOffs = BaseOffset.getFixed();
5913	AM.HasBaseReg = HasBaseReg;
5914	AM.Scale = Scale;
5915	AM.ScalableOffset = BaseOffset.getScalable();
5916	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
5917	// Scale represents reg2 scale, thus account for 1 if*
5918	// it is not equal to 0 or 1.
5919	return AM.Scale != `0` && AM.Scale != `1`;
5920	return InstructionCost::getInvalid();
5921	}
5922
5923	bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
5924	const Instruction I) const* {
5925	if (EnableOrLikeSelectOpt) {
5926	// For the binary operators (e.g. or) we need to be more careful than
5927	// selects, here we only transform them if they are already at a natural
5928	// break point in the code - the end of a block with an unconditional
5929	// terminator.
5930	if (I->getOpcode() == Instruction::Or &&
5931	isa<BranchInst>(Val: I->getNextNode()) &&
5932	cast<BranchInst>(Val: I->getNextNode())->isUnconditional())
5933	return true;
5934
5935	if (I->getOpcode() == Instruction::Add \|\|
5936	I->getOpcode() == Instruction::Sub)
5937	return true;
5938	}
5939	return BaseT::shouldTreatInstructionLikeSelect(I);
5940	}
5941
5942	bool AArch64TTIImpl::isLSRCostLess(
5943	const TargetTransformInfo::LSRCost &C1,
5944	const TargetTransformInfo::LSRCost &C2) const {
5945	// AArch64 specific here is adding the number of instructions to the
5946	// comparison (though not as the first consideration, as some targets do)
5947	// along with changing the priority of the base additions.
5948	// TODO: Maybe a more nuanced tradeoff between instruction count
5949	// and number of registers? To be investigated at a later date.
5950	if (EnableLSRCostOpt)
5951	return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
5952	args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
5953	std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
5954	args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
5955
5956	return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
5957	}
5958
5959	static bool isSplatShuffle(Value *V) {
5960	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
5961	return all_equal(Range: Shuf->getShuffleMask());
5962	return false;
5963	}
5964
5965	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
5966	/// or upper half of the vector elements.
5967	static bool areExtractShuffleVectors(Value Op1, Value Op2,
5968	bool AllowSplat = false) {
5969	// Scalable types can't be extract shuffle vectors.
5970	if (Op1->getType()->isScalableTy() \|\| Op2->getType()->isScalableTy())
5971	return false;
5972
5973	auto areTypesHalfed = [](Value FullV, Value HalfV) {
5974	auto *FullTy = FullV->getType();
5975	auto *HalfTy = HalfV->getType();
5976	return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
5977	`2` * HalfTy->getPrimitiveSizeInBits().getFixedValue();
5978	};
5979
5980	auto extractHalf = [](Value FullV, Value HalfV) {
5981	auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
5982	auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
5983	return FullVT->getNumElements() == `2` * HalfVT->getNumElements();
5984	};
5985
5986	ArrayRef<int> M1, M2;
5987	Value S1Op1 = nullptr, S2Op1 = nullptr;
5988	if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask (M1))) \|\|
5989	!match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask (M2))))
5990	return false;
5991
5992	// If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
5993	// it is not checked as an extract below.
5994	if (AllowSplat && isSplatShuffle(V: Op1))
5995	S1Op1 = nullptr;
5996	if (AllowSplat && isSplatShuffle(V: Op2))
5997	S2Op1 = nullptr;
5998
5999	// Check that the operands are half as wide as the result and we extract
6000	// half of the elements of the input vectors.
6001	if ((S1Op1 && (!areTypesHalfed (S1Op1, Op1) \|\| !extractHalf (S1Op1, Op1))) \|\|
6002	(S2Op1 && (!areTypesHalfed (S2Op1, Op2) \|\| !extractHalf (S2Op1, Op2))))
6003	return false;
6004
6005	// Check the mask extracts either the lower or upper half of vector
6006	// elements.
6007	int M1Start = `0`;
6008	int M2Start = `0`;
6009	int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * `2`;
6010	if ((S1Op1 &&
6011	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) \|\|
6012	(S2Op1 &&
6013	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6014	return false;
6015
6016	if ((M1Start != `0` && M1Start != (NumElements / `2`)) \|\|
6017	(M2Start != `0` && M2Start != (NumElements / `2`)))
6018	return false;
6019	if (S1Op1 && S2Op1 && M1Start != M2Start)
6020	return false;
6021
6022	return true;
6023	}
6024
6025	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6026	/// of the vector elements.
6027	static bool areExtractExts(Value Ext1, Value Ext2) {
6028	auto areExtDoubled = [](Instruction *Ext) {
6029	return Ext->getType()->getScalarSizeInBits() ==
6030	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
6031	};
6032
6033	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
6034	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
6035	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
6036	!areExtDoubled (cast<Instruction>(Val: Ext2)))
6037	return false;
6038
6039	return true;
6040	}
6041
6042	/// Check if Op could be used with vmull_high_p64 intrinsic.
6043	static bool isOperandOfVmullHighP64(Value *Op) {
6044	Value VectorOperand = nullptr*;
6045	ConstantInt ElementIndex = nullptr*;
6046	return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6047	Idx: m_ConstantInt(CI&: ElementIndex))) &&
6048	ElementIndex->getValue() == `1` &&
6049	isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6050	cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == `2`;
6051	}
6052
6053	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6054	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
6055	return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6056	}
6057
6058	static bool shouldSinkVectorOfPtrs(Value Ptrs, SmallVectorImpl<Use > &Ops) {
6059	// Restrict ourselves to the form CodeGenPrepare typically constructs.
6060	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6061	if (!GEP \|\| GEP->getNumOperands() != `2`)
6062	return false;
6063
6064	Value *Base = GEP->getOperand(i_nocapture: `0`);
6065	Value *Offsets = GEP->getOperand(i_nocapture: `1`);
6066
6067	// We only care about scalar_base+vector_offsets.
6068	if (Base->getType()->isVectorTy() \|\| !Offsets->getType()->isVectorTy())
6069	return false;
6070
6071	// Sink extends that would allow us to use 32-bit offset vectors.
6072	if (isa<SExtInst>(Val: Offsets) \|\| isa<ZExtInst>(Val: Offsets)) {
6073	auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6074	if (OffsetsInst->getType()->getScalarSizeInBits() > `32` &&
6075	OffsetsInst->getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `32`)
6076	Ops.push_back(Elt: &GEP->getOperandUse(i: `1`));
6077	}
6078
6079	// Sink the GEP.
6080	return true;
6081	}
6082
6083	/// We want to sink following cases:
6084	/// (add\|sub\|gep) A, ((mul\|shl) vscale, imm); (add\|sub\|gep) A, vscale;
6085	/// (add\|sub\|gep) A, ((mul\|shl) zext(vscale), imm);
6086	static bool shouldSinkVScale(Value Op, SmallVectorImpl<Use > &Ops) {
6087	if (match(V: Op, P: m_VScale()))
6088	return true;
6089	if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) \|\|
6090	match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6091	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
6092	return true;
6093	}
6094	if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) \|\|
6095	match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6096	Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: `0`);
6097	Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: `0`));
6098	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
6099	return true;
6100	}
6101	return false;
6102	}
6103
6104	/// Check if sinking \p I's operands to I's basic block is profitable, because
6105	/// the operands can be folded into a target instruction, e.g.
6106	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6107	bool AArch64TTIImpl::isProfitableToSinkOperands(
6108	Instruction I, SmallVectorImpl<Use > &Ops) const {
6109	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6110	switch (II->getIntrinsicID()) {
6111	case Intrinsic::aarch64_neon_smull:
6112	case Intrinsic::aarch64_neon_umull:
6113	if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`),
6114	/AllowSplat=/true)) {
6115	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6116	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6117	return true;
6118	}
6119	[[fallthrough]];
6120
6121	case Intrinsic::fma:
6122	case Intrinsic::fmuladd:
6123	if (isa<VectorType>(Val: I->getType()) &&
6124	cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6125	!ST->hasFullFP16())
6126	return false;
6127	[[fallthrough]];
6128	case Intrinsic::aarch64_neon_sqdmull:
6129	case Intrinsic::aarch64_neon_sqdmulh:
6130	case Intrinsic::aarch64_neon_sqrdmulh:
6131	// Sink splats for index lane variants
6132	if (isSplatShuffle(V: II->getOperand(i_nocapture: `0`)))
6133	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6134	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
6135	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6136	return !Ops.empty();
6137	case Intrinsic::aarch64_neon_fmlal:
6138	case Intrinsic::aarch64_neon_fmlal2:
6139	case Intrinsic::aarch64_neon_fmlsl:
6140	case Intrinsic::aarch64_neon_fmlsl2:
6141	// Sink splats for index lane variants
6142	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
6143	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6144	if (isSplatShuffle(V: II->getOperand(i_nocapture: `2`)))
6145	Ops.push_back(Elt: &II->getOperandUse(i: `2`));
6146	return !Ops.empty();
6147	case Intrinsic::aarch64_sve_ptest_first:
6148	case Intrinsic::aarch64_sve_ptest_last:
6149	if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: `0`)))
6150	if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6151	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6152	return !Ops.empty();
6153	case Intrinsic::aarch64_sme_write_horiz:
6154	case Intrinsic::aarch64_sme_write_vert:
6155	case Intrinsic::aarch64_sme_writeq_horiz:
6156	case Intrinsic::aarch64_sme_writeq_vert: {
6157	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `1`));
6158	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
6159	return false;
6160	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6161	return true;
6162	}
6163	case Intrinsic::aarch64_sme_read_horiz:
6164	case Intrinsic::aarch64_sme_read_vert:
6165	case Intrinsic::aarch64_sme_readq_horiz:
6166	case Intrinsic::aarch64_sme_readq_vert:
6167	case Intrinsic::aarch64_sme_ld1b_vert:
6168	case Intrinsic::aarch64_sme_ld1h_vert:
6169	case Intrinsic::aarch64_sme_ld1w_vert:
6170	case Intrinsic::aarch64_sme_ld1d_vert:
6171	case Intrinsic::aarch64_sme_ld1q_vert:
6172	case Intrinsic::aarch64_sme_st1b_vert:
6173	case Intrinsic::aarch64_sme_st1h_vert:
6174	case Intrinsic::aarch64_sme_st1w_vert:
6175	case Intrinsic::aarch64_sme_st1d_vert:
6176	case Intrinsic::aarch64_sme_st1q_vert:
6177	case Intrinsic::aarch64_sme_ld1b_horiz:
6178	case Intrinsic::aarch64_sme_ld1h_horiz:
6179	case Intrinsic::aarch64_sme_ld1w_horiz:
6180	case Intrinsic::aarch64_sme_ld1d_horiz:
6181	case Intrinsic::aarch64_sme_ld1q_horiz:
6182	case Intrinsic::aarch64_sme_st1b_horiz:
6183	case Intrinsic::aarch64_sme_st1h_horiz:
6184	case Intrinsic::aarch64_sme_st1w_horiz:
6185	case Intrinsic::aarch64_sme_st1d_horiz:
6186	case Intrinsic::aarch64_sme_st1q_horiz: {
6187	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `3`));
6188	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
6189	return false;
6190	Ops.push_back(Elt: &II->getOperandUse(i: `3`));
6191	return true;
6192	}
6193	case Intrinsic::aarch64_neon_pmull:
6194	if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`)))
6195	return false;
6196	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6197	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6198	return true;
6199	case Intrinsic::aarch64_neon_pmull64:
6200	if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: `0`),
6201	Op2: II->getArgOperand(i: `1`)))
6202	return false;
6203	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
6204	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
6205	return true;
6206	case Intrinsic::masked_gather:
6207	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `0`), Ops))
6208	return false;
6209	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
6210	return true;
6211	case Intrinsic::masked_scatter:
6212	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `1`), Ops))
6213	return false;
6214	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
6215	return true;
6216	default:
6217	return false;
6218	}
6219	}
6220
6221	auto ShouldSinkCondition = [](Value Cond) -> bool* {
6222	auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
6223	return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
6224	isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
6225	};
6226
6227	switch (I->getOpcode()) {
6228	case Instruction::GetElementPtr:
6229	case Instruction::Add:
6230	case Instruction::Sub:
6231	// Sink vscales closer to uses for better isel
6232	for (unsigned Op = `0`; Op < I->getNumOperands(); ++Op) {
6233	if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
6234	Ops.push_back(Elt: &I->getOperandUse(i: Op));
6235	return true;
6236	}
6237	}
6238	break;
6239	case Instruction::Select: {
6240	if (!ShouldSinkCondition (I->getOperand(i: `0`)))
6241	return false;
6242
6243	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6244	return true;
6245	}
6246	case Instruction::Br: {
6247	if (cast<BranchInst>(Val: I)->isUnconditional())
6248	return false;
6249
6250	if (!ShouldSinkCondition (cast<BranchInst>(Val: I)->getCondition()))
6251	return false;
6252
6253	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6254	return true;
6255	}
6256	default:
6257	break;
6258	}
6259
6260	if (!I->getType()->isVectorTy())
6261	return false;
6262
6263	switch (I->getOpcode()) {
6264	case Instruction::Sub:
6265	case Instruction::Add: {
6266	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
6267	return false;
6268
6269	// If the exts' operands extract either the lower or upper elements, we
6270	// can sink them too.
6271	auto Ext1 = cast<Instruction>(Val: I->getOperand(i: `0`));
6272	auto Ext2 = cast<Instruction>(Val: I->getOperand(i: `1`));
6273	if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: `0`), Op2: Ext2->getOperand(i: `0`))) {
6274	Ops.push_back(Elt: &Ext1->getOperandUse(i: `0`));
6275	Ops.push_back(Elt: &Ext2->getOperandUse(i: `0`));
6276	}
6277
6278	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6279	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6280
6281	return true;
6282	}
6283	case Instruction::Or: {
6284	// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6285	// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6286	if (ST->hasNEON()) {
6287	Instruction OtherAnd, IA, *IB;
6288	Value *MaskValue;
6289	// MainAnd refers to And instruction that has 'Not' as one of its operands
6290	if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
6291	R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
6292	R: m_Instruction(I&: IA)))))) {
6293	if (match(V: OtherAnd,
6294	P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
6295	Instruction *MainAnd = I->getOperand(i: `0`) == OtherAnd
6296	? cast<Instruction>(Val: I->getOperand(i: `1`))
6297	: cast<Instruction>(Val: I->getOperand(i: `0`));
6298
6299	// Both Ands should be in same basic block as Or
6300	if (I->getParent() != MainAnd->getParent() \|\|
6301	I->getParent() != OtherAnd->getParent())
6302	return false;
6303
6304	// Non-mask operands of both Ands should also be in same basic block
6305	if (I->getParent() != IA->getParent() \|\|
6306	I->getParent() != IB->getParent())
6307	return false;
6308
6309	Ops.push_back(
6310	Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: `0`) == IA ? `1` : `0`));
6311	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6312	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6313
6314	return true;
6315	}
6316	}
6317	}
6318
6319	return false;
6320	}
6321	case Instruction::Mul: {
6322	auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6323	auto *Ty = cast<VectorType>(Val: V->getType());
6324	// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6325	if (Ty->isScalableTy())
6326	return false;
6327
6328	// Indexed variants of Mul exist for i16 and i32 element types only.
6329	return Ty->getScalarSizeInBits() == `16` \|\| Ty->getScalarSizeInBits() == `32`;
6330	};
6331
6332	int NumZExts = `0`, NumSExts = `0`;
6333	for (auto &Op : I->operands()) {
6334	// Make sure we are not already sinking this operand
6335	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
6336	continue;
6337
6338	if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
6339	auto *Ext = cast<Instruction>(Val&: Op);
6340	auto *ExtOp = Ext->getOperand(i: `0`);
6341	if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant (ExtOp))
6342	Ops.push_back(Elt: &Ext->getOperandUse(i: `0`));
6343	Ops.push_back(Elt: &Op);
6344
6345	if (isa<SExtInst>(Val: Ext))
6346	NumSExts++;
6347	else
6348	NumZExts++;
6349
6350	continue;
6351	}
6352
6353	ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
6354	if (!Shuffle)
6355	continue;
6356
6357	// If the Shuffle is a splat and the operand is a zext/sext, sinking the
6358	// operand and the s/zext can help create indexed s/umull. This is
6359	// especially useful to prevent i64 mul being scalarized.
6360	if (isSplatShuffle(V: Shuffle) &&
6361	match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_ZExtOrSExt(Op: m_Value()))) {
6362	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
6363	Ops.push_back(Elt: &Op);
6364	if (match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_SExt(Op: m_Value())))
6365	NumSExts++;
6366	else
6367	NumZExts++;
6368	continue;
6369	}
6370
6371	Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: `0`);
6372	InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
6373	if (!Insert)
6374	continue;
6375
6376	Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: `1`));
6377	if (!OperandInstr)
6378	continue;
6379
6380	ConstantInt *ElementConstant =
6381	dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: `2`));
6382	// Check that the insertelement is inserting into element 0
6383	if (!ElementConstant \|\| !ElementConstant->isZero())
6384	continue;
6385
6386	unsigned Opcode = OperandInstr->getOpcode();
6387	if (Opcode == Instruction::SExt)
6388	NumSExts++;
6389	else if (Opcode == Instruction::ZExt)
6390	NumZExts++;
6391	else {
6392	// If we find that the top bits are known 0, then we can sink and allow
6393	// the backend to generate a umull.
6394	unsigned Bitwidth = I->getType()->getScalarSizeInBits();
6395	APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / `2`);
6396	if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
6397	continue;
6398	NumZExts++;
6399	}
6400
6401	// And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
6402	// the And, just to hoist it again back to the load.
6403	if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
6404	Ops.push_back(Elt: &Insert->getOperandUse(i: `1`));
6405	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
6406	Ops.push_back(Elt: &Op);
6407	}
6408
6409	// It is profitable to sink if we found two of the same type of extends.
6410	if (!Ops.empty() && (NumSExts == `2` \|\| NumZExts == `2`))
6411	return true;
6412
6413	// Otherwise, see if we should sink splats for indexed variants.
6414	if (!ShouldSinkSplatForIndexedVariant (I))
6415	return false;
6416
6417	Ops.clear();
6418	if (isSplatShuffle(V: I->getOperand(i: `0`)))
6419	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6420	if (isSplatShuffle(V: I->getOperand(i: `1`)))
6421	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6422
6423	return !Ops.empty();
6424	}
6425	case Instruction::FMul: {
6426	// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6427	if (I->getType()->isScalableTy())
6428	return false;
6429
6430	if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6431	!ST->hasFullFP16())
6432	return false;
6433
6434	// Sink splats for index lane variants
6435	if (isSplatShuffle(V: I->getOperand(i: `0`)))
6436	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6437	if (isSplatShuffle(V: I->getOperand(i: `1`)))
6438	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6439	return !Ops.empty();
6440	}
6441	default:
6442	return false;
6443	}
6444	return false;
6445	}
6446

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp