AArch64TargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp]

1	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AArch64TargetTransformInfo.h"
10	#include "AArch64ExpandImm.h"
11	#include "AArch64PerfectShuffle.h"
12	#include "AArch64SMEAttributes.h"
13	#include "MCTargetDesc/AArch64AddressingModes.h"
14	#include "llvm/ADT/DenseMap.h"
15	#include "llvm/Analysis/LoopInfo.h"
16	#include "llvm/Analysis/TargetTransformInfo.h"
17	#include "llvm/CodeGen/BasicTTIImpl.h"
18	#include "llvm/CodeGen/CostTable.h"
19	#include "llvm/CodeGen/TargetLowering.h"
20	#include "llvm/IR/DerivedTypes.h"
21	#include "llvm/IR/IntrinsicInst.h"
22	#include "llvm/IR/Intrinsics.h"
23	#include "llvm/IR/IntrinsicsAArch64.h"
24	#include "llvm/IR/PatternMatch.h"
25	#include "llvm/Support/Debug.h"
26	#include "llvm/TargetParser/AArch64TargetParser.h"
27	#include "llvm/Transforms/InstCombine/InstCombiner.h"
28	#include "llvm/Transforms/Utils/UnrollLoop.h"
29	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
30	#include <algorithm>
31	#include <optional>
32	using namespace llvm;
33	using namespace llvm::PatternMatch;
34
35	#define DEBUG_TYPE "aarch64tti"
36
37	static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38	cl::init(Val: true), cl::Hidden);
39
40	static cl::opt<bool> SVEPreferFixedOverScalableIfEqualCost(
41	"sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43	static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: `10`),
44	cl::Hidden);
45
46	static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47	cl::init(Val: `10`), cl::Hidden);
48
49	static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50	cl::init(Val: `15`), cl::Hidden);
51
52	static cl::opt<unsigned>
53	NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: `10`),
54	cl::Hidden);
55
56	static cl::opt<unsigned> CallPenaltyChangeSM(
57	"call-penalty-sm-change", cl::init(Val: `5`), cl::Hidden,
58	cl::desc (
59	"Penalty of calling a function that requires a change to PSTATE.SM"));
60
61	static cl::opt<unsigned> InlineCallPenaltyChangeSM(
62	"inline-call-penalty-sm-change", cl::init(Val: `10`), cl::Hidden,
63	cl::desc ("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65	static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66	cl::init(Val: true), cl::Hidden);
67
68	static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69	cl::init(Val: true), cl::Hidden);
70
71	// A complete guess as to a reasonable cost.
72	static cl::opt<unsigned>
73	BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: `8`), cl::Hidden,
74	cl::desc ("The cost of a histcnt instruction"));
75
76	static cl::opt<unsigned> DMBLookaheadThreshold(
77	"dmb-lookahead-threshold", cl::init(Val: `10`), cl::Hidden,
78	cl::desc ("The number of instructions to search for a redundant dmb"));
79
80	static cl::opt<int> Aarch64ForceUnrollThreshold(
81	"aarch64-force-unroll-threshold", cl::init(Val: `0`), cl::Hidden,
82	cl::desc ("Threshold for forced unrolling of small loops in AArch64"));
83
84	namespace {
85	class TailFoldingOption {
86	// These bitfields will only ever be set to something non-zero in operator=,
87	// when setting the -sve-tail-folding option. This option should always be of
88	// the form (default\|simple\|all\|disable)[+(Flag1\|Flag2\|etc)], where here
89	// InitialBits is one of (disabled\|all\|simple). EnableBits represents
90	// additional flags we're enabling, and DisableBits for those flags we're
91	// disabling. The default flag is tracked in the variable NeedsDefault, since
92	// at the time of setting the option we may not know what the default value
93	// for the CPU is.
94	TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
95	TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
96	TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
97
98	// This value needs to be initialised to true in case the user does not
99	// explicitly set the -sve-tail-folding option.
100	bool NeedsDefault = true;
101
102	void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104	void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106	void setEnableBit(TailFoldingOpts Bit) {
107	EnableBits \|= Bit;
108	DisableBits &= ~Bit;
109	}
110
111	void setDisableBit(TailFoldingOpts Bit) {
112	EnableBits &= ~Bit;
113	DisableBits \|= Bit;
114	}
115
116	TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117	TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119	assert((InitialBits == TailFoldingOpts::Disabled \|\| !NeedsDefault) &&
120	"Initial bits should only include one of "
121	"(disabled\|all\|simple\|default)");
122	Bits = NeedsDefault ? DefaultBits : InitialBits;
123	Bits \|= EnableBits;
124	Bits &= ~DisableBits;
125
126	return Bits;
127	}
128
129	void reportError(std::string Opt) {
130	errs() << "invalid argument '" << Opt
131	<< "' to -sve-tail-folding=; the option should be of the form\n"
132	" (disabled\|all\|default\|simple)[+(reductions\|recurrences"
133	"\|reverse\|noreductions\|norecurrences\|noreverse)]\n";
134	report_fatal_error(reason: "Unrecognised tail-folding option");
135	}
136
137	public:
138
139	void operator=(const std::string &Val) {
140	// If the user explicitly sets -sve-tail-folding= then treat as an error.
141	if (Val.empty()) {
142	reportError(Opt: "");
143	return;
144	}
145
146	// Since the user is explicitly setting the option we don't automatically
147	// need the default unless they require it.
148	setNeedsDefault(false);
149
150	SmallVector<StringRef, `4`> TailFoldTypes;
151	StringRef (Val).split(A&: TailFoldTypes, Separator: `'+'`, MaxSplit: -`1`, KeepEmpty: false);
152
153	unsigned StartIdx = `1`;
154	if (TailFoldTypes [`0`] == "disabled")
155	setInitialBits(TailFoldingOpts::Disabled);
156	else if (TailFoldTypes [`0`] == "all")
157	setInitialBits(TailFoldingOpts::All);
158	else if (TailFoldTypes [`0`] == "default")
159	setNeedsDefault(true);
160	else if (TailFoldTypes [`0`] == "simple")
161	setInitialBits(TailFoldingOpts::Simple);
162	else {
163	StartIdx = `0`;
164	setInitialBits(TailFoldingOpts::Disabled);
165	}
166
167	for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168	if (TailFoldTypes [I] == "reductions")
169	setEnableBit(TailFoldingOpts::Reductions);
170	else if (TailFoldTypes [I] == "recurrences")
171	setEnableBit(TailFoldingOpts::Recurrences);
172	else if (TailFoldTypes [I] == "reverse")
173	setEnableBit(TailFoldingOpts::Reverse);
174	else if (TailFoldTypes [I] == "noreductions")
175	setDisableBit(TailFoldingOpts::Reductions);
176	else if (TailFoldTypes [I] == "norecurrences")
177	setDisableBit(TailFoldingOpts::Recurrences);
178	else if (TailFoldTypes [I] == "noreverse")
179	setDisableBit(TailFoldingOpts::Reverse);
180	else
181	reportError(Opt: Val);
182	}
183	}
184
185	bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186	return (getBits(DefaultBits) & Required) == Required;
187	}
188	};
189	} // namespace
190
191	TailFoldingOption TailFoldingOptionLoc;
192
193	static cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
194	"sve-tail-folding",
195	cl::desc (
196	"Control the use of vectorisation using tail-folding for SVE where the"
197	" option is specified in the form (Initial)[+(Flag1\|Flag2\|...)]:"
198	"\ndisabled (Initial) No loop types will vectorize using "
199	"tail-folding"
200	"\ndefault (Initial) Uses the default tail-folding settings for "
201	"the target CPU"
202	"\nall (Initial) All legal loop types will vectorize using "
203	"tail-folding"
204	"\nsimple (Initial) Use tail-folding for simple loops (not "
205	"reductions or recurrences)"
206	"\nreductions Use tail-folding for loops containing reductions"
207	"\nnoreductions Inverse of above"
208	"\nrecurrences Use tail-folding for loops containing fixed order "
209	"recurrences"
210	"\nnorecurrences Inverse of above"
211	"\nreverse Use tail-folding for loops requiring reversed "
212	"predicates"
213	"\nnoreverse Inverse of above"),
214	cl::location(L&: TailFoldingOptionLoc));
215
216	// Experimental option that will only be fully functional when the
217	// code-generator is changed to use SVE instead of NEON for all fixed-width
218	// operations.
219	static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
220	"enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
221
222	// Experimental option that will only be fully functional when the cost-model
223	// and code-generator have been changed to avoid using scalable vector
224	// instructions that are not legal in streaming SVE mode.
225	static cl::opt<bool> EnableScalableAutovecInStreamingMode(
226	"enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
227
228	static bool isSMEABIRoutineCall(const CallInst &CI,
229	const AArch64TargetLowering &TLI) {
230	const auto *F = CI.getCalledFunction();
231	return F &&
232	SMEAttrs (F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
233	}
234
235	/// Returns true if the function has explicit operations that can only be
236	/// lowered using incompatible instructions for the selected mode. This also
237	/// returns true if the function F may use or modify ZA state.
238	static bool hasPossibleIncompatibleOps(const Function *F,
239	const AArch64TargetLowering &TLI) {
240	for (const BasicBlock &BB : *F) {
241	for (const Instruction &I : BB) {
242	// Be conservative for now and assume that any call to inline asm or to
243	// intrinsics could could result in non-streaming ops (e.g. calls to
244	// @llvm.aarch64. or @llvm.gather/scatter intrinsics). We can assume that*
245	// all native LLVM instructions can be lowered to compatible instructions.
246	if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
247	(cast<CallInst>(Val: I).isInlineAsm() \|\| isa<IntrinsicInst>(Val: I) \|\|
248	isSMEABIRoutineCall(CI: cast<CallInst>(Val: I), TLI)))
249	return true;
250	}
251	}
252	return false;
253	}
254
255	static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI,
256	SmallVectorImpl<StringRef> &Features) {
257	StringRef AttributeStr =
258	TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259	StringRef FeatureStr = F.getFnAttribute(Kind: AttributeStr).getValueAsString();
260	FeatureStr.split(A&: Features, Separator: ",");
261	}
262
263	APInt AArch64TTIImpl::getFeatureMask(const Function &F) const {
264	SmallVector<StringRef, `8`> Features;
265	extractAttrFeatures(F, TTI: this, Features);
266	return AArch64::getCpuSupportsMask(Features);
267	}
268
269	APInt AArch64TTIImpl::getPriorityMask(const Function &F) const {
270	SmallVector<StringRef, `8`> Features;
271	extractAttrFeatures(F, TTI: this, Features);
272	return AArch64::getFMVPriority(Features);
273	}
274
275	bool AArch64TTIImpl::isMultiversionedFunction(const Function &F) const {
276	return F.hasFnAttribute(Kind: "fmv-features");
277	}
278
279	const FeatureBitset AArch64TTIImpl::InlineInverseFeatures = {
280	AArch64::FeatureExecuteOnly,
281	};
282
283	bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
284	const Function Callee) const* {
285	SMECallAttrs CallAttrs(Caller, Callee);
286
287	// Never inline a function explicitly marked as being streaming,
288	// into a non-streaming function. Assume it was marked as streaming
289	// for a reason.
290	if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
291	CallAttrs.callee().hasStreamingInterfaceOrBody())
292	return false;
293
294	// When inlining, we should consider the body of the function, not the
295	// interface.
296	if (CallAttrs.callee().hasStreamingBody()) {
297	CallAttrs.callee().set(M: SMEAttrs::SM_Compatible, Enable: false);
298	CallAttrs.callee().set(M: SMEAttrs::SM_Enabled, Enable: true);
299	}
300
301	if (CallAttrs.callee().isNewZA() \|\| CallAttrs.callee().isNewZT0())
302	return false;
303
304	if (CallAttrs.requiresLazySave() \|\| CallAttrs.requiresSMChange() \|\|
305	CallAttrs.requiresPreservingZT0() \|\|
306	CallAttrs.requiresPreservingAllZAState()) {
307	if (hasPossibleIncompatibleOps(F: Callee, TLI: *getTLI()))
308	return false;
309	}
310
311	const TargetMachine &TM = getTLI()->getTargetMachine();
312	const FeatureBitset &CallerBits =
313	TM.getSubtargetImpl(*Caller)->getFeatureBits();
314	const FeatureBitset &CalleeBits =
315	TM.getSubtargetImpl(*Callee)->getFeatureBits();
316	// Adjust the feature bitsets by inverting some of the bits. This is needed
317	// for target features that represent restrictions rather than capabilities,
318	// for example a "+execute-only" callee can be inlined into a caller without
319	// "+execute-only", but not vice versa.
320	FeatureBitset EffectiveCallerBits = CallerBits ^ InlineInverseFeatures;
321	FeatureBitset EffectiveCalleeBits = CalleeBits ^ InlineInverseFeatures;
322
323	return (EffectiveCallerBits & EffectiveCalleeBits) == EffectiveCalleeBits;
324	}
325
326	bool AArch64TTIImpl::areTypesABICompatible(const Function *Caller,
327	const Function *Callee,
328	ArrayRef<Type > Types) const* {
329	if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
330	return false;
331
332	// We need to ensure that argument promotion does not attempt to promote
333	// pointers to fixed-length vector types larger than 128 bits like
334	// <8 x float> (and pointers to aggregate types which have such fixed-length
335	// vector type members) into the values of the pointees. Such vector types
336	// are used for SVE VLS but there is no ABI for SVE VLS arguments and the
337	// backend cannot lower such value arguments. The 128-bit fixed-length SVE
338	// types can be safely treated as 128-bit NEON types and they cannot be
339	// distinguished in IR.
340	if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range&: Types, P: [](Type *Ty) {
341	auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
342	return FVTy &&
343	FVTy->getScalarSizeInBits() * FVTy->getNumElements() > `128`;
344	}))
345	return false;
346
347	return true;
348	}
349
350	unsigned
351	AArch64TTIImpl::getInlineCallPenalty(const Function F, const* CallBase &Call,
352	unsigned DefaultCallPenalty) const {
353	// This function calculates a penalty for executing Call in F.
354	//
355	// There are two ways this function can be called:
356	// (1) F:
357	// call from F -> G (the call here is Call)
358	//
359	// For (1), Call.getCaller() == F, so it will always return a high cost if
360	// a streaming-mode change is required (thus promoting the need to inline the
361	// function)
362	//
363	// (2) F:
364	// call from F -> G (the call here is not Call)
365	// G:
366	// call from G -> H (the call here is Call)
367	//
368	// For (2), if after inlining the body of G into F the call to H requires a
369	// streaming-mode change, and the call to G from F would also require a
370	// streaming-mode change, then there is benefit to do the streaming-mode
371	// change only once and avoid inlining of G into F.
372
373	SMEAttrs FAttrs(*F);
374	SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
375
376	if (SMECallAttrs (FAttrs, CallAttrs.callee()).requiresSMChange()) {
377	if (F == Call.getCaller()) // (1)
378	return CallPenaltyChangeSM * DefaultCallPenalty;
379	if (SMECallAttrs (FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
380	return InlineCallPenaltyChangeSM * DefaultCallPenalty;
381	}
382
383	return DefaultCallPenalty;
384	}
385
386	bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
387	TargetTransformInfo::RegisterKind K) const {
388	assert(K != TargetTransformInfo::RGK_Scalar);
389
390	if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
391	return true;
392
393	return K == TargetTransformInfo::RGK_ScalableVector &&
394	ST->isSVEorStreamingSVEAvailable() &&
395	!ST->disableMaximizeScalableBandwidth();
396	}
397
398	/// Calculate the cost of materializing a 64-bit value. This helper
399	/// method might only calculate a fraction of a larger immediate. Therefore it
400	/// is valid to return a cost of ZERO.
401	InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) const {
402	// Check if the immediate can be encoded within an instruction.
403	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: `64`))
404	return `0`;
405
406	if (Val < `0`)
407	Val = ~Val;
408
409	// Calculate how many moves we will need to materialize this constant.
410	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
411	AArch64_IMM::expandMOVImm(Imm: Val, BitSize: `64`, Insn);
412	return Insn.size();
413	}
414
415	/// Calculate the cost of materializing the given constant.
416	InstructionCost
417	AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
418	TTI::TargetCostKind CostKind) const {
419	assert(Ty->isIntegerTy());
420
421	unsigned BitSize = Ty->getPrimitiveSizeInBits();
422	if (BitSize == `0`)
423	return ~`0U`;
424
425	// Sign-extend all constants to a multiple of 64-bit.
426	APInt ImmVal = Imm;
427	if (BitSize & `0x3f`)
428	ImmVal = Imm.sext(width: (BitSize + `63`) & ~`0x3fU`);
429
430	// Split the constant into 64-bit chunks and calculate the cost for each
431	// chunk.
432	InstructionCost Cost = `0`;
433	for (unsigned ShiftVal = `0`; ShiftVal < BitSize; ShiftVal += `64`) {
434	APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: `64`);
435	int64_t Val = Tmp.getSExtValue();
436	Cost += getIntImmCost(Val);
437	}
438	// We need at least one instruction to materialze the constant.
439	return std::max<InstructionCost>(a: `1`, b: Cost);
440	}
441
442	InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
443	const APInt &Imm, Type *Ty,
444	TTI::TargetCostKind CostKind,
445	Instruction Inst) const* {
446	assert(Ty->isIntegerTy());
447
448	unsigned BitSize = Ty->getPrimitiveSizeInBits();
449	// There is no cost model for constants with a bit size of 0. Return TCC_Free
450	// here, so that constant hoisting will ignore this constant.
451	if (BitSize == `0`)
452	return TTI::TCC_Free;
453
454	unsigned ImmIdx = ~`0U`;
455	switch (Opcode) {
456	default:
457	return TTI::TCC_Free;
458	case Instruction::GetElementPtr:
459	// Always hoist the base address of a GetElementPtr.
460	if (Idx == `0`)
461	return `2` * TTI::TCC_Basic;
462	return TTI::TCC_Free;
463	case Instruction::Store:
464	ImmIdx = `0`;
465	break;
466	case Instruction::Add:
467	case Instruction::Sub:
468	case Instruction::Mul:
469	case Instruction::UDiv:
470	case Instruction::SDiv:
471	case Instruction::URem:
472	case Instruction::SRem:
473	case Instruction::And:
474	case Instruction::Or:
475	case Instruction::Xor:
476	case Instruction::ICmp:
477	ImmIdx = `1`;
478	break;
479	// Always return TCC_Free for the shift value of a shift instruction.
480	case Instruction::Shl:
481	case Instruction::LShr:
482	case Instruction::AShr:
483	if (Idx == `1`)
484	return TTI::TCC_Free;
485	break;
486	case Instruction::Trunc:
487	case Instruction::ZExt:
488	case Instruction::SExt:
489	case Instruction::IntToPtr:
490	case Instruction::PtrToInt:
491	case Instruction::BitCast:
492	case Instruction::PHI:
493	case Instruction::Call:
494	case Instruction::Select:
495	case Instruction::Ret:
496	case Instruction::Load:
497	break;
498	}
499
500	if (Idx == ImmIdx) {
501	int NumConstants = (BitSize + `63`) / `64`;
502	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
503	return (Cost <= NumConstants * TTI::TCC_Basic)
504	? static_cast<int>(TTI::TCC_Free)
505	: Cost;
506	}
507	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
508	}
509
510	InstructionCost
511	AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
512	const APInt &Imm, Type *Ty,
513	TTI::TargetCostKind CostKind) const {
514	assert(Ty->isIntegerTy());
515
516	unsigned BitSize = Ty->getPrimitiveSizeInBits();
517	// There is no cost model for constants with a bit size of 0. Return TCC_Free
518	// here, so that constant hoisting will ignore this constant.
519	if (BitSize == `0`)
520	return TTI::TCC_Free;
521
522	// Most (all?) AArch64 intrinsics do not support folding immediates into the
523	// selected instruction, so we compute the materialization cost for the
524	// immediate directly.
525	if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
526	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
527
528	switch (IID) {
529	default:
530	return TTI::TCC_Free;
531	case Intrinsic::sadd_with_overflow:
532	case Intrinsic::uadd_with_overflow:
533	case Intrinsic::ssub_with_overflow:
534	case Intrinsic::usub_with_overflow:
535	case Intrinsic::smul_with_overflow:
536	case Intrinsic::umul_with_overflow:
537	if (Idx == `1`) {
538	int NumConstants = (BitSize + `63`) / `64`;
539	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
540	return (Cost <= NumConstants * TTI::TCC_Basic)
541	? static_cast<int>(TTI::TCC_Free)
542	: Cost;
543	}
544	break;
545	case Intrinsic::experimental_stackmap:
546	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
547	return TTI::TCC_Free;
548	break;
549	case Intrinsic::experimental_patchpoint_void:
550	case Intrinsic::experimental_patchpoint:
551	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
552	return TTI::TCC_Free;
553	break;
554	case Intrinsic::experimental_gc_statepoint:
555	if ((Idx < `5`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
556	return TTI::TCC_Free;
557	break;
558	}
559	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
560	}
561
562	TargetTransformInfo::PopcntSupportKind
563	AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
564	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
565	if (TyWidth == `32` \|\| TyWidth == `64`)
566	return TTI::PSK_FastHardware;
567	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
568	return TTI::PSK_Software;
569	}
570
571	static bool isUnpackedVectorVT(EVT VecVT) {
572	return VecVT.isScalableVector() &&
573	VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
574	}
575
576	static InstructionCost getHistogramCost(const AArch64Subtarget *ST,
577	const IntrinsicCostAttributes &ICA) {
578	// We need to know at least the number of elements in the vector of buckets
579	// and the size of each element to update.
580	if (ICA.getArgTypes().size() < `2`)
581	return InstructionCost::getInvalid();
582
583	// Only interested in costing for the hardware instruction from SVE2.
584	if (!ST->hasSVE2())
585	return InstructionCost::getInvalid();
586
587	Type BucketPtrsTy = ICA.getArgTypes()[`0`]; // Type of vector of pointers*
588	Type EltTy = ICA.getArgTypes()[`1`]; // Type of bucket elements*
589	unsigned TotalHistCnts = `1`;
590
591	unsigned EltSize = EltTy->getScalarSizeInBits();
592	// Only allow (up to 64b) integers or pointers
593	if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) \|\| EltSize > `64`)
594	return InstructionCost::getInvalid();
595
596	// FIXME: We should be able to generate histcnt for fixed-length vectors
597	// using ptrue with a specific VL.
598	if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy)) {
599	unsigned EC = VTy->getElementCount().getKnownMinValue();
600	if (!isPowerOf2_64(Value: EC) \|\| !VTy->isScalableTy())
601	return InstructionCost::getInvalid();
602
603	// HistCnt only supports 32b and 64b element types
604	unsigned LegalEltSize = EltSize <= `32` ? `32` : `64`;
605
606	if (EC == `2` \|\| (LegalEltSize == `32` && EC == `4`))
607	return InstructionCost (BaseHistCntCost);
608
609	unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
610	TotalHistCnts = EC / NaturalVectorWidth;
611
612	return InstructionCost (BaseHistCntCost * TotalHistCnts);
613	}
614
615	return InstructionCost::getInvalid();
616	}
617
618	InstructionCost
619	AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
620	TTI::TargetCostKind CostKind) const {
621	// The code-generator is currently not able to handle scalable vectors
622	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
623	// it. This change will be removed when code-generation for these types is
624	// sufficiently reliable.
625	auto *RetTy = ICA.getReturnType();
626	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
627	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
628	return InstructionCost::getInvalid();
629
630	switch (ICA.getID()) {
631	case Intrinsic::experimental_vector_histogram_add: {
632	InstructionCost HistCost = getHistogramCost(ST, ICA);
633	// If the cost isn't valid, we may still be able to scalarize
634	if (HistCost.isValid())
635	return HistCost;
636	break;
637	}
638	case Intrinsic::umin:
639	case Intrinsic::umax:
640	case Intrinsic::smin:
641	case Intrinsic::smax: {
642	static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
643	MVT::v8i16, MVT::v2i32, MVT::v4i32,
644	MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
645	MVT::nxv2i64};
646	auto LT = getTypeLegalizationCost(Ty: RetTy);
647	// v2i64 types get converted to cmp+bif hence the cost of 2
648	if (LT.second == MVT::v2i64)
649	return LT.first * `2`;
650	if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)))
651	return LT.first;
652	break;
653	}
654	case Intrinsic::scmp:
655	case Intrinsic::ucmp: {
656	static const CostTblEntry BitreverseTbl[] = {
657	{.ISD: Intrinsic::scmp, .Type: MVT::i32, .Cost: `3`}, // cmp+cset+csinv
658	{.ISD: Intrinsic::scmp, .Type: MVT::i64, .Cost: `3`}, // cmp+cset+csinv
659	{.ISD: Intrinsic::scmp, .Type: MVT::v8i8, .Cost: `3`}, // cmgt+cmgt+sub
660	{.ISD: Intrinsic::scmp, .Type: MVT::v16i8, .Cost: `3`}, // cmgt+cmgt+sub
661	{.ISD: Intrinsic::scmp, .Type: MVT::v4i16, .Cost: `3`}, // cmgt+cmgt+sub
662	{.ISD: Intrinsic::scmp, .Type: MVT::v8i16, .Cost: `3`}, // cmgt+cmgt+sub
663	{.ISD: Intrinsic::scmp, .Type: MVT::v2i32, .Cost: `3`}, // cmgt+cmgt+sub
664	{.ISD: Intrinsic::scmp, .Type: MVT::v4i32, .Cost: `3`}, // cmgt+cmgt+sub
665	{.ISD: Intrinsic::scmp, .Type: MVT::v1i64, .Cost: `3`}, // cmgt+cmgt+sub
666	{.ISD: Intrinsic::scmp, .Type: MVT::v2i64, .Cost: `3`}, // cmgt+cmgt+sub
667	};
668	const auto LT = getTypeLegalizationCost(Ty: RetTy);
669	const auto *Entry =
670	CostTableLookup(Table: BitreverseTbl, ISD: Intrinsic::scmp, Ty: LT.second);
671	if (Entry)
672	return Entry->Cost * LT.first;
673	break;
674	}
675	case Intrinsic::sadd_sat:
676	case Intrinsic::ssub_sat:
677	case Intrinsic::uadd_sat:
678	case Intrinsic::usub_sat: {
679	static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
680	MVT::v8i16, MVT::v2i32, MVT::v4i32,
681	MVT::v2i64};
682	auto LT = getTypeLegalizationCost(Ty: RetTy);
683	// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
684	// need to extend the type, as it uses shr(qadd(shl, shl)).
685	unsigned Instrs =
686	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1` : `4`;
687	if (any_of(Range: ValidSatTys, P: equal_to(Arg&: LT.second)))
688	return LT.first * Instrs;
689
690	TypeSize TS = getDataLayout().getTypeSizeInBits(Ty: RetTy);
691	uint64_t VectorSize = TS.getKnownMinValue();
692
693	if (ST->isSVEAvailable() && VectorSize >= `128` && isPowerOf2_64(Value: VectorSize))
694	return LT.first * Instrs;
695
696	break;
697	}
698	case Intrinsic::abs: {
699	static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
700	MVT::v8i16, MVT::v2i32, MVT::v4i32,
701	MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
702	MVT::nxv4i32, MVT::nxv2i64};
703	auto LT = getTypeLegalizationCost(Ty: RetTy);
704	if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)))
705	return LT.first;
706	break;
707	}
708	case Intrinsic::bswap: {
709	static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
710	MVT::v4i32, MVT::v2i64};
711	auto LT = getTypeLegalizationCost(Ty: RetTy);
712	if (any_of(Range: ValidAbsTys, P: equal_to(Arg&: LT.second)) &&
713	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
714	return LT.first;
715	break;
716	}
717	case Intrinsic::fma:
718	case Intrinsic::fmuladd: {
719	// Given a fma or fmuladd, cost it the same as a fmul instruction which are
720	// usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
721	Type *EltTy = RetTy->getScalarType();
722	if (EltTy->isFloatTy() \|\| EltTy->isDoubleTy() \|\|
723	(EltTy->isHalfTy() && ST->hasFullFP16()))
724	return getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
725	break;
726	}
727	case Intrinsic::stepvector: {
728	InstructionCost Cost = `1`; // Cost of the `index' instruction
729	auto LT = getTypeLegalizationCost(Ty: RetTy);
730	// Legalisation of illegal vectors involves an `index' instruction plus
731	// (LT.first - 1) vector adds.
732	if (LT.first > `1`) {
733	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: RetTy->getContext());
734	InstructionCost AddCost =
735	getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
736	Cost += AddCost * (LT.first - `1`);
737	}
738	return Cost;
739	}
740	case Intrinsic::vector_extract:
741	case Intrinsic::vector_insert: {
742	// If both the vector and subvector types are legal types and the index
743	// is 0, then this should be a no-op or simple operation; return a
744	// relatively low cost.
745
746	// If arguments aren't actually supplied, then we cannot determine the
747	// value of the index. We also want to skip predicate types.
748	if (ICA.getArgs().size() != ICA.getArgTypes().size() \|\|
749	ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: `1`))
750	break;
751
752	LLVMContext &C = RetTy->getContext();
753	EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
754	bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
755	EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
756	: getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`1`]);
757	// Skip this if either the vector or subvector types are unpacked
758	// SVE types; they may get lowered to stack stores and loads.
759	if (isUnpackedVectorVT(VecVT) \|\| isUnpackedVectorVT(VecVT: SubVecVT))
760	break;
761
762	TargetLoweringBase::LegalizeKind SubVecLK =
763	getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
764	TargetLoweringBase::LegalizeKind VecLK =
765	getTLI()->getTypeConversion(Context&: C, VT: VecVT);
766	const Value *Idx = IsExtract ? ICA.getArgs()[`1`] : ICA.getArgs()[`2`];
767	const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
768	if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
769	VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
770	return TTI::TCC_Free;
771	break;
772	}
773	case Intrinsic::bitreverse: {
774	static const CostTblEntry BitreverseTbl[] = {
775	{.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: `1`},
776	{.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: `1`},
777	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: `1`},
778	{.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: `1`},
779	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: `2`},
780	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: `2`},
781	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: `2`},
782	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: `2`},
783	{.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: `2`},
784	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: `2`},
785	};
786	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
787	const auto *Entry =
788	CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
789	if (Entry) {
790	// Cost Model is using the legal type(i32) that i8 and i16 will be
791	// converted to +1 so that we match the actual lowering cost
792	if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 \|\|
793	TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
794	return LegalisationCost.first * Entry->Cost + `1`;
795
796	return LegalisationCost.first * Entry->Cost;
797	}
798	break;
799	}
800	case Intrinsic::ctpop: {
801	if (!ST->hasNEON()) {
802	// 32-bit or 64-bit ctpop without NEON is 12 instructions.
803	return getTypeLegalizationCost(Ty: RetTy).first * `12`;
804	}
805	static const CostTblEntry CtpopCostTbl[] = {
806	{.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: `4`},
807	{.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: `3`},
808	{.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: `2`},
809	{.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: `1`},
810	{.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: `4`},
811	{.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: `3`},
812	{.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: `2`},
813	{.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: `1`},
814	{.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: `5`},
815	};
816	auto LT = getTypeLegalizationCost(Ty: RetTy);
817	MVT MTy = LT.second;
818	if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
819	// Extra cost of +1 when illegal vector types are legalized by promoting
820	// the integer type.
821	int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
822	RetTy->getScalarSizeInBits()
823	? `1`
824	: `0`;
825	return LT.first * Entry->Cost + ExtraCost;
826	}
827	break;
828	}
829	case Intrinsic::sadd_with_overflow:
830	case Intrinsic::uadd_with_overflow:
831	case Intrinsic::ssub_with_overflow:
832	case Intrinsic::usub_with_overflow:
833	case Intrinsic::smul_with_overflow:
834	case Intrinsic::umul_with_overflow: {
835	static const CostTblEntry WithOverflowCostTbl[] = {
836	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
837	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
838	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
839	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
840	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
841	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
842	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
843	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
844	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: `3`},
845	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: `3`},
846	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: `3`},
847	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: `3`},
848	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: `1`},
849	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: `1`},
850	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: `1`},
851	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: `1`},
852	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: `5`},
853	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: `4`},
854	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: `5`},
855	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: `4`},
856	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;tst
857	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;cmp sxtw
858	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;smulh;cmp
859	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;umulh;cmp asr
860	};
861	EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: `0`), AllowUnknown: true);
862	if (MTy.isSimple())
863	if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
864	Ty: MTy.getSimpleVT()))
865	return Entry->Cost;
866	break;
867	}
868	case Intrinsic::fptosi_sat:
869	case Intrinsic::fptoui_sat: {
870	if (ICA.getArgTypes().empty())
871	break;
872	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
873	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
874	EVT MTy = TLI->getValueType(DL, Ty: RetTy);
875	// Check for the legal types, which are where the size of the input and the
876	// output are the same, or we are using cvt f64->i32 or f32->i64.
877	if ((LT.second == MVT::f32 \|\| LT.second == MVT::f64 \|\|
878	LT.second == MVT::v2f32 \|\| LT.second == MVT::v4f32 \|\|
879	LT.second == MVT::v2f64)) {
880	if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() \|\|
881	(LT.second == MVT::f64 && MTy == MVT::i32) \|\|
882	(LT.second == MVT::f32 && MTy == MVT::i64)))
883	return LT.first;
884	// Extending vector types v2f32->v2i64, fcvtl2 + fcvt2
885	if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
886	MTy.getScalarSizeInBits() == `64`)
887	return LT.first * (MTy.getVectorNumElements() > `2` ? `4` : `2`);
888	}
889	// Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
890	// f32.
891	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
892	return LT.first + getIntrinsicInstrCost(
893	ICA: {ICA.getID(),
894	RetTy,
895	{ICA.getArgTypes()[`0`]->getWithNewType(
896	EltTy: Type::getFloatTy(C&: RetTy->getContext()))}},
897	CostKind);
898	if ((LT.second == MVT::f16 && MTy == MVT::i32) \|\|
899	(LT.second == MVT::f16 && MTy == MVT::i64) \|\|
900	((LT.second == MVT::v4f16 \|\| LT.second == MVT::v8f16) &&
901	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
902	return LT.first;
903	// Extending vector types v8f16->v8i32, fcvtl2 + fcvt2
904	if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
905	MTy.getScalarSizeInBits() == `32`)
906	return LT.first * (MTy.getVectorNumElements() > `4` ? `4` : `2`);
907	// Extending vector types v8f16->v8i32. These current scalarize but the
908	// codegen could be better.
909	if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
910	MTy.getScalarSizeInBits() == `64`)
911	return MTy.getVectorNumElements() * `3`;
912
913	// If we can we use a legal convert followed by a min+max
914	if ((LT.second.getScalarType() == MVT::f32 \|\|
915	LT.second.getScalarType() == MVT::f64 \|\|
916	LT.second.getScalarType() == MVT::f16) &&
917	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
918	Type *LegalTy =
919	Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
920	if (LT.second.isVector())
921	LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
922	InstructionCost Cost = `1`;
923	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
924	LegalTy, {LegalTy, LegalTy});
925	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
926	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
927	LegalTy, {LegalTy, LegalTy});
928	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
929	return LT.first * Cost +
930	((LT.second.getScalarType() != MVT::f16 \|\| ST->hasFullFP16()) ? `0`
931	: `1`);
932	}
933	// Otherwise we need to follow the default expansion that clamps the value
934	// using a float min/max with a fcmp+sel for nan handling when signed.
935	Type *FPTy = ICA.getArgTypes()[`0`]->getScalarType();
936	RetTy = RetTy->getScalarType();
937	if (LT.second.isVector()) {
938	FPTy = VectorType::get(ElementType: FPTy, EC: LT.second.getVectorElementCount());
939	RetTy = VectorType::get(ElementType: RetTy, EC: LT.second.getVectorElementCount());
940	}
941	IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
942	InstructionCost Cost = getIntrinsicInstrCost(ICA: Attrs1, CostKind);
943	IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
944	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
945	Cost +=
946	getCastInstrCost(Opcode: IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
947	Dst: RetTy, Src: FPTy, CCH: TTI::CastContextHint::None, CostKind);
948	if (IsSigned) {
949	Type *CondTy = RetTy->getWithNewBitWidth(NewBitWidth: `1`);
950	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::FCmp, ValTy: FPTy, CondTy,
951	VecPred: CmpInst::FCMP_UNO, CostKind);
952	Cost += getCmpSelInstrCost(Opcode: BinaryOperator::Select, ValTy: RetTy, CondTy,
953	VecPred: CmpInst::FCMP_UNO, CostKind);
954	}
955	return LT.first * Cost;
956	}
957	case Intrinsic::fshl:
958	case Intrinsic::fshr: {
959	if (ICA.getArgs().empty())
960	break;
961
962	const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[`2`]);
963
964	// ROTR / ROTL is a funnel shift with equal first and second operand. For
965	// ROTR on integer registers (i32/i64) this can be done in a single ror
966	// instruction. A fshl with a non-constant shift uses a neg + ror.
967	if (RetTy->isIntegerTy() && ICA.getArgs()[`0`] == ICA.getArgs()[`1`] &&
968	(RetTy->getPrimitiveSizeInBits() == `32` \|\|
969	RetTy->getPrimitiveSizeInBits() == `64`)) {
970	InstructionCost NegCost =
971	(ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? `1` : `0`;
972	return `1` + NegCost;
973	}
974
975	// TODO: Add handling for fshl where third argument is not a constant.
976	if (!OpInfoZ.isConstant())
977	break;
978
979	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
980	if (OpInfoZ.isUniform()) {
981	static const CostTblEntry FshlTbl[] = {
982	{.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: `2`}, // shl + usra
983	{.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: `2`},
984	{.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: `2`},
985	{.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: `2`}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: `2`}};
986	// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
987	// to avoid having to duplicate the costs.
988	const auto *Entry =
989	CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
990	if (Entry)
991	return LegalisationCost.first * Entry->Cost;
992	}
993
994	auto TyL = getTypeLegalizationCost(Ty: RetTy);
995	if (!RetTy->isIntegerTy())
996	break;
997
998	// Estimate cost manually, as types like i8 and i16 will get promoted to
999	// i32 and CostTableLookup will ignore the extra conversion cost.
1000	bool HigherCost = (RetTy->getScalarSizeInBits() != `32` &&
1001	RetTy->getScalarSizeInBits() < `64`) \|\|
1002	(RetTy->getScalarSizeInBits() % `64` != `0`);
1003	unsigned ExtraCost = HigherCost ? `1` : `0`;
1004	if (RetTy->getScalarSizeInBits() == `32` \|\|
1005	RetTy->getScalarSizeInBits() == `64`)
1006	ExtraCost = `0`; // fhsl/fshr for i32 and i64 can be lowered to a single
1007	// extr instruction.
1008	else if (HigherCost)
1009	ExtraCost = `1`;
1010	else
1011	break;
1012	return TyL.first + ExtraCost;
1013	}
1014	case Intrinsic::get_active_lane_mask: {
1015	auto RetTy = cast<VectorType>(Val: ICA.getReturnType());
1016	EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
1017	EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
1018	if (getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT))
1019	break;
1020
1021	if (RetTy->isScalableTy()) {
1022	if (TLI->getTypeAction(Context&: RetTy->getContext(), VT: RetVT) !=
1023	TargetLowering::TypeSplitVector)
1024	break;
1025
1026	auto LT = getTypeLegalizationCost(Ty: RetTy);
1027	InstructionCost Cost = LT.first;
1028	// When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1029	// as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1030	// nxv32i1 = get_active_lane_mask(base, idx) ->
1031	// {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1032	if (ST->hasSVE2p1() \|\| ST->hasSME2()) {
1033	Cost /= `2`;
1034	if (Cost == `1`)
1035	return Cost;
1036	}
1037
1038	// If more than one whilelo intrinsic is required, include the extra cost
1039	// required by the saturating add & select required to increment the
1040	// start value after the first intrinsic call.
1041	Type *OpTy = ICA.getArgTypes()[`0`];
1042	IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1043	InstructionCost SplitCost = getIntrinsicInstrCost(ICA: AddAttrs, CostKind);
1044	Type *CondTy = OpTy->getWithNewBitWidth(NewBitWidth: `1`);
1045	SplitCost += getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: OpTy, CondTy,
1046	VecPred: CmpInst::ICMP_UGT, CostKind);
1047	return Cost + (SplitCost * (Cost - `1`));
1048	} else if (!getTLI()->isTypeLegal(VT: RetVT)) {
1049	// We don't have enough context at this point to determine if the mask
1050	// is going to be kept live after the block, which will force the vXi1
1051	// type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1052	// For now, we just assume the vectorizer created this intrinsic and
1053	// the result will be the input for a PHI. In this case the cost will
1054	// be extremely high for fixed-width vectors.
1055	// NOTE: getScalarizationOverhead returns a cost that's far too
1056	// pessimistic for the actual generated codegen. In reality there are
1057	// two instructions generated per lane.
1058	return cast<FixedVectorType>(Val: RetTy)->getNumElements() * `2`;
1059	}
1060	break;
1061	}
1062	case Intrinsic::experimental_vector_match: {
1063	auto *NeedleTy = cast<FixedVectorType>(Val: ICA.getArgTypes()[`1`]);
1064	EVT SearchVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
1065	unsigned SearchSize = NeedleTy->getNumElements();
1066	if (!getTLI()->shouldExpandVectorMatch(VT: SearchVT, SearchSize)) {
1067	// Base cost for MATCH instructions. At least on the Neoverse V2 and
1068	// Neoverse V3, these are cheap operations with the same latency as a
1069	// vector ADD. In most cases, however, we also need to do an extra DUP.
1070	// For fixed-length vectors we currently need an extra five--six
1071	// instructions besides the MATCH.
1072	InstructionCost Cost = `4`;
1073	if (isa<FixedVectorType>(Val: RetTy))
1074	Cost += `10`;
1075	return Cost;
1076	}
1077	break;
1078	}
1079	case Intrinsic::experimental_cttz_elts: {
1080	EVT ArgVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
1081	if (!getTLI()->shouldExpandCttzElements(VT: ArgVT)) {
1082	// This will consist of a SVE brkb and a cntp instruction. These
1083	// typically have the same latency and half the throughput as a vector
1084	// add instruction.
1085	return `4`;
1086	}
1087	break;
1088	}
1089	case Intrinsic::loop_dependence_raw_mask:
1090	case Intrinsic::loop_dependence_war_mask: {
1091	// The whilewr/rw instructions require SVE2 or SME.
1092	if (ST->hasSVE2() \|\| ST->hasSME()) {
1093	EVT VecVT = getTLI()->getValueType(DL, Ty: RetTy);
1094	unsigned EltSizeInBytes =
1095	cast<ConstantInt>(Val: ICA.getArgs()[`2`])->getZExtValue();
1096	if (!is_contained(Set: {`1u`, `2u`, `4u`, `8u`}, Element: EltSizeInBytes) \|\|
1097	VecVT.getVectorMinNumElements() != (`16` / EltSizeInBytes))
1098	break;
1099	// For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1100	return isa<FixedVectorType>(Val: RetTy) ? `2` : `1`;
1101	}
1102	break;
1103	}
1104	case Intrinsic::experimental_vector_extract_last_active:
1105	if (ST->isSVEorStreamingSVEAvailable()) {
1106	auto [LegalCost, _] = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
1107	// This should turn into chained clastb instructions.
1108	return LegalCost;
1109	}
1110	break;
1111	case Intrinsic::pow: {
1112	// For scalar calls we know the target has the libcall, and for fixed-width
1113	// vectors we know for the worst case it can be scalarised.
1114	EVT VT = getTLI()->getValueType(DL, Ty: RetTy);
1115	RTLIB::Libcall LC = RTLIB::getPOW(RetVT: VT);
1116	bool HasLibcall = getTLI()->getLibcallImpl(Call: LC) != RTLIB::Unsupported;
1117	bool CanLowerWithLibcalls = !isa<ScalableVectorType>(Val: RetTy) \|\| HasLibcall;
1118
1119	// If we know that the call can be lowered with libcalls then it's safe to
1120	// reduce the costs in some cases. This is important for scalable vectors,
1121	// since we cannot scalarize the call in the absence of a vector math
1122	// library.
1123	if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1124	// If we know the fast math flags and the exponent is a constant then the
1125	// cost may be less for some exponents like 0.25 and 0.75.
1126	const Constant *ExpC = dyn_cast<Constant>(Val: ICA.getArgs()[`1`]);
1127	if (ExpC && isa<VectorType>(Val: ExpC->getType()))
1128	ExpC = ExpC->getSplatValue();
1129	if (auto *ExpF = dyn_cast_or_null<ConstantFP>(Val: ExpC)) {
1130	// The argument must be a FP constant.
1131	bool Is025 = ExpF->getValueAPF().isExactlyValue(V: `0.25`);
1132	bool Is075 = ExpF->getValueAPF().isExactlyValue(V: `0.75`);
1133	FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1134	if ((Is025 \|\| Is075) && FMF.noInfs() && FMF.approxFunc() &&
1135	(!Is025 \|\| FMF.noSignedZeros())) {
1136	IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1137	InstructionCost Sqrt = getIntrinsicInstrCost(ICA: Attrs, CostKind);
1138	if (Is025)
1139	return `2` * Sqrt;
1140	InstructionCost FMul =
1141	getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: RetTy, CostKind);
1142	return (Sqrt * `2`) + FMul;
1143	}
1144	// TODO: For 1/3 exponents we expect the cbrt call to be slightly
1145	// cheaper than pow.
1146	}
1147	}
1148
1149	if (HasLibcall)
1150	return getCallInstrCost(F: nullptr, RetTy, Tys: ICA.getArgTypes(), CostKind);
1151	break;
1152	}
1153	case Intrinsic::sqrt:
1154	case Intrinsic::fabs:
1155	case Intrinsic::ceil:
1156	case Intrinsic::floor:
1157	case Intrinsic::nearbyint:
1158	case Intrinsic::round:
1159	case Intrinsic::rint:
1160	case Intrinsic::roundeven:
1161	case Intrinsic::trunc:
1162	case Intrinsic::minnum:
1163	case Intrinsic::maxnum:
1164	case Intrinsic::minimum:
1165	case Intrinsic::maximum: {
1166	if (isa<ScalableVectorType>(Val: RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1167	auto LT = getTypeLegalizationCost(Ty: RetTy);
1168	return LT.first;
1169	}
1170	break;
1171	}
1172	default:
1173	break;
1174	}
1175	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1176	}
1177
1178	/// The function will remove redundant reinterprets casting in the presence
1179	/// of the control flow
1180	static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1181	IntrinsicInst &II) {
1182	SmallVector<Instruction *, `32`> Worklist;
1183	auto RequiredType = II.getType();
1184
1185	auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: `0`));
1186	assert(PN && "Expected Phi Node!");
1187
1188	// Don't create a new Phi unless we can remove the old one.
1189	if (!PN->hasOneUse())
1190	return std::nullopt;
1191
1192	for (Value *IncValPhi : PN->incoming_values()) {
1193	auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
1194	if (!Reinterpret \|\|
1195	Reinterpret->getIntrinsicID() !=
1196	Intrinsic::aarch64_sve_convert_to_svbool \|\|
1197	RequiredType != Reinterpret->getArgOperand(i: `0`)->getType())
1198	return std::nullopt;
1199	}
1200
1201	// Create the new Phi
1202	IC.Builder.SetInsertPoint(PN);
1203	PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
1204	Worklist.push_back(Elt: PN);
1205
1206	for (unsigned I = `0`; I < PN->getNumIncomingValues(); I++) {
1207	auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
1208	NPN->addIncoming(V: Reinterpret->getOperand(i: `0`), BB: PN->getIncomingBlock(i: I));
1209	Worklist.push_back(Elt: Reinterpret);
1210	}
1211
1212	// Cleanup Phi Node and reinterprets
1213	return IC.replaceInstUsesWith(I&: II, V: NPN);
1214	}
1215
1216	// A collection of properties common to SVE intrinsics that allow for combines
1217	// to be written without needing to know the specific intrinsic.
1218	struct SVEIntrinsicInfo {
1219	//
1220	// Helper routines for common intrinsic definitions.
1221	//
1222
1223	// e.g. llvm.aarch64.sve.add pg, op1, op2
1224	// with IID ==> llvm.aarch64.sve.add_u
1225	static SVEIntrinsicInfo
1226	defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) {
1227	return SVEIntrinsicInfo ()
1228	.setGoverningPredicateOperandIdx(`0`)
1229	.setOperandIdxInactiveLanesTakenFrom(`1`)
1230	.setMatchingUndefIntrinsic(IID);
1231	}
1232
1233	// e.g. llvm.aarch64.sve.neg inactive, pg, op
1234	static SVEIntrinsicInfo defaultMergingUnaryOp() {
1235	return SVEIntrinsicInfo ()
1236	.setGoverningPredicateOperandIdx(`1`)
1237	.setOperandIdxInactiveLanesTakenFrom(`0`)
1238	.setOperandIdxWithNoActiveLanes(`0`);
1239	}
1240
1241	// e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1242	static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() {
1243	return SVEIntrinsicInfo ()
1244	.setGoverningPredicateOperandIdx(`1`)
1245	.setOperandIdxInactiveLanesTakenFrom(`0`);
1246	}
1247
1248	// e.g. llvm.aarch64.sve.add_u pg, op1, op2
1249	static SVEIntrinsicInfo defaultUndefOp() {
1250	return SVEIntrinsicInfo ()
1251	.setGoverningPredicateOperandIdx(`0`)
1252	.setInactiveLanesAreNotDefined();
1253	}
1254
1255	// e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1256	// llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1257	static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1258	return SVEIntrinsicInfo ()
1259	.setGoverningPredicateOperandIdx(GPIndex)
1260	.setInactiveLanesAreUnused();
1261	}
1262
1263	// e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1264	// llvm.aarch64.sve.ld1 pg, ptr
1265	static SVEIntrinsicInfo defaultZeroingOp() {
1266	return SVEIntrinsicInfo ()
1267	.setGoverningPredicateOperandIdx(`0`)
1268	.setInactiveLanesAreUnused()
1269	.setResultIsZeroInitialized();
1270	}
1271
1272	// All properties relate to predication and thus having a general predicate
1273	// is the minimum requirement to say there is intrinsic info to act on.
1274	explicit operator bool() const { return hasGoverningPredicate(); }
1275
1276	//
1277	// Properties relating to the governing predicate.
1278	//
1279
1280	bool hasGoverningPredicate() const {
1281	return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1282	}
1283
1284	unsigned getGoverningPredicateOperandIdx() const {
1285	assert(hasGoverningPredicate() && "Propery not set!");
1286	return GoverningPredicateIdx;
1287	}
1288
1289	SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) {
1290	assert(!hasGoverningPredicate() && "Cannot set property twice!");
1291	GoverningPredicateIdx = Index;
1292	return *this;
1293	}
1294
1295	//
1296	// Properties relating to operations the intrinsic could be transformed into.
1297	// NOTE: This does not mean such a transformation is always possible, but the
1298	// knowledge makes it possible to reuse existing optimisations without needing
1299	// to embed specific handling for each intrinsic. For example, instruction
1300	// simplification can be used to optimise an intrinsic's active lanes.
1301	//
1302
1303	bool hasMatchingUndefIntrinsic() const {
1304	return UndefIntrinsic != Intrinsic::not_intrinsic;
1305	}
1306
1307	Intrinsic::ID getMatchingUndefIntrinsic() const {
1308	assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1309	return UndefIntrinsic;
1310	}
1311
1312	SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) {
1313	assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1314	UndefIntrinsic = IID;
1315	return *this;
1316	}
1317
1318	bool hasMatchingIROpode() const { return IROpcode != `0`; }
1319
1320	unsigned getMatchingIROpode() const {
1321	assert(hasMatchingIROpode() && "Propery not set!");
1322	return IROpcode;
1323	}
1324
1325	SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) {
1326	assert(!hasMatchingIROpode() && "Cannot set property twice!");
1327	IROpcode = Opcode;
1328	return *this;
1329	}
1330
1331	//
1332	// Properties relating to the result of inactive lanes.
1333	//
1334
1335	bool inactiveLanesTakenFromOperand() const {
1336	return ResultLanes == InactiveLanesTakenFromOperand;
1337	}
1338
1339	unsigned getOperandIdxInactiveLanesTakenFrom() const {
1340	assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1341	return OperandIdxForInactiveLanes;
1342	}
1343
1344	SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) {
1345	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1346	ResultLanes = InactiveLanesTakenFromOperand;
1347	OperandIdxForInactiveLanes = Index;
1348	return *this;
1349	}
1350
1351	bool inactiveLanesAreNotDefined() const {
1352	return ResultLanes == InactiveLanesAreNotDefined;
1353	}
1354
1355	SVEIntrinsicInfo &setInactiveLanesAreNotDefined() {
1356	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1357	ResultLanes = InactiveLanesAreNotDefined;
1358	return *this;
1359	}
1360
1361	bool inactiveLanesAreUnused() const {
1362	return ResultLanes == InactiveLanesAreUnused;
1363	}
1364
1365	SVEIntrinsicInfo &setInactiveLanesAreUnused() {
1366	assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1367	ResultLanes = InactiveLanesAreUnused;
1368	return *this;
1369	}
1370
1371	// NOTE: Whilst not limited to only inactive lanes, the common use case is:
1372	// inactiveLanesAreZeroed =
1373	// resultIsZeroInitialized() && inactiveLanesAreUnused()
1374	bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1375
1376	SVEIntrinsicInfo &setResultIsZeroInitialized() {
1377	ResultIsZeroInitialized = true;
1378	return *this;
1379	}
1380
1381	//
1382	// The first operand of unary merging operations is typically only used to
1383	// set the result for inactive lanes. Knowing this allows us to deadcode the
1384	// operand when we can prove there are no inactive lanes.
1385	//
1386
1387	bool hasOperandWithNoActiveLanes() const {
1388	return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1389	}
1390
1391	unsigned getOperandIdxWithNoActiveLanes() const {
1392	assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1393	return OperandIdxWithNoActiveLanes;
1394	}
1395
1396	SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) {
1397	assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1398	OperandIdxWithNoActiveLanes = Index;
1399	return *this;
1400	}
1401
1402	private:
1403	unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1404
1405	Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1406	unsigned IROpcode = `0`;
1407
1408	enum PredicationStyle {
1409	Uninitialized,
1410	InactiveLanesTakenFromOperand,
1411	InactiveLanesAreNotDefined,
1412	InactiveLanesAreUnused
1413	} ResultLanes = Uninitialized;
1414
1415	bool ResultIsZeroInitialized = false;
1416	unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1417	unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1418	};
1419
1420	static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
1421	// Some SVE intrinsics do not use scalable vector types, but since they are
1422	// not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1423	if (!isa<ScalableVectorType>(Val: II.getType()) &&
1424	all_of(Range: II.args(), P: [&](const Value *V) {
1425	return !isa<ScalableVectorType>(Val: V->getType());
1426	}))
1427	return SVEIntrinsicInfo ();
1428
1429	Intrinsic::ID IID = II.getIntrinsicID();
1430	switch (IID) {
1431	default:
1432	break;
1433	case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1434	case Intrinsic::aarch64_sve_fcvt_f16f32:
1435	case Intrinsic::aarch64_sve_fcvt_f16f64:
1436	case Intrinsic::aarch64_sve_fcvt_f32f16:
1437	case Intrinsic::aarch64_sve_fcvt_f32f64:
1438	case Intrinsic::aarch64_sve_fcvt_f64f16:
1439	case Intrinsic::aarch64_sve_fcvt_f64f32:
1440	case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1441	case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1442	case Intrinsic::aarch64_sve_fcvtx_f32f64:
1443	case Intrinsic::aarch64_sve_fcvtzs:
1444	case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1445	case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1446	case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1447	case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1448	case Intrinsic::aarch64_sve_fcvtzu:
1449	case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1450	case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1451	case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1452	case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1453	case Intrinsic::aarch64_sve_scvtf:
1454	case Intrinsic::aarch64_sve_scvtf_f16i32:
1455	case Intrinsic::aarch64_sve_scvtf_f16i64:
1456	case Intrinsic::aarch64_sve_scvtf_f32i64:
1457	case Intrinsic::aarch64_sve_scvtf_f64i32:
1458	case Intrinsic::aarch64_sve_ucvtf:
1459	case Intrinsic::aarch64_sve_ucvtf_f16i32:
1460	case Intrinsic::aarch64_sve_ucvtf_f16i64:
1461	case Intrinsic::aarch64_sve_ucvtf_f32i64:
1462	case Intrinsic::aarch64_sve_ucvtf_f64i32:
1463	return SVEIntrinsicInfo::defaultMergingUnaryOp();
1464
1465	case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1466	case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1467	case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1468	case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1469	return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp();
1470
1471	case Intrinsic::aarch64_sve_fabd:
1472	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fabd_u);
1473	case Intrinsic::aarch64_sve_fadd:
1474	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fadd_u)
1475	.setMatchingIROpcode(Instruction::FAdd);
1476	case Intrinsic::aarch64_sve_fdiv:
1477	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fdiv_u)
1478	.setMatchingIROpcode(Instruction::FDiv);
1479	case Intrinsic::aarch64_sve_fmax:
1480	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmax_u);
1481	case Intrinsic::aarch64_sve_fmaxnm:
1482	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmaxnm_u);
1483	case Intrinsic::aarch64_sve_fmin:
1484	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmin_u);
1485	case Intrinsic::aarch64_sve_fminnm:
1486	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fminnm_u);
1487	case Intrinsic::aarch64_sve_fmla:
1488	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmla_u);
1489	case Intrinsic::aarch64_sve_fmls:
1490	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmls_u);
1491	case Intrinsic::aarch64_sve_fmul:
1492	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmul_u)
1493	.setMatchingIROpcode(Instruction::FMul);
1494	case Intrinsic::aarch64_sve_fmulx:
1495	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fmulx_u);
1496	case Intrinsic::aarch64_sve_fnmla:
1497	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmla_u);
1498	case Intrinsic::aarch64_sve_fnmls:
1499	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fnmls_u);
1500	case Intrinsic::aarch64_sve_fsub:
1501	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_fsub_u)
1502	.setMatchingIROpcode(Instruction::FSub);
1503	case Intrinsic::aarch64_sve_add:
1504	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_add_u)
1505	.setMatchingIROpcode(Instruction::Add);
1506	case Intrinsic::aarch64_sve_mla:
1507	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mla_u);
1508	case Intrinsic::aarch64_sve_mls:
1509	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mls_u);
1510	case Intrinsic::aarch64_sve_mul:
1511	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_mul_u)
1512	.setMatchingIROpcode(Instruction::Mul);
1513	case Intrinsic::aarch64_sve_sabd:
1514	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sabd_u);
1515	case Intrinsic::aarch64_sve_sdiv:
1516	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sdiv_u)
1517	.setMatchingIROpcode(Instruction::SDiv);
1518	case Intrinsic::aarch64_sve_smax:
1519	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smax_u);
1520	case Intrinsic::aarch64_sve_smin:
1521	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smin_u);
1522	case Intrinsic::aarch64_sve_smulh:
1523	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_smulh_u);
1524	case Intrinsic::aarch64_sve_sub:
1525	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sub_u)
1526	.setMatchingIROpcode(Instruction::Sub);
1527	case Intrinsic::aarch64_sve_uabd:
1528	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uabd_u);
1529	case Intrinsic::aarch64_sve_udiv:
1530	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_udiv_u)
1531	.setMatchingIROpcode(Instruction::UDiv);
1532	case Intrinsic::aarch64_sve_umax:
1533	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umax_u);
1534	case Intrinsic::aarch64_sve_umin:
1535	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umin_u);
1536	case Intrinsic::aarch64_sve_umulh:
1537	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_umulh_u);
1538	case Intrinsic::aarch64_sve_asr:
1539	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_asr_u)
1540	.setMatchingIROpcode(Instruction::AShr);
1541	case Intrinsic::aarch64_sve_lsl:
1542	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsl_u)
1543	.setMatchingIROpcode(Instruction::Shl);
1544	case Intrinsic::aarch64_sve_lsr:
1545	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_lsr_u)
1546	.setMatchingIROpcode(Instruction::LShr);
1547	case Intrinsic::aarch64_sve_and:
1548	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_and_u)
1549	.setMatchingIROpcode(Instruction::And);
1550	case Intrinsic::aarch64_sve_bic:
1551	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_bic_u);
1552	case Intrinsic::aarch64_sve_eor:
1553	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_eor_u)
1554	.setMatchingIROpcode(Instruction::Xor);
1555	case Intrinsic::aarch64_sve_orr:
1556	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_orr_u)
1557	.setMatchingIROpcode(Instruction::Or);
1558	case Intrinsic::aarch64_sve_shsub:
1559	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_shsub_u);
1560	case Intrinsic::aarch64_sve_shsubr:
1561	return SVEIntrinsicInfo::defaultMergingOp();
1562	case Intrinsic::aarch64_sve_sqrshl:
1563	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqrshl_u);
1564	case Intrinsic::aarch64_sve_sqshl:
1565	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqshl_u);
1566	case Intrinsic::aarch64_sve_sqsub:
1567	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_sqsub_u);
1568	case Intrinsic::aarch64_sve_srshl:
1569	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_srshl_u);
1570	case Intrinsic::aarch64_sve_uhsub:
1571	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uhsub_u);
1572	case Intrinsic::aarch64_sve_uhsubr:
1573	return SVEIntrinsicInfo::defaultMergingOp();
1574	case Intrinsic::aarch64_sve_uqrshl:
1575	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqrshl_u);
1576	case Intrinsic::aarch64_sve_uqshl:
1577	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqshl_u);
1578	case Intrinsic::aarch64_sve_uqsub:
1579	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_uqsub_u);
1580	case Intrinsic::aarch64_sve_urshl:
1581	return SVEIntrinsicInfo::defaultMergingOp(IID: Intrinsic::aarch64_sve_urshl_u);
1582
1583	case Intrinsic::aarch64_sve_add_u:
1584	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1585	Instruction::Add);
1586	case Intrinsic::aarch64_sve_and_u:
1587	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1588	Instruction::And);
1589	case Intrinsic::aarch64_sve_asr_u:
1590	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1591	Instruction::AShr);
1592	case Intrinsic::aarch64_sve_eor_u:
1593	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1594	Instruction::Xor);
1595	case Intrinsic::aarch64_sve_fadd_u:
1596	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1597	Instruction::FAdd);
1598	case Intrinsic::aarch64_sve_fdiv_u:
1599	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1600	Instruction::FDiv);
1601	case Intrinsic::aarch64_sve_fmul_u:
1602	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1603	Instruction::FMul);
1604	case Intrinsic::aarch64_sve_fsub_u:
1605	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1606	Instruction::FSub);
1607	case Intrinsic::aarch64_sve_lsl_u:
1608	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1609	Instruction::Shl);
1610	case Intrinsic::aarch64_sve_lsr_u:
1611	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1612	Instruction::LShr);
1613	case Intrinsic::aarch64_sve_mul_u:
1614	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1615	Instruction::Mul);
1616	case Intrinsic::aarch64_sve_orr_u:
1617	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1618	Instruction::Or);
1619	case Intrinsic::aarch64_sve_sdiv_u:
1620	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1621	Instruction::SDiv);
1622	case Intrinsic::aarch64_sve_sub_u:
1623	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1624	Instruction::Sub);
1625	case Intrinsic::aarch64_sve_udiv_u:
1626	return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode(
1627	Instruction::UDiv);
1628
1629	case Intrinsic::aarch64_sve_addqv:
1630	case Intrinsic::aarch64_sve_and_z:
1631	case Intrinsic::aarch64_sve_bic_z:
1632	case Intrinsic::aarch64_sve_brka_z:
1633	case Intrinsic::aarch64_sve_brkb_z:
1634	case Intrinsic::aarch64_sve_brkn_z:
1635	case Intrinsic::aarch64_sve_brkpa_z:
1636	case Intrinsic::aarch64_sve_brkpb_z:
1637	case Intrinsic::aarch64_sve_cntp:
1638	case Intrinsic::aarch64_sve_compact:
1639	case Intrinsic::aarch64_sve_eor_z:
1640	case Intrinsic::aarch64_sve_eorv:
1641	case Intrinsic::aarch64_sve_eorqv:
1642	case Intrinsic::aarch64_sve_nand_z:
1643	case Intrinsic::aarch64_sve_nor_z:
1644	case Intrinsic::aarch64_sve_orn_z:
1645	case Intrinsic::aarch64_sve_orr_z:
1646	case Intrinsic::aarch64_sve_orv:
1647	case Intrinsic::aarch64_sve_orqv:
1648	case Intrinsic::aarch64_sve_pnext:
1649	case Intrinsic::aarch64_sve_rdffr_z:
1650	case Intrinsic::aarch64_sve_saddv:
1651	case Intrinsic::aarch64_sve_uaddv:
1652	case Intrinsic::aarch64_sve_umaxv:
1653	case Intrinsic::aarch64_sve_umaxqv:
1654	case Intrinsic::aarch64_sve_cmpeq:
1655	case Intrinsic::aarch64_sve_cmpeq_wide:
1656	case Intrinsic::aarch64_sve_cmpge:
1657	case Intrinsic::aarch64_sve_cmpge_wide:
1658	case Intrinsic::aarch64_sve_cmpgt:
1659	case Intrinsic::aarch64_sve_cmpgt_wide:
1660	case Intrinsic::aarch64_sve_cmphi:
1661	case Intrinsic::aarch64_sve_cmphi_wide:
1662	case Intrinsic::aarch64_sve_cmphs:
1663	case Intrinsic::aarch64_sve_cmphs_wide:
1664	case Intrinsic::aarch64_sve_cmple_wide:
1665	case Intrinsic::aarch64_sve_cmplo_wide:
1666	case Intrinsic::aarch64_sve_cmpls_wide:
1667	case Intrinsic::aarch64_sve_cmplt_wide:
1668	case Intrinsic::aarch64_sve_cmpne:
1669	case Intrinsic::aarch64_sve_cmpne_wide:
1670	case Intrinsic::aarch64_sve_facge:
1671	case Intrinsic::aarch64_sve_facgt:
1672	case Intrinsic::aarch64_sve_fcmpeq:
1673	case Intrinsic::aarch64_sve_fcmpge:
1674	case Intrinsic::aarch64_sve_fcmpgt:
1675	case Intrinsic::aarch64_sve_fcmpne:
1676	case Intrinsic::aarch64_sve_fcmpuo:
1677	case Intrinsic::aarch64_sve_ld1:
1678	case Intrinsic::aarch64_sve_ld1_gather:
1679	case Intrinsic::aarch64_sve_ld1_gather_index:
1680	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1681	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1682	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1683	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1684	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1685	case Intrinsic::aarch64_sve_ld1q_gather_index:
1686	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1687	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1688	case Intrinsic::aarch64_sve_ld1ro:
1689	case Intrinsic::aarch64_sve_ld1rq:
1690	case Intrinsic::aarch64_sve_ld1udq:
1691	case Intrinsic::aarch64_sve_ld1uwq:
1692	case Intrinsic::aarch64_sve_ld2_sret:
1693	case Intrinsic::aarch64_sve_ld2q_sret:
1694	case Intrinsic::aarch64_sve_ld3_sret:
1695	case Intrinsic::aarch64_sve_ld3q_sret:
1696	case Intrinsic::aarch64_sve_ld4_sret:
1697	case Intrinsic::aarch64_sve_ld4q_sret:
1698	case Intrinsic::aarch64_sve_ldff1:
1699	case Intrinsic::aarch64_sve_ldff1_gather:
1700	case Intrinsic::aarch64_sve_ldff1_gather_index:
1701	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1702	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1703	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1704	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1705	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1706	case Intrinsic::aarch64_sve_ldnf1:
1707	case Intrinsic::aarch64_sve_ldnt1:
1708	case Intrinsic::aarch64_sve_ldnt1_gather:
1709	case Intrinsic::aarch64_sve_ldnt1_gather_index:
1710	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1711	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1712	return SVEIntrinsicInfo::defaultZeroingOp();
1713
1714	case Intrinsic::aarch64_sve_prf:
1715	case Intrinsic::aarch64_sve_prfb_gather_index:
1716	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1717	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1718	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1719	case Intrinsic::aarch64_sve_prfd_gather_index:
1720	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1721	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1722	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1723	case Intrinsic::aarch64_sve_prfh_gather_index:
1724	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1725	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1726	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1727	case Intrinsic::aarch64_sve_prfw_gather_index:
1728	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1729	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1730	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1731	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `0`);
1732
1733	case Intrinsic::aarch64_sve_st1_scatter:
1734	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1735	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1736	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1737	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1738	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1739	case Intrinsic::aarch64_sve_st1dq:
1740	case Intrinsic::aarch64_sve_st1q_scatter_index:
1741	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1742	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1743	case Intrinsic::aarch64_sve_st1wq:
1744	case Intrinsic::aarch64_sve_stnt1:
1745	case Intrinsic::aarch64_sve_stnt1_scatter:
1746	case Intrinsic::aarch64_sve_stnt1_scatter_index:
1747	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1748	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1749	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `1`);
1750	case Intrinsic::aarch64_sve_st2:
1751	case Intrinsic::aarch64_sve_st2q:
1752	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `2`);
1753	case Intrinsic::aarch64_sve_st3:
1754	case Intrinsic::aarch64_sve_st3q:
1755	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `3`);
1756	case Intrinsic::aarch64_sve_st4:
1757	case Intrinsic::aarch64_sve_st4q:
1758	return SVEIntrinsicInfo::defaultVoidOp(GPIndex: `4`);
1759	}
1760
1761	return SVEIntrinsicInfo ();
1762	}
1763
1764	static bool isAllActivePredicate(Value *Pred) {
1765	Value *UncastedPred;
1766
1767	// Look through predicate casts that only remove lanes.
1768	if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1769	Op0: m_Value(V&: UncastedPred)))) {
1770	auto *OrigPredTy = cast<ScalableVectorType>(Val: Pred->getType());
1771	Pred = UncastedPred;
1772
1773	if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1774	Op0: m_Value(V&: UncastedPred))))
1775	// If the predicate has the same or less lanes than the uncasted predicate
1776	// then we know the casting has no effect.
1777	if (OrigPredTy->getMinNumElements() <=
1778	cast<ScalableVectorType>(Val: UncastedPred->getType())
1779	->getMinNumElements())
1780	Pred = UncastedPred;
1781	}
1782
1783	auto *C = dyn_cast<Constant>(Val: Pred);
1784	return C && C->isAllOnesValue();
1785	}
1786
1787	// Simplify `V` by only considering the operations that affect active lanes.
1788	// This function should only return existing Values or newly created Constants.
1789	static Value stripInactiveLanes(Value V, const Value *Pg) {
1790	auto *Dup = dyn_cast<IntrinsicInst>(Val: V);
1791	if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1792	Dup->getOperand(i_nocapture: `1`) == Pg && isa<Constant>(Val: Dup->getOperand(i_nocapture: `2`)))
1793	return ConstantVector::getSplat(
1794	EC: cast<VectorType>(Val: V->getType())->getElementCount(),
1795	Elt: cast<Constant>(Val: Dup->getOperand(i_nocapture: `2`)));
1796
1797	return V;
1798	}
1799
1800	static std::optional<Instruction *>
1801	simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II,
1802	const SVEIntrinsicInfo &IInfo) {
1803	const unsigned Opc = IInfo.getMatchingIROpode();
1804	assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1805
1806	Value *Pg = II.getOperand(i_nocapture: `0`);
1807	Value *Op1 = II.getOperand(i_nocapture: `1`);
1808	Value *Op2 = II.getOperand(i_nocapture: `2`);
1809	const DataLayout &DL = II.getDataLayout();
1810
1811	// Canonicalise constants to the RHS.
1812	if (Instruction::isCommutative(Opcode: Opc) && IInfo.inactiveLanesAreNotDefined() &&
1813	isa<Constant>(Val: Op1) && !isa<Constant>(Val: Op2)) {
1814	IC.replaceOperand(I&: II, OpNum: `1`, V: Op2);
1815	IC.replaceOperand(I&: II, OpNum: `2`, V: Op1);
1816	return &II;
1817	}
1818
1819	// Only active lanes matter when simplifying the operation.
1820	Op1 = stripInactiveLanes(V: Op1, Pg);
1821	Op2 = stripInactiveLanes(V: Op2, Pg);
1822
1823	Value *SimpleII;
1824	if (auto FII = dyn_cast<FPMathOperator>(Val: &II))
1825	SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, FMF: FII->getFastMathFlags(), Q: DL);
1826	else
1827	SimpleII = simplifyBinOp(Opcode: Opc, LHS: Op1, RHS: Op2, Q: DL);
1828
1829	// An SVE intrinsic's result is always defined. However, this is not the case
1830	// for its equivalent IR instruction (e.g. when shifting by an amount more
1831	// than the data's bitwidth). Simplifications to an undefined result must be
1832	// ignored to preserve the intrinsic's expected behaviour.
1833	if (!SimpleII \|\| isa<UndefValue>(Val: SimpleII))
1834	return std::nullopt;
1835
1836	if (IInfo.inactiveLanesAreNotDefined())
1837	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1838
1839	Value *Inactive = II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom());
1840
1841	// The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1842	if (SimpleII == Inactive)
1843	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1844
1845	// Inactive lanes must be preserved.
1846	SimpleII = IC.Builder.CreateSelect(C: Pg, True: SimpleII, False: Inactive);
1847	return IC.replaceInstUsesWith(I&: II, V: SimpleII);
1848	}
1849
1850	// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1851	// to operations with less strict inactive lane requirements.
1852	static std::optional<Instruction *>
1853	simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II,
1854	const SVEIntrinsicInfo &IInfo) {
1855	if (!IInfo.hasGoverningPredicate())
1856	return std::nullopt;
1857
1858	auto *OpPredicate = II.getOperand(i_nocapture: IInfo.getGoverningPredicateOperandIdx());
1859
1860	// If there are no active lanes.
1861	if (match(V: OpPredicate, P: m_ZeroInt())) {
1862	if (IInfo.inactiveLanesTakenFromOperand())
1863	return IC.replaceInstUsesWith(
1864	I&: II, V: II.getOperand(i_nocapture: IInfo.getOperandIdxInactiveLanesTakenFrom()));
1865
1866	if (IInfo.inactiveLanesAreUnused()) {
1867	if (IInfo.resultIsZeroInitialized())
1868	IC.replaceInstUsesWith(I&: II, V: Constant::getNullValue(Ty: II.getType()));
1869
1870	return IC.eraseInstFromFunction(I&: II);
1871	}
1872	}
1873
1874	// If there are no inactive lanes.
1875	if (isAllActivePredicate(Pred: OpPredicate)) {
1876	if (IInfo.hasOperandWithNoActiveLanes()) {
1877	unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1878	if (!isa<UndefValue>(Val: II.getOperand(i_nocapture: OpIdx)))
1879	return IC.replaceOperand(I&: II, OpNum: OpIdx, V: UndefValue::get(T: II.getType()));
1880	}
1881
1882	if (IInfo.hasMatchingUndefIntrinsic()) {
1883	auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1884	M: II.getModule(), id: IInfo.getMatchingUndefIntrinsic(), Tys: {II.getType()});
1885	II.setCalledFunction(NewDecl);
1886	return &II;
1887	}
1888	}
1889
1890	// Operation specific simplifications.
1891	if (IInfo.hasMatchingIROpode() &&
1892	Instruction::isBinaryOp(Opcode: IInfo.getMatchingIROpode()))
1893	return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1894
1895	return std::nullopt;
1896	}
1897
1898	// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1899	// => (binop (pred) (from_svbool _) (from_svbool _))
1900	//
1901	// The above transformation eliminates a `to_svbool` in the predicate
1902	// operand of bitwise operation `binop` by narrowing the vector width of
1903	// the operation. For example, it would convert a `<vscale x 16 x i1>
1904	// and` into a `<vscale x 4 x i1> and`. This is profitable because
1905	// to_svbool must zero the new lanes during widening, whereas
1906	// from_svbool is free.
1907	static std::optional<Instruction *>
1908	tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
1909	auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: `0`));
1910	if (!BinOp)
1911	return std::nullopt;
1912
1913	auto IntrinsicID = BinOp->getIntrinsicID();
1914	switch (IntrinsicID) {
1915	case Intrinsic::aarch64_sve_and_z:
1916	case Intrinsic::aarch64_sve_bic_z:
1917	case Intrinsic::aarch64_sve_eor_z:
1918	case Intrinsic::aarch64_sve_nand_z:
1919	case Intrinsic::aarch64_sve_nor_z:
1920	case Intrinsic::aarch64_sve_orn_z:
1921	case Intrinsic::aarch64_sve_orr_z:
1922	break;
1923	default:
1924	return std::nullopt;
1925	}
1926
1927	auto BinOpPred = BinOp->getOperand(i_nocapture: `0`);
1928	auto BinOpOp1 = BinOp->getOperand(i_nocapture: `1`);
1929	auto BinOpOp2 = BinOp->getOperand(i_nocapture: `2`);
1930
1931	auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
1932	if (!PredIntr \|\|
1933	PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
1934	return std::nullopt;
1935
1936	auto PredOp = PredIntr->getOperand(i_nocapture: `0`);
1937	auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
1938	if (PredOpTy != II.getType())
1939	return std::nullopt;
1940
1941	SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
1942	auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
1943	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
1944	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1945	if (BinOpOp1 == BinOpOp2)
1946	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
1947	else
1948	NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
1949	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
1950
1951	auto NarrowedBinOp =
1952	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
1953	return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
1954	}
1955
1956	static std::optional<Instruction *>
1957	instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
1958	// If the reinterpret instruction operand is a PHI Node
1959	if (isa<PHINode>(Val: II.getArgOperand(i: `0`)))
1960	return processPhiNode(IC, II);
1961
1962	if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
1963	return BinOpCombine;
1964
1965	// Ignore converts to/from svcount_t.
1966	if (isa<TargetExtType>(Val: II.getArgOperand(i: `0`)->getType()) \|\|
1967	isa<TargetExtType>(Val: II.getType()))
1968	return std::nullopt;
1969
1970	SmallVector<Instruction *, `32`> CandidatesForRemoval;
1971	Value Cursor = II.getOperand(i_nocapture: `0`), EarliestReplacement = nullptr;
1972
1973	const auto *IVTy = cast<VectorType>(Val: II.getType());
1974
1975	// Walk the chain of conversions.
1976	while (Cursor) {
1977	// If the type of the cursor has fewer lanes than the final result, zeroing
1978	// must take place, which breaks the equivalence chain.
1979	const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
1980	if (CursorVTy->getElementCount().getKnownMinValue() <
1981	IVTy->getElementCount().getKnownMinValue())
1982	break;
1983
1984	// If the cursor has the same type as I, it is a viable replacement.
1985	if (Cursor->getType() == IVTy)
1986	EarliestReplacement = Cursor;
1987
1988	auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
1989
1990	// If this is not an SVE conversion intrinsic, this is the end of the chain.
1991	if (!IntrinsicCursor \|\| !(IntrinsicCursor->getIntrinsicID() ==
1992	Intrinsic::aarch64_sve_convert_to_svbool \|\|
1993	IntrinsicCursor->getIntrinsicID() ==
1994	Intrinsic::aarch64_sve_convert_from_svbool))
1995	break;
1996
1997	CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1998	Cursor = IntrinsicCursor->getOperand(i_nocapture: `0`);
1999	}
2000
2001	// If no viable replacement in the conversion chain was found, there is
2002	// nothing to do.
2003	if (!EarliestReplacement)
2004	return std::nullopt;
2005
2006	return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
2007	}
2008
2009	static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2010	IntrinsicInst &II) {
2011	// svsel(ptrue, x, y) => x
2012	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
2013	if (isAllActivePredicate(Pred: OpPredicate))
2014	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
2015
2016	auto Select =
2017	IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: `1`), False: II.getOperand(i_nocapture: `2`));
2018	return IC.replaceInstUsesWith(I&: II, V: Select);
2019	}
2020
2021	static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2022	IntrinsicInst &II) {
2023	Value *Pg = II.getOperand(i_nocapture: `1`);
2024
2025	// sve.dup(V, all_active, X) ==> splat(X)
2026	if (isAllActivePredicate(Pred: Pg)) {
2027	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2028	Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2029	V: II.getArgOperand(i: `2`));
2030	return IC.replaceInstUsesWith(I&: II, V: Splat);
2031	}
2032
2033	if (!match(V: Pg, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
2034	Op0: m_SpecificInt(V: AArch64SVEPredPattern::vl1))))
2035	return std::nullopt;
2036
2037	// sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2038	Value *Insert = IC.Builder.CreateInsertElement(
2039	Vec: II.getArgOperand(i: `0`), NewElt: II.getArgOperand(i: `2`), Idx: uint64_t(`0`));
2040	return IC.replaceInstUsesWith(I&: II, V: Insert);
2041	}
2042
2043	static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2044	IntrinsicInst &II) {
2045	// Replace DupX with a regular IR splat.
2046	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2047	Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
2048	V: II.getArgOperand(i: `0`));
2049	Splat->takeName(V: &II);
2050	return IC.replaceInstUsesWith(I&: II, V: Splat);
2051	}
2052
2053	static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2054	IntrinsicInst &II) {
2055	LLVMContext &Ctx = II.getContext();
2056
2057	if (!isAllActivePredicate(Pred: II.getArgOperand(i: `0`)))
2058	return std::nullopt;
2059
2060	// Check that we have a compare of zero..
2061	auto *SplatValue =
2062	dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: `2`)));
2063	if (!SplatValue \|\| !SplatValue->isZero())
2064	return std::nullopt;
2065
2066	// ..against a dupq
2067	auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
2068	if (!DupQLane \|\|
2069	DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2070	return std::nullopt;
2071
2072	// Where the dupq is a lane 0 replicate of a vector insert
2073	auto *DupQLaneIdx = dyn_cast<ConstantInt>(Val: DupQLane->getArgOperand(i: `1`));
2074	if (!DupQLaneIdx \|\| !DupQLaneIdx->isZero())
2075	return std::nullopt;
2076
2077	auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: `0`));
2078	if (!VecIns \|\| VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2079	return std::nullopt;
2080
2081	// Where the vector insert is a fixed constant vector insert into undef at
2082	// index zero
2083	if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: `0`)))
2084	return std::nullopt;
2085
2086	if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: `2`))->isZero())
2087	return std::nullopt;
2088
2089	auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: `1`));
2090	if (!ConstVec)
2091	return std::nullopt;
2092
2093	auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
2094	auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
2095	if (!VecTy \|\| !OutTy \|\| VecTy->getNumElements() != OutTy->getMinNumElements())
2096	return std::nullopt;
2097
2098	unsigned NumElts = VecTy->getNumElements();
2099	unsigned PredicateBits = `0`;
2100
2101	// Expand intrinsic operands to a 16-bit byte level predicate
2102	for (unsigned I = `0`; I < NumElts; ++I) {
2103	auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
2104	if (!Arg)
2105	return std::nullopt;
2106	if (!Arg->isZero())
2107	PredicateBits \|= `1` << (I * (`16` / NumElts));
2108	}
2109
2110	// If all bits are zero bail early with an empty predicate
2111	if (PredicateBits == `0`) {
2112	auto *PFalse = Constant::getNullValue(Ty: II.getType());
2113	PFalse->takeName(V: &II);
2114	return IC.replaceInstUsesWith(I&: II, V: PFalse);
2115	}
2116
2117	// Calculate largest predicate type used (where byte predicate is largest)
2118	unsigned Mask = `8`;
2119	for (unsigned I = `0`; I < `16`; ++I)
2120	if ((PredicateBits & (`1` << I)) != `0`)
2121	Mask \|= (I % `8`);
2122
2123	unsigned PredSize = Mask & -Mask;
2124	auto *PredType = ScalableVectorType::get(
2125	ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * `8`));
2126
2127	// Ensure all relevant bits are set
2128	for (unsigned I = `0`; I < `16`; I += PredSize)
2129	if ((PredicateBits & (`1` << I)) == `0`)
2130	return std::nullopt;
2131
2132	auto *PTruePat =
2133	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2134	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2135	Types: {PredType}, Args: {PTruePat});
2136	auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
2137	ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
2138	auto *ConvertFromSVBool =
2139	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
2140	Types: {II.getType()}, Args: {ConvertToSVBool});
2141
2142	ConvertFromSVBool->takeName(V: &II);
2143	return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
2144	}
2145
2146	static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2147	IntrinsicInst &II) {
2148	Value *Pg = II.getArgOperand(i: `0`);
2149	Value *Vec = II.getArgOperand(i: `1`);
2150	auto IntrinsicID = II.getIntrinsicID();
2151	bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2152
2153	// lastX(splat(X)) --> X
2154	if (auto *SplatVal = getSplatValue(V: Vec))
2155	return IC.replaceInstUsesWith(I&: II, V: SplatVal);
2156
2157	// If x and/or y is a splat value then:
2158	// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2159	Value LHS, RHS;
2160	if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
2161	if (isSplatValue(V: LHS) \|\| isSplatValue(V: RHS)) {
2162	auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
2163	auto OpC = OldBinOp->getOpcode();
2164	auto *NewLHS =
2165	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
2166	auto *NewRHS =
2167	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
2168	auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
2169	Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
2170	return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
2171	}
2172	}
2173
2174	auto *C = dyn_cast<Constant>(Val: Pg);
2175	if (IsAfter && C && C->isNullValue()) {
2176	// The intrinsic is extracting lane 0 so use an extract instead.
2177	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2178	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
2179	Extract->insertBefore(InsertPos: II.getIterator());
2180	Extract->takeName(V: &II);
2181	return IC.replaceInstUsesWith(I&: II, V: Extract);
2182	}
2183
2184	auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
2185	if (!IntrPG)
2186	return std::nullopt;
2187
2188	if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2189	return std::nullopt;
2190
2191	const auto PTruePattern =
2192	cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: `0`))->getZExtValue();
2193
2194	// Can the intrinsic's predicate be converted to a known constant index?
2195	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
2196	if (!MinNumElts)
2197	return std::nullopt;
2198
2199	unsigned Idx = MinNumElts - `1`;
2200	// Increment the index if extracting the element after the last active
2201	// predicate element.
2202	if (IsAfter)
2203	++Idx;
2204
2205	// Ignore extracts whose index is larger than the known minimum vector
2206	// length. NOTE: This is an artificial constraint where we prefer to
2207	// maintain what the user asked for until an alternative is proven faster.
2208	auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
2209	if (Idx >= PgVTy->getMinNumElements())
2210	return std::nullopt;
2211
2212	// The intrinsic is extracting a fixed lane so use an extract instead.
2213	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
2214	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
2215	Extract->insertBefore(InsertPos: II.getIterator());
2216	Extract->takeName(V: &II);
2217	return IC.replaceInstUsesWith(I&: II, V: Extract);
2218	}
2219
2220	static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2221	IntrinsicInst &II) {
2222	// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2223	// integer variant across a variety of micro-architectures. Replace scalar
2224	// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2225	// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2226	// depending on the micro-architecture, but has been observed as generally
2227	// being faster, particularly when the CLAST[AB] op is a loop-carried
2228	// dependency.
2229	Value *Pg = II.getArgOperand(i: `0`);
2230	Value *Fallback = II.getArgOperand(i: `1`);
2231	Value *Vec = II.getArgOperand(i: `2`);
2232	Type *Ty = II.getType();
2233
2234	if (!Ty->isIntegerTy())
2235	return std::nullopt;
2236
2237	Type *FPTy;
2238	switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
2239	default:
2240	return std::nullopt;
2241	case `16`:
2242	FPTy = IC.Builder.getHalfTy();
2243	break;
2244	case `32`:
2245	FPTy = IC.Builder.getFloatTy();
2246	break;
2247	case `64`:
2248	FPTy = IC.Builder.getDoubleTy();
2249	break;
2250	}
2251
2252	Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
2253	auto *FPVTy = VectorType::get(
2254	ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
2255	Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
2256	auto *FPII = IC.Builder.CreateIntrinsic(
2257	ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
2258	Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
2259	return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
2260	}
2261
2262	static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2263	IntrinsicInst &II) {
2264	LLVMContext &Ctx = II.getContext();
2265	// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2266	// can work with RDFFR_PP for ptest elimination.
2267	auto *AllPat =
2268	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
2269	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
2270	Types: {II.getType()}, Args: {AllPat});
2271	auto *RDFFR =
2272	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Args: {PTrue});
2273	RDFFR->takeName(V: &II);
2274	return IC.replaceInstUsesWith(I&: II, V: RDFFR);
2275	}
2276
2277	static std::optional<Instruction *>
2278	instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
2279	const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: `0`))->getZExtValue();
2280
2281	if (Pattern == AArch64SVEPredPattern::all) {
2282	Value *Cnt = IC.Builder.CreateElementCount(
2283	Ty: II.getType(), EC: ElementCount::getScalable(MinVal: NumElts));
2284	Cnt->takeName(V: &II);
2285	return IC.replaceInstUsesWith(I&: II, V: Cnt);
2286	}
2287
2288	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2289
2290	return MinNumElts && NumElts >= MinNumElts
2291	? std::optional<Instruction *>(IC.replaceInstUsesWith(
2292	I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
2293	: std::nullopt;
2294	}
2295
2296	static std::optional<Instruction *>
2297	instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II,
2298	const AArch64Subtarget *ST) {
2299	if (!ST->isStreaming())
2300	return std::nullopt;
2301
2302	// In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2303	// with SVEPredPattern::all
2304	Value *Cnt =
2305	IC.Builder.CreateElementCount(Ty: II.getType(), EC: ElementCount::getScalable(MinVal: `2`));
2306	Cnt->takeName(V: &II);
2307	return IC.replaceInstUsesWith(I&: II, V: Cnt);
2308	}
2309
2310	static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2311	IntrinsicInst &II) {
2312	Value *PgVal = II.getArgOperand(i: `0`);
2313	Value *OpVal = II.getArgOperand(i: `1`);
2314
2315	// PTEST_<FIRST\|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2316	// Later optimizations prefer this form.
2317	if (PgVal == OpVal &&
2318	(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first \|\|
2319	II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2320	Value *Ops[] = {PgVal, OpVal};
2321	Type *Tys[] = {PgVal->getType()};
2322
2323	auto *PTest =
2324	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
2325	PTest->takeName(V: &II);
2326
2327	return IC.replaceInstUsesWith(I&: II, V: PTest);
2328	}
2329
2330	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
2331	IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
2332
2333	if (!Pg \|\| !Op)
2334	return std::nullopt;
2335
2336	Intrinsic::ID OpIID = Op->getIntrinsicID();
2337
2338	if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2339	OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2340	Pg->getArgOperand(i: `0`)->getType() == Op->getArgOperand(i: `0`)->getType()) {
2341	Value *Ops[] = {Pg->getArgOperand(i: `0`), Op->getArgOperand(i: `0`)};
2342	Type *Tys[] = {Pg->getArgOperand(i: `0`)->getType()};
2343
2344	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2345
2346	PTest->takeName(V: &II);
2347	return IC.replaceInstUsesWith(I&: II, V: PTest);
2348	}
2349
2350	// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2351	// Later optimizations may rewrite sequence to use the flag-setting variant
2352	// of instruction X to remove PTEST.
2353	if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2354	((OpIID == Intrinsic::aarch64_sve_brka_z) \|\|
2355	(OpIID == Intrinsic::aarch64_sve_brkb_z) \|\|
2356	(OpIID == Intrinsic::aarch64_sve_brkpa_z) \|\|
2357	(OpIID == Intrinsic::aarch64_sve_brkpb_z) \|\|
2358	(OpIID == Intrinsic::aarch64_sve_rdffr_z) \|\|
2359	(OpIID == Intrinsic::aarch64_sve_and_z) \|\|
2360	(OpIID == Intrinsic::aarch64_sve_bic_z) \|\|
2361	(OpIID == Intrinsic::aarch64_sve_eor_z) \|\|
2362	(OpIID == Intrinsic::aarch64_sve_nand_z) \|\|
2363	(OpIID == Intrinsic::aarch64_sve_nor_z) \|\|
2364	(OpIID == Intrinsic::aarch64_sve_orn_z) \|\|
2365	(OpIID == Intrinsic::aarch64_sve_orr_z))) {
2366	Value *Ops[] = {Pg->getArgOperand(i: `0`), Pg};
2367	Type *Tys[] = {Pg->getType()};
2368
2369	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
2370	PTest->takeName(V: &II);
2371
2372	return IC.replaceInstUsesWith(I&: II, V: PTest);
2373	}
2374
2375	return std::nullopt;
2376	}
2377
2378	template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2379	static std::optional<Instruction *>
2380	instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
2381	bool MergeIntoAddendOp) {
2382	Value *P = II.getOperand(i_nocapture: `0`);
2383	Value MulOp0, MulOp1, AddendOp, Mul;
2384	if (MergeIntoAddendOp) {
2385	AddendOp = II.getOperand(i_nocapture: `1`);
2386	Mul = II.getOperand(i_nocapture: `2`);
2387	} else {
2388	AddendOp = II.getOperand(i_nocapture: `2`);
2389	Mul = II.getOperand(i_nocapture: `1`);
2390	}
2391
2392	if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
2393	m_Value(V&: MulOp1))))
2394	return std::nullopt;
2395
2396	if (!Mul->hasOneUse())
2397	return std::nullopt;
2398
2399	Instruction FMFSource = nullptr*;
2400	if (II.getType()->isFPOrFPVectorTy()) {
2401	llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2402	// Stop the combine when the flags on the inputs differ in case dropping
2403	// flags would lead to us missing out on more beneficial optimizations.
2404	if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
2405	return std::nullopt;
2406	if (!FAddFlags.allowContract())
2407	return std::nullopt;
2408	FMFSource = &II;
2409	}
2410
2411	CallInst *Res;
2412	if (MergeIntoAddendOp)
2413	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2414	Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2415	else
2416	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
2417	Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2418
2419	return IC.replaceInstUsesWith(I&: II, V: Res);
2420	}
2421
2422	static std::optional<Instruction *>
2423	instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2424	Value *Pred = II.getOperand(i_nocapture: `0`);
2425	Value *PtrOp = II.getOperand(i_nocapture: `1`);
2426	Type *VecTy = II.getType();
2427
2428	if (isAllActivePredicate(Pred)) {
2429	LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
2430	Load->copyMetadata(SrcInst: II);
2431	return IC.replaceInstUsesWith(I&: II, V: Load);
2432	}
2433
2434	CallInst *MaskedLoad =
2435	IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
2436	Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
2437	MaskedLoad->copyMetadata(SrcInst: II);
2438	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2439	}
2440
2441	static std::optional<Instruction *>
2442	instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
2443	Value *VecOp = II.getOperand(i_nocapture: `0`);
2444	Value *Pred = II.getOperand(i_nocapture: `1`);
2445	Value *PtrOp = II.getOperand(i_nocapture: `2`);
2446
2447	if (isAllActivePredicate(Pred)) {
2448	StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
2449	Store->copyMetadata(SrcInst: II);
2450	return IC.eraseInstFromFunction(I&: II);
2451	}
2452
2453	CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2454	Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
2455	MaskedStore->copyMetadata(SrcInst: II);
2456	return IC.eraseInstFromFunction(I&: II);
2457	}
2458
2459	static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
2460	switch (Intrinsic) {
2461	case Intrinsic::aarch64_sve_fmul_u:
2462	return Instruction::BinaryOps::FMul;
2463	case Intrinsic::aarch64_sve_fadd_u:
2464	return Instruction::BinaryOps::FAdd;
2465	case Intrinsic::aarch64_sve_fsub_u:
2466	return Instruction::BinaryOps::FSub;
2467	default:
2468	return Instruction::BinaryOpsEnd;
2469	}
2470	}
2471
2472	static std::optional<Instruction *>
2473	instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
2474	// Bail due to missing support for ISD::STRICT_ scalable vector operations.
2475	if (II.isStrictFP())
2476	return std::nullopt;
2477
2478	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
2479	auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
2480	if (BinOpCode == Instruction::BinaryOpsEnd \|\|
2481	!isAllActivePredicate(Pred: OpPredicate))
2482	return std::nullopt;
2483	auto BinOp = IC.Builder.CreateBinOpFMF(
2484	Opc: BinOpCode, LHS: II.getOperand(i_nocapture: `1`), RHS: II.getOperand(i_nocapture: `2`), FMFSource: II.getFastMathFlags());
2485	return IC.replaceInstUsesWith(I&: II, V: BinOp);
2486	}
2487
2488	static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2489	IntrinsicInst &II) {
2490	if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2491	Intrinsic::aarch64_sve_mla>(
2492	IC, II, MergeIntoAddendOp: true))
2493	return MLA;
2494	if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2495	Intrinsic::aarch64_sve_mad>(
2496	IC, II, MergeIntoAddendOp: false))
2497	return MAD;
2498	return std::nullopt;
2499	}
2500
2501	static std::optional<Instruction *>
2502	instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
2503	if (auto FMLA =
2504	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2505	Intrinsic::aarch64_sve_fmla>(IC, II,
2506	MergeIntoAddendOp: true))
2507	return FMLA;
2508	if (auto FMAD =
2509	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2510	Intrinsic::aarch64_sve_fmad>(IC, II,
2511	MergeIntoAddendOp: false))
2512	return FMAD;
2513	if (auto FMLA =
2514	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2515	Intrinsic::aarch64_sve_fmla>(IC, II,
2516	MergeIntoAddendOp: true))
2517	return FMLA;
2518	return std::nullopt;
2519	}
2520
2521	static std::optional<Instruction *>
2522	instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
2523	if (auto FMLA =
2524	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2525	Intrinsic::aarch64_sve_fmla>(IC, II,
2526	MergeIntoAddendOp: true))
2527	return FMLA;
2528	if (auto FMAD =
2529	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2530	Intrinsic::aarch64_sve_fmad>(IC, II,
2531	MergeIntoAddendOp: false))
2532	return FMAD;
2533	if (auto FMLA_U =
2534	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2535	Intrinsic::aarch64_sve_fmla_u>(
2536	IC, II, MergeIntoAddendOp: true))
2537	return FMLA_U;
2538	return instCombineSVEVectorBinOp(IC, II);
2539	}
2540
2541	static std::optional<Instruction *>
2542	instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
2543	if (auto FMLS =
2544	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2545	Intrinsic::aarch64_sve_fmls>(IC, II,
2546	MergeIntoAddendOp: true))
2547	return FMLS;
2548	if (auto FMSB =
2549	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2550	Intrinsic::aarch64_sve_fnmsb>(
2551	IC, II, MergeIntoAddendOp: false))
2552	return FMSB;
2553	if (auto FMLS =
2554	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2555	Intrinsic::aarch64_sve_fmls>(IC, II,
2556	MergeIntoAddendOp: true))
2557	return FMLS;
2558	return std::nullopt;
2559	}
2560
2561	static std::optional<Instruction *>
2562	instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
2563	if (auto FMLS =
2564	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2565	Intrinsic::aarch64_sve_fmls>(IC, II,
2566	MergeIntoAddendOp: true))
2567	return FMLS;
2568	if (auto FMSB =
2569	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2570	Intrinsic::aarch64_sve_fnmsb>(
2571	IC, II, MergeIntoAddendOp: false))
2572	return FMSB;
2573	if (auto FMLS_U =
2574	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2575	Intrinsic::aarch64_sve_fmls_u>(
2576	IC, II, MergeIntoAddendOp: true))
2577	return FMLS_U;
2578	return instCombineSVEVectorBinOp(IC, II);
2579	}
2580
2581	static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2582	IntrinsicInst &II) {
2583	if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2584	Intrinsic::aarch64_sve_mls>(
2585	IC, II, MergeIntoAddendOp: true))
2586	return MLS;
2587	return std::nullopt;
2588	}
2589
2590	static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2591	IntrinsicInst &II) {
2592	Value *UnpackArg = II.getArgOperand(i: `0`);
2593	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
2594	bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi \|\|
2595	II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2596
2597	// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2598	// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2599	if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
2600	ScalarArg =
2601	IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
2602	Value *NewVal =
2603	IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
2604	NewVal->takeName(V: &II);
2605	return IC.replaceInstUsesWith(I&: II, V: NewVal);
2606	}
2607
2608	return std::nullopt;
2609	}
2610	static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2611	IntrinsicInst &II) {
2612	auto *OpVal = II.getOperand(i_nocapture: `0`);
2613	auto *OpIndices = II.getOperand(i_nocapture: `1`);
2614	VectorType *VTy = cast<VectorType>(Val: II.getType());
2615
2616	// Check whether OpIndices is a constant splat value < minimal element count
2617	// of result.
2618	auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
2619	if (!SplatValue \|\|
2620	SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
2621	return std::nullopt;
2622
2623	// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2624	// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2625	auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
2626	auto *VectorSplat =
2627	IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
2628
2629	VectorSplat->takeName(V: &II);
2630	return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
2631	}
2632
2633	static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2634	IntrinsicInst &II) {
2635	Value A, B;
2636	Type *RetTy = II.getType();
2637	constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2638	constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2639
2640	// uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2641	// uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2642	if ((match(V: II.getArgOperand(i: `0`),
2643	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
2644	match(V: II.getArgOperand(i: `1`),
2645	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) \|\|
2646	(match(V: II.getArgOperand(i: `0`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
2647	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
2648	auto *TyA = cast<ScalableVectorType>(Val: A->getType());
2649	if (TyA == B->getType() &&
2650	RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
2651	auto *SubVec = IC.Builder.CreateInsertVector(
2652	DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: uint64_t(`0`));
2653	auto *ConcatVec = IC.Builder.CreateInsertVector(DstType: RetTy, SrcVec: SubVec, SubVec: B,
2654	Idx: TyA->getMinNumElements());
2655	ConcatVec->takeName(V: &II);
2656	return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
2657	}
2658	}
2659
2660	return std::nullopt;
2661	}
2662
2663	static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2664	IntrinsicInst &II) {
2665	// zip1(uzp1(A, B), uzp2(A, B)) --> A
2666	// zip2(uzp1(A, B), uzp2(A, B)) --> B
2667	Value A, B;
2668	if (match(V: II.getArgOperand(i: `0`),
2669	P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
2670	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
2671	Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
2672	return IC.replaceInstUsesWith(
2673	I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2674
2675	return std::nullopt;
2676	}
2677
2678	static std::optional<Instruction *>
2679	instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
2680	Value *Mask = II.getOperand(i_nocapture: `0`);
2681	Value *BasePtr = II.getOperand(i_nocapture: `1`);
2682	Value *Index = II.getOperand(i_nocapture: `2`);
2683	Type *Ty = II.getType();
2684	Value *PassThru = ConstantAggregateZero::get(Ty);
2685
2686	// Contiguous gather => masked load.
2687	// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2688	// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2689	Value *IndexBase;
2690	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2691	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
2692	Align Alignment =
2693	BasePtr->getPointerAlignment(DL: II.getDataLayout());
2694
2695	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2696	Ptr: BasePtr, IdxList: IndexBase);
2697	CallInst *MaskedLoad =
2698	IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2699	MaskedLoad->takeName(V: &II);
2700	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
2701	}
2702
2703	return std::nullopt;
2704	}
2705
2706	static std::optional<Instruction *>
2707	instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
2708	Value *Val = II.getOperand(i_nocapture: `0`);
2709	Value *Mask = II.getOperand(i_nocapture: `1`);
2710	Value *BasePtr = II.getOperand(i_nocapture: `2`);
2711	Value *Index = II.getOperand(i_nocapture: `3`);
2712	Type *Ty = Val->getType();
2713
2714	// Contiguous scatter => masked store.
2715	// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2716	// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2717	Value *IndexBase;
2718	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
2719	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
2720	Align Alignment =
2721	BasePtr->getPointerAlignment(DL: II.getDataLayout());
2722
2723	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
2724	Ptr: BasePtr, IdxList: IndexBase);
2725	(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2726
2727	return IC.eraseInstFromFunction(I&: II);
2728	}
2729
2730	return std::nullopt;
2731	}
2732
2733	static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2734	IntrinsicInst &II) {
2735	Type *Int32Ty = IC.Builder.getInt32Ty();
2736	Value *Pred = II.getOperand(i_nocapture: `0`);
2737	Value *Vec = II.getOperand(i_nocapture: `1`);
2738	Value *DivVec = II.getOperand(i_nocapture: `2`);
2739
2740	Value *SplatValue = getSplatValue(V: DivVec);
2741	ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
2742	if (!SplatConstantInt)
2743	return std::nullopt;
2744
2745	APInt Divisor = SplatConstantInt->getValue();
2746	const int64_t DivisorValue = Divisor.getSExtValue();
2747	if (DivisorValue == -`1`)
2748	return std::nullopt;
2749	if (DivisorValue == `1`)
2750	IC.replaceInstUsesWith(I&: II, V: Vec);
2751
2752	if (Divisor.isPowerOf2()) {
2753	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2754	auto ASRD = IC.Builder.CreateIntrinsic(
2755	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2756	return IC.replaceInstUsesWith(I&: II, V: ASRD);
2757	}
2758	if (Divisor.isNegatedPowerOf2()) {
2759	Divisor.negate();
2760	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
2761	auto ASRD = IC.Builder.CreateIntrinsic(
2762	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
2763	auto NEG = IC.Builder.CreateIntrinsic(
2764	ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
2765	return IC.replaceInstUsesWith(I&: II, V: NEG);
2766	}
2767
2768	return std::nullopt;
2769	}
2770
2771	bool SimplifyValuePattern(SmallVector<Value > &Vec, bool* AllowPoison) {
2772	size_t VecSize = Vec.size();
2773	if (VecSize == `1`)
2774	return true;
2775	if (!isPowerOf2_64(Value: VecSize))
2776	return false;
2777	size_t HalfVecSize = VecSize / `2`;
2778
2779	for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2780	RHS != Vec.end(); LHS++, RHS++) {
2781	if (LHS != nullptr* && RHS != nullptr*) {
2782	if (LHS == RHS)
2783	continue;
2784	else
2785	return false;
2786	}
2787	if (!AllowPoison)
2788	return false;
2789	if (LHS == nullptr* && RHS != nullptr*)
2790	LHS = RHS;
2791	}
2792
2793	Vec.resize(N: HalfVecSize);
2794	SimplifyValuePattern(Vec, AllowPoison);
2795	return true;
2796	}
2797
2798	// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2799	// to dupqlane(f64(C)) where C is A concatenated with B
2800	static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2801	IntrinsicInst &II) {
2802	Value CurrentInsertElt = nullptr, Default = nullptr;
2803	if (!match(V: II.getOperand(i_nocapture: `0`),
2804	P: m_Intrinsic<Intrinsic::vector_insert>(
2805	Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) \|\|
2806	!isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
2807	return std::nullopt;
2808	auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
2809
2810	// Insert the scalars into a container ordered by InsertElement index
2811	SmallVector<Value > Elts(IIScalableTy->getMinNumElements(), nullptr*);
2812	while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
2813	auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: `2`));
2814	Elts [Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: `1`);
2815	CurrentInsertElt = InsertElt->getOperand(i_nocapture: `0`);
2816	}
2817
2818	bool AllowPoison =
2819	isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
2820	if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
2821	return std::nullopt;
2822
2823	// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2824	Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
2825	for (size_t I = `0`; I < Elts.size(); I++) {
2826	if (Elts [I] == nullptr)
2827	continue;
2828	InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts [I],
2829	Idx: IC.Builder.getInt64(C: I));
2830	}
2831	if (InsertEltChain == nullptr)
2832	return std::nullopt;
2833
2834	// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2835	// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2836	// be bitcast to a type wide enough to fit the sequence, be splatted, and then
2837	// be narrowed back to the original type.
2838	unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2839	unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2840	IIScalableTy->getMinNumElements() /
2841	PatternWidth;
2842
2843	IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
2844	auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
2845	auto *WideShuffleMaskTy =
2846	ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
2847
2848	auto InsertSubvector = IC.Builder.CreateInsertVector(
2849	DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain,
2850	Idx: uint64_t(`0`));
2851	auto WideBitcast =
2852	IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2853	auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2854	auto WideShuffle = IC.Builder.CreateShuffleVector(
2855	V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2856	auto NarrowBitcast =
2857	IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2858
2859	return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2860	}
2861
2862	static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2863	IntrinsicInst &II) {
2864	Value *A = II.getArgOperand(i: `0`);
2865	Value *B = II.getArgOperand(i: `1`);
2866	if (A == B)
2867	return IC.replaceInstUsesWith(I&: II, V: A);
2868
2869	return std::nullopt;
2870	}
2871
2872	static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2873	IntrinsicInst &II) {
2874	Value *Pred = II.getOperand(i_nocapture: `0`);
2875	Value *Vec = II.getOperand(i_nocapture: `1`);
2876	Value *Shift = II.getOperand(i_nocapture: `2`);
2877
2878	// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2879	Value AbsPred, MergedValue;
2880	if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2881	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2882	!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2883	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2884
2885	return std::nullopt;
2886
2887	// Transform is valid if any of the following are true:
2888	// The ABS merge value is an undef or non-negative*
2889	// The ABS predicate is all active*
2890	// The ABS predicate and the SRSHL predicates are the same*
2891	if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2892	AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2893	return std::nullopt;
2894
2895	// Only valid when the shift amount is non-negative, otherwise the rounding
2896	// behaviour of SRSHL cannot be ignored.
2897	if (!match(V: Shift, P: m_NonNegative()))
2898	return std::nullopt;
2899
2900	auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2901	Types: {II.getType()}, Args: {Pred, Vec, Shift});
2902
2903	return IC.replaceInstUsesWith(I&: II, V: LSL);
2904	}
2905
2906	static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
2907	IntrinsicInst &II) {
2908	Value *Vec = II.getOperand(i_nocapture: `0`);
2909
2910	if (getSplatValue(V: Vec) == II.getOperand(i_nocapture: `1`))
2911	return IC.replaceInstUsesWith(I&: II, V: Vec);
2912
2913	return std::nullopt;
2914	}
2915
2916	static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
2917	IntrinsicInst &II) {
2918	// If this barrier is post-dominated by identical one we can remove it
2919	auto *NI = II.getNextNode();
2920	unsigned LookaheadThreshold = DMBLookaheadThreshold;
2921	auto CanSkipOver = [](Instruction *I) {
2922	return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
2923	};
2924	while (LookaheadThreshold-- && CanSkipOver (NI)) {
2925	auto *NIBB = NI->getParent();
2926	NI = NI->getNextNode();
2927	if (!NI) {
2928	if (auto *SuccBB = NIBB->getUniqueSuccessor())
2929	NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
2930	else
2931	break;
2932	}
2933	}
2934	auto *NextII = dyn_cast_or_null<IntrinsicInst>(Val: NI);
2935	if (NextII && II.isIdenticalTo(I: NextII))
2936	return IC.eraseInstFromFunction(I&: II);
2937
2938	return std::nullopt;
2939	}
2940
2941	static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
2942	IntrinsicInst &II) {
2943	return IC.replaceInstUsesWith(
2944	I&: II,
2945	V: IC.Builder.CreateIntrinsic(ID: Intrinsic::get_active_lane_mask,
2946	Types: {II.getType(), II.getOperand(i_nocapture: `0`)->getType()},
2947	Args: {II.getOperand(i_nocapture: `0`), II.getOperand(i_nocapture: `1`)}));
2948	}
2949
2950	static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
2951	IntrinsicInst &II) {
2952	if (match(V: II.getOperand(i_nocapture: `0`), P: m_ConstantInt<AArch64SVEPredPattern::all>()))
2953	return IC.replaceInstUsesWith(I&: II, V: Constant::getAllOnesValue(Ty: II.getType()));
2954	return std::nullopt;
2955	}
2956
2957	static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
2958	IntrinsicInst &II,
2959	unsigned NumBits) {
2960	Value *Passthru = II.getOperand(i_nocapture: `0`);
2961	Value *Pg = II.getOperand(i_nocapture: `1`);
2962	Value *Op = II.getOperand(i_nocapture: `2`);
2963
2964	// Convert UXT[BHW] to AND.
2965	if (isa<UndefValue>(Val: Passthru) \|\| isAllActivePredicate(Pred: Pg)) {
2966	auto *Ty = cast<VectorType>(Val: II.getType());
2967	auto MaskValue = APInt::getLowBitsSet(numBits: Ty->getScalarSizeInBits(), loBitsSet: NumBits);
2968	auto *Mask = ConstantInt::get(Ty, V: MaskValue);
2969	auto *And = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_and_u, Types: {Ty},
2970	Args: {Pg, Op, Mask});
2971	return IC.replaceInstUsesWith(I&: II, V: And);
2972	}
2973
2974	return std::nullopt;
2975	}
2976
2977	static std::optional<Instruction *>
2978	instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) {
2979	SMEAttrs FnSMEAttrs(*II.getFunction());
2980	bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
2981	if (IsStreaming \|\| !FnSMEAttrs.hasStreamingCompatibleInterface())
2982	return IC.replaceInstUsesWith(
2983	I&: II, V: ConstantInt::getBool(Ty: II.getType(), V: IsStreaming));
2984	return std::nullopt;
2985	}
2986
2987	std::optional<Instruction *>
2988	AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2989	IntrinsicInst &II) const {
2990	const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II);
2991	if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
2992	return I;
2993
2994	Intrinsic::ID IID = II.getIntrinsicID();
2995	switch (IID) {
2996	default:
2997	break;
2998	case Intrinsic::aarch64_dmb:
2999	return instCombineDMB(IC, II);
3000	case Intrinsic::aarch64_neon_fmaxnm:
3001	case Intrinsic::aarch64_neon_fminnm:
3002	return instCombineMaxMinNM(IC, II);
3003	case Intrinsic::aarch64_sve_convert_from_svbool:
3004	return instCombineConvertFromSVBool(IC, II);
3005	case Intrinsic::aarch64_sve_dup:
3006	return instCombineSVEDup(IC, II);
3007	case Intrinsic::aarch64_sve_dup_x:
3008	return instCombineSVEDupX(IC, II);
3009	case Intrinsic::aarch64_sve_cmpne:
3010	case Intrinsic::aarch64_sve_cmpne_wide:
3011	return instCombineSVECmpNE(IC, II);
3012	case Intrinsic::aarch64_sve_rdffr:
3013	return instCombineRDFFR(IC, II);
3014	case Intrinsic::aarch64_sve_lasta:
3015	case Intrinsic::aarch64_sve_lastb:
3016	return instCombineSVELast(IC, II);
3017	case Intrinsic::aarch64_sve_clasta_n:
3018	case Intrinsic::aarch64_sve_clastb_n:
3019	return instCombineSVECondLast(IC, II);
3020	case Intrinsic::aarch64_sve_cntd:
3021	return instCombineSVECntElts(IC, II, NumElts: `2`);
3022	case Intrinsic::aarch64_sve_cntw:
3023	return instCombineSVECntElts(IC, II, NumElts: `4`);
3024	case Intrinsic::aarch64_sve_cnth:
3025	return instCombineSVECntElts(IC, II, NumElts: `8`);
3026	case Intrinsic::aarch64_sve_cntb:
3027	return instCombineSVECntElts(IC, II, NumElts: `16`);
3028	case Intrinsic::aarch64_sme_cntsd:
3029	return instCombineSMECntsd(IC, II, ST);
3030	case Intrinsic::aarch64_sve_ptest_any:
3031	case Intrinsic::aarch64_sve_ptest_first:
3032	case Intrinsic::aarch64_sve_ptest_last:
3033	return instCombineSVEPTest(IC, II);
3034	case Intrinsic::aarch64_sve_fadd:
3035	return instCombineSVEVectorFAdd(IC, II);
3036	case Intrinsic::aarch64_sve_fadd_u:
3037	return instCombineSVEVectorFAddU(IC, II);
3038	case Intrinsic::aarch64_sve_fmul_u:
3039	return instCombineSVEVectorBinOp(IC, II);
3040	case Intrinsic::aarch64_sve_fsub:
3041	return instCombineSVEVectorFSub(IC, II);
3042	case Intrinsic::aarch64_sve_fsub_u:
3043	return instCombineSVEVectorFSubU(IC, II);
3044	case Intrinsic::aarch64_sve_add:
3045	return instCombineSVEVectorAdd(IC, II);
3046	case Intrinsic::aarch64_sve_add_u:
3047	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3048	Intrinsic::aarch64_sve_mla_u>(
3049	IC, II, MergeIntoAddendOp: true);
3050	case Intrinsic::aarch64_sve_sub:
3051	return instCombineSVEVectorSub(IC, II);
3052	case Intrinsic::aarch64_sve_sub_u:
3053	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3054	Intrinsic::aarch64_sve_mls_u>(
3055	IC, II, MergeIntoAddendOp: true);
3056	case Intrinsic::aarch64_sve_tbl:
3057	return instCombineSVETBL(IC, II);
3058	case Intrinsic::aarch64_sve_uunpkhi:
3059	case Intrinsic::aarch64_sve_uunpklo:
3060	case Intrinsic::aarch64_sve_sunpkhi:
3061	case Intrinsic::aarch64_sve_sunpklo:
3062	return instCombineSVEUnpack(IC, II);
3063	case Intrinsic::aarch64_sve_uzp1:
3064	return instCombineSVEUzp1(IC, II);
3065	case Intrinsic::aarch64_sve_zip1:
3066	case Intrinsic::aarch64_sve_zip2:
3067	return instCombineSVEZip(IC, II);
3068	case Intrinsic::aarch64_sve_ld1_gather_index:
3069	return instCombineLD1GatherIndex(IC, II);
3070	case Intrinsic::aarch64_sve_st1_scatter_index:
3071	return instCombineST1ScatterIndex(IC, II);
3072	case Intrinsic::aarch64_sve_ld1:
3073	return instCombineSVELD1(IC, II, DL);
3074	case Intrinsic::aarch64_sve_st1:
3075	return instCombineSVEST1(IC, II, DL);
3076	case Intrinsic::aarch64_sve_sdiv:
3077	return instCombineSVESDIV(IC, II);
3078	case Intrinsic::aarch64_sve_sel:
3079	return instCombineSVESel(IC, II);
3080	case Intrinsic::aarch64_sve_srshl:
3081	return instCombineSVESrshl(IC, II);
3082	case Intrinsic::aarch64_sve_dupq_lane:
3083	return instCombineSVEDupqLane(IC, II);
3084	case Intrinsic::aarch64_sve_insr:
3085	return instCombineSVEInsr(IC, II);
3086	case Intrinsic::aarch64_sve_whilelo:
3087	return instCombineWhilelo(IC, II);
3088	case Intrinsic::aarch64_sve_ptrue:
3089	return instCombinePTrue(IC, II);
3090	case Intrinsic::aarch64_sve_uxtb:
3091	return instCombineSVEUxt(IC, II, NumBits: `8`);
3092	case Intrinsic::aarch64_sve_uxth:
3093	return instCombineSVEUxt(IC, II, NumBits: `16`);
3094	case Intrinsic::aarch64_sve_uxtw:
3095	return instCombineSVEUxt(IC, II, NumBits: `32`);
3096	case Intrinsic::aarch64_sme_in_streaming_mode:
3097	return instCombineInStreamingMode(IC, II);
3098	}
3099
3100	return std::nullopt;
3101	}
3102
3103	std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
3104	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3105	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3106	std::function<void(Instruction , unsigned*, APInt, APInt &)>
3107	SimplifyAndSetOp) const {
3108	switch (II.getIntrinsicID()) {
3109	default:
3110	break;
3111	case Intrinsic::aarch64_neon_fcvtxn:
3112	case Intrinsic::aarch64_neon_rshrn:
3113	case Intrinsic::aarch64_neon_sqrshrn:
3114	case Intrinsic::aarch64_neon_sqrshrun:
3115	case Intrinsic::aarch64_neon_sqshrn:
3116	case Intrinsic::aarch64_neon_sqshrun:
3117	case Intrinsic::aarch64_neon_sqxtn:
3118	case Intrinsic::aarch64_neon_sqxtun:
3119	case Intrinsic::aarch64_neon_uqrshrn:
3120	case Intrinsic::aarch64_neon_uqshrn:
3121	case Intrinsic::aarch64_neon_uqxtn:
3122	SimplifyAndSetOp (&II, `0`, OrigDemandedElts, UndefElts);
3123	break;
3124	}
3125
3126	return std::nullopt;
3127	}
3128
3129	bool AArch64TTIImpl::enableScalableVectorization() const {
3130	return ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
3131	EnableScalableAutovecInStreamingMode);
3132	}
3133
3134	TypeSize
3135	AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
3136	switch (K) {
3137	case TargetTransformInfo::RGK_Scalar:
3138	return TypeSize::getFixed(ExactSize: `64`);
3139	case TargetTransformInfo::RGK_FixedWidthVector:
3140	if (ST->useSVEForFixedLengthVectors() &&
3141	(ST->isSVEAvailable() \|\| EnableFixedwidthAutovecInStreamingMode))
3142	return TypeSize::getFixed(
3143	ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: `128u`));
3144	else if (ST->isNeonAvailable())
3145	return TypeSize::getFixed(ExactSize: `128`);
3146	else
3147	return TypeSize::getFixed(ExactSize: `0`);
3148	case TargetTransformInfo::RGK_ScalableVector:
3149	if (ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
3150	EnableScalableAutovecInStreamingMode))
3151	return TypeSize::getScalable(MinimumSize: `128`);
3152	else
3153	return TypeSize::getScalable(MinimumSize: `0`);
3154	}
3155	llvm_unreachable("Unsupported register kind");
3156	}
3157
3158	bool AArch64TTIImpl::isSingleExtWideningInstruction(
3159	unsigned Opcode, Type DstTy, ArrayRef<const* Value *> Args,
3160	Type SrcOverrideTy) const* {
3161	// A helper that returns a vector type from the given type. The number of
3162	// elements in type Ty determines the vector width.
3163	auto toVectorTy = [&](Type *ArgTy) {
3164	return VectorType::get(ElementType: ArgTy->getScalarType(),
3165	EC: cast<VectorType>(Val: DstTy)->getElementCount());
3166	};
3167
3168	// Exit early if DstTy is not a vector type whose elements are one of [i16,
3169	// i32, i64]. SVE doesn't generally have the same set of instructions to
3170	// perform an extend with the add/sub/mul. There are SMULLB style
3171	// instructions, but they operate on top/bottom, requiring some sort of lane
3172	// interleaving to be used with zext/sext.
3173	unsigned DstEltSize = DstTy->getScalarSizeInBits();
3174	if (!useNeonVector(Ty: DstTy) \|\| Args.size() != `2` \|\|
3175	(DstEltSize != `16` && DstEltSize != `32` && DstEltSize != `64`))
3176	return false;
3177
3178	Type *SrcTy = SrcOverrideTy;
3179	switch (Opcode) {
3180	case Instruction::Add: // UADDW(2), SADDW(2).
3181	case Instruction::Sub: { // USUBW(2), SSUBW(2).
3182	// The second operand needs to be an extend
3183	if (isa<SExtInst>(Val: Args [`1`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
3184	if (!SrcTy)
3185	SrcTy =
3186	toVectorTy (cast<Instruction>(Val: Args [`1`])->getOperand(i: `0`)->getType());
3187	break;
3188	}
3189
3190	if (Opcode == Instruction::Sub)
3191	return false;
3192
3193	// UADDW(2), SADDW(2) can be commutted.
3194	if (isa<SExtInst>(Val: Args [`0`]) \|\| isa<ZExtInst>(Val: Args [`0`])) {
3195	if (!SrcTy)
3196	SrcTy =
3197	toVectorTy (cast<Instruction>(Val: Args [`0`])->getOperand(i: `0`)->getType());
3198	break;
3199	}
3200	return false;
3201	}
3202	default:
3203	return false;
3204	}
3205
3206	// Legalize the destination type and ensure it can be used in a widening
3207	// operation.
3208	auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
3209	if (!DstTyL.second.isVector() \|\| DstEltSize != DstTy->getScalarSizeInBits())
3210	return false;
3211
3212	// Legalize the source type and ensure it can be used in a widening
3213	// operation.
3214	assert(SrcTy && "Expected some SrcTy");
3215	auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
3216	unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3217	if (!SrcTyL.second.isVector() \|\| SrcElTySize != SrcTy->getScalarSizeInBits())
3218	return false;
3219
3220	// Get the total number of vector elements in the legalized types.
3221	InstructionCost NumDstEls =
3222	DstTyL.first * DstTyL.second.getVectorMinNumElements();
3223	InstructionCost NumSrcEls =
3224	SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3225
3226	// Return true if the legalized types have the same number of vector elements
3227	// and the destination element type size is twice that of the source type.
3228	return NumDstEls == NumSrcEls && `2` * SrcElTySize == DstEltSize;
3229	}
3230
3231	Type AArch64TTIImpl::isBinExtWideningInstruction(unsigned* Opcode, Type *DstTy,
3232	ArrayRef<const Value *> Args,
3233	Type SrcOverrideTy) const* {
3234	if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3235	Opcode != Instruction::Mul)
3236	return nullptr;
3237
3238	// Exit early if DstTy is not a vector type whose elements are one of [i16,
3239	// i32, i64]. SVE doesn't generally have the same set of instructions to
3240	// perform an extend with the add/sub/mul. There are SMULLB style
3241	// instructions, but they operate on top/bottom, requiring some sort of lane
3242	// interleaving to be used with zext/sext.
3243	unsigned DstEltSize = DstTy->getScalarSizeInBits();
3244	if (!useNeonVector(Ty: DstTy) \|\| Args.size() != `2` \|\|
3245	(DstEltSize != `16` && DstEltSize != `32` && DstEltSize != `64`))
3246	return nullptr;
3247
3248	auto getScalarSizeWithOverride = [&](const Value *V) {
3249	if (SrcOverrideTy)
3250	return SrcOverrideTy->getScalarSizeInBits();
3251	return cast<Instruction>(Val: V)
3252	->getOperand(i: `0`)
3253	->getType()
3254	->getScalarSizeInBits();
3255	};
3256
3257	unsigned MaxEltSize = `0`;
3258	if ((isa<SExtInst>(Val: Args [`0`]) && isa<SExtInst>(Val: Args [`1`])) \|\|
3259	(isa<ZExtInst>(Val: Args [`0`]) && isa<ZExtInst>(Val: Args [`1`]))) {
3260	unsigned EltSize0 = getScalarSizeWithOverride (Args [`0`]);
3261	unsigned EltSize1 = getScalarSizeWithOverride (Args [`1`]);
3262	MaxEltSize = std::max(a: EltSize0, b: EltSize1);
3263	} else if (isa<SExtInst, ZExtInst>(Val: Args [`0`]) &&
3264	isa<SExtInst, ZExtInst>(Val: Args [`1`])) {
3265	unsigned EltSize0 = getScalarSizeWithOverride (Args [`0`]);
3266	unsigned EltSize1 = getScalarSizeWithOverride (Args [`1`]);
3267	// mul(sext, zext) will become smull(sext, zext) if the extends are large
3268	// enough.
3269	if (EltSize0 >= DstEltSize / `2` \|\| EltSize1 >= DstEltSize / `2`)
3270	return nullptr;
3271	MaxEltSize = DstEltSize / `2`;
3272	} else if (Opcode == Instruction::Mul &&
3273	(isa<ZExtInst>(Val: Args [`0`]) \|\| isa<ZExtInst>(Val: Args [`1`]))) {
3274	// If one of the operands is a Zext and the other has enough zero bits
3275	// to be treated as unsigned, we can still generate a umull, meaning the
3276	// zext is free.
3277	KnownBits Known =
3278	computeKnownBits(V: isa<ZExtInst>(Val: Args [`0`]) ? Args [`1`] : Args [`0`], DL);
3279	if (Args [`0`]->getType()->getScalarSizeInBits() -
3280	Known.Zero.countLeadingOnes() >
3281	DstTy->getScalarSizeInBits() / `2`)
3282	return nullptr;
3283
3284	MaxEltSize =
3285	getScalarSizeWithOverride (isa<ZExtInst>(Val: Args [`0`]) ? Args [`0`] : Args [`1`]);
3286	} else
3287	return nullptr;
3288
3289	if (MaxEltSize * `2` > DstEltSize)
3290	return nullptr;
3291
3292	Type ExtTy = DstTy->getWithNewBitWidth(NewBitWidth: MaxEltSize `2`);
3293	if (ExtTy->getPrimitiveSizeInBits() <= `64`)
3294	return nullptr;
3295	return ExtTy;
3296	}
3297
3298	// s/urhadd instructions implement the following pattern, making the
3299	// extends free:
3300	// %x = add ((zext i8 -> i16), 1)
3301	// %y = (zext i8 -> i16)
3302	// trunc i16 (lshr (add %x, %y), 1) -> i8
3303	//
3304	bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction ExtUser, Type Dst,
3305	Type Src) const* {
3306	// The source should be a legal vector type.
3307	if (!Src->isVectorTy() \|\| !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) \|\|
3308	(Src->isScalableTy() && !ST->hasSVE2()))
3309	return false;
3310
3311	if (ExtUser->getOpcode() != Instruction::Add \|\| !ExtUser->hasOneUse())
3312	return false;
3313
3314	// Look for trunc/shl/add before trying to match the pattern.
3315	const Instruction *Add = ExtUser;
3316	auto *AddUser =
3317	dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3318	if (AddUser && AddUser->getOpcode() == Instruction::Add)
3319	Add = AddUser;
3320
3321	auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
3322	if (!Shr \|\| Shr->getOpcode() != Instruction::LShr)
3323	return false;
3324
3325	auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
3326	if (!Trunc \|\| Trunc->getOpcode() != Instruction::Trunc \|\|
3327	Src->getScalarSizeInBits() !=
3328	cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
3329	return false;
3330
3331	// Try to match the whole pattern. Ext could be either the first or second
3332	// m_ZExtOrSExt matched.
3333	Instruction Ex1, Ex2;
3334	if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
3335	R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: `1`))))))
3336	return false;
3337
3338	// Ensure both extends are of the same type
3339	if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
3340	Ex1->getOpcode() == Ex2->getOpcode())
3341	return true;
3342
3343	return false;
3344	}
3345
3346	InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
3347	Type *Src,
3348	TTI::CastContextHint CCH,
3349	TTI::TargetCostKind CostKind,
3350	const Instruction I) const* {
3351	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3352	assert(ISD && "Invalid opcode");
3353	// If the cast is observable, and it is used by a widening instruction (e.g.,
3354	// uaddl, saddw, etc.), it may be free.
3355	if (I && I->hasOneUser()) {
3356	auto SingleUser = cast<Instruction>(Val: I->user_begin());
3357	SmallVector<const Value *, `4`> Operands(SingleUser->operand_values());
3358	if (Type *ExtTy = isBinExtWideningInstruction(
3359	Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3360	SrcOverrideTy: Src != I->getOperand(i: `0`)->getType() ? Src : nullptr)) {
3361	// The cost from Src->Src2 needs to be added if required, the cost from*
3362	// Src2->ExtTy is free.*
3363	if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * `2`) {
3364	Type *DoubleSrcTy =
3365	Src->getWithNewBitWidth(NewBitWidth: Src->getScalarSizeInBits() * `2`);
3366	return getCastInstrCost(Opcode, Dst: DoubleSrcTy, Src,
3367	CCH: TTI::CastContextHint::None, CostKind);
3368	}
3369
3370	return `0`;
3371	}
3372
3373	if (isSingleExtWideningInstruction(
3374	Opcode: SingleUser->getOpcode(), DstTy: Dst, Args: Operands,
3375	SrcOverrideTy: Src != I->getOperand(i: `0`)->getType() ? Src : nullptr)) {
3376	// For adds only count the second operand as free if both operands are
3377	// extends but not the same operation. (i.e both operands are not free in
3378	// add(sext, zext)).
3379	if (SingleUser->getOpcode() == Instruction::Add) {
3380	if (I == SingleUser->getOperand(i: `1`) \|\|
3381	(isa<CastInst>(Val: SingleUser->getOperand(i: `1`)) &&
3382	cast<CastInst>(Val: SingleUser->getOperand(i: `1`))->getOpcode() == Opcode))
3383	return `0`;
3384	} else {
3385	// Others are free so long as isSingleExtWideningInstruction
3386	// returned true.
3387	return `0`;
3388	}
3389	}
3390
3391	// The cast will be free for the s/urhadd instructions
3392	if ((isa<ZExtInst>(Val: I) \|\| isa<SExtInst>(Val: I)) &&
3393	isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
3394	return `0`;
3395	}
3396
3397	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
3398	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
3399
3400	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
3401	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3402
3403	// For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3404	// we use fcvtx under SVE2. Give them invalid costs.
3405	if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3406	ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3407	DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3408	return InstructionCost::getInvalid();
3409
3410	static const TypeConversionCostTblEntry BF16Tbl[] = {
3411	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: `1`}, // bfcvt
3412	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: `1`}, // bfcvt
3413	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: `1`}, // bfcvtn
3414	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: `2`}, // bfcvtn+bfcvtn2
3415	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: `2`}, // bfcvtn+fcvtn
3416	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: `3`}, // fcvtn+fcvtl2+bfcvtn
3417	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: `6`}, // 2 fcvtn+fcvtn2+bfcvtn*
3418	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: `1`}, // bfcvt
3419	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: `1`}, // bfcvt
3420	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: `3`}, // bfcvt+bfcvt+uzp1
3421	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: `2`}, // fcvtx+bfcvt
3422	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: `5`}, // 2fcvtx+2bfcvt+uzp1
3423	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: `11`}, // 4fcvt+4bfcvt+3uzp*
3424	};
3425
3426	if (ST->hasBF16())
3427	if (const auto *Entry = ConvertCostTableLookup(
3428	Table: BF16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3429	return Entry->Cost;
3430
3431	// Symbolic constants for the SVE sitofp/uitofp entries in the table below
3432	// The cost of unpacking twice is artificially increased for now in order
3433	// to avoid regressions against NEON, which will use tbl instructions directly
3434	// instead of multiple layers of [s\|u]unpk[lo\|hi].
3435	// We use the unpacks in cases where the destination type is illegal and
3436	// requires splitting of the input, even if the input type itself is legal.
3437	const unsigned int SVE_EXT_COST = `1`;
3438	const unsigned int SVE_FCVT_COST = `1`;
3439	const unsigned int SVE_UNPACK_ONCE = `4`;
3440	const unsigned int SVE_UNPACK_TWICE = `16`;
3441
3442	static const TypeConversionCostTblEntry ConversionTbl[] = {
3443	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3444	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3445	{.ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: `1`}, // xtn
3446	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: `1`}, // xtn
3447	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: `3`}, // 2 xtn + 1 uzp1
3448	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1`}, // xtn
3449	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: `2`}, // 1 uzp1 + 1 xtn
3450	{.ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `1`}, // 1 uzp1
3451	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: `1`}, // 1 xtn
3452	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `2`}, // 1 uzp1 + 1 xtn
3453	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: `4`}, // 3 x uzp1 + xtn
3454	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: `1`}, // 1 uzp1
3455	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: `3`}, // 3 x uzp1
3456	{.ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: `2`}, // 2 x uzp1
3457	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: `1`}, // uzp1
3458	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `3`}, // (2 + 1) x uzp1
3459	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: `7`}, // (4 + 2 + 1) x uzp1
3460	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: `2`}, // 2 x uzp1
3461	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: `6`}, // (4 + 2) x uzp1
3462	{.ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: `4`}, // 4 x uzp1
3463
3464	// Truncations on nxvmiN
3465	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i8, .Cost: `2`},
3466	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: `2`},
3467	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: `2`},
3468	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: `2`},
3469	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i8, .Cost: `2`},
3470	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: `2`},
3471	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: `2`},
3472	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: `5`},
3473	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i8, .Cost: `2`},
3474	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: `2`},
3475	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: `5`},
3476	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: `11`},
3477	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: `2`},
3478	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i16, .Cost: `0`},
3479	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i32, .Cost: `0`},
3480	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i8, .Src: MVT::nxv2i64, .Cost: `0`},
3481	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: `0`},
3482	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i64, .Cost: `0`},
3483	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: `0`},
3484	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i16, .Cost: `0`},
3485	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i32, .Cost: `0`},
3486	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i8, .Src: MVT::nxv4i64, .Cost: `1`},
3487	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: `0`},
3488	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i64, .Cost: `1`},
3489	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: `1`},
3490	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i16, .Cost: `0`},
3491	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i32, .Cost: `1`},
3492	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i8, .Src: MVT::nxv8i64, .Cost: `3`},
3493	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: `1`},
3494	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i64, .Cost: `3`},
3495	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i16, .Cost: `1`},
3496	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i32, .Cost: `3`},
3497	{.ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i8, .Src: MVT::nxv16i64, .Cost: `7`},
3498
3499	// The number of shll instructions for the extension.
3500	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3`},
3501	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3`},
3502	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
3503	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2`},
3504	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3`},
3505	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3`},
3506	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
3507	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2`},
3508	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7`},
3509	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7`},
3510	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6`},
3511	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6`},
3512	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
3513	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2`},
3514	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6`},
3515	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6`},
3516
3517	// FP Ext and trunc
3518	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f32, .Cost: `1`}, // fcvt
3519	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f32, .Cost: `1`}, // fcvtl
3520	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f32, .Cost: `2`}, // fcvtl+fcvtl2
3521	// FP16
3522	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::f16, .Cost: `1`}, // fcvt
3523	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::f16, .Cost: `1`}, // fcvt
3524	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4f16, .Cost: `1`}, // fcvtl
3525	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8f16, .Cost: `2`}, // fcvtl+fcvtl2
3526	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2f16, .Cost: `2`}, // fcvtl+fcvtl
3527	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4f16, .Cost: `3`}, // fcvtl+fcvtl2+fcvtl
3528	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8f16, .Cost: `6`}, // 2 fcvtl+fcvtl2+fcvtl*
3529	// BF16 (uses shift)
3530	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f32, .Src: MVT::bf16, .Cost: `1`}, // shl
3531	{.ISD: ISD::FP_EXTEND, .Dst: MVT::f64, .Src: MVT::bf16, .Cost: `2`}, // shl+fcvt
3532	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f32, .Src: MVT::v4bf16, .Cost: `1`}, // shll
3533	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f32, .Src: MVT::v8bf16, .Cost: `2`}, // shll+shll2
3534	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v2f64, .Src: MVT::v2bf16, .Cost: `2`}, // shll+fcvtl
3535	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v4f64, .Src: MVT::v4bf16, .Cost: `3`}, // shll+fcvtl+fcvtl2
3536	{.ISD: ISD::FP_EXTEND, .Dst: MVT::v8f64, .Src: MVT::v8bf16, .Cost: `6`}, // 2 shll+fcvtl+fcvtl2*
3537	// FP Ext and trunc
3538	{.ISD: ISD::FP_ROUND, .Dst: MVT::f32, .Src: MVT::f64, .Cost: `1`}, // fcvt
3539	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2f32, .Src: MVT::v2f64, .Cost: `1`}, // fcvtn
3540	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f32, .Src: MVT::v4f64, .Cost: `2`}, // fcvtn+fcvtn2
3541	// FP16
3542	{.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f32, .Cost: `1`}, // fcvt
3543	{.ISD: ISD::FP_ROUND, .Dst: MVT::f16, .Src: MVT::f64, .Cost: `1`}, // fcvt
3544	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f32, .Cost: `1`}, // fcvtn
3545	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f32, .Cost: `2`}, // fcvtn+fcvtn2
3546	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2f16, .Src: MVT::v2f64, .Cost: `2`}, // fcvtn+fcvtn
3547	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4f16, .Src: MVT::v4f64, .Cost: `3`}, // fcvtn+fcvtn2+fcvtn
3548	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8f16, .Src: MVT::v8f64, .Cost: `6`}, // 2 fcvtn+fcvtn2+fcvtn*
3549	// BF16 (more complex, with +bf16 is handled above)
3550	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f32, .Cost: `8`}, // Expansion is ~8 insns
3551	{.ISD: ISD::FP_ROUND, .Dst: MVT::bf16, .Src: MVT::f64, .Cost: `9`}, // fcvtn + above
3552	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f32, .Cost: `8`},
3553	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f32, .Cost: `8`},
3554	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f32, .Cost: `15`},
3555	{.ISD: ISD::FP_ROUND, .Dst: MVT::v2bf16, .Src: MVT::v2f64, .Cost: `9`},
3556	{.ISD: ISD::FP_ROUND, .Dst: MVT::v4bf16, .Src: MVT::v4f64, .Cost: `10`},
3557	{.ISD: ISD::FP_ROUND, .Dst: MVT::v8bf16, .Src: MVT::v8f64, .Cost: `19`},
3558
3559	// LowerVectorINT_TO_FP:
3560	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
3561	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
3562	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1`},
3563	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1`},
3564	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1`},
3565	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1`},
3566
3567	// SVE: to nxv2f16
3568	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3569	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3570	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3571	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3572	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3573	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i8,
3574	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3575	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3576	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3577	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3578
3579	// SVE: to nxv4f16
3580	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3581	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3582	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3583	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3584	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i8,
3585	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3586	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3587	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3588
3589	// SVE: to nxv8f16
3590	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3591	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3592	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3593	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i8,
3594	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3595	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f16, .Src: MVT::nxv8i16, .Cost: SVE_FCVT_COST},
3596
3597	// SVE: to nxv16f16
3598	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3599	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3600	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f16, .Src: MVT::nxv16i8,
3601	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3602
3603	// Complex: to v2f32
3604	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
3605	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3`},
3606	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3`},
3607	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3`},
3608
3609	// SVE: to nxv2f32
3610	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3611	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3612	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3613	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3614	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3615	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i8,
3616	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3617	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3618	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3619	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3620
3621	// Complex: to v4f32
3622	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `4`},
3623	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
3624	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3`},
3625	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2`},
3626
3627	// SVE: to nxv4f32
3628	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3629	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3630	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3631	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3632	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i8,
3633	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3634	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i16, .Cost: SVE_FCVT_COST},
3635	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f32, .Src: MVT::nxv4i32, .Cost: SVE_FCVT_COST},
3636
3637	// Complex: to v8f32
3638	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
3639	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4`},
3640	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10`},
3641	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4`},
3642
3643	// SVE: to nxv8f32
3644	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3645	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3646	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3647	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3648	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i8,
3649	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3650	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f32, .Src: MVT::nxv8i16,
3651	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3652
3653	// SVE: to nxv16f32
3654	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3655	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3656	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv16f32, .Src: MVT::nxv16i8,
3657	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3658
3659	// Complex: to v16f32
3660	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21`},
3661	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21`},
3662
3663	// Complex: to v2f64
3664	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4`},
3665	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4`},
3666	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2`},
3667	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4`},
3668	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4`},
3669	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2`},
3670
3671	// SVE: to nxv2f64
3672	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3673	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3674	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3675	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3676	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3677	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i8,
3678	.Cost: SVE_EXT_COST + SVE_FCVT_COST},
3679	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i16, .Cost: SVE_FCVT_COST},
3680	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i32, .Cost: SVE_FCVT_COST},
3681	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv2f64, .Src: MVT::nxv2i64, .Cost: SVE_FCVT_COST},
3682
3683	// Complex: to v4f64
3684	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4`},
3685	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4`},
3686
3687	// SVE: to nxv4f64
3688	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3689	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3690	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3691	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3692	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3693	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3694	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i8,
3695	.Cost: SVE_EXT_COST + SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3696	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i16,
3697	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3698	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv4f64, .Src: MVT::nxv4i32,
3699	.Cost: SVE_UNPACK_ONCE + `2` * SVE_FCVT_COST},
3700
3701	// SVE: to nxv8f64
3702	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3703	.Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3704	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3705	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3706	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i8,
3707	.Cost: SVE_EXT_COST + SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3708	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::nxv8f64, .Src: MVT::nxv8i16,
3709	.Cost: SVE_UNPACK_TWICE + `4` * SVE_FCVT_COST},
3710
3711	// LowerVectorFP_TO_INT
3712	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1`},
3713	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1`},
3714	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1`},
3715	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1`},
3716	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1`},
3717	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1`},
3718
3719	// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3720	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2`},
3721	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1`},
3722	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1`},
3723	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2`},
3724	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1`},
3725	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1`},
3726
3727	// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3728	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2`},
3729	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2`},
3730	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2`},
3731	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2`},
3732
3733	// Complex, from nxv2f32.
3734	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1`},
3735	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1`},
3736	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1`},
3737	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1`},
3738	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1`},
3739	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1`},
3740	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1`},
3741	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1`},
3742
3743	// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3744	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2`},
3745	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2`},
3746	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2`},
3747	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2`},
3748	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2`},
3749	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2`},
3750
3751	// Complex, from nxv2f64.
3752	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1`},
3753	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1`},
3754	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1`},
3755	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1`},
3756	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: `1`},
3757	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1`},
3758	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1`},
3759	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1`},
3760	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1`},
3761	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i1, .Src: MVT::nxv2f64, .Cost: `1`},
3762
3763	// Complex, from nxv4f32.
3764	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4`},
3765	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1`},
3766	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1`},
3767	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1`},
3768	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: `1`},
3769	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4`},
3770	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1`},
3771	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1`},
3772	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1`},
3773	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i1, .Src: MVT::nxv4f32, .Cost: `1`},
3774
3775	// Complex, from nxv8f64. Illegal -> illegal conversions not required.
3776	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7`},
3777	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7`},
3778	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7`},
3779	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7`},
3780
3781	// Complex, from nxv4f64. Illegal -> illegal conversions not required.
3782	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3`},
3783	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3`},
3784	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3`},
3785	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3`},
3786	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3`},
3787	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3`},
3788
3789	// Complex, from nxv8f32. Illegal -> illegal conversions not required.
3790	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3`},
3791	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3`},
3792	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3`},
3793	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3`},
3794
3795	// Complex, from nxv8f16.
3796	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10`},
3797	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4`},
3798	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1`},
3799	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1`},
3800	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: `1`},
3801	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10`},
3802	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4`},
3803	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1`},
3804	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1`},
3805	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i1, .Src: MVT::nxv8f16, .Cost: `1`},
3806
3807	// Complex, from nxv4f16.
3808	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4`},
3809	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1`},
3810	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1`},
3811	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1`},
3812	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4`},
3813	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1`},
3814	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1`},
3815	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1`},
3816
3817	// Complex, from nxv2f16.
3818	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1`},
3819	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1`},
3820	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1`},
3821	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1`},
3822	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1`},
3823	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1`},
3824	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1`},
3825	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1`},
3826
3827	// Truncate from nxvmf32 to nxvmf16.
3828	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: `1`},
3829	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: `1`},
3830	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: `3`},
3831
3832	// Truncate from nxvmf32 to nxvmbf16.
3833	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f32, .Cost: `8`},
3834	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f32, .Cost: `8`},
3835	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f32, .Cost: `17`},
3836
3837	// Truncate from nxvmf64 to nxvmf16.
3838	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: `1`},
3839	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: `3`},
3840	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: `7`},
3841
3842	// Truncate from nxvmf64 to nxvmbf16.
3843	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2bf16, .Src: MVT::nxv2f64, .Cost: `9`},
3844	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4bf16, .Src: MVT::nxv4f64, .Cost: `19`},
3845	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8bf16, .Src: MVT::nxv8f64, .Cost: `39`},
3846
3847	// Truncate from nxvmf64 to nxvmf32.
3848	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: `1`},
3849	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: `3`},
3850	{.ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: `6`},
3851
3852	// Extend from nxvmf16 to nxvmf32.
3853	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: `1`},
3854	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: `1`},
3855	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: `2`},
3856
3857	// Extend from nxvmbf16 to nxvmf32.
3858	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2bf16, .Cost: `1`}, // lsl
3859	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4bf16, .Cost: `1`}, // lsl
3860	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8bf16, .Cost: `4`}, // unpck+unpck+lsl+lsl
3861
3862	// Extend from nxvmf16 to nxvmf64.
3863	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: `1`},
3864	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: `2`},
3865	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: `4`},
3866
3867	// Extend from nxvmbf16 to nxvmf64.
3868	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2bf16, .Cost: `2`}, // lsl+fcvt
3869	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4bf16, .Cost: `6`}, // 2unpck+2lsl+2fcvt*
3870	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8bf16, .Cost: `14`}, // 6unpck+4lsl+4fcvt*
3871
3872	// Extend from nxvmf32 to nxvmf64.
3873	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: `1`},
3874	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: `2`},
3875	{.ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: `6`},
3876
3877	// Bitcasts from float to integer
3878	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: `0`},
3879	{.ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: `0`},
3880	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: `0`},
3881
3882	// Bitcasts from integer to float
3883	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `0`},
3884	{.ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `0`},
3885	{.ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `0`},
3886
3887	// Add cost for extending to illegal -too wide- scalable vectors.
3888	// zero/sign extend are implemented by multiple unpack operations,
3889	// where each operation has a cost of 1.
3890	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
3891	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
3892	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
3893	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
3894	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
3895	{.ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
3896
3897	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
3898	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
3899	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
3900	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
3901	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
3902	{.ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
3903	};
3904
3905	// We have to estimate a cost of fixed length operation upon
3906	// SVE registers(operations) with the number of registers required
3907	// for a fixed type to be represented upon SVE registers.
3908	EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
3909	if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3910	SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3911	ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
3912	std::pair<InstructionCost, MVT> LT =
3913	getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
3914	unsigned NumElements =
3915	AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3916	return LT.first *
3917	getCastInstrCost(
3918	Opcode,
3919	Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
3920	Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
3921	CostKind, I);
3922	}
3923
3924	if (const auto *Entry = ConvertCostTableLookup(
3925	Table: ConversionTbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3926	return Entry->Cost;
3927
3928	static const TypeConversionCostTblEntry FP16Tbl[] = {
3929	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
3930	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`},
3931	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
3932	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`},
3933	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`}, // fcvtl+fcvtzs
3934	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`},
3935	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`}, // fcvtzs+xtn
3936	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`},
3937	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`}, // fcvtzs
3938	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`},
3939	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`}, // 2fcvtl+2fcvtzs
3940	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`},
3941	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`}, // 2fcvtzs+xtn*
3942	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`},
3943	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`}, // 2fcvtzs*
3944	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`},
3945	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`}, // 4fcvtl+4fcvtzs
3946	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`},
3947	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // ushll + ucvtf
3948	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // sshll + scvtf
3949	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 ushl(2) + 2 * ucvtf*
3950	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 sshl(2) + 2 * scvtf*
3951	};
3952
3953	if (ST->hasFullFP16())
3954	if (const auto *Entry = ConvertCostTableLookup(
3955	Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
3956	return Entry->Cost;
3957
3958	// INT_TO_FP of i64->f32 will scalarize, which is required to avoid
3959	// double-rounding issues.
3960	if ((ISD == ISD::SINT_TO_FP \|\| ISD == ISD::UINT_TO_FP) &&
3961	DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > `32` &&
3962	isa<FixedVectorType>(Val: Dst) && isa<FixedVectorType>(Val: Src))
3963	return cast<FixedVectorType>(Val: Dst)->getNumElements() *
3964	getCastInstrCost(Opcode, Dst: Dst->getScalarType(),
3965	Src: Src->getScalarType(), CCH, CostKind) +
3966	BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Src), Insert: false,
3967	Extract: true, CostKind) +
3968	BaseT::getScalarizationOverhead(InTy: cast<FixedVectorType>(Val: Dst), Insert: true,
3969	Extract: false, CostKind);
3970
3971	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
3972	CCH == TTI::CastContextHint::Masked &&
3973	ST->isSVEorStreamingSVEAvailable() &&
3974	TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
3975	TargetLowering::TypePromoteInteger &&
3976	TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
3977	TargetLowering::TypeSplitVector) {
3978	// The standard behaviour in the backend for these cases is to split the
3979	// extend up into two parts:
3980	// 1. Perform an extending load or masked load up to the legal type.
3981	// 2. Extend the loaded data to the final type.
3982	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
3983	Type *LegalTy = EVT (SrcLT.second).getTypeForEVT(Context&: Src->getContext());
3984	InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
3985	Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
3986	InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
3987	Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
3988	return Part1 + Part2;
3989	}
3990
3991	// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
3992	// but we also want to include the TTI::CastContextHint::Masked case too.
3993	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
3994	CCH == TTI::CastContextHint::Masked &&
3995	ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
3996	CCH = TTI::CastContextHint::Normal;
3997
3998	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3999	}
4000
4001	InstructionCost
4002	AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
4003	VectorType VecTy, unsigned* Index,
4004	TTI::TargetCostKind CostKind) const {
4005
4006	// Make sure we were given a valid extend opcode.
4007	assert((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
4008	"Invalid opcode");
4009
4010	// We are extending an element we extract from a vector, so the source type
4011	// of the extend is the element type of the vector.
4012	auto *Src = VecTy->getElementType();
4013
4014	// Sign- and zero-extends are for integer types only.
4015	assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4016
4017	// Get the cost for the extract. We compute the cost (if any) for the extend
4018	// below.
4019	InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
4020	CostKind, Index, Op0: nullptr, Op1: nullptr);
4021
4022	// Legalize the types.
4023	auto VecLT = getTypeLegalizationCost(Ty: VecTy);
4024	auto DstVT = TLI->getValueType(DL, Ty: Dst);
4025	auto SrcVT = TLI->getValueType(DL, Ty: Src);
4026
4027	// If the resulting type is still a vector and the destination type is legal,
4028	// we may get the extension for free. If not, get the default cost for the
4029	// extend.
4030	if (!VecLT.second.isVector() \|\| !TLI->isTypeLegal(VT: DstVT))
4031	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4032	CostKind);
4033
4034	// The destination type should be larger than the element type. If not, get
4035	// the default cost for the extend.
4036	if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4037	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4038	CostKind);
4039
4040	switch (Opcode) {
4041	default:
4042	llvm_unreachable("Opcode should be either SExt or ZExt");
4043
4044	// For sign-extends, we only need a smov, which performs the extension
4045	// automatically.
4046	case Instruction::SExt:
4047	return Cost;
4048
4049	// For zero-extends, the extend is performed automatically by a umov unless
4050	// the destination type is i64 and the element type is i8 or i16.
4051	case Instruction::ZExt:
4052	if (DstVT.getSizeInBits() != `64u` \|\| SrcVT.getSizeInBits() == `32u`)
4053	return Cost;
4054	}
4055
4056	// If we are unable to perform the extend for free, get the default cost.
4057	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
4058	CostKind);
4059	}
4060
4061	InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
4062	TTI::TargetCostKind CostKind,
4063	const Instruction I) const* {
4064	if (CostKind != TTI::TCK_RecipThroughput)
4065	return Opcode == Instruction::PHI ? `0` : `1`;
4066	assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4067	// Branches are assumed to be predicted.
4068	return `0`;
4069	}
4070
4071	InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4072	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
4073	const Instruction I, Value Scalar,
4074	ArrayRef<std::tuple<Value , User , int>> ScalarUserAndIdx,
4075	TTI::VectorInstrContext VIC) const {
4076	assert(Val->isVectorTy() && "This must be a vector type");
4077
4078	if (Index != -`1U`) {
4079	// Legalize the type.
4080	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
4081
4082	// This type is legalized to a scalar type.
4083	if (!LT.second.isVector())
4084	return `0`;
4085
4086	// The type may be split. For fixed-width vectors we can normalize the
4087	// index to the new type.
4088	if (LT.second.isFixedLengthVector()) {
4089	unsigned Width = LT.second.getVectorNumElements();
4090	Index = Index % Width;
4091	}
4092
4093	// The element at index zero is already inside the vector.
4094	// - For a insert-element or extract-element
4095	// instruction that extracts integers, an explicit FPR -> GPR move is
4096	// needed. So it has non-zero cost.
4097	if (Index == `0` && !Val->getScalarType()->isIntegerTy())
4098	return `0`;
4099
4100	// This is recognising a LD1 single-element structure to one lane of one
4101	// register instruction. I.e., if this is an `insertelement` instruction,
4102	// and its second operand is a load, then we will generate a LD1, which
4103	// are expensive instructions on some uArchs.
4104	if (VIC == TTI::VectorInstrContext::Load) {
4105	if (ST->hasFastLD1Single())
4106	return `0`;
4107	return CostKind == TTI::TCK_CodeSize
4108	? `0`
4109	: ST->getVectorInsertExtractBaseCost() + `1`;
4110	}
4111
4112	// i1 inserts and extract will include an extra cset or cmp of the vector
4113	// value. Increase the cost by 1 to account.
4114	if (Val->getScalarSizeInBits() == `1`)
4115	return CostKind == TTI::TCK_CodeSize
4116	? `2`
4117	: ST->getVectorInsertExtractBaseCost() + `1`;
4118
4119	// FIXME:
4120	// If the extract-element and insert-element instructions could be
4121	// simplified away (e.g., could be combined into users by looking at use-def
4122	// context), they have no cost. This is not done in the first place for
4123	// compile-time considerations.
4124	}
4125
4126	// In case of Neon, if there exists extractelement from lane != 0 such that
4127	// 1. extractelement does not necessitate a move from vector_reg -> GPR.
4128	// 2. extractelement result feeds into fmul.
4129	// 3. Other operand of fmul is an extractelement from lane 0 or lane
4130	// equivalent to 0.
4131	// then the extractelement can be merged with fmul in the backend and it
4132	// incurs no cost.
4133	// e.g.
4134	// define double @foo(<2 x double> %a) {
4135	// %1 = extractelement <2 x double> %a, i32 0
4136	// %2 = extractelement <2 x double> %a, i32 1
4137	// %res = fmul double %1, %2
4138	// ret double %res
4139	// }
4140	// %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4141	auto ExtractCanFuseWithFmul = [&]() {
4142	// We bail out if the extract is from lane 0.
4143	if (Index == `0`)
4144	return false;
4145
4146	// Check if the scalar element type of the vector operand of ExtractElement
4147	// instruction is one of the allowed types.
4148	auto IsAllowedScalarTy = [&](const Type *T) {
4149	return T->isFloatTy() \|\| T->isDoubleTy() \|\|
4150	(T->isHalfTy() && ST->hasFullFP16());
4151	};
4152
4153	// Check if the extractelement user is scalar fmul.
4154	auto IsUserFMulScalarTy = [](const Value *EEUser) {
4155	// Check if the user is scalar fmul.
4156	const auto *BO = dyn_cast<BinaryOperator>(Val: EEUser);
4157	return BO && BO->getOpcode() == BinaryOperator::FMul &&
4158	!BO->getType()->isVectorTy();
4159	};
4160
4161	// Check if the extract index is from lane 0 or lane equivalent to 0 for a
4162	// certain scalar type and a certain vector register width.
4163	auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4164	auto RegWidth =
4165	getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
4166	.getFixedValue();
4167	return Idx == `0` \|\| (RegWidth != `0` && (Idx * EltSz) % RegWidth == `0`);
4168	};
4169
4170	// Check if the type constraints on input vector type and result scalar type
4171	// of extractelement instruction are satisfied.
4172	if (!isa<FixedVectorType>(Val) \|\| !IsAllowedScalarTy(Val->getScalarType()))
4173	return false;
4174
4175	if (Scalar) {
4176	DenseMap<User , unsigned*> UserToExtractIdx;
4177	for (auto *U : Scalar->users()) {
4178	if (!IsUserFMulScalarTy(U))
4179	return false;
4180	// Recording entry for the user is important. Index value is not
4181	// important.
4182	UserToExtractIdx [U];
4183	}
4184	if (UserToExtractIdx.empty())
4185	return false;
4186	for (auto &[S, U, L] : ScalarUserAndIdx) {
4187	for (auto *U : S->users()) {
4188	if (UserToExtractIdx.contains(Val: U)) {
4189	auto *FMul = cast<BinaryOperator>(Val: U);
4190	auto *Op0 = FMul->getOperand(i_nocapture: `0`);
4191	auto *Op1 = FMul->getOperand(i_nocapture: `1`);
4192	if ((Op0 == S && Op1 == S) \|\| Op0 != S \|\| Op1 != S) {
4193	UserToExtractIdx [U] = L;
4194	break;
4195	}
4196	}
4197	}
4198	}
4199	for (auto &[U, L] : UserToExtractIdx) {
4200	if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4201	!IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4202	return false;
4203	}
4204	} else {
4205	const auto *EE = cast<ExtractElementInst>(Val: I);
4206
4207	const auto *IdxOp = dyn_cast<ConstantInt>(Val: EE->getIndexOperand());
4208	if (!IdxOp)
4209	return false;
4210
4211	return !EE->users().empty() && all_of(Range: EE->users(), P: [&](const User *U) {
4212	if (!IsUserFMulScalarTy(U))
4213	return false;
4214
4215	// Check if the other operand of extractelement is also extractelement
4216	// from lane equivalent to 0.
4217	const auto *BO = cast<BinaryOperator>(Val: U);
4218	const auto *OtherEE = dyn_cast<ExtractElementInst>(
4219	Val: BO->getOperand(i_nocapture: `0`) == EE ? BO->getOperand(i_nocapture: `1`) : BO->getOperand(i_nocapture: `0`));
4220	if (OtherEE) {
4221	const auto *IdxOp = dyn_cast<ConstantInt>(Val: OtherEE->getIndexOperand());
4222	if (!IdxOp)
4223	return false;
4224	return IsExtractLaneEquivalentToZero(
4225	cast<ConstantInt>(Val: OtherEE->getIndexOperand())
4226	->getValue()
4227	.getZExtValue(),
4228	OtherEE->getType()->getScalarSizeInBits());
4229	}
4230	return true;
4231	});
4232	}
4233	return true;
4234	};
4235
4236	if (Opcode == Instruction::ExtractElement && (I \|\| Scalar) &&
4237	ExtractCanFuseWithFmul ())
4238	return `0`;
4239
4240	// All other insert/extracts cost this much.
4241	return CostKind == TTI::TCK_CodeSize ? `1`
4242	: ST->getVectorInsertExtractBaseCost();
4243	}
4244
4245	InstructionCost AArch64TTIImpl::getVectorInstrCost(
4246	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
4247	const Value Op0, const* Value Op1, TTI::VectorInstrContext VIC) const* {
4248	// Treat insert at lane 0 into a poison vector as having zero cost. This
4249	// ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4250	// single dup) are treated as cheap.
4251	if (Opcode == Instruction::InsertElement && Index == `0` && Op0 &&
4252	isa<PoisonValue>(Val: Op0))
4253	return `0`;
4254	return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr,
4255	Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4256	}
4257
4258	InstructionCost AArch64TTIImpl::getVectorInstrCost(
4259	unsigned Opcode, Type Val, TTI::TargetCostKind CostKind, unsigned* Index,
4260	Value Scalar, ArrayRef<std::tuple<Value , User , int*>> ScalarUserAndIdx,
4261	TTI::VectorInstrContext VIC) const {
4262	return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, I: nullptr, Scalar,
4263	ScalarUserAndIdx, VIC);
4264	}
4265
4266	InstructionCost
4267	AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val,
4268	TTI::TargetCostKind CostKind, unsigned Index,
4269	TTI::VectorInstrContext VIC) const {
4270	return getVectorInstrCostHelper(Opcode: I.getOpcode(), Val, CostKind, Index, I: &I,
4271	Scalar: nullptr, ScalarUserAndIdx: {}, VIC);
4272	}
4273
4274	InstructionCost
4275	AArch64TTIImpl::getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
4276	TTI::TargetCostKind CostKind,
4277	unsigned Index) const {
4278	if (isa<FixedVectorType>(Val))
4279	return BaseT::getIndexedVectorInstrCostFromEnd(Opcode, Val, CostKind,
4280	Index);
4281
4282	// This typically requires both while and lastb instructions in order
4283	// to extract the last element. If this is in a loop the while
4284	// instruction can at least be hoisted out, although it will consume a
4285	// predicate register. The cost should be more expensive than the base
4286	// extract cost, which is 2 for most CPUs.
4287	return CostKind == TTI::TCK_CodeSize
4288	? `2`
4289	: ST->getVectorInsertExtractBaseCost() + `1`;
4290	}
4291
4292	InstructionCost AArch64TTIImpl::getScalarizationOverhead(
4293	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
4294	TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4295	TTI::VectorInstrContext VIC) const {
4296	if (isa<ScalableVectorType>(Val: Ty))
4297	return InstructionCost::getInvalid();
4298	if (Ty->getElementType()->isFloatingPointTy())
4299	return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
4300	CostKind);
4301	unsigned VecInstCost =
4302	CostKind == TTI::TCK_CodeSize ? `1` : ST->getVectorInsertExtractBaseCost();
4303	return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4304	}
4305
4306	std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4307	Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4308	TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4309	std::function<InstructionCost(Type )> InstCost) const* {
4310	if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4311	return std::nullopt;
4312	if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4313	return std::nullopt;
4314	// If we have +sve-b16b16 the operation can be promoted to SVE.
4315	if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4316	return std::nullopt;
4317
4318	Type *PromotedTy = Ty->getWithNewType(EltTy: Type::getFloatTy(C&: Ty->getContext()));
4319	InstructionCost Cost = getCastInstrCost(Opcode: Instruction::FPExt, Dst: PromotedTy, Src: Ty,
4320	CCH: TTI::CastContextHint::None, CostKind);
4321	if (!Op1Info.isConstant() && !Op2Info.isConstant())
4322	Cost *= `2`;
4323	Cost += InstCost (PromotedTy);
4324	if (IncludeTrunc)
4325	Cost += getCastInstrCost(Opcode: Instruction::FPTrunc, Dst: Ty, Src: PromotedTy,
4326	CCH: TTI::CastContextHint::None, CostKind);
4327	return Cost;
4328	}
4329
4330	InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
4331	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4332	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
4333	ArrayRef<const Value > Args, const* Instruction CxtI) const* {
4334
4335	// The code-generator is currently not able to handle scalable vectors
4336	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4337	// it. This change will be removed when code-generation for these types is
4338	// sufficiently reliable.
4339	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
4340	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
4341	return InstructionCost::getInvalid();
4342
4343	// TODO: Handle more cost kinds.
4344	if (CostKind != TTI::TCK_RecipThroughput)
4345	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4346	Opd2Info: Op2Info, Args, CxtI);
4347
4348	// Legalize the type.
4349	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4350	int ISD = TLI->InstructionOpcodeToISD(Opcode);
4351
4352	// Increase the cost for half and bfloat types if not architecturally
4353	// supported.
4354	if (ISD == ISD::FADD \|\| ISD == ISD::FSUB \|\| ISD == ISD::FMUL \|\|
4355	ISD == ISD::FDIV \|\| ISD == ISD::FREM)
4356	if (auto PromotedCost = getFP16BF16PromoteCost(
4357	Ty, CostKind, Op1Info, Op2Info, /IncludeTrunc=/true,
4358	// There is not native support for fdiv/frem even with +sve-b16b16.
4359	/CanUseSVE=/ISD != ISD::FDIV && ISD != ISD::FREM,
4360	InstCost: [&](Type *PromotedTy) {
4361	return getArithmeticInstrCost(Opcode, Ty: PromotedTy, CostKind,
4362	Op1Info, Op2Info);
4363	}))
4364	return *PromotedCost;
4365
4366	// If the operation is a widening instruction (smull or umull) and both
4367	// operands are extends the cost can be cheaper by considering that the
4368	// operation will operate on the narrowest type size possible (double the
4369	// largest input size) and a further extend.
4370	if (Type *ExtTy = isBinExtWideningInstruction(Opcode, DstTy: Ty, Args)) {
4371	if (ExtTy != Ty)
4372	return getArithmeticInstrCost(Opcode, Ty: ExtTy, CostKind) +
4373	getCastInstrCost(Opcode: Instruction::ZExt, Dst: Ty, Src: ExtTy,
4374	CCH: TTI::CastContextHint::None, CostKind);
4375	return LT.first;
4376	}
4377
4378	switch (ISD) {
4379	default:
4380	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4381	Opd2Info: Op2Info);
4382	case ISD::SREM:
4383	case ISD::SDIV:
4384	/*
4385	Notes for sdiv/srem specific costs:
4386	1. This only considers the cases where the divisor is constant, uniform and
4387	(pow-of-2/non-pow-of-2). Other cases are not important since they either
4388	result in some form of (ldr + adrp), corresponding to constant vectors, or
4389	scalarization of the division operation.
4390	2. Constant divisors, either negative in whole or partially, don't result in
4391	significantly different codegen as compared to positive constant divisors.
4392	So, we don't consider negative divisors separately.
4393	3. If the codegen is significantly different with SVE, it has been indicated
4394	using comments at appropriate places.
4395
4396	sdiv specific cases:
4397	-----------------------------------------------------------------------
4398	codegen \| pow-of-2 \| Type
4399	-----------------------------------------------------------------------
4400	add + cmp + csel + asr \| Y \| i64
4401	add + cmp + csel + asr \| Y \| i32
4402	-----------------------------------------------------------------------
4403
4404	srem specific cases:
4405	-----------------------------------------------------------------------
4406	codegen \| pow-of-2 \| Type
4407	-----------------------------------------------------------------------
4408	negs + and + and + csneg \| Y \| i64
4409	negs + and + and + csneg \| Y \| i32
4410	-----------------------------------------------------------------------
4411
4412	other sdiv/srem cases:
4413	-------------------------------------------------------------------------
4414	common codegen \| + srem \| + sdiv \| pow-of-2 \| Type
4415	-------------------------------------------------------------------------
4416	smulh + asr + add + add \| - \| - \| N \| i64
4417	smull + lsr + add + add \| - \| - \| N \| i32
4418	usra \| and + sub \| sshr \| Y \| <2 x i64>
4419	2 (scalar code) \| - \| - \| N \| <2 x i64>*
4420	usra \| bic + sub \| sshr + neg \| Y \| <4 x i32>
4421	smull2 + smull + uzp2 \| mls \| - \| N \| <4 x i32>
4422	+ sshr + usra \| \| \| \|
4423	-------------------------------------------------------------------------
4424	*/
4425	if (Op2Info.isConstant() && Op2Info.isUniform()) {
4426	InstructionCost AddCost =
4427	getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4428	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4429	InstructionCost AsrCost =
4430	getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4431	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4432	InstructionCost MulCost =
4433	getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4434	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4435	// add/cmp/csel/csneg should have similar cost while asr/negs/and should
4436	// have similar cost.
4437	auto VT = TLI->getValueType(DL, Ty);
4438	if (VT.isScalarInteger() && VT.getSizeInBits() <= `64`) {
4439	if (Op2Info.isPowerOf2() \|\| Op2Info.isNegatedPowerOf2()) {
4440	// Neg can be folded into the asr instruction.
4441	return ISD == ISD::SDIV ? (`3` * AddCost + AsrCost)
4442	: (`3` * AsrCost + AddCost);
4443	} else {
4444	return MulCost + AsrCost + `2` * AddCost;
4445	}
4446	} else if (VT.isVector()) {
4447	InstructionCost UsraCost = `2` * AsrCost;
4448	if (Op2Info.isPowerOf2() \|\| Op2Info.isNegatedPowerOf2()) {
4449	// Division with scalable types corresponds to native 'asrd'
4450	// instruction when SVE is available.
4451	// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4452
4453	// One more for the negation in SDIV
4454	InstructionCost Cost =
4455	(Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : `0`;
4456	if (Ty->isScalableTy() && ST->hasSVE())
4457	Cost += `2` * AsrCost;
4458	else {
4459	Cost +=
4460	UsraCost +
4461	(ISD == ISD::SDIV
4462	? (LT.second.getScalarType() == MVT::i64 ? `1` : `2`) * AsrCost
4463	: `2` * AddCost);
4464	}
4465	return Cost;
4466	} else if (LT.second == MVT::v2i64) {
4467	return VT.getVectorNumElements() *
4468	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind,
4469	Op1Info: Op1Info.getNoProps(),
4470	Op2Info: Op2Info.getNoProps());
4471	} else {
4472	// When SVE is available, we get:
4473	// smulh + lsr + add/sub + asr + add/sub.
4474	if (Ty->isScalableTy() && ST->hasSVE())
4475	return MulCost /smulh cost/ + `2` * AddCost + `2` * AsrCost;
4476	return `2` * MulCost + AddCost /uzp2 cost/ + AsrCost + UsraCost;
4477	}
4478	}
4479	}
4480	if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4481	LT.second.isFixedLengthVector()) {
4482	// FIXME: When the constant vector is non-uniform, this may result in
4483	// loading the vector from constant pool or in some cases, may also result
4484	// in scalarization. For now, we are approximating this with the
4485	// scalarization cost.
4486	auto ExtractCost = `2` * getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty,
4487	CostKind, Index: -`1`, Op0: nullptr, Op1: nullptr);
4488	auto InsertCost = getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty,
4489	CostKind, Index: -`1`, Op0: nullptr, Op1: nullptr);
4490	unsigned NElts = cast<FixedVectorType>(Val: Ty)->getNumElements();
4491	return ExtractCost + InsertCost +
4492	NElts * getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(),
4493	CostKind, Op1Info: Op1Info.getNoProps(),
4494	Op2Info: Op2Info.getNoProps());
4495	}
4496	[[fallthrough]];
4497	case ISD::UDIV:
4498	case ISD::UREM: {
4499	auto VT = TLI->getValueType(DL, Ty);
4500	if (Op2Info.isConstant()) {
4501	// If the operand is a power of 2 we can use the shift or and cost.
4502	if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4503	return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind,
4504	Op1Info: Op1Info.getNoProps(),
4505	Op2Info: Op2Info.getNoProps());
4506	if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4507	return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind,
4508	Op1Info: Op1Info.getNoProps(),
4509	Op2Info: Op2Info.getNoProps());
4510
4511	if (ISD == ISD::UDIV \|\| ISD == ISD::UREM) {
4512	// Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4513	// The MULHU will be expanded to UMULL for the types not listed below,
4514	// and will become a pair of UMULL+MULL2 for 128bit vectors.
4515	bool HasMULH = VT == MVT::i64 \|\| LT.second == MVT::nxv2i64 \|\|
4516	LT.second == MVT::nxv4i32 \|\| LT.second == MVT::nxv8i16 \|\|
4517	LT.second == MVT::nxv16i8;
4518	bool Is128bit = LT.second.is128BitVector();
4519
4520	InstructionCost MulCost =
4521	getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind,
4522	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4523	InstructionCost AddCost =
4524	getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind,
4525	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4526	InstructionCost ShrCost =
4527	getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
4528	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
4529	InstructionCost DivCost = MulCost * (Is128bit ? `2` : `1`) + // UMULL/UMULH
4530	(HasMULH ? `0` : ShrCost) + // UMULL shift
4531	AddCost * `2` + ShrCost;
4532	return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : `0`);
4533	}
4534	}
4535
4536	// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4537	// emitted by the backend even when those functions are not declared in the
4538	// module.
4539	if (!VT.isVector() && VT.getSizeInBits() > `64`)
4540	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4541
4542	InstructionCost Cost = BaseT::getArithmeticInstrCost(
4543	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4544	if (Ty->isVectorTy() && (ISD == ISD::SDIV \|\| ISD == ISD::UDIV)) {
4545	if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
4546	// SDIV/UDIV operations are lowered using SVE, then we can have less
4547	// costs.
4548	if (VT.isSimple() && isa<FixedVectorType>(Val: Ty) &&
4549	Ty->getPrimitiveSizeInBits().getFixedValue() < `128`) {
4550	static const CostTblEntry DivTbl[]{
4551	{.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: `8`},
4552	{.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: `5`},
4553	{.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `1`},
4554	{.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: `8`},
4555	{.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: `5`},
4556	{.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `1`}};
4557
4558	const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
4559	if (nullptr != Entry)
4560	return Entry->Cost;
4561	}
4562	// For 8/16-bit elements, the cost is higher because the type
4563	// requires promotion and possibly splitting:
4564	if (LT.second.getScalarType() == MVT::i8)
4565	Cost *= `8`;
4566	else if (LT.second.getScalarType() == MVT::i16)
4567	Cost *= `4`;
4568	return Cost;
4569	} else {
4570	// If one of the operands is a uniform constant then the cost for each
4571	// element is Cost for insertion, extraction and division.
4572	// Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4573	// operation with scalar type
4574	if ((Op1Info.isConstant() && Op1Info.isUniform()) \|\|
4575	(Op2Info.isConstant() && Op2Info.isUniform())) {
4576	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
4577	InstructionCost DivCost = BaseT::getArithmeticInstrCost(
4578	Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
4579	return (`4` + DivCost) * VTy->getNumElements();
4580	}
4581	}
4582	// On AArch64, without SVE, vector divisions are expanded
4583	// into scalar divisions of each pair of elements.
4584	Cost += getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind,
4585	Index: -`1`, Op0: nullptr, Op1: nullptr);
4586	Cost += getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -`1`,
4587	Op0: nullptr, Op1: nullptr);
4588	}
4589
4590	// TODO: if one of the arguments is scalar, then it's not necessary to
4591	// double the cost of handling the vector elements.
4592	Cost += Cost;
4593	}
4594	return Cost;
4595	}
4596	case ISD::MUL:
4597	// When SVE is available, then we can lower the v2i64 operation using
4598	// the SVE mul instruction, which has a lower cost.
4599	if (LT.second == MVT::v2i64 && ST->hasSVE())
4600	return LT.first;
4601
4602	// When SVE is not available, there is no MUL.2d instruction,
4603	// which means mul <2 x i64> is expensive as elements are extracted
4604	// from the vectors and the muls scalarized.
4605	// As getScalarizationOverhead is a bit too pessimistic, we
4606	// estimate the cost for a i64 vector directly here, which is:
4607	// - four 2-cost i64 extracts,
4608	// - two 2-cost i64 inserts, and
4609	// - two 1-cost muls.
4610	// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4611	// LT.first = 2 the cost is 28.
4612	if (LT.second != MVT::v2i64)
4613	return LT.first;
4614	return cast<VectorType>(Val: Ty)->getElementCount().getKnownMinValue() *
4615	(getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind) +
4616	getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, CostKind, Index: -`1`,
4617	Op0: nullptr, Op1: nullptr) *
4618	`2` +
4619	getVectorInstrCost(Opcode: Instruction::InsertElement, Val: Ty, CostKind, Index: -`1`,
4620	Op0: nullptr, Op1: nullptr));
4621	case ISD::ADD:
4622	case ISD::XOR:
4623	case ISD::OR:
4624	case ISD::AND:
4625	case ISD::SRL:
4626	case ISD::SRA:
4627	case ISD::SHL:
4628	// These nodes are marked as 'custom' for combining purposes only.
4629	// We know that they are legal. See LowerAdd in ISelLowering.
4630	return LT.first;
4631
4632	case ISD::FNEG:
4633	// Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4634	if ((Ty->isFloatTy() \|\| Ty->isDoubleTy() \|\|
4635	(Ty->isHalfTy() && ST->hasFullFP16())) &&
4636	CxtI &&
4637	((CxtI->hasOneUse() &&
4638	match(V: *CxtI->user_begin(), P: m_FMul(L: m_Value(), R: m_Value()))) \|\|
4639	match(V: CxtI->getOperand(i: `0`), P: m_FMul(L: m_Value(), R: m_Value()))))
4640	return `0`;
4641	[[fallthrough]];
4642	case ISD::FADD:
4643	case ISD::FSUB:
4644	if (!Ty->getScalarType()->isFP128Ty())
4645	return LT.first;
4646	[[fallthrough]];
4647	case ISD::FMUL:
4648	case ISD::FDIV:
4649	// These nodes are marked as 'custom' just to lower them to SVE.
4650	// We know said lowering will incur no additional cost.
4651	if (!Ty->getScalarType()->isFP128Ty())
4652	return `2` * LT.first;
4653
4654	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4655	Opd2Info: Op2Info);
4656	case ISD::FREM:
4657	// Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4658	// those functions are not declared in the module.
4659	if (!Ty->isVectorTy())
4660	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
4661	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
4662	Opd2Info: Op2Info);
4663	}
4664	}
4665
4666	InstructionCost
4667	AArch64TTIImpl::getAddressComputationCost(Type PtrTy, ScalarEvolution SE,
4668	const SCEV *Ptr,
4669	TTI::TargetCostKind CostKind) const {
4670	// Address computations in vectorized code with non-consecutive addresses will
4671	// likely result in more instructions compared to scalar code where the
4672	// computation can more often be merged into the index mode. The resulting
4673	// extra micro-ops can significantly decrease throughput.
4674	unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4675	int MaxMergeDistance = `64`;
4676
4677	if (PtrTy->isVectorTy() && SE &&
4678	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
4679	return NumVectorInstToHideOverhead;
4680
4681	// In many cases the address computation is not merged into the instruction
4682	// addressing mode.
4683	return `1`;
4684	}
4685
4686	/// Check whether Opcode1 has less throughput according to the scheduling
4687	/// model than Opcode2.
4688	bool AArch64TTIImpl::hasKnownLowerThroughputFromSchedulingModel(
4689	unsigned Opcode1, unsigned Opcode2) const {
4690	const MCSchedModel &Sched = ST->getSchedModel();
4691	const TargetInstrInfo *TII = ST->getInstrInfo();
4692	if (!Sched.hasInstrSchedModel())
4693	return false;
4694
4695	const MCSchedClassDesc *SCD1 =
4696	Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode1).getSchedClass());
4697	const MCSchedClassDesc *SCD2 =
4698	Sched.getSchedClassDesc(SchedClassIdx: TII->get(Opcode: Opcode2).getSchedClass());
4699	// We cannot handle variant scheduling classes without an MI. If we need to
4700	// support them for any of the instructions we query the information of we
4701	// might need to add a way to resolve them without a MI or not use the
4702	// scheduling info.
4703	assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4704	"Cannot handle variant scheduling classes without an MI");
4705	if (!SCD1->isValid() \|\| !SCD2->isValid())
4706	return false;
4707
4708	return MCSchedModel::getReciprocalThroughput(STI: ST, SCDesc: SCD1) >
4709	MCSchedModel::getReciprocalThroughput(STI: ST, SCDesc: SCD2);
4710	}
4711
4712	InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
4713	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
4714	TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
4715	TTI::OperandValueInfo Op2Info, const Instruction I) const* {
4716	// We don't lower some vector selects well that are wider than the register
4717	// width. TODO: Improve this with different cost kinds.
4718	if (isa<FixedVectorType>(Val: ValTy) && Opcode == Instruction::Select) {
4719	// We would need this many instructions to hide the scalarization happening.
4720	const int AmortizationCost = `20`;
4721
4722	// If VecPred is not set, check if we can get a predicate from the context
4723	// instruction, if its type matches the requested ValTy.
4724	if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4725	CmpPredicate CurrentPred;
4726	if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
4727	R: m_Value())))
4728	VecPred = CurrentPred;
4729	}
4730	// Check if we have a compare/select chain that can be lowered using
4731	// a (F)CMxx & BFI pair.
4732	if (CmpInst::isIntPredicate(P: VecPred) \|\| VecPred == CmpInst::FCMP_OLE \|\|
4733	VecPred == CmpInst::FCMP_OLT \|\| VecPred == CmpInst::FCMP_OGT \|\|
4734	VecPred == CmpInst::FCMP_OGE \|\| VecPred == CmpInst::FCMP_OEQ \|\|
4735	VecPred == CmpInst::FCMP_UNE) {
4736	static const auto ValidMinMaxTys = {
4737	MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4738	MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4739	static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4740
4741	auto LT = getTypeLegalizationCost(Ty: ValTy);
4742	if (any_of(Range: ValidMinMaxTys, P: equal_to(Arg&: LT.second)) \|\|
4743	(ST->hasFullFP16() &&
4744	any_of(Range: ValidFP16MinMaxTys, P: equal_to(Arg&: LT.second))))
4745	return LT.first;
4746	}
4747
4748	static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4749	{.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: `2`},
4750	{.ISD: Instruction::Select, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: `2`},
4751	{.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: `2`},
4752	{.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: `2`},
4753	{.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: `2`},
4754	{.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: `16`},
4755	{.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: `8`},
4756	{.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: `16`},
4757	{.ISD: Instruction::Select, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4` * AmortizationCost},
4758	{.ISD: Instruction::Select, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `8` * AmortizationCost},
4759	{.ISD: Instruction::Select, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `16` * AmortizationCost}};
4760
4761	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
4762	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
4763	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4764	if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD: Opcode,
4765	Dst: SelCondTy.getSimpleVT(),
4766	Src: SelValTy.getSimpleVT()))
4767	return Entry->Cost;
4768	}
4769	}
4770
4771	if (Opcode == Instruction::FCmp) {
4772	if (auto PromotedCost = getFP16BF16PromoteCost(
4773	Ty: ValTy, CostKind, Op1Info, Op2Info, /IncludeTrunc=/false,
4774	// TODO: Consider costing SVE FCMPs.
4775	/CanUseSVE=/false, InstCost: [&](Type *PromotedTy) {
4776	InstructionCost Cost =
4777	getCmpSelInstrCost(Opcode, ValTy: PromotedTy, CondTy, VecPred,
4778	CostKind, Op1Info, Op2Info);
4779	if (isa<VectorType>(Val: PromotedTy))
4780	Cost += getCastInstrCost(
4781	Opcode: Instruction::Trunc,
4782	Dst: VectorType::getInteger(VTy: cast<VectorType>(Val: ValTy)),
4783	Src: VectorType::getInteger(VTy: cast<VectorType>(Val: PromotedTy)),
4784	CCH: TTI::CastContextHint::None, CostKind);
4785	return Cost;
4786	}))
4787	return *PromotedCost;
4788
4789	auto LT = getTypeLegalizationCost(Ty: ValTy);
4790	// Model unknown fp compares as a libcall.
4791	if (LT.second.getScalarType() != MVT::f64 &&
4792	LT.second.getScalarType() != MVT::f32 &&
4793	LT.second.getScalarType() != MVT::f16)
4794	return LT.first * getCallInstrCost(/Function/ F: nullptr, RetTy: ValTy,
4795	Tys: {ValTy, ValTy}, CostKind);
4796
4797	// Some comparison operators require expanding to multiple compares + or.
4798	unsigned Factor = `1`;
4799	if (!CondTy->isVectorTy() &&
4800	(VecPred == FCmpInst::FCMP_ONE \|\| VecPred == FCmpInst::FCMP_UEQ))
4801	Factor = `2`; // fcmp with 2 selects
4802	else if (isa<FixedVectorType>(Val: ValTy) &&
4803	(VecPred == FCmpInst::FCMP_ONE \|\| VecPred == FCmpInst::FCMP_UEQ \|\|
4804	VecPred == FCmpInst::FCMP_ORD \|\| VecPred == FCmpInst::FCMP_UNO))
4805	Factor = `3`; // fcmxx+fcmyy+or
4806	else if (isa<ScalableVectorType>(Val: ValTy) &&
4807	(VecPred == FCmpInst::FCMP_ONE \|\| VecPred == FCmpInst::FCMP_UEQ))
4808	Factor = `3`; // fcmxx+fcmyy+or
4809
4810	if (isa<ScalableVectorType>(Val: ValTy) &&
4811	CostKind == TTI::TCK_RecipThroughput &&
4812	hasKnownLowerThroughputFromSchedulingModel(Opcode1: AArch64::FCMEQ_PPzZZ_S,
4813	Opcode2: AArch64::FCMEQv4f32))
4814	Factor *= `2`;
4815
4816	return Factor * (CostKind == TTI::TCK_Latency ? `2` : LT.first);
4817	}
4818
4819	// Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4820	// icmp(and, 0) as free, as we can make use of ands, but only if the
4821	// comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4822	// providing it will not cause performance regressions.
4823	if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4824	Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(Pred: VecPred) &&
4825	TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
4826	match(V: I->getOperand(i: `0`), P: m_And(L: m_Value(), R: m_Value()))) {
4827	if (match(V: I->getOperand(i: `1`), P: m_Zero()))
4828	return `0`;
4829
4830	// x >= 1 / x < 1 -> x > 0 / x <= 0
4831	if (match(V: I->getOperand(i: `1`), P: m_One()) &&
4832	(VecPred == CmpInst::ICMP_SLT \|\| VecPred == CmpInst::ICMP_SGE))
4833	return `0`;
4834
4835	// x <= -1 / x > -1 -> x > 0 / x <= 0
4836	if (match(V: I->getOperand(i: `1`), P: m_AllOnes()) &&
4837	(VecPred == CmpInst::ICMP_SLE \|\| VecPred == CmpInst::ICMP_SGT))
4838	return `0`;
4839	}
4840
4841	// The base case handles scalable vectors fine for now, since it treats the
4842	// cost as 1 legalization cost.*
4843	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4844	Op1Info, Op2Info, I);
4845	}
4846
4847	AArch64TTIImpl::TTI::MemCmpExpansionOptions
4848	AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
4849	TTI::MemCmpExpansionOptions Options;
4850	if (ST->requiresStrictAlign()) {
4851	// TODO: Add cost modeling for strict align. Misaligned loads expand to
4852	// a bunch of instructions when strict align is enabled.
4853	return Options;
4854	}
4855	Options.AllowOverlappingLoads = true;
4856	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4857	Options.NumLoadsPerBlock = Options.MaxNumLoads;
4858	// TODO: Though vector loads usually perform well on AArch64, in some targets
4859	// they may wake up the FP unit, which raises the power consumption. Perhaps
4860	// they could be used with no holds barred (-O3).
4861	Options.LoadSizes = {`8`, `4`, `2`, `1`};
4862	Options.AllowedTailExpansions = {`3`, `5`, `6`};
4863	return Options;
4864	}
4865
4866	bool AArch64TTIImpl::prefersVectorizedAddressing() const {
4867	return ST->hasSVE();
4868	}
4869
4870	InstructionCost
4871	AArch64TTIImpl::getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
4872	TTI::TargetCostKind CostKind) const {
4873	switch (MICA.getID()) {
4874	case Intrinsic::masked_scatter:
4875	case Intrinsic::masked_gather:
4876	return getGatherScatterOpCost(MICA, CostKind);
4877	case Intrinsic::masked_load:
4878	case Intrinsic::masked_store:
4879	return getMaskedMemoryOpCost(MICA, CostKind);
4880	}
4881	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4882	}
4883
4884	InstructionCost
4885	AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
4886	TTI::TargetCostKind CostKind) const {
4887	Type *Src = MICA.getDataType();
4888
4889	if (useNeonVector(Ty: Src))
4890	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4891	auto LT = getTypeLegalizationCost(Ty: Src);
4892	if (!LT.first.isValid())
4893	return InstructionCost::getInvalid();
4894
4895	// Return an invalid cost for element types that we are unable to lower.
4896	auto *VT = cast<VectorType>(Val: Src);
4897	if (VT->getElementType()->isIntegerTy(Bitwidth: `1`))
4898	return InstructionCost::getInvalid();
4899
4900	// The code-generator is currently not able to handle scalable vectors
4901	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4902	// it. This change will be removed when code-generation for these types is
4903	// sufficiently reliable.
4904	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
4905	return InstructionCost::getInvalid();
4906
4907	return LT.first;
4908	}
4909
4910	// This function returns gather/scatter overhead either from
4911	// user-provided value or specialized values per-target from \p ST.
4912	static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
4913	const AArch64Subtarget *ST) {
4914	assert((Opcode == Instruction::Load \|\| Opcode == Instruction::Store) &&
4915	"Should be called on only load or stores.");
4916	switch (Opcode) {
4917	case Instruction::Load:
4918	if (SVEGatherOverhead.getNumOccurrences() > `0`)
4919	return SVEGatherOverhead;
4920	return ST->getGatherOverhead();
4921	break;
4922	case Instruction::Store:
4923	if (SVEScatterOverhead.getNumOccurrences() > `0`)
4924	return SVEScatterOverhead;
4925	return ST->getScatterOverhead();
4926	break;
4927	default:
4928	llvm_unreachable("Shouldn't have reached here");
4929	}
4930	}
4931
4932	InstructionCost
4933	AArch64TTIImpl::getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
4934	TTI::TargetCostKind CostKind) const {
4935
4936	unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather \|\|
4937	MICA.getID() == Intrinsic::vp_gather)
4938	? Instruction::Load
4939	: Instruction::Store;
4940
4941	Type *DataTy = MICA.getDataType();
4942	Align Alignment = MICA.getAlignment();
4943	const Instruction *I = MICA.getInst();
4944
4945	if (useNeonVector(Ty: DataTy) \|\| !isLegalMaskedGatherScatter(DataType: DataTy))
4946	return BaseT::getMemIntrinsicInstrCost(MICA, CostKind);
4947	auto *VT = cast<VectorType>(Val: DataTy);
4948	auto LT = getTypeLegalizationCost(Ty: DataTy);
4949	if (!LT.first.isValid())
4950	return InstructionCost::getInvalid();
4951
4952	// Return an invalid cost for element types that we are unable to lower.
4953	if (!LT.second.isVector() \|\|
4954	!isElementTypeLegalForScalableVector(Ty: VT->getElementType()) \|\|
4955	VT->getElementType()->isIntegerTy(Bitwidth: `1`))
4956	return InstructionCost::getInvalid();
4957
4958	// The code-generator is currently not able to handle scalable vectors
4959	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4960	// it. This change will be removed when code-generation for these types is
4961	// sufficiently reliable.
4962	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
4963	return InstructionCost::getInvalid();
4964
4965	ElementCount LegalVF = LT.second.getVectorElementCount();
4966	InstructionCost MemOpCost =
4967	getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: `0`, CostKind,
4968	OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
4969	// Add on an overhead cost for using gathers/scatters.
4970	MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
4971	return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
4972	}
4973
4974	bool AArch64TTIImpl::useNeonVector(const Type Ty) const* {
4975	return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
4976	}
4977
4978	InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
4979	Align Alignment,
4980	unsigned AddressSpace,
4981	TTI::TargetCostKind CostKind,
4982	TTI::OperandValueInfo OpInfo,
4983	const Instruction I) const* {
4984	EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
4985	// Type legalization can't handle structs
4986	if (VT == MVT::Other)
4987	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
4988	CostKind);
4989
4990	auto LT = getTypeLegalizationCost(Ty);
4991	if (!LT.first.isValid())
4992	return InstructionCost::getInvalid();
4993
4994	// The code-generator is currently not able to handle scalable vectors
4995	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4996	// it. This change will be removed when code-generation for these types is
4997	// sufficiently reliable.
4998	// We also only support full register predicate loads and stores.
4999	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5000	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`) \|\|
5001	(VTy->getElementType()->isIntegerTy(Bitwidth: `1`) &&
5002	!VTy->getElementCount().isKnownMultipleOf(
5003	RHS: ElementCount::getScalable(MinVal: `16`))))
5004	return InstructionCost::getInvalid();
5005
5006	// TODO: consider latency as well for TCK_SizeAndLatency.
5007	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency)
5008	return LT.first;
5009
5010	if (CostKind != TTI::TCK_RecipThroughput)
5011	return `1`;
5012
5013	if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5014	LT.second.is128BitVector() && Alignment < Align (`16`)) {
5015	// Unaligned stores are extremely inefficient. We don't split all
5016	// unaligned 128-bit stores because the negative impact that has shown in
5017	// practice on inlined block copy code.
5018	// We make such stores expensive so that we will only vectorize if there
5019	// are 6 other instructions getting vectorized.
5020	const int AmortizationCost = `6`;
5021
5022	return LT.first * `2` * AmortizationCost;
5023	}
5024
5025	// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5026	if (Ty->isPtrOrPtrVectorTy())
5027	return LT.first;
5028
5029	if (useNeonVector(Ty)) {
5030	// Check truncating stores and extending loads.
5031	if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5032	// v4i8 types are lowered to scalar a load/store and sshll/xtn.
5033	if (VT == MVT::v4i8)
5034	return `2`;
5035	// Otherwise we need to scalarize.
5036	return cast<FixedVectorType>(Val: Ty)->getNumElements() * `2`;
5037	}
5038	EVT EltVT = VT.getVectorElementType();
5039	unsigned EltSize = EltVT.getScalarSizeInBits();
5040	if (!isPowerOf2_32(Value: EltSize) \|\| EltSize < `8` \|\| EltSize > `64` \|\|
5041	VT.getVectorNumElements() >= (`128` / EltSize) \|\| Alignment != Align (`1`))
5042	return LT.first;
5043	// FIXME: v3i8 lowering currently is very inefficient, due to automatic
5044	// widening to v4i8, which produces suboptimal results.
5045	if (VT.getVectorNumElements() == `3` && EltVT == MVT::i8)
5046	return LT.first;
5047
5048	// Check non-power-of-2 loads/stores for legal vector element types with
5049	// NEON. Non-power-of-2 memory ops will get broken down to a set of
5050	// operations on smaller power-of-2 ops, including ld1/st1.
5051	LLVMContext &C = Ty->getContext();
5052	InstructionCost Cost(`0`);
5053	SmallVector<EVT> TypeWorklist;
5054	TypeWorklist.push_back(Elt: VT);
5055	while (!TypeWorklist.empty()) {
5056	EVT CurrVT = TypeWorklist.pop_back_val();
5057	unsigned CurrNumElements = CurrVT.getVectorNumElements();
5058	if (isPowerOf2_32(Value: CurrNumElements)) {
5059	Cost += `1`;
5060	continue;
5061	}
5062
5063	unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / `2`;
5064	TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
5065	TypeWorklist.push_back(
5066	Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
5067	}
5068	return Cost;
5069	}
5070
5071	return LT.first;
5072	}
5073
5074	InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
5075	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
5076	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5077	bool UseMaskForCond, bool UseMaskForGaps) const {
5078	assert(Factor >= `2` && "Invalid interleave factor");
5079	auto *VecVTy = cast<VectorType>(Val: VecTy);
5080
5081	if (VecTy->isScalableTy() && !ST->hasSVE())
5082	return InstructionCost::getInvalid();
5083
5084	// Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5085	// only have lowering for power-of-2 factors.
5086	// TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5087	// InterleavedAccessPass for ld3/st3
5088	if (VecTy->isScalableTy() && !isPowerOf2_32(Value: Factor))
5089	return InstructionCost::getInvalid();
5090
5091	// Vectorization for masked interleaved accesses is only enabled for scalable
5092	// VF.
5093	if (!VecTy->isScalableTy() && (UseMaskForCond \|\| UseMaskForGaps))
5094	return InstructionCost::getInvalid();
5095
5096	if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5097	unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5098	auto *SubVecTy =
5099	VectorType::get(ElementType: VecVTy->getElementType(),
5100	EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
5101
5102	// ldN/stN only support legal vector types of size 64 or 128 in bits.
5103	// Accesses having vector types that are a multiple of 128 bits can be
5104	// matched to more than one ldN/stN instruction.
5105	bool UseScalable;
5106	if (MinElts % Factor == `0` &&
5107	TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
5108	return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
5109	}
5110
5111	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5112	Alignment, AddressSpace, CostKind,
5113	UseMaskForCond, UseMaskForGaps);
5114	}
5115
5116	InstructionCost
5117	AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type > Tys) const* {
5118	InstructionCost Cost = `0`;
5119	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5120	for (auto *I : Tys) {
5121	if (!I->isVectorTy())
5122	continue;
5123	if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
5124	`128`)
5125	Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind) +
5126	getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind);
5127	}
5128	return Cost;
5129	}
5130
5131	unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) const {
5132	return ST->getMaxInterleaveFactor();
5133	}
5134
5135	// For Falkor, we want to avoid having too many strided loads in a loop since
5136	// that can exhaust the HW prefetcher resources. We adjust the unroller
5137	// MaxCount preference below to attempt to ensure unrolling doesn't create too
5138	// many strided loads.
5139	static void
5140	getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5141	TargetTransformInfo::UnrollingPreferences &UP) {
5142	enum { MaxStridedLoads = `7` };
5143	auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5144	int StridedLoads = `0`;
5145	// FIXME? We could make this more precise by looking at the CFG and
5146	// e.g. not counting loads in each side of an if-then-else diamond.
5147	for (const auto BB : L->blocks()) {
5148	for (auto &I : *BB) {
5149	LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
5150	if (!LMemI)
5151	continue;
5152
5153	Value *PtrValue = LMemI->getPointerOperand();
5154	if (L->isLoopInvariant(V: PtrValue))
5155	continue;
5156
5157	const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
5158	const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
5159	if (!LSCEVAddRec \|\| !LSCEVAddRec->isAffine())
5160	continue;
5161
5162	// FIXME? We could take pairing of unrolled load copies into account
5163	// by looking at the AddRec, but we would probably have to limit this
5164	// to loops with no stores or other memory optimization barriers.
5165	++StridedLoads;
5166	// We've seen enough strided loads that seeing more won't make a
5167	// difference.
5168	if (StridedLoads > MaxStridedLoads / `2`)
5169	return StridedLoads;
5170	}
5171	}
5172	return StridedLoads;
5173	};
5174
5175	int StridedLoads = countStridedLoads (L, SE);
5176	LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5177	<< " strided loads\n");
5178	// Pick the largest power of 2 unroll count that won't result in too many
5179	// strided loads.
5180	if (StridedLoads) {
5181	UP.MaxCount = `1` << Log2_32(Value: MaxStridedLoads / StridedLoads);
5182	LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5183	<< UP.MaxCount << `'\n'`);
5184	}
5185	}
5186
5187	// This function returns true if the loop:
5188	// 1. Has a valid cost, and
5189	// 2. Has a cost within the supplied budget.
5190	// Otherwise it returns false.
5191	static bool isLoopSizeWithinBudget(Loop L, const* AArch64TTIImpl &TTI,
5192	InstructionCost Budget,
5193	unsigned *FinalSize) {
5194	// Estimate the size of the loop.
5195	InstructionCost LoopCost = `0`;
5196
5197	for (auto *BB : L->getBlocks()) {
5198	for (auto &I : *BB) {
5199	SmallVector<const Value *, `4`> Operands(I.operand_values());
5200	InstructionCost Cost =
5201	TTI.getInstructionCost(U: &I, Operands, CostKind: TTI::TCK_CodeSize);
5202	// This can happen with intrinsics that don't currently have a cost model
5203	// or for some operations that require SVE.
5204	if (!Cost.isValid())
5205	return false;
5206
5207	LoopCost += Cost;
5208	if (LoopCost > Budget)
5209	return false;
5210	}
5211	}
5212
5213	if (FinalSize)
5214	*FinalSize = LoopCost.getValue();
5215	return true;
5216	}
5217
5218	static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE,
5219	const AArch64TTIImpl &TTI) {
5220	// Only consider loops with unknown trip counts for which we can determine
5221	// a symbolic expression. Multi-exit loops with small known trip counts will
5222	// likely be unrolled anyway.
5223	const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5224	if (isa<SCEVConstant>(Val: BTC) \|\| isa<SCEVCouldNotCompute>(Val: BTC))
5225	return false;
5226
5227	// It might not be worth unrolling loops with low max trip counts. Restrict
5228	// this to max trip counts > 32 for now.
5229	unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5230	if (MaxTC > `0` && MaxTC <= `32`)
5231	return false;
5232
5233	// Make sure the loop size is <= 5.
5234	if (!isLoopSizeWithinBudget(L, TTI, Budget: `5`, FinalSize: nullptr))
5235	return false;
5236
5237	// Small search loops with multiple exits can be highly beneficial to unroll.
5238	// We only care about loops with exactly two exiting blocks, although each
5239	// block could jump to the same exit block.
5240	ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5241	if (Blocks.size() != `2`)
5242	return false;
5243
5244	if (any_of(Range&: Blocks, P: [](BasicBlock *BB) {
5245	return !isa<UncondBrInst, CondBrInst>(Val: BB->getTerminator());
5246	}))
5247	return false;
5248
5249	return true;
5250	}
5251
5252	/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5253	/// OOO engine's wide instruction window and various predictors.
5254	static void
5255	getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
5256	TargetTransformInfo::UnrollingPreferences &UP,
5257	const AArch64TTIImpl &TTI) {
5258	// Limit loops with structure that is highly likely to benefit from runtime
5259	// unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5260	// likely with complex control flow). Note that the heuristics here may be
5261	// overly conservative and we err on the side of avoiding runtime unrolling
5262	// rather than unroll excessively. They are all subject to further refinement.
5263	if (!L->isInnermost() \|\| L->getNumBlocks() > `8`)
5264	return;
5265
5266	// Loops with multiple exits are handled by common code.
5267	if (!L->getExitBlock())
5268	return;
5269
5270	// Check if the loop contains any reductions that could be parallelized when
5271	// unrolling. If so, enable partial unrolling, if the trip count is know to be
5272	// a multiple of 2.
5273	bool HasParellelizableReductions =
5274	L->getNumBlocks() == `1` &&
5275	any_of(Range: L->getHeader()->phis(),
5276	P: [&SE, L](PHINode &Phi) {
5277	return canParallelizeReductionWhenUnrolling(Phi, L, SE: &SE);
5278	}) &&
5279	isLoopSizeWithinBudget(L, TTI, Budget: `12`, FinalSize: nullptr);
5280	if (HasParellelizableReductions &&
5281	SE.getSmallConstantTripMultiple(L, ExitingBlock: L->getExitingBlock()) % `2` == `0`) {
5282	UP.Partial = true;
5283	UP.MaxCount = `4`;
5284	UP.AddAdditionalAccumulators = true;
5285	}
5286
5287	const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5288	if (isa<SCEVConstant>(Val: BTC) \|\| isa<SCEVCouldNotCompute>(Val: BTC) \|\|
5289	(SE.getSmallConstantMaxTripCount(L) > `0` &&
5290	SE.getSmallConstantMaxTripCount(L) <= `32`))
5291	return;
5292
5293	if (findStringMetadataForLoop(TheLoop: L, Name: "llvm.loop.isvectorized"))
5294	return;
5295
5296	if (SE.getSymbolicMaxBackedgeTakenCount(L) != SE.getBackedgeTakenCount(L))
5297	return;
5298
5299	// Limit to loops with trip counts that are cheap to expand.
5300	UP.SCEVExpansionBudget = `1`;
5301
5302	if (HasParellelizableReductions) {
5303	UP.Runtime = true;
5304	UP.DefaultUnrollRuntimeCount = `4`;
5305	UP.AddAdditionalAccumulators = true;
5306	}
5307
5308	// Try to unroll small loops, of few-blocks with low budget, if they have
5309	// load/store dependencies, to expose more parallel memory access streams,
5310	// or if they do little work inside a block (i.e. load -> X -> store pattern).
5311	BasicBlock *Header = L->getHeader();
5312	BasicBlock *Latch = L->getLoopLatch();
5313	if (Header == Latch) {
5314	// Estimate the size of the loop.
5315	unsigned Size;
5316	unsigned Width = `10`;
5317	if (!isLoopSizeWithinBudget(L, TTI, Budget: Width, FinalSize: &Size))
5318	return;
5319
5320	// Try to find an unroll count that maximizes the use of the instruction
5321	// window, i.e. trying to fetch as many instructions per cycle as possible.
5322	unsigned MaxInstsPerLine = `16`;
5323	unsigned UC = `1`;
5324	unsigned BestUC = `1`;
5325	unsigned SizeWithBestUC = BestUC * Size;
5326	while (UC <= `8`) {
5327	unsigned SizeWithUC = UC * Size;
5328	if (SizeWithUC > `48`)
5329	break;
5330	if ((SizeWithUC % MaxInstsPerLine) == `0` \|\|
5331	(SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5332	BestUC = UC;
5333	SizeWithBestUC = BestUC * Size;
5334	}
5335	UC++;
5336	}
5337
5338	if (BestUC == `1`)
5339	return;
5340
5341	SmallPtrSet<Value *, `8`> LoadedValuesPlus;
5342	SmallVector<StoreInst *> Stores;
5343	for (auto *BB : L->blocks()) {
5344	for (auto &I : *BB) {
5345	Value *Ptr = getLoadStorePointerOperand(V: &I);
5346	if (!Ptr)
5347	continue;
5348	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5349	if (SE.isLoopInvariant(S: PtrSCEV, L))
5350	continue;
5351	if (isa<LoadInst>(Val: &I)) {
5352	LoadedValuesPlus.insert(Ptr: &I);
5353	// Include in-loop 1st users of loaded values.
5354	for (auto *U : I.users())
5355	if (L->contains(Inst: cast<Instruction>(Val: U)))
5356	LoadedValuesPlus.insert(Ptr: U);
5357	} else
5358	Stores.push_back(Elt: cast<StoreInst>(Val: &I));
5359	}
5360	}
5361
5362	if (none_of(Range&: Stores, P: [&LoadedValuesPlus](StoreInst *SI) {
5363	return LoadedValuesPlus.contains(Ptr: SI->getOperand(i_nocapture: `0`));
5364	}))
5365	return;
5366
5367	UP.Runtime = true;
5368	UP.DefaultUnrollRuntimeCount = BestUC;
5369	return;
5370	}
5371
5372	// Try to runtime-unroll loops with early-continues depending on loop-varying
5373	// loads; this helps with branch-prediction for the early-continues.
5374	auto *Term = dyn_cast<CondBrInst>(Val: Header->getTerminator());
5375	SmallVector<BasicBlock *> Preds(predecessors(BB: Latch));
5376	if (!Term \|\| Preds.size() == `1` \|\| !llvm::is_contained(Range&: Preds, Element: Header) \|\|
5377	none_of(Range&: Preds, P: [L](BasicBlock Pred) { return* L->contains(BB: Pred); }))
5378	return;
5379
5380	std::function<bool(Instruction , unsigned*)> DependsOnLoopLoad =
5381	[&](Instruction I, unsigned* Depth) -> bool {
5382	if (isa<PHINode>(Val: I) \|\| L->isLoopInvariant(V: I) \|\| Depth > `8`)
5383	return false;
5384
5385	if (isa<LoadInst>(Val: I))
5386	return true;
5387
5388	return any_of(Range: I->operands(), P: [&](Value *V) {
5389	auto *I = dyn_cast<Instruction>(Val: V);
5390	return I && DependsOnLoopLoad (I, Depth + `1`);
5391	});
5392	};
5393	CmpPredicate Pred;
5394	Instruction *I;
5395	if (match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Instruction(I), R: m_Value()), T: m_Value(),
5396	F: m_Value())) &&
5397	DependsOnLoopLoad (I, `0`)) {
5398	UP.Runtime = true;
5399	}
5400	}
5401
5402	void AArch64TTIImpl::getUnrollingPreferences(
5403	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
5404	OptimizationRemarkEmitter ORE) const* {
5405	// Enable partial unrolling and runtime unrolling.
5406	BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5407
5408	UP.UpperBound = true;
5409
5410	// For inner loop, it is more likely to be a hot one, and the runtime check
5411	// can be promoted out from LICM pass, so the overhead is less, let's try
5412	// a larger threshold to unroll more loops.
5413	if (L->getLoopDepth() > `1`)
5414	UP.PartialThreshold *= `2`;
5415
5416	// Disable partial & runtime unrolling on -Os.
5417	UP.PartialOptSizeThreshold = `0`;
5418
5419	// Scan the loop: don't unroll loops with calls as this could prevent
5420	// inlining. Don't unroll auto-vectorized loops either, though do allow
5421	// unrolling of the scalar remainder.
5422	bool IsVectorized = getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized");
5423	InstructionCost Cost = `0`;
5424	for (auto *BB : L->getBlocks()) {
5425	for (auto &I : *BB) {
5426	// Both auto-vectorized loops and the scalar remainder have the
5427	// isvectorized attribute, so differentiate between them by the presence
5428	// of vector instructions.
5429	if (IsVectorized && I.getType()->isVectorTy())
5430	return;
5431	if (isa<CallBase>(Val: I)) {
5432	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I))
5433	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction())
5434	if (!isLoweredToCall(F))
5435	continue;
5436	return;
5437	}
5438
5439	SmallVector<const Value *, `4`> Operands(I.operand_values());
5440	Cost += getInstructionCost(U: &I, Operands,
5441	CostKind: TargetTransformInfo::TCK_SizeAndLatency);
5442	}
5443	}
5444
5445	// Apply subtarget-specific unrolling preferences.
5446	if (ST->isAppleMLike())
5447	getAppleRuntimeUnrollPreferences(L, SE, UP, TTI: *this);
5448	else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5449	EnableFalkorHWPFUnrollFix)
5450	getFalkorUnrollingPreferences(L, SE, UP);
5451
5452	// If this is a small, multi-exit loop similar to something like std::find,
5453	// then there is typically a performance improvement achieved by unrolling.
5454	if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, TTI: *this)) {
5455	UP.RuntimeUnrollMultiExit = true;
5456	UP.Runtime = true;
5457	// Limit unroll count.
5458	UP.DefaultUnrollRuntimeCount = `4`;
5459	// Allow slightly more costly trip-count expansion to catch search loops
5460	// with pointer inductions.
5461	UP.SCEVExpansionBudget = `5`;
5462	return;
5463	}
5464
5465	// Enable runtime unrolling for in-order models
5466	// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5467	// checking for that case, we can ensure that the default behaviour is
5468	// unchanged
5469	if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5470	!ST->getSchedModel().isOutOfOrder()) {
5471	UP.Runtime = true;
5472	UP.Partial = true;
5473	UP.UnrollRemainder = true;
5474	UP.DefaultUnrollRuntimeCount = `4`;
5475
5476	UP.UnrollAndJam = true;
5477	UP.UnrollAndJamInnerLoopThreshold = `60`;
5478	}
5479
5480	// Force unrolling small loops can be very useful because of the branch
5481	// taken cost of the backedge.
5482	if (Cost < Aarch64ForceUnrollThreshold)
5483	UP.Force = true;
5484	}
5485
5486	void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
5487	TTI::PeelingPreferences &PP) const {
5488	BaseT::getPeelingPreferences(L, SE, PP);
5489	}
5490
5491	Value AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
5492	Type *ExpectedType,
5493	bool CanCreate) const {
5494	switch (Inst->getIntrinsicID()) {
5495	default:
5496	return nullptr;
5497	case Intrinsic::aarch64_neon_st2:
5498	case Intrinsic::aarch64_neon_st3:
5499	case Intrinsic::aarch64_neon_st4: {
5500	// Create a struct type
5501	StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
5502	if (!CanCreate \|\| !ST)
5503	return nullptr;
5504	unsigned NumElts = Inst->arg_size() - `1`;
5505	if (ST->getNumElements() != NumElts)
5506	return nullptr;
5507	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
5508	if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
5509	return nullptr;
5510	}
5511	Value *Res = PoisonValue::get(T: ExpectedType);
5512	IRBuilder<> Builder(Inst);
5513	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
5514	Value *L = Inst->getArgOperand(i);
5515	Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
5516	}
5517	return Res;
5518	}
5519	case Intrinsic::aarch64_neon_ld2:
5520	case Intrinsic::aarch64_neon_ld3:
5521	case Intrinsic::aarch64_neon_ld4:
5522	if (Inst->getType() == ExpectedType)
5523	return Inst;
5524	return nullptr;
5525	}
5526	}
5527
5528	bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
5529	MemIntrinsicInfo &Info) const {
5530	switch (Inst->getIntrinsicID()) {
5531	default:
5532	break;
5533	case Intrinsic::aarch64_neon_ld2:
5534	case Intrinsic::aarch64_neon_ld3:
5535	case Intrinsic::aarch64_neon_ld4:
5536	Info.ReadMem = true;
5537	Info.WriteMem = false;
5538	Info.PtrVal = Inst->getArgOperand(i: `0`);
5539	break;
5540	case Intrinsic::aarch64_neon_st2:
5541	case Intrinsic::aarch64_neon_st3:
5542	case Intrinsic::aarch64_neon_st4:
5543	Info.ReadMem = false;
5544	Info.WriteMem = true;
5545	Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - `1`);
5546	break;
5547	}
5548
5549	switch (Inst->getIntrinsicID()) {
5550	default:
5551	return false;
5552	case Intrinsic::aarch64_neon_ld2:
5553	case Intrinsic::aarch64_neon_st2:
5554	Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
5555	break;
5556	case Intrinsic::aarch64_neon_ld3:
5557	case Intrinsic::aarch64_neon_st3:
5558	Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
5559	break;
5560	case Intrinsic::aarch64_neon_ld4:
5561	case Intrinsic::aarch64_neon_st4:
5562	Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
5563	break;
5564	}
5565	return true;
5566	}
5567
5568	/// See if \p I should be considered for address type promotion. We check if \p
5569	/// I is a sext with right type and used in memory accesses. If it used in a
5570	/// "complex" getelementptr, we allow it to be promoted without finding other
5571	/// sext instructions that sign extended the same initial value. A getelementptr
5572	/// is considered as "complex" if it has more than 2 operands.
5573	bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
5574	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5575	bool Considerable = false;
5576	AllowPromotionWithoutCommonHeader = false;
5577	if (!isa<SExtInst>(Val: &I))
5578	return false;
5579	Type *ConsideredSExtType =
5580	Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
5581	if (I.getType() != ConsideredSExtType)
5582	return false;
5583	// See if the sext is the one with the right type and used in at least one
5584	// GetElementPtrInst.
5585	for (const User *U : I.users()) {
5586	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
5587	Considerable = true;
5588	// A getelementptr is considered as "complex" if it has more than 2
5589	// operands. We will promote a SExt used in such complex GEP as we
5590	// expect some computation to be merged if they are done on 64 bits.
5591	if (GEPInst->getNumOperands() > `2`) {
5592	AllowPromotionWithoutCommonHeader = true;
5593	break;
5594	}
5595	}
5596	}
5597	return Considerable;
5598	}
5599
5600	bool AArch64TTIImpl::isLegalToVectorizeReduction(
5601	const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5602	if (!VF.isScalable())
5603	return true;
5604
5605	Type *Ty = RdxDesc.getRecurrenceType();
5606	if (Ty->isBFloatTy() \|\| !isElementTypeLegalForScalableVector(Ty))
5607	return false;
5608
5609	switch (RdxDesc.getRecurrenceKind()) {
5610	case RecurKind::Sub:
5611	case RecurKind::AddChainWithSubs:
5612	case RecurKind::Add:
5613	case RecurKind::FAdd:
5614	case RecurKind::And:
5615	case RecurKind::Or:
5616	case RecurKind::Xor:
5617	case RecurKind::SMin:
5618	case RecurKind::SMax:
5619	case RecurKind::UMin:
5620	case RecurKind::UMax:
5621	case RecurKind::FMin:
5622	case RecurKind::FMax:
5623	case RecurKind::FMulAdd:
5624	case RecurKind::AnyOf:
5625	case RecurKind::FindLast:
5626	return true;
5627	default:
5628	return false;
5629	}
5630	}
5631
5632	InstructionCost
5633	AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
5634	FastMathFlags FMF,
5635	TTI::TargetCostKind CostKind) const {
5636	// The code-generator is currently not able to handle scalable vectors
5637	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5638	// it. This change will be removed when code-generation for these types is
5639	// sufficiently reliable.
5640	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
5641	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5642	return InstructionCost::getInvalid();
5643
5644	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5645
5646	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5647	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5648
5649	InstructionCost LegalizationCost = `0`;
5650	if (LT.first > `1`) {
5651	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Ty->getContext());
5652	IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5653	LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - `1`);
5654	}
5655
5656	return LegalizationCost + /Cost of horizontal reduction/ `2`;
5657	}
5658
5659	InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
5660	unsigned Opcode, VectorType ValTy, TTI::TargetCostKind CostKind) const* {
5661	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5662	InstructionCost LegalizationCost = `0`;
5663	if (LT.first > `1`) {
5664	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: ValTy->getContext());
5665	LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
5666	LegalizationCost *= LT.first - `1`;
5667	}
5668
5669	int ISD = TLI->InstructionOpcodeToISD(Opcode);
5670	assert(ISD && "Invalid opcode");
5671	// Add the final reduction cost for the legal horizontal reduction
5672	switch (ISD) {
5673	case ISD::ADD:
5674	case ISD::AND:
5675	case ISD::OR:
5676	case ISD::XOR:
5677	case ISD::FADD:
5678	return LegalizationCost + `2`;
5679	default:
5680	return InstructionCost::getInvalid();
5681	}
5682	}
5683
5684	InstructionCost
5685	AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
5686	std::optional<FastMathFlags> FMF,
5687	TTI::TargetCostKind CostKind) const {
5688	// The code-generator is currently not able to handle scalable vectors
5689	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5690	// it. This change will be removed when code-generation for these types is
5691	// sufficiently reliable.
5692	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
5693	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5694	return InstructionCost::getInvalid();
5695
5696	if (TTI::requiresOrderedReduction(FMF)) {
5697	if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
5698	InstructionCost BaseCost =
5699	BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5700	// Add on extra cost to reflect the extra overhead on some CPUs. We still
5701	// end up vectorizing for more computationally intensive loops.
5702	return BaseCost + FixedVTy->getNumElements();
5703	}
5704
5705	if (Opcode != Instruction::FAdd)
5706	return InstructionCost::getInvalid();
5707
5708	auto *VTy = cast<ScalableVectorType>(Val: ValTy);
5709	InstructionCost Cost =
5710	getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
5711	Cost *= getMaxNumElements(VF: VTy->getElementCount());
5712	return Cost;
5713	}
5714
5715	if (isa<ScalableVectorType>(Val: ValTy))
5716	return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
5717
5718	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
5719	MVT MTy = LT.second;
5720	int ISD = TLI->InstructionOpcodeToISD(Opcode);
5721	assert(ISD && "Invalid opcode");
5722
5723	// Horizontal adds can use the 'addv' instruction. We model the cost of these
5724	// instructions as twice a normal vector add, plus 1 for each legalization
5725	// step (LT.first). This is the only arithmetic vector reduction operation for
5726	// which we have an instruction.
5727	// OR, XOR and AND costs should match the codegen from:
5728	// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
5729	// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
5730	// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
5731	static const CostTblEntry CostTblNoPairwise[]{
5732	{.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: `2`},
5733	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `2`},
5734	{.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: `2`},
5735	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `2`},
5736	{.ISD: ISD::ADD, .Type: MVT::v2i32, .Cost: `2`},
5737	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `2`},
5738	{.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: `2`},
5739	{.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: `5`}, // fmov + orr_lsr + orr_lsr + lsr + orr
5740	{.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: `7`}, // ext + orr + same as v8i8
5741	{.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: `4`}, // fmov + orr_lsr + lsr + orr
5742	{.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: `6`}, // ext + orr + same as v4i16
5743	{.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: `3`}, // fmov + lsr + orr
5744	{.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: `5`}, // ext + orr + same as v2i32
5745	{.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: `3`}, // ext + orr + fmov
5746	{.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: `5`}, // Same as above for or...
5747	{.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: `7`},
5748	{.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: `4`},
5749	{.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: `6`},
5750	{.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: `3`},
5751	{.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: `5`},
5752	{.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: `3`},
5753	{.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: `5`}, // Same as above for or...
5754	{.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: `7`},
5755	{.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: `4`},
5756	{.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: `6`},
5757	{.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: `3`},
5758	{.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: `5`},
5759	{.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: `3`},
5760	};
5761	switch (ISD) {
5762	default:
5763	break;
5764	case ISD::FADD:
5765	if (Type *EltTy = ValTy->getScalarType();
5766	// FIXME: For half types without fullfp16 support, this could extend and
5767	// use a fp32 faddp reduction but current codegen unrolls.
5768	MTy.isVector() && (EltTy->isFloatTy() \|\| EltTy->isDoubleTy() \|\|
5769	(EltTy->isHalfTy() && ST->hasFullFP16()))) {
5770	const unsigned NElts = MTy.getVectorNumElements();
5771	if (ValTy->getElementCount().getFixedValue() >= `2` && NElts >= `2` &&
5772	isPowerOf2_32(Value: NElts))
5773	// Reduction corresponding to series of fadd instructions is lowered to
5774	// series of faddp instructions. faddp has latency/throughput that
5775	// matches fadd instruction and hence, every faddp instruction can be
5776	// considered to have a relative cost = 1 with
5777	// CostKind = TCK_RecipThroughput.
5778	// An faddp will pairwise add vector elements, so the size of input
5779	// vector reduces by half every time, requiring
5780	// #(faddp instructions) = log2_32(NElts).
5781	return (LT.first - `1`) + /No of faddp instructions/ Log2_32(Value: NElts);
5782	}
5783	break;
5784	case ISD::ADD:
5785	if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
5786	return (LT.first - `1`) + Entry->Cost;
5787	break;
5788	case ISD::XOR:
5789	case ISD::AND:
5790	case ISD::OR:
5791	const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
5792	if (!Entry)
5793	break;
5794	auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
5795	if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
5796	isPowerOf2_32(Value: ValVTy->getNumElements())) {
5797	InstructionCost ExtraCost = `0`;
5798	if (LT.first != `1`) {
5799	// Type needs to be split, so there is an extra cost of LT.first - 1
5800	// arithmetic ops.
5801	auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
5802	NumElts: MTy.getVectorNumElements());
5803	ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
5804	ExtraCost *= LT.first - `1`;
5805	}
5806	// All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
5807	auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: `1`) ? `2` : Entry->Cost;
5808	return Cost + ExtraCost;
5809	}
5810	break;
5811	}
5812	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
5813	}
5814
5815	InstructionCost AArch64TTIImpl::getExtendedReductionCost(
5816	unsigned Opcode, bool IsUnsigned, Type ResTy, VectorType VecTy,
5817	std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
5818	EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5819	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5820
5821	if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
5822	VecVT.getSizeInBits() >= `64`) {
5823	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5824
5825	// The legal cases are:
5826	// UADDLV 8/16/32->32
5827	// UADDLP 32->64
5828	unsigned RevVTSize = ResVT.getSizeInBits();
5829	if (((LT.second == MVT::v8i8 \|\| LT.second == MVT::v16i8) &&
5830	RevVTSize <= `32`) \|\|
5831	((LT.second == MVT::v4i16 \|\| LT.second == MVT::v8i16) &&
5832	RevVTSize <= `32`) \|\|
5833	((LT.second == MVT::v2i32 \|\| LT.second == MVT::v4i32) &&
5834	RevVTSize <= `64`))
5835	return (LT.first - `1`) * `2` + `2`;
5836	}
5837
5838	return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: VecTy, FMF,
5839	CostKind);
5840	}
5841
5842	InstructionCost
5843	AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
5844	Type ResTy, VectorType VecTy,
5845	TTI::TargetCostKind CostKind) const {
5846	EVT VecVT = TLI->getValueType(DL, Ty: VecTy);
5847	EVT ResVT = TLI->getValueType(DL, Ty: ResTy);
5848
5849	if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
5850	RedOpcode == Instruction::Add) {
5851	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: VecTy);
5852
5853	// The legal cases with dotprod are
5854	// UDOT 8->32
5855	// Which requires an additional uaddv to sum the i32 values.
5856	if ((LT.second == MVT::v8i8 \|\| LT.second == MVT::v16i8) &&
5857	ResVT == MVT::i32)
5858	return LT.first + `2`;
5859	}
5860
5861	return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, Ty: VecTy,
5862	CostKind);
5863	}
5864
5865	InstructionCost
5866	AArch64TTIImpl::getSpliceCost(VectorType Tp, int* Index,
5867	TTI::TargetCostKind CostKind) const {
5868	static const CostTblEntry ShuffleTbl[] = {
5869	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: `1` },
5870	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: `1` },
5871	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: `1` },
5872	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: `1` },
5873	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: `1` },
5874	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: `1` },
5875	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: `1` },
5876	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: `1` },
5877	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: `1` },
5878	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: `1` },
5879	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: `1` },
5880	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: `1` },
5881	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: `1` },
5882	};
5883
5884	// The code-generator is currently not able to handle scalable vectors
5885	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5886	// it. This change will be removed when code-generation for these types is
5887	// sufficiently reliable.
5888	if (Tp->getElementCount() == ElementCount::getScalable(MinVal: `1`))
5889	return InstructionCost::getInvalid();
5890
5891	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
5892	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Tp->getContext());
5893	EVT PromotedVT = LT.second.getScalarType() == MVT::i1
5894	? TLI->getPromotedVTForPredicate(VT: EVT (LT.second))
5895	: LT.second;
5896	Type *PromotedVTy = EVT (PromotedVT).getTypeForEVT(Context&: Tp->getContext());
5897	InstructionCost LegalizationCost = `0`;
5898	if (Index < `0`) {
5899	LegalizationCost =
5900	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
5901	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
5902	getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
5903	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
5904	}
5905
5906	// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
5907	// Cost performed on a promoted type.
5908	if (LT.second.getScalarType() == MVT::i1) {
5909	LegalizationCost +=
5910	getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
5911	CCH: TTI::CastContextHint::None, CostKind) +
5912	getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
5913	CCH: TTI::CastContextHint::None, CostKind);
5914	}
5915	const auto *Entry =
5916	CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
5917	assert(Entry && "Illegal Type for Splice");
5918	LegalizationCost += Entry->Cost;
5919	return LegalizationCost * LT.first;
5920	}
5921
5922	InstructionCost AArch64TTIImpl::getPartialReductionCost(
5923	unsigned Opcode, Type InputTypeA, Type InputTypeB, Type *AccumType,
5924	ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
5925	TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
5926	TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
5927	InstructionCost Invalid = InstructionCost::getInvalid();
5928
5929	if (CostKind != TTI::TCK_RecipThroughput)
5930	return Invalid;
5931
5932	if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
5933	(!ST->isNeonAvailable() \|\| !ST->hasDotProd()))
5934	return Invalid;
5935
5936	if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
5937	Opcode != Instruction::FAdd) \|\|
5938	OpAExtend == TTI::PR_None)
5939	return Invalid;
5940
5941	// Floating-point partial reductions are invalid if `reassoc` and `contract`
5942	// are not allowed.
5943	if (AccumType->isFloatingPointTy()) {
5944	assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
5945	if (!FMF ->allowReassoc() \|\| !FMF ->allowContract())
5946	return Invalid;
5947	} else {
5948	assert(!FMF &&
5949	"FastMathFlags only apply to floating-point partial reductions");
5950	}
5951
5952	assert((BinOp \|\| (OpBExtend == TTI::PR_None && !InputTypeB)) &&
5953	(!BinOp \|\| (OpBExtend != TTI::PR_None && InputTypeB)) &&
5954	"Unexpected values for OpBExtend or InputTypeB");
5955
5956	// We only support multiply binary operations for now, and for muls we
5957	// require the types being extended to be the same.
5958	if (BinOp && ((BinOp != Instruction::Mul && BinOp != Instruction::FMul) \|\|
5959	InputTypeA != InputTypeB))
5960	return Invalid;
5961
5962	bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
5963	if (IsUSDot && !ST->hasMatMulInt8())
5964	return Invalid;
5965
5966	unsigned Ratio =
5967	AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5968	if (VF.getKnownMinValue() <= Ratio)
5969	return Invalid;
5970
5971	VectorType *InputVectorType = VectorType::get(ElementType: InputTypeA, EC: VF);
5972	VectorType *AccumVectorType =
5973	VectorType::get(ElementType: AccumType, EC: VF.divideCoefficientBy(RHS: Ratio));
5974	// We don't yet support all kinds of legalization.
5975	auto TC = TLI->getTypeConversion(Context&: AccumVectorType->getContext(),
5976	VT: EVT::getEVT(Ty: AccumVectorType));
5977	switch (TC.first) {
5978	default:
5979	return Invalid;
5980	case TargetLowering::TypeLegal:
5981	case TargetLowering::TypePromoteInteger:
5982	case TargetLowering::TypeSplitVector:
5983	// The legalised type (e.g. after splitting) must be legal too.
5984	if (TLI->getTypeAction(Context&: AccumVectorType->getContext(), VT: TC.second) !=
5985	TargetLowering::TypeLegal)
5986	return Invalid;
5987	break;
5988	}
5989
5990	std::pair<InstructionCost, MVT> AccumLT =
5991	getTypeLegalizationCost(Ty: AccumVectorType);
5992	std::pair<InstructionCost, MVT> InputLT =
5993	getTypeLegalizationCost(Ty: InputVectorType);
5994
5995	InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
5996
5997	// The sub/negation cannot be folded into the operands of
5998	// ISD::PARTIAL_REDUCE_MLA, so make the cost more expensive.*
5999	if (Opcode == Instruction::Sub)
6000	Cost += `8`;
6001
6002	// Prefer using full types by costing half-full input types as more expensive.
6003	if (TypeSize::isKnownLT(LHS: InputVectorType->getPrimitiveSizeInBits(),
6004	RHS: TypeSize::getScalable(MinimumSize: `128`)))
6005	// FIXME: This can be removed after the cost of the extends are folded into
6006	// the dot-product expression in VPlan, after landing:
6007	// https://github.com/llvm/llvm-project/pull/147302
6008	Cost *= `2`;
6009
6010	if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6011	// i16 -> i64 is natively supported for udot/sdot
6012	if (AccumLT.second.getScalarType() == MVT::i64 &&
6013	InputLT.second.getScalarType() == MVT::i16)
6014	return Cost;
6015	// i16 -> i32 is natively supported with SVE2p1
6016	if (AccumLT.second.getScalarType() == MVT::i32 &&
6017	InputLT.second.getScalarType() == MVT::i16 &&
6018	(ST->hasSVE2p1() \|\| ST->hasSME2()))
6019	return Cost;
6020	// i8 -> i64 is supported with an extra level of extends
6021	if (AccumLT.second.getScalarType() == MVT::i64 &&
6022	InputLT.second.getScalarType() == MVT::i8)
6023	// FIXME: This cost should probably be a little higher, e.g. Cost + 2
6024	// because it requires two extra extends on the inputs. But if we'd change
6025	// that now, a regular reduction would be cheaper because the costs of
6026	// the extends in the IR are still counted. This can be fixed
6027	// after https://github.com/llvm/llvm-project/pull/147302 has landed.
6028	return Cost;
6029	}
6030
6031	// i8 -> i32 is natively supported for udot/sdot/usdot, both for NEON and SVE.
6032	if (ST->isSVEorStreamingSVEAvailable() \|\|
6033	(AccumLT.second.isFixedLengthVector() && ST->isNeonAvailable() &&
6034	ST->hasDotProd())) {
6035	if (AccumLT.second.getScalarType() == MVT::i32 &&
6036	InputLT.second.getScalarType() == MVT::i8)
6037	return Cost;
6038	}
6039
6040	// f16 -> f32 is natively supported for fdot
6041	if (Opcode == Instruction::FAdd && (ST->hasSME2() \|\| ST->hasSVE2p1())) {
6042	if (AccumLT.second.getScalarType() == MVT::f32 &&
6043	InputLT.second.getScalarType() == MVT::f16 &&
6044	AccumLT.second.getVectorMinNumElements() == `4` &&
6045	InputLT.second.getVectorMinNumElements() == `8`)
6046	return Cost;
6047	// Floating-point types aren't promoted, so expanding the partial reduction
6048	// is more expensive.
6049	return Cost + `20`;
6050	}
6051
6052	// Add additional cost for the extends that would need to be inserted.
6053	return Cost + `2`;
6054	}
6055
6056	InstructionCost
6057	AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
6058	VectorType SrcTy, ArrayRef<int*> Mask,
6059	TTI::TargetCostKind CostKind, int Index,
6060	VectorType SubTp, ArrayRef<const* Value *> Args,
6061	const Instruction CxtI) const* {
6062	assert((Mask.empty() \|\| DstTy->isScalableTy() \|\|
6063	Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6064	"Expected the Mask to match the return size if given");
6065	assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6066	"Expected the same scalar types");
6067	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcTy);
6068
6069	// If we have a Mask, and the LT is being legalized somehow, split the Mask
6070	// into smaller vectors and sum the cost of each shuffle.
6071	if (!Mask.empty() && isa<FixedVectorType>(Val: SrcTy) && LT.second.isVector() &&
6072	LT.second.getScalarSizeInBits() * Mask.size() > `128` &&
6073	SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6074	Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6075	// Check for LD3/LD4 instructions, which are represented in llvm IR as
6076	// deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6077	// but we model it with a cost of LT.first so that LD3/LD4 have a higher
6078	// cost than just the load.
6079	if (Args.size() >= `1` && isa<LoadInst>(Val: Args [`0`]) &&
6080	(ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `3`) \|\|
6081	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `4`)))
6082	return std::max<InstructionCost>(a: `1`, b: LT.first / `4`);
6083
6084	// Check for ST3/ST4 instructions, which are represented in llvm IR as
6085	// store(interleaving-shuffle). The shuffle cost could potentially be free,
6086	// but we model it with a cost of LT.first so that ST3/ST4 have a higher
6087	// cost than just the store.
6088	if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
6089	(ShuffleVectorInst::isInterleaveMask(
6090	Mask, Factor: `4`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`) \|\|
6091	ShuffleVectorInst::isInterleaveMask(
6092	Mask, Factor: `3`, NumInputElts: SrcTy->getElementCount().getKnownMinValue() * `2`)))
6093	return LT.first;
6094
6095	unsigned TpNumElts = Mask.size();
6096	unsigned LTNumElts = LT.second.getVectorNumElements();
6097	unsigned NumVecs = (TpNumElts + LTNumElts - `1`) / LTNumElts;
6098	VectorType *NTp = VectorType::get(ElementType: SrcTy->getScalarType(),
6099	EC: LT.second.getVectorElementCount());
6100	InstructionCost Cost;
6101	std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6102	PreviousCosts;
6103	for (unsigned N = `0`; N < NumVecs; N++) {
6104	SmallVector<int> NMask;
6105	// Split the existing mask into chunks of size LTNumElts. Track the source
6106	// sub-vectors to ensure the result has at most 2 inputs.
6107	unsigned Source1 = -`1U`, Source2 = -`1U`;
6108	unsigned NumSources = `0`;
6109	for (unsigned E = `0`; E < LTNumElts; E++) {
6110	int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask [N * LTNumElts + E]
6111	: PoisonMaskElem;
6112	if (MaskElt < `0`) {
6113	NMask.push_back(Elt: PoisonMaskElem);
6114	continue;
6115	}
6116
6117	// Calculate which source from the input this comes from and whether it
6118	// is new to us.
6119	unsigned Source = MaskElt / LTNumElts;
6120	if (NumSources == `0`) {
6121	Source1 = Source;
6122	NumSources = `1`;
6123	} else if (NumSources == `1` && Source != Source1) {
6124	Source2 = Source;
6125	NumSources = `2`;
6126	} else if (NumSources >= `2` && Source != Source1 && Source != Source2) {
6127	NumSources++;
6128	}
6129
6130	// Add to the new mask. For the NumSources>2 case these are not correct,
6131	// but are only used for the modular lane number.
6132	if (Source == Source1)
6133	NMask.push_back(Elt: MaskElt % LTNumElts);
6134	else if (Source == Source2)
6135	NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
6136	else
6137	NMask.push_back(Elt: MaskElt % LTNumElts);
6138	}
6139	// Check if we have already generated this sub-shuffle, which means we
6140	// will have already generated the output. For example a <16 x i32> splat
6141	// will be the same sub-splat 4 times, which only needs to be generated
6142	// once and reused.
6143	auto Result =
6144	PreviousCosts.insert(x: {std::make_tuple(args&: Source1, args&: Source2, args&: NMask), `0`});
6145	// Check if it was already in the map (already costed).
6146	if (!Result.second)
6147	continue;
6148	// If the sub-mask has at most 2 input sub-vectors then re-cost it using
6149	// getShuffleCost. If not then cost it using the worst case as the number
6150	// of element moves into a new vector.
6151	InstructionCost NCost =
6152	NumSources <= `2`
6153	? getShuffleCost(Kind: NumSources <= `1` ? TTI::SK_PermuteSingleSrc
6154	: TTI::SK_PermuteTwoSrc,
6155	DstTy: NTp, SrcTy: NTp, Mask: NMask, CostKind, Index: `0`, SubTp: nullptr, Args,
6156	CxtI)
6157	: LTNumElts;
6158	Result.first ->second = NCost;
6159	Cost += NCost;
6160	}
6161	return Cost;
6162	}
6163
6164	Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTy&: SubTp);
6165	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6166	// A subvector extract can be implemented with a NEON/SVE ext (or trivial
6167	// extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6168	// This currently only handles low or high extracts to prevent SLP vectorizer
6169	// regressions.
6170	// Note that SVE's ext instruction is destructive, but it can be fused with
6171	// a movprfx to act like a constructive instruction.
6172	if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6173	if (LT.second.getFixedSizeInBits() >= `128` &&
6174	cast<FixedVectorType>(Val: SubTp)->getNumElements() ==
6175	LT.second.getVectorNumElements() / `2`) {
6176	if (Index == `0`)
6177	return `0`;
6178	if (Index == (int)LT.second.getVectorNumElements() / `2`)
6179	return `1`;
6180	}
6181	Kind = TTI::SK_PermuteSingleSrc;
6182	}
6183	// FIXME: This was added to keep the costs equal when adding DstTys. Update
6184	// the code to handle length-changing shuffles.
6185	if (Kind == TTI::SK_InsertSubvector) {
6186	LT = getTypeLegalizationCost(Ty: DstTy);
6187	SrcTy = DstTy;
6188	}
6189
6190	// Check for identity masks, which we can treat as free for both fixed and
6191	// scalable vector paths.
6192	if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6193	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
6194	all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
6195	return M.value() < `0` \|\| M.value() == (int)M.index();
6196	}))
6197	return `0`;
6198
6199	// Segmented shuffle matching.
6200	if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(Val: SrcTy) &&
6201	!Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6202	SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6203	RHS: AArch64::SVEBitsPerBlock)) {
6204
6205	FixedVectorType *VTy = cast<FixedVectorType>(Val: SrcTy);
6206	unsigned Segments =
6207	VTy->getPrimitiveSizeInBits() / AArch64::SVEBitsPerBlock;
6208	unsigned SegmentElts = VTy->getNumElements() / Segments;
6209
6210	// dupq zd.t, zn.t[idx]
6211	if ((ST->hasSVE2p1() \|\| ST->hasSME2p1()) &&
6212	ST->isSVEorStreamingSVEAvailable() &&
6213	isDUPQMask(Mask, Segments, SegmentSize: SegmentElts))
6214	return LT.first;
6215
6216	// mov zd.q, vn
6217	if (ST->isSVEorStreamingSVEAvailable() &&
6218	isDUPFirstSegmentMask(Mask, Segments, SegmentSize: SegmentElts))
6219	return LT.first;
6220	}
6221
6222	// Check for broadcast loads, which are supported by the LD1R instruction.
6223	// In terms of code-size, the shuffle vector is free when a load + dup get
6224	// folded into a LD1R. That's what we check and return here. For performance
6225	// and reciprocal throughput, a LD1R is not completely free. In this case, we
6226	// return the cost for the broadcast below (i.e. 1 for most/all types), so
6227	// that we model the load + dup sequence slightly higher because LD1R is a
6228	// high latency instruction.
6229	if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6230	bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args [`0`]);
6231	if (IsLoad && LT.second.isVector() &&
6232	isLegalBroadcastLoad(ElementTy: SrcTy->getElementType(),
6233	NumElements: LT.second.getVectorElementCount()))
6234	return `0`;
6235	}
6236
6237	// If we have 4 elements for the shuffle and a Mask, get the cost straight
6238	// from the perfect shuffle tables.
6239	if (Mask.size() == `4` &&
6240	SrcTy->getElementCount() == ElementCount::getFixed(MinVal: `4`) &&
6241	(SrcTy->getScalarSizeInBits() == `16` \|\|
6242	SrcTy->getScalarSizeInBits() == `32`) &&
6243	all_of(Range&: Mask, P: [](int E) { return E < `8`; }))
6244	return getPerfectShuffleCost(M: Mask);
6245
6246	// Check for other shuffles that are not SK_ kinds but we have native
6247	// instructions for, for example ZIP and UZP.
6248	unsigned Unused;
6249	if (LT.second.isFixedLengthVector() &&
6250	LT.second.getVectorNumElements() == Mask.size() &&
6251	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc \|\|
6252	// Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6253	// mean that we can end up with shuffles that satisfy isTRNMask, but end
6254	// up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6255	Kind == TTI::SK_InsertSubvector) &&
6256	(isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) \|\|
6257	isTRNMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused, OperandOrderOut&: Unused) \|\|
6258	isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) \|\|
6259	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6260	NumElts: LT.second.getVectorNumElements(), BlockSize: `16`) \|\|
6261	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6262	NumElts: LT.second.getVectorNumElements(), BlockSize: `32`) \|\|
6263	isREVMask(M: Mask, EltSize: LT.second.getScalarSizeInBits(),
6264	NumElts: LT.second.getVectorNumElements(), BlockSize: `64`) \|\|
6265	// Check for non-zero lane splats
6266	all_of(Range: drop_begin(RangeOrContainer&: Mask),
6267	P: [&Mask](int M) { return M < `0` \|\| M == Mask [`0`]; })))
6268	return `1`;
6269
6270	if (Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Transpose \|\|
6271	Kind == TTI::SK_Select \|\| Kind == TTI::SK_PermuteSingleSrc \|\|
6272	Kind == TTI::SK_Reverse \|\| Kind == TTI::SK_Splice) {
6273	static const CostTblEntry ShuffleTbl[] = {
6274	// Broadcast shuffle kinds can be performed with 'dup'.
6275	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: `1`},
6276	{.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: `1`},
6277	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: `1`},
6278	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: `1`},
6279	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: `1`},
6280	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: `1`},
6281	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: `1`},
6282	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: `1`},
6283	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: `1`},
6284	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4bf16, .Cost: `1`},
6285	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8bf16, .Cost: `1`},
6286	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: `1`},
6287	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: `1`},
6288	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: `1`},
6289	// Transpose shuffle kinds can be performed with 'trn1/trn2' and
6290	// 'zip1/zip2' instructions.
6291	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: `1`},
6292	{.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: `1`},
6293	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: `1`},
6294	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: `1`},
6295	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: `1`},
6296	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: `1`},
6297	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: `1`},
6298	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: `1`},
6299	{.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: `1`},
6300	{.ISD: TTI::SK_Transpose, .Type: MVT::v4bf16, .Cost: `1`},
6301	{.ISD: TTI::SK_Transpose, .Type: MVT::v8bf16, .Cost: `1`},
6302	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: `1`},
6303	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: `1`},
6304	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: `1`},
6305	// Select shuffle kinds.
6306	// TODO: handle vXi8/vXi16.
6307	{.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: `1`}, // mov.
6308	{.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: `2`}, // rev+trn (or similar).
6309	{.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: `1`}, // mov.
6310	{.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: `1`}, // mov.
6311	{.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: `2`}, // rev+trn (or similar).
6312	{.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: `1`}, // mov.
6313	// PermuteSingleSrc shuffle kinds.
6314	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: `1`}, // mov.
6315	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: `3`}, // perfectshuffle worst case.
6316	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: `1`}, // mov.
6317	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: `1`}, // mov.
6318	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: `3`}, // perfectshuffle worst case.
6319	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: `1`}, // mov.
6320	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: `3`}, // perfectshuffle worst case.
6321	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: `3`}, // perfectshuffle worst case.
6322	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: `3`}, // same
6323	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: `8`}, // constpool + load + tbl
6324	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: `8`}, // constpool + load + tbl
6325	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: `8`}, // constpool + load + tbl
6326	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: `8`}, // constpool + load + tbl
6327	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: `8`}, // constpool + load + tbl
6328	// Reverse can be lowered with `rev`.
6329	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: `1`}, // REV64
6330	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: `2`}, // REV64; EXT
6331	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: `1`}, // EXT
6332	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: `1`}, // REV64
6333	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: `2`}, // REV64; EXT
6334	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: `1`}, // EXT
6335	{.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: `2`}, // REV64; EXT
6336	{.ISD: TTI::SK_Reverse, .Type: MVT::v8bf16, .Cost: `2`}, // REV64; EXT
6337	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: `2`}, // REV64; EXT
6338	{.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: `2`}, // REV64; EXT
6339	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: `1`}, // REV64
6340	{.ISD: TTI::SK_Reverse, .Type: MVT::v4bf16, .Cost: `1`}, // REV64
6341	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: `1`}, // REV64
6342	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: `1`}, // REV64
6343	// Splice can all be lowered as `ext`.
6344	{.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: `1`},
6345	{.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: `1`},
6346	{.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: `1`},
6347	{.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: `1`},
6348	{.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: `1`},
6349	{.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: `1`},
6350	{.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: `1`},
6351	{.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: `1`},
6352	{.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: `1`},
6353	{.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: `1`},
6354	{.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: `1`},
6355	{.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: `1`},
6356	{.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: `1`},
6357	{.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: `1`},
6358	// Broadcast shuffle kinds for scalable vectors
6359	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: `1`},
6360	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: `1`},
6361	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: `1`},
6362	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: `1`},
6363	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: `1`},
6364	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: `1`},
6365	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: `1`},
6366	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: `1`},
6367	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: `1`},
6368	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: `1`},
6369	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: `1`},
6370	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: `1`},
6371	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: `1`},
6372	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: `1`},
6373	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: `1`},
6374	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: `1`},
6375	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: `1`},
6376	// Handle the cases for vector.reverse with scalable vectors
6377	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: `1`},
6378	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: `1`},
6379	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: `1`},
6380	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: `1`},
6381	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: `1`},
6382	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: `1`},
6383	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: `1`},
6384	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: `1`},
6385	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: `1`},
6386	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: `1`},
6387	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: `1`},
6388	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: `1`},
6389	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: `1`},
6390	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: `1`},
6391	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: `1`},
6392	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: `1`},
6393	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: `1`},
6394	};
6395	if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
6396	return LT.first * Entry->Cost;
6397	}
6398
6399	if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: SrcTy))
6400	return getSpliceCost(Tp: SrcTy, Index, CostKind);
6401
6402	// Inserting a subvector can often be done with either a D, S or H register
6403	// move, so long as the inserted vector is "aligned".
6404	if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6405	LT.second.getSizeInBits() <= `128` && SubTp) {
6406	std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
6407	if (SubLT.second.isVector()) {
6408	int NumElts = LT.second.getVectorNumElements();
6409	int NumSubElts = SubLT.second.getVectorNumElements();
6410	if ((Index % NumSubElts) == `0` && (NumElts % NumSubElts) == `0`)
6411	return SubLT.first;
6412	}
6413	}
6414
6415	// Restore optimal kind.
6416	if (IsExtractSubvector)
6417	Kind = TTI::SK_ExtractSubvector;
6418	return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6419	Args, CxtI);
6420	}
6421
6422	static bool containsDecreasingPointers(Loop *TheLoop,
6423	PredicatedScalarEvolution *PSE,
6424	const DominatorTree &DT) {
6425	const auto &Strides = DenseMap<Value , const* SCEV *>();
6426	for (BasicBlock *BB : TheLoop->blocks()) {
6427	// Scan the instructions in the block and look for addresses that are
6428	// consecutive and decreasing.
6429	for (Instruction &I : *BB) {
6430	if (isa<LoadInst>(Val: &I) \|\| isa<StoreInst>(Val: &I)) {
6431	Value *Ptr = getLoadStorePointerOperand(V: &I);
6432	Type *AccessTy = getLoadStoreType(I: &I);
6433	if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, DT, StridesMap: Strides,
6434	/Assume=/true, /ShouldCheckWrap=/false)
6435	.value_or(u: `0`) < `0`)
6436	return true;
6437	}
6438	}
6439	}
6440	return false;
6441	}
6442
6443	bool AArch64TTIImpl::preferFixedOverScalableIfEqualCost(bool IsEpilogue) const {
6444	if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6445	return SVEPreferFixedOverScalableIfEqualCost;
6446	// For cases like post-LTO vectorization, when we eventually know the trip
6447	// count, epilogue with fixed-width vectorization can be deleted if the trip
6448	// count is less than the epilogue iterations. That's why we prefer
6449	// fixed-width vectorization in epilogue in case of equal costs.
6450	if (IsEpilogue)
6451	return true;
6452	return ST->useFixedOverScalableIfEqualCost();
6453	}
6454
6455	unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
6456	return ST->getEpilogueVectorizationMinVF();
6457	}
6458
6459	bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo TFI) const* {
6460	if (!ST->hasSVE())
6461	return false;
6462
6463	// We don't currently support vectorisation with interleaving for SVE - with
6464	// such loops we're better off not using tail-folding. This gives us a chance
6465	// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6466	if (TFI->IAI->hasGroups())
6467	return false;
6468
6469	TailFoldingOpts Required = TailFoldingOpts::Disabled;
6470	if (TFI->LVL->getReductionVars().size())
6471	Required \|= TailFoldingOpts::Reductions;
6472	if (TFI->LVL->getFixedOrderRecurrences().size())
6473	Required \|= TailFoldingOpts::Recurrences;
6474
6475	// We call this to discover whether any load/store pointers in the loop have
6476	// negative strides. This will require extra work to reverse the loop
6477	// predicate, which may be expensive.
6478	if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
6479	PSE: TFI->LVL->getPredicatedScalarEvolution(),
6480	DT: *TFI->LVL->getDominatorTree()))
6481	Required \|= TailFoldingOpts::Reverse;
6482	if (Required == TailFoldingOpts::Disabled)
6483	Required \|= TailFoldingOpts::Simple;
6484
6485	if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
6486	Required))
6487	return false;
6488
6489	// Don't tail-fold for tight loops where we would be better off interleaving
6490	// with an unpredicated loop.
6491	unsigned NumInsns = `0`;
6492	for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6493	NumInsns += BB->sizeWithoutDebug();
6494	}
6495
6496	// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6497	return NumInsns >= SVETailFoldInsnThreshold;
6498	}
6499
6500	InstructionCost
6501	AArch64TTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
6502	StackOffset BaseOffset, bool HasBaseReg,
6503	int64_t Scale, unsigned AddrSpace) const {
6504	// Scaling factors are not free at all.
6505	// Operands \| Rt Latency
6506	// -------------------------------------------
6507	// Rt, [Xn, Xm] \| 4
6508	// -------------------------------------------
6509	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
6510	// Rt, [Xn, Wm, <extend> #imm] \|
6511	TargetLoweringBase::AddrMode AM;
6512	AM.BaseGV = BaseGV;
6513	AM.BaseOffs = BaseOffset.getFixed();
6514	AM.HasBaseReg = HasBaseReg;
6515	AM.Scale = Scale;
6516	AM.ScalableOffset = BaseOffset.getScalable();
6517	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
6518	// Scale represents reg2 scale, thus account for 1 if*
6519	// it is not equal to 0 or 1.
6520	return AM.Scale != `0` && AM.Scale != `1`;
6521	return InstructionCost::getInvalid();
6522	}
6523
6524	bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(
6525	const Instruction I) const* {
6526	if (EnableOrLikeSelectOpt) {
6527	// For the binary operators (e.g. or) we need to be more careful than
6528	// selects, here we only transform them if they are already at a natural
6529	// break point in the code - the end of a block with an unconditional
6530	// terminator.
6531	if (I->getOpcode() == Instruction::Or &&
6532	isa<UncondBrInst>(Val: I->getNextNode()))
6533	return true;
6534
6535	if (I->getOpcode() == Instruction::Add \|\|
6536	I->getOpcode() == Instruction::Sub)
6537	return true;
6538	}
6539	return BaseT::shouldTreatInstructionLikeSelect(I);
6540	}
6541
6542	bool AArch64TTIImpl::isLSRCostLess(
6543	const TargetTransformInfo::LSRCost &C1,
6544	const TargetTransformInfo::LSRCost &C2) const {
6545	// AArch64 specific here is adding the number of instructions to the
6546	// comparison (though not as the first consideration, as some targets do)
6547	// along with changing the priority of the base additions.
6548	// TODO: Maybe a more nuanced tradeoff between instruction count
6549	// and number of registers? To be investigated at a later date.
6550	if (EnableLSRCostOpt)
6551	return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
6552	args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
6553	std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
6554	args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
6555
6556	return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
6557	}
6558
6559	static bool isSplatShuffle(Value *V) {
6560	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
6561	return all_equal(Range: Shuf->getShuffleMask());
6562	return false;
6563	}
6564
6565	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6566	/// or upper half of the vector elements.
6567	static bool areExtractShuffleVectors(Value Op1, Value Op2,
6568	bool AllowSplat = false) {
6569	// Scalable types can't be extract shuffle vectors.
6570	if (Op1->getType()->isScalableTy() \|\| Op2->getType()->isScalableTy())
6571	return false;
6572
6573	auto areTypesHalfed = [](Value FullV, Value HalfV) {
6574	auto *FullTy = FullV->getType();
6575	auto *HalfTy = HalfV->getType();
6576	return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6577	`2` * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6578	};
6579
6580	auto extractHalf = [](Value FullV, Value HalfV) {
6581	auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
6582	auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
6583	return FullVT->getNumElements() == `2` * HalfVT->getNumElements();
6584	};
6585
6586	ArrayRef<int> M1, M2;
6587	Value S1Op1 = nullptr, S2Op1 = nullptr;
6588	if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask (M1))) \|\|
6589	!match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask (M2))))
6590	return false;
6591
6592	// If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6593	// it is not checked as an extract below.
6594	if (AllowSplat && isSplatShuffle(V: Op1))
6595	S1Op1 = nullptr;
6596	if (AllowSplat && isSplatShuffle(V: Op2))
6597	S2Op1 = nullptr;
6598
6599	// Check that the operands are half as wide as the result and we extract
6600	// half of the elements of the input vectors.
6601	if ((S1Op1 && (!areTypesHalfed (S1Op1, Op1) \|\| !extractHalf (S1Op1, Op1))) \|\|
6602	(S2Op1 && (!areTypesHalfed (S2Op1, Op2) \|\| !extractHalf (S2Op1, Op2))))
6603	return false;
6604
6605	// Check the mask extracts either the lower or upper half of vector
6606	// elements.
6607	int M1Start = `0`;
6608	int M2Start = `0`;
6609	int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * `2`;
6610	if ((S1Op1 &&
6611	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) \|\|
6612	(S2Op1 &&
6613	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
6614	return false;
6615
6616	if ((M1Start != `0` && M1Start != (NumElements / `2`)) \|\|
6617	(M2Start != `0` && M2Start != (NumElements / `2`)))
6618	return false;
6619	if (S1Op1 && S2Op1 && M1Start != M2Start)
6620	return false;
6621
6622	return true;
6623	}
6624
6625	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6626	/// of the vector elements.
6627	static bool areExtractExts(Value Ext1, Value Ext2) {
6628	auto areExtDoubled = [](Instruction *Ext) {
6629	return Ext->getType()->getScalarSizeInBits() ==
6630	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
6631	};
6632
6633	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
6634	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
6635	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
6636	!areExtDoubled (cast<Instruction>(Val: Ext2)))
6637	return false;
6638
6639	return true;
6640	}
6641
6642	/// Check if Op could be used with vmull_high_p64 intrinsic.
6643	static bool isOperandOfVmullHighP64(Value *Op) {
6644	Value VectorOperand = nullptr*;
6645	ConstantInt ElementIndex = nullptr*;
6646	return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
6647	Idx: m_ConstantInt(CI&: ElementIndex))) &&
6648	ElementIndex->getValue() == `1` &&
6649	isa<FixedVectorType>(Val: VectorOperand->getType()) &&
6650	cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == `2`;
6651	}
6652
6653	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6654	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
6655	return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
6656	}
6657
6658	static bool shouldSinkVectorOfPtrs(Value Ptrs, SmallVectorImpl<Use > &Ops) {
6659	// Restrict ourselves to the form CodeGenPrepare typically constructs.
6660	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
6661	if (!GEP \|\| GEP->getNumOperands() != `2`)
6662	return false;
6663
6664	Value *Base = GEP->getOperand(i_nocapture: `0`);
6665	Value *Offsets = GEP->getOperand(i_nocapture: `1`);
6666
6667	// We only care about scalar_base+vector_offsets.
6668	if (Base->getType()->isVectorTy() \|\| !Offsets->getType()->isVectorTy())
6669	return false;
6670
6671	// Sink extends that would allow us to use 32-bit offset vectors.
6672	if (isa<SExtInst>(Val: Offsets) \|\| isa<ZExtInst>(Val: Offsets)) {
6673	auto *OffsetsInst = cast<Instruction>(Val: Offsets);
6674	if (OffsetsInst->getType()->getScalarSizeInBits() > `32` &&
6675	OffsetsInst->getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `32`)
6676	Ops.push_back(Elt: &GEP->getOperandUse(i: `1`));
6677	}
6678
6679	// Sink the GEP.
6680	return true;
6681	}
6682
6683	/// We want to sink following cases:
6684	/// (add\|sub\|gep) A, ((mul\|shl) vscale, imm); (add\|sub\|gep) A, vscale;
6685	/// (add\|sub\|gep) A, ((mul\|shl) zext(vscale), imm);
6686	static bool shouldSinkVScale(Value Op, SmallVectorImpl<Use > &Ops) {
6687	if (match(V: Op, P: m_VScale()))
6688	return true;
6689	if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) \|\|
6690	match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
6691	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
6692	return true;
6693	}
6694	if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) \|\|
6695	match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
6696	Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: `0`);
6697	Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: `0`));
6698	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
6699	return true;
6700	}
6701	return false;
6702	}
6703
6704	static bool isFNeg(Value Op) { return* match(V: Op, P: m_FNeg(X: m_Value())); }
6705
6706	/// Check if sinking \p I's operands to I's basic block is profitable, because
6707	/// the operands can be folded into a target instruction, e.g.
6708	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
6709	bool AArch64TTIImpl::isProfitableToSinkOperands(
6710	Instruction I, SmallVectorImpl<Use > &Ops) const {
6711	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
6712	switch (II->getIntrinsicID()) {
6713	case Intrinsic::aarch64_neon_smull:
6714	case Intrinsic::aarch64_neon_umull:
6715	if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`),
6716	/AllowSplat=/true)) {
6717	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6718	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6719	return true;
6720	}
6721	[[fallthrough]];
6722
6723	case Intrinsic::fma:
6724	case Intrinsic::fmuladd:
6725	if (isa<VectorType>(Val: I->getType()) &&
6726	cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
6727	!ST->hasFullFP16())
6728	return false;
6729
6730	if (isFNeg(Op: II->getOperand(i_nocapture: `0`)))
6731	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6732	if (isFNeg(Op: II->getOperand(i_nocapture: `1`)))
6733	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6734
6735	[[fallthrough]];
6736	case Intrinsic::aarch64_neon_sqdmull:
6737	case Intrinsic::aarch64_neon_sqdmulh:
6738	case Intrinsic::aarch64_neon_sqrdmulh:
6739	// Sink splats for index lane variants
6740	if (isSplatShuffle(V: II->getOperand(i_nocapture: `0`)))
6741	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6742	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
6743	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6744	return !Ops.empty();
6745	case Intrinsic::aarch64_neon_fmlal:
6746	case Intrinsic::aarch64_neon_fmlal2:
6747	case Intrinsic::aarch64_neon_fmlsl:
6748	case Intrinsic::aarch64_neon_fmlsl2:
6749	// Sink splats for index lane variants
6750	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
6751	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6752	if (isSplatShuffle(V: II->getOperand(i_nocapture: `2`)))
6753	Ops.push_back(Elt: &II->getOperandUse(i: `2`));
6754	return !Ops.empty();
6755	case Intrinsic::aarch64_sve_ptest_first:
6756	case Intrinsic::aarch64_sve_ptest_last:
6757	if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: `0`)))
6758	if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
6759	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6760	return !Ops.empty();
6761	case Intrinsic::aarch64_sme_write_horiz:
6762	case Intrinsic::aarch64_sme_write_vert:
6763	case Intrinsic::aarch64_sme_writeq_horiz:
6764	case Intrinsic::aarch64_sme_writeq_vert: {
6765	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `1`));
6766	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
6767	return false;
6768	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6769	return true;
6770	}
6771	case Intrinsic::aarch64_sme_read_horiz:
6772	case Intrinsic::aarch64_sme_read_vert:
6773	case Intrinsic::aarch64_sme_readq_horiz:
6774	case Intrinsic::aarch64_sme_readq_vert:
6775	case Intrinsic::aarch64_sme_ld1b_vert:
6776	case Intrinsic::aarch64_sme_ld1h_vert:
6777	case Intrinsic::aarch64_sme_ld1w_vert:
6778	case Intrinsic::aarch64_sme_ld1d_vert:
6779	case Intrinsic::aarch64_sme_ld1q_vert:
6780	case Intrinsic::aarch64_sme_st1b_vert:
6781	case Intrinsic::aarch64_sme_st1h_vert:
6782	case Intrinsic::aarch64_sme_st1w_vert:
6783	case Intrinsic::aarch64_sme_st1d_vert:
6784	case Intrinsic::aarch64_sme_st1q_vert:
6785	case Intrinsic::aarch64_sme_ld1b_horiz:
6786	case Intrinsic::aarch64_sme_ld1h_horiz:
6787	case Intrinsic::aarch64_sme_ld1w_horiz:
6788	case Intrinsic::aarch64_sme_ld1d_horiz:
6789	case Intrinsic::aarch64_sme_ld1q_horiz:
6790	case Intrinsic::aarch64_sme_st1b_horiz:
6791	case Intrinsic::aarch64_sme_st1h_horiz:
6792	case Intrinsic::aarch64_sme_st1w_horiz:
6793	case Intrinsic::aarch64_sme_st1d_horiz:
6794	case Intrinsic::aarch64_sme_st1q_horiz: {
6795	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `3`));
6796	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
6797	return false;
6798	Ops.push_back(Elt: &II->getOperandUse(i: `3`));
6799	return true;
6800	}
6801	case Intrinsic::aarch64_neon_pmull:
6802	if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`)))
6803	return false;
6804	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6805	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
6806	return true;
6807	case Intrinsic::aarch64_neon_pmull64:
6808	if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: `0`),
6809	Op2: II->getArgOperand(i: `1`)))
6810	return false;
6811	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
6812	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
6813	return true;
6814	case Intrinsic::masked_gather:
6815	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `0`), Ops))
6816	return false;
6817	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
6818	return true;
6819	case Intrinsic::masked_scatter:
6820	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `1`), Ops))
6821	return false;
6822	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
6823	return true;
6824	default:
6825	return false;
6826	}
6827	}
6828
6829	auto ShouldSinkCondition = [](Value *Cond,
6830	SmallVectorImpl<Use > &Ops) -> bool* {
6831	if (!isa<IntrinsicInst>(Val: Cond))
6832	return false;
6833	auto *II = dyn_cast<IntrinsicInst>(Val: Cond);
6834	if (II->getIntrinsicID() != Intrinsic::vector_reduce_or \|\|
6835	!isa<ScalableVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType()))
6836	return false;
6837	if (isa<CmpInst>(Val: II->getOperand(i_nocapture: `0`)))
6838	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
6839	return true;
6840	};
6841
6842	switch (I->getOpcode()) {
6843	case Instruction::GetElementPtr:
6844	case Instruction::Add:
6845	case Instruction::Sub:
6846	// Sink vscales closer to uses for better isel
6847	for (unsigned Op = `0`; Op < I->getNumOperands(); ++Op) {
6848	if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
6849	Ops.push_back(Elt: &I->getOperandUse(i: Op));
6850	return true;
6851	}
6852	}
6853	break;
6854	case Instruction::Select: {
6855	if (!ShouldSinkCondition (I->getOperand(i: `0`), Ops))
6856	return false;
6857
6858	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6859	return true;
6860	}
6861	case Instruction::UncondBr:
6862	return false;
6863	case Instruction::CondBr: {
6864	if (!ShouldSinkCondition (cast<CondBrInst>(Val: I)->getCondition(), Ops))
6865	return false;
6866
6867	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6868	return true;
6869	}
6870	case Instruction::FMul:
6871	// fmul with contract flag can be combined with fadd into fma.
6872	// Sinking fneg into this block enables fmls pattern.
6873	if (cast<FPMathOperator>(Val: I)->hasAllowContract()) {
6874	if (isFNeg(Op: I->getOperand(i: `0`)))
6875	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6876	if (isFNeg(Op: I->getOperand(i: `1`)))
6877	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6878	}
6879	break;
6880
6881	default:
6882	break;
6883	}
6884
6885	if (!I->getType()->isVectorTy())
6886	return !Ops.empty();
6887
6888	switch (I->getOpcode()) {
6889	case Instruction::Sub:
6890	case Instruction::Add: {
6891	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
6892	return false;
6893
6894	// If the exts' operands extract either the lower or upper elements, we
6895	// can sink them too.
6896	auto Ext1 = cast<Instruction>(Val: I->getOperand(i: `0`));
6897	auto Ext2 = cast<Instruction>(Val: I->getOperand(i: `1`));
6898	if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: `0`), Op2: Ext2->getOperand(i: `0`))) {
6899	Ops.push_back(Elt: &Ext1->getOperandUse(i: `0`));
6900	Ops.push_back(Elt: &Ext2->getOperandUse(i: `0`));
6901	}
6902
6903	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6904	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6905
6906	return true;
6907	}
6908	case Instruction::Or: {
6909	// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
6910	// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
6911	if (ST->hasNEON()) {
6912	Instruction OtherAnd, IA, *IB;
6913	Value *MaskValue;
6914	// MainAnd refers to And instruction that has 'Not' as one of its operands
6915	if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
6916	R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
6917	R: m_Instruction(I&: IA)))))) {
6918	if (match(V: OtherAnd,
6919	P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
6920	Instruction *MainAnd = I->getOperand(i: `0`) == OtherAnd
6921	? cast<Instruction>(Val: I->getOperand(i: `1`))
6922	: cast<Instruction>(Val: I->getOperand(i: `0`));
6923
6924	// Both Ands should be in same basic block as Or
6925	if (I->getParent() != MainAnd->getParent() \|\|
6926	I->getParent() != OtherAnd->getParent())
6927	return false;
6928
6929	// Non-mask operands of both Ands should also be in same basic block
6930	if (I->getParent() != IA->getParent() \|\|
6931	I->getParent() != IB->getParent())
6932	return false;
6933
6934	Ops.push_back(
6935	Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: `0`) == IA ? `1` : `0`));
6936	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
6937	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
6938
6939	return true;
6940	}
6941	}
6942	}
6943
6944	return false;
6945	}
6946	case Instruction::Mul: {
6947	auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
6948	auto *Ty = cast<VectorType>(Val: V->getType());
6949	// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
6950	if (Ty->isScalableTy())
6951	return false;
6952
6953	// Indexed variants of Mul exist for i16 and i32 element types only.
6954	return Ty->getScalarSizeInBits() == `16` \|\| Ty->getScalarSizeInBits() == `32`;
6955	};
6956
6957	int NumZExts = `0`, NumSExts = `0`;
6958	for (auto &Op : I->operands()) {
6959	// Make sure we are not already sinking this operand
6960	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
6961	continue;
6962
6963	if (match(V: &Op, P: m_ZExtOrSExt(Op: m_Value()))) {
6964	auto *Ext = cast<Instruction>(Val&: Op);
6965	auto *ExtOp = Ext->getOperand(i: `0`);
6966	if (isSplatShuffle(V: ExtOp) && ShouldSinkSplatForIndexedVariant (ExtOp))
6967	Ops.push_back(Elt: &Ext->getOperandUse(i: `0`));
6968	Ops.push_back(Elt: &Op);
6969
6970	if (isa<SExtInst>(Val: Ext)) {
6971	NumSExts++;
6972	} else {
6973	NumZExts++;
6974	// A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
6975	if (Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits() * `2` <
6976	I->getType()->getScalarSizeInBits())
6977	NumSExts++;
6978	}
6979
6980	continue;
6981	}
6982
6983	ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
6984	if (!Shuffle)
6985	continue;
6986
6987	// If the Shuffle is a splat and the operand is a zext/sext, sinking the
6988	// operand and the s/zext can help create indexed s/umull. This is
6989	// especially useful to prevent i64 mul being scalarized.
6990	if (isSplatShuffle(V: Shuffle) &&
6991	match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_ZExtOrSExt(Op: m_Value()))) {
6992	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
6993	Ops.push_back(Elt: &Op);
6994	if (match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_SExt(Op: m_Value())))
6995	NumSExts++;
6996	else
6997	NumZExts++;
6998	continue;
6999	}
7000
7001	Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: `0`);
7002	InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
7003	if (!Insert)
7004	continue;
7005
7006	Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: `1`));
7007	if (!OperandInstr)
7008	continue;
7009
7010	ConstantInt *ElementConstant =
7011	dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: `2`));
7012	// Check that the insertelement is inserting into element 0
7013	if (!ElementConstant \|\| !ElementConstant->isZero())
7014	continue;
7015
7016	unsigned Opcode = OperandInstr->getOpcode();
7017	if (Opcode == Instruction::SExt)
7018	NumSExts++;
7019	else if (Opcode == Instruction::ZExt)
7020	NumZExts++;
7021	else {
7022	// If we find that the top bits are known 0, then we can sink and allow
7023	// the backend to generate a umull.
7024	unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7025	APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / `2`);
7026	if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, SQ: DL))
7027	continue;
7028	NumZExts++;
7029	}
7030
7031	// And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7032	// the And, just to hoist it again back to the load.
7033	if (!match(V: OperandInstr, P: m_And(L: m_Load(Op: m_Value()), R: m_Value())))
7034	Ops.push_back(Elt: &Insert->getOperandUse(i: `1`));
7035	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
7036	Ops.push_back(Elt: &Op);
7037	}
7038
7039	// It is profitable to sink if we found two of the same type of extends.
7040	if (!Ops.empty() && (NumSExts == `2` \|\| NumZExts == `2`))
7041	return true;
7042
7043	// Otherwise, see if we should sink splats for indexed variants.
7044	if (!ShouldSinkSplatForIndexedVariant (I))
7045	return false;
7046
7047	Ops.clear();
7048	if (isSplatShuffle(V: I->getOperand(i: `0`)))
7049	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
7050	if (isSplatShuffle(V: I->getOperand(i: `1`)))
7051	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
7052
7053	return !Ops.empty();
7054	}
7055	case Instruction::FMul: {
7056	// For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7057	if (I->getType()->isScalableTy())
7058	return !Ops.empty();
7059
7060	if (cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
7061	!ST->hasFullFP16())
7062	return !Ops.empty();
7063
7064	// Sink splats for index lane variants
7065	if (isSplatShuffle(V: I->getOperand(i: `0`)))
7066	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
7067	if (isSplatShuffle(V: I->getOperand(i: `1`)))
7068	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
7069	return !Ops.empty();
7070	}
7071	default:
7072	return false;
7073	}
7074	return false;
7075	}
7076

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp