AArch64TargetTransformInfo.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp]

1	//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "AArch64TargetTransformInfo.h"
10	#include "AArch64ExpandImm.h"
11	#include "AArch64PerfectShuffle.h"
12	#include "MCTargetDesc/AArch64AddressingModes.h"
13	#include "llvm/Analysis/IVDescriptors.h"
14	#include "llvm/Analysis/LoopInfo.h"
15	#include "llvm/Analysis/TargetTransformInfo.h"
16	#include "llvm/CodeGen/BasicTTIImpl.h"
17	#include "llvm/CodeGen/CostTable.h"
18	#include "llvm/CodeGen/TargetLowering.h"
19	#include "llvm/IR/IntrinsicInst.h"
20	#include "llvm/IR/Intrinsics.h"
21	#include "llvm/IR/IntrinsicsAArch64.h"
22	#include "llvm/IR/PatternMatch.h"
23	#include "llvm/Support/Debug.h"
24	#include "llvm/Transforms/InstCombine/InstCombiner.h"
25	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
26	#include <algorithm>
27	#include <optional>
28	using namespace llvm;
29	using namespace llvm::PatternMatch;
30
31	#define DEBUG_TYPE "aarch64tti"
32
33	static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34	cl::init(Val: true), cl::Hidden);
35
36	static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(Val: `10`),
37	cl::Hidden);
38
39	static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40	cl::init(Val: `10`), cl::Hidden);
41
42	static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43	cl::init(Val: `15`), cl::Hidden);
44
45	static cl::opt<unsigned>
46	NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(Val: `10`),
47	cl::Hidden);
48
49	static cl::opt<unsigned> CallPenaltyChangeSM(
50	"call-penalty-sm-change", cl::init(Val: `5`), cl::Hidden,
51	cl::desc (
52	"Penalty of calling a function that requires a change to PSTATE.SM"));
53
54	static cl::opt<unsigned> InlineCallPenaltyChangeSM(
55	"inline-call-penalty-sm-change", cl::init(Val: `10`), cl::Hidden,
56	cl::desc ("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58	static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59	cl::init(Val: true), cl::Hidden);
60
61	static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62	cl::init(Val: true), cl::Hidden);
63
64	// A complete guess as to a reasonable cost.
65	static cl::opt<unsigned>
66	BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(Val: `8`), cl::Hidden,
67	cl::desc ("The cost of a histcnt instruction"));
68
69	namespace {
70	class TailFoldingOption {
71	// These bitfields will only ever be set to something non-zero in operator=,
72	// when setting the -sve-tail-folding option. This option should always be of
73	// the form (default\|simple\|all\|disable)[+(Flag1\|Flag2\|etc)], where here
74	// InitialBits is one of (disabled\|all\|simple). EnableBits represents
75	// additional flags we're enabling, and DisableBits for those flags we're
76	// disabling. The default flag is tracked in the variable NeedsDefault, since
77	// at the time of setting the option we may not know what the default value
78	// for the CPU is.
79	TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
80	TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
81	TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
82
83	// This value needs to be initialised to true in case the user does not
84	// explicitly set the -sve-tail-folding option.
85	bool NeedsDefault = true;
86
87	void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
88
89	void setNeedsDefault(bool V) { NeedsDefault = V; }
90
91	void setEnableBit(TailFoldingOpts Bit) {
92	EnableBits \|= Bit;
93	DisableBits &= ~Bit;
94	}
95
96	void setDisableBit(TailFoldingOpts Bit) {
97	EnableBits &= ~Bit;
98	DisableBits \|= Bit;
99	}
100
101	TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
102	TailFoldingOpts Bits = TailFoldingOpts::Disabled;
103
104	assert((InitialBits == TailFoldingOpts::Disabled \|\| !NeedsDefault) &&
105	"Initial bits should only include one of "
106	"(disabled\|all\|simple\|default)");
107	Bits = NeedsDefault ? DefaultBits : InitialBits;
108	Bits \|= EnableBits;
109	Bits &= ~DisableBits;
110
111	return Bits;
112	}
113
114	void reportError(std::string Opt) {
115	errs() << "invalid argument '" << Opt
116	<< "' to -sve-tail-folding=; the option should be of the form\n"
117	" (disabled\|all\|default\|simple)[+(reductions\|recurrences"
118	"\|reverse\|noreductions\|norecurrences\|noreverse)]\n";
119	report_fatal_error(reason: "Unrecognised tail-folding option");
120	}
121
122	public:
123
124	void operator=(const std::string &Val) {
125	// If the user explicitly sets -sve-tail-folding= then treat as an error.
126	if (Val.empty()) {
127	reportError(Opt: "");
128	return;
129	}
130
131	// Since the user is explicitly setting the option we don't automatically
132	// need the default unless they require it.
133	setNeedsDefault(false);
134
135	SmallVector<StringRef, `4`> TailFoldTypes;
136	StringRef (Val).split(A&: TailFoldTypes, Separator: `'+'`, MaxSplit: -`1`, KeepEmpty: false);
137
138	unsigned StartIdx = `1`;
139	if (TailFoldTypes [`0`] == "disabled")
140	setInitialBits(TailFoldingOpts::Disabled);
141	else if (TailFoldTypes [`0`] == "all")
142	setInitialBits(TailFoldingOpts::All);
143	else if (TailFoldTypes [`0`] == "default")
144	setNeedsDefault(true);
145	else if (TailFoldTypes [`0`] == "simple")
146	setInitialBits(TailFoldingOpts::Simple);
147	else {
148	StartIdx = `0`;
149	setInitialBits(TailFoldingOpts::Disabled);
150	}
151
152	for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
153	if (TailFoldTypes [I] == "reductions")
154	setEnableBit(TailFoldingOpts::Reductions);
155	else if (TailFoldTypes [I] == "recurrences")
156	setEnableBit(TailFoldingOpts::Recurrences);
157	else if (TailFoldTypes [I] == "reverse")
158	setEnableBit(TailFoldingOpts::Reverse);
159	else if (TailFoldTypes [I] == "noreductions")
160	setDisableBit(TailFoldingOpts::Reductions);
161	else if (TailFoldTypes [I] == "norecurrences")
162	setDisableBit(TailFoldingOpts::Recurrences);
163	else if (TailFoldTypes [I] == "noreverse")
164	setDisableBit(TailFoldingOpts::Reverse);
165	else
166	reportError(Opt: Val);
167	}
168	}
169
170	bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
171	return (getBits(DefaultBits) & Required) == Required;
172	}
173	};
174	} // namespace
175
176	TailFoldingOption TailFoldingOptionLoc;
177
178	cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
179	"sve-tail-folding",
180	cl::desc (
181	"Control the use of vectorisation using tail-folding for SVE where the"
182	" option is specified in the form (Initial)[+(Flag1\|Flag2\|...)]:"
183	"\ndisabled (Initial) No loop types will vectorize using "
184	"tail-folding"
185	"\ndefault (Initial) Uses the default tail-folding settings for "
186	"the target CPU"
187	"\nall (Initial) All legal loop types will vectorize using "
188	"tail-folding"
189	"\nsimple (Initial) Use tail-folding for simple loops (not "
190	"reductions or recurrences)"
191	"\nreductions Use tail-folding for loops containing reductions"
192	"\nnoreductions Inverse of above"
193	"\nrecurrences Use tail-folding for loops containing fixed order "
194	"recurrences"
195	"\nnorecurrences Inverse of above"
196	"\nreverse Use tail-folding for loops requiring reversed "
197	"predicates"
198	"\nnoreverse Inverse of above"),
199	cl::location(L&: TailFoldingOptionLoc));
200
201	// Experimental option that will only be fully functional when the
202	// code-generator is changed to use SVE instead of NEON for all fixed-width
203	// operations.
204	static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
205	"enable-fixedwidth-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
206
207	// Experimental option that will only be fully functional when the cost-model
208	// and code-generator have been changed to avoid using scalable vector
209	// instructions that are not legal in streaming SVE mode.
210	static cl::opt<bool> EnableScalableAutovecInStreamingMode(
211	"enable-scalable-autovec-in-streaming-mode", cl::init(Val: false), cl::Hidden);
212
213	static bool isSMEABIRoutineCall(const CallInst &CI) {
214	const auto *F = CI.getCalledFunction();
215	return F && StringSwitch<bool>(F->getName())
216	.Case(S: "__arm_sme_state", Value: true)
217	.Case(S: "__arm_tpidr2_save", Value: true)
218	.Case(S: "__arm_tpidr2_restore", Value: true)
219	.Case(S: "__arm_za_disable", Value: true)
220	.Default(Value: false);
221	}
222
223	/// Returns true if the function has explicit operations that can only be
224	/// lowered using incompatible instructions for the selected mode. This also
225	/// returns true if the function F may use or modify ZA state.
226	static bool hasPossibleIncompatibleOps(const Function *F) {
227	for (const BasicBlock &BB : *F) {
228	for (const Instruction &I : BB) {
229	// Be conservative for now and assume that any call to inline asm or to
230	// intrinsics could could result in non-streaming ops (e.g. calls to
231	// @llvm.aarch64. or @llvm.gather/scatter intrinsics). We can assume that*
232	// all native LLVM instructions can be lowered to compatible instructions.
233	if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() &&
234	(cast<CallInst>(Val: I).isInlineAsm() \|\| isa<IntrinsicInst>(Val: I) \|\|
235	isSMEABIRoutineCall(CI: cast<CallInst>(Val: I))))
236	return true;
237	}
238	}
239	return false;
240	}
241
242	bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
243	const Function Callee) const* {
244	SMEAttrs CallerAttrs(Caller), CalleeAttrs(Callee);
245
246	// When inlining, we should consider the body of the function, not the
247	// interface.
248	if (CalleeAttrs.hasStreamingBody()) {
249	CalleeAttrs.set(M: SMEAttrs::SM_Compatible, Enable: false);
250	CalleeAttrs.set(M: SMEAttrs::SM_Enabled, Enable: true);
251	}
252
253	if (CalleeAttrs.isNewZA())
254	return false;
255
256	if (CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
257	CallerAttrs.requiresSMChange(Callee: CalleeAttrs) \|\|
258	CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs)) {
259	if (hasPossibleIncompatibleOps(F: Callee))
260	return false;
261	}
262
263	const TargetMachine &TM = getTLI()->getTargetMachine();
264
265	const FeatureBitset &CallerBits =
266	TM.getSubtargetImpl(*Caller)->getFeatureBits();
267	const FeatureBitset &CalleeBits =
268	TM.getSubtargetImpl(*Callee)->getFeatureBits();
269
270	// Inline a callee if its target-features are a subset of the callers
271	// target-features.
272	return (CallerBits & CalleeBits) == CalleeBits;
273	}
274
275	bool AArch64TTIImpl::areTypesABICompatible(
276	const Function Caller, const* Function *Callee,
277	const ArrayRef<Type > &Types) const* {
278	if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
279	return false;
280
281	// We need to ensure that argument promotion does not attempt to promote
282	// pointers to fixed-length vector types larger than 128 bits like
283	// <8 x float> (and pointers to aggregate types which have such fixed-length
284	// vector type members) into the values of the pointees. Such vector types
285	// are used for SVE VLS but there is no ABI for SVE VLS arguments and the
286	// backend cannot lower such value arguments. The 128-bit fixed-length SVE
287	// types can be safely treated as 128-bit NEON types and they cannot be
288	// distinguished in IR.
289	if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) {
290	auto FVTy = dyn_cast<FixedVectorType>(Val: Ty);
291	return FVTy &&
292	FVTy->getScalarSizeInBits() * FVTy->getNumElements() > `128`;
293	}))
294	return false;
295
296	return true;
297	}
298
299	unsigned
300	AArch64TTIImpl::getInlineCallPenalty(const Function F, const* CallBase &Call,
301	unsigned DefaultCallPenalty) const {
302	// This function calculates a penalty for executing Call in F.
303	//
304	// There are two ways this function can be called:
305	// (1) F:
306	// call from F -> G (the call here is Call)
307	//
308	// For (1), Call.getCaller() == F, so it will always return a high cost if
309	// a streaming-mode change is required (thus promoting the need to inline the
310	// function)
311	//
312	// (2) F:
313	// call from F -> G (the call here is not Call)
314	// G:
315	// call from G -> H (the call here is Call)
316	//
317	// For (2), if after inlining the body of G into F the call to H requires a
318	// streaming-mode change, and the call to G from F would also require a
319	// streaming-mode change, then there is benefit to do the streaming-mode
320	// change only once and avoid inlining of G into F.
321	SMEAttrs FAttrs(*F);
322	SMEAttrs CalleeAttrs(Call);
323	if (FAttrs.requiresSMChange(Callee: CalleeAttrs)) {
324	if (F == Call.getCaller()) // (1)
325	return CallPenaltyChangeSM * DefaultCallPenalty;
326	if (FAttrs.requiresSMChange(Callee: SMEAttrs (Call.getCaller()))) // (2)*
327	return InlineCallPenaltyChangeSM * DefaultCallPenalty;
328	}
329
330	return DefaultCallPenalty;
331	}
332
333	bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
334	TargetTransformInfo::RegisterKind K) const {
335	assert(K != TargetTransformInfo::RGK_Scalar);
336	return (K == TargetTransformInfo::RGK_FixedWidthVector &&
337	ST->isNeonAvailable());
338	}
339
340	/// Calculate the cost of materializing a 64-bit value. This helper
341	/// method might only calculate a fraction of a larger immediate. Therefore it
342	/// is valid to return a cost of ZERO.
343	InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
344	// Check if the immediate can be encoded within an instruction.
345	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: `64`))
346	return `0`;
347
348	if (Val < `0`)
349	Val = ~Val;
350
351	// Calculate how many moves we will need to materialize this constant.
352	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
353	AArch64_IMM::expandMOVImm(Imm: Val, BitSize: `64`, Insn);
354	return Insn.size();
355	}
356
357	/// Calculate the cost of materializing the given constant.
358	InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
359	TTI::TargetCostKind CostKind) {
360	assert(Ty->isIntegerTy());
361
362	unsigned BitSize = Ty->getPrimitiveSizeInBits();
363	if (BitSize == `0`)
364	return ~`0U`;
365
366	// Sign-extend all constants to a multiple of 64-bit.
367	APInt ImmVal = Imm;
368	if (BitSize & `0x3f`)
369	ImmVal = Imm.sext(width: (BitSize + `63`) & ~`0x3fU`);
370
371	// Split the constant into 64-bit chunks and calculate the cost for each
372	// chunk.
373	InstructionCost Cost = `0`;
374	for (unsigned ShiftVal = `0`; ShiftVal < BitSize; ShiftVal += `64`) {
375	APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: `64`);
376	int64_t Val = Tmp.getSExtValue();
377	Cost += getIntImmCost(Val);
378	}
379	// We need at least one instruction to materialze the constant.
380	return std::max<InstructionCost>(a: `1`, b: Cost);
381	}
382
383	InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
384	const APInt &Imm, Type *Ty,
385	TTI::TargetCostKind CostKind,
386	Instruction *Inst) {
387	assert(Ty->isIntegerTy());
388
389	unsigned BitSize = Ty->getPrimitiveSizeInBits();
390	// There is no cost model for constants with a bit size of 0. Return TCC_Free
391	// here, so that constant hoisting will ignore this constant.
392	if (BitSize == `0`)
393	return TTI::TCC_Free;
394
395	unsigned ImmIdx = ~`0U`;
396	switch (Opcode) {
397	default:
398	return TTI::TCC_Free;
399	case Instruction::GetElementPtr:
400	// Always hoist the base address of a GetElementPtr.
401	if (Idx == `0`)
402	return `2` * TTI::TCC_Basic;
403	return TTI::TCC_Free;
404	case Instruction::Store:
405	ImmIdx = `0`;
406	break;
407	case Instruction::Add:
408	case Instruction::Sub:
409	case Instruction::Mul:
410	case Instruction::UDiv:
411	case Instruction::SDiv:
412	case Instruction::URem:
413	case Instruction::SRem:
414	case Instruction::And:
415	case Instruction::Or:
416	case Instruction::Xor:
417	case Instruction::ICmp:
418	ImmIdx = `1`;
419	break;
420	// Always return TCC_Free for the shift value of a shift instruction.
421	case Instruction::Shl:
422	case Instruction::LShr:
423	case Instruction::AShr:
424	if (Idx == `1`)
425	return TTI::TCC_Free;
426	break;
427	case Instruction::Trunc:
428	case Instruction::ZExt:
429	case Instruction::SExt:
430	case Instruction::IntToPtr:
431	case Instruction::PtrToInt:
432	case Instruction::BitCast:
433	case Instruction::PHI:
434	case Instruction::Call:
435	case Instruction::Select:
436	case Instruction::Ret:
437	case Instruction::Load:
438	break;
439	}
440
441	if (Idx == ImmIdx) {
442	int NumConstants = (BitSize + `63`) / `64`;
443	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
444	return (Cost <= NumConstants * TTI::TCC_Basic)
445	? static_cast<int>(TTI::TCC_Free)
446	: Cost;
447	}
448	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
449	}
450
451	InstructionCost
452	AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
453	const APInt &Imm, Type *Ty,
454	TTI::TargetCostKind CostKind) {
455	assert(Ty->isIntegerTy());
456
457	unsigned BitSize = Ty->getPrimitiveSizeInBits();
458	// There is no cost model for constants with a bit size of 0. Return TCC_Free
459	// here, so that constant hoisting will ignore this constant.
460	if (BitSize == `0`)
461	return TTI::TCC_Free;
462
463	// Most (all?) AArch64 intrinsics do not support folding immediates into the
464	// selected instruction, so we compute the materialization cost for the
465	// immediate directly.
466	if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
467	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
468
469	switch (IID) {
470	default:
471	return TTI::TCC_Free;
472	case Intrinsic::sadd_with_overflow:
473	case Intrinsic::uadd_with_overflow:
474	case Intrinsic::ssub_with_overflow:
475	case Intrinsic::usub_with_overflow:
476	case Intrinsic::smul_with_overflow:
477	case Intrinsic::umul_with_overflow:
478	if (Idx == `1`) {
479	int NumConstants = (BitSize + `63`) / `64`;
480	InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
481	return (Cost <= NumConstants * TTI::TCC_Basic)
482	? static_cast<int>(TTI::TCC_Free)
483	: Cost;
484	}
485	break;
486	case Intrinsic::experimental_stackmap:
487	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
488	return TTI::TCC_Free;
489	break;
490	case Intrinsic::experimental_patchpoint_void:
491	case Intrinsic::experimental_patchpoint:
492	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
493	return TTI::TCC_Free;
494	break;
495	case Intrinsic::experimental_gc_statepoint:
496	if ((Idx < `5`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
497	return TTI::TCC_Free;
498	break;
499	}
500	return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
501	}
502
503	TargetTransformInfo::PopcntSupportKind
504	AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
505	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
506	if (TyWidth == `32` \|\| TyWidth == `64`)
507	return TTI::PSK_FastHardware;
508	// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
509	return TTI::PSK_Software;
510	}
511
512	static bool isUnpackedVectorVT(EVT VecVT) {
513	return VecVT.isScalableVector() &&
514	VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock;
515	}
516
517	static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
518	Type BucketPtrsTy = ICA.getArgTypes()[`0`]; // Type of vector of pointers*
519	Type EltTy = ICA.getArgTypes()[`1`]; // Type of bucket elements*
520
521	// Only allow (32b and 64b) integers or pointers for now...
522	if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) \|\|
523	(EltTy->getScalarSizeInBits() != `32` &&
524	EltTy->getScalarSizeInBits() != `64`))
525	return InstructionCost::getInvalid();
526
527	// FIXME: Hacky check for legal vector types. We can promote smaller types
528	// but we cannot legalize vectors via splitting for histcnt.
529	// FIXME: We should be able to generate histcnt for fixed-length vectors
530	// using ptrue with a specific VL.
531	if (VectorType *VTy = dyn_cast<VectorType>(Val: BucketPtrsTy))
532	if ((VTy->getElementCount().getKnownMinValue() != `2` &&
533	VTy->getElementCount().getKnownMinValue() != `4`) \|\|
534	VTy->getPrimitiveSizeInBits().getKnownMinValue() > `128` \|\|
535	!VTy->isScalableTy())
536	return InstructionCost::getInvalid();
537
538	return InstructionCost (BaseHistCntCost);
539	}
540
541	InstructionCost
542	AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
543	TTI::TargetCostKind CostKind) {
544	// The code-generator is currently not able to handle scalable vectors
545	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
546	// it. This change will be removed when code-generation for these types is
547	// sufficiently reliable.
548	auto *RetTy = ICA.getReturnType();
549	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: RetTy))
550	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
551	return InstructionCost::getInvalid();
552
553	switch (ICA.getID()) {
554	case Intrinsic::experimental_vector_histogram_add:
555	if (!ST->hasSVE2())
556	return InstructionCost::getInvalid();
557	return getHistogramCost(ICA);
558	case Intrinsic::umin:
559	case Intrinsic::umax:
560	case Intrinsic::smin:
561	case Intrinsic::smax: {
562	static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
563	MVT::v8i16, MVT::v2i32, MVT::v4i32,
564	MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
565	MVT::nxv2i64};
566	auto LT = getTypeLegalizationCost(Ty: RetTy);
567	// v2i64 types get converted to cmp+bif hence the cost of 2
568	if (LT.second == MVT::v2i64)
569	return LT.first * `2`;
570	if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }))
571	return LT.first;
572	break;
573	}
574	case Intrinsic::sadd_sat:
575	case Intrinsic::ssub_sat:
576	case Intrinsic::uadd_sat:
577	case Intrinsic::usub_sat: {
578	static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
579	MVT::v8i16, MVT::v2i32, MVT::v4i32,
580	MVT::v2i64};
581	auto LT = getTypeLegalizationCost(Ty: RetTy);
582	// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
583	// need to extend the type, as it uses shr(qadd(shl, shl)).
584	unsigned Instrs =
585	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? `1` : `4`;
586	if (any_of(Range: ValidSatTys, P: [&LT](MVT M) { return M == LT.second; }))
587	return LT.first * Instrs;
588	break;
589	}
590	case Intrinsic::abs: {
591	static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
592	MVT::v8i16, MVT::v2i32, MVT::v4i32,
593	MVT::v2i64};
594	auto LT = getTypeLegalizationCost(Ty: RetTy);
595	if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }))
596	return LT.first;
597	break;
598	}
599	case Intrinsic::bswap: {
600	static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
601	MVT::v4i32, MVT::v2i64};
602	auto LT = getTypeLegalizationCost(Ty: RetTy);
603	if (any_of(Range: ValidAbsTys, P: [&LT](MVT M) { return M == LT.second; }) &&
604	LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
605	return LT.first;
606	break;
607	}
608	case Intrinsic::experimental_stepvector: {
609	InstructionCost Cost = `1`; // Cost of the `index' instruction
610	auto LT = getTypeLegalizationCost(Ty: RetTy);
611	// Legalisation of illegal vectors involves an `index' instruction plus
612	// (LT.first - 1) vector adds.
613	if (LT.first > `1`) {
614	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: RetTy->getContext());
615	InstructionCost AddCost =
616	getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind);
617	Cost += AddCost * (LT.first - `1`);
618	}
619	return Cost;
620	}
621	case Intrinsic::vector_extract:
622	case Intrinsic::vector_insert: {
623	// If both the vector and subvector types are legal types and the index
624	// is 0, then this should be a no-op or simple operation; return a
625	// relatively low cost.
626
627	// If arguments aren't actually supplied, then we cannot determine the
628	// value of the index. We also want to skip predicate types.
629	if (ICA.getArgs().size() != ICA.getArgTypes().size() \|\|
630	ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: `1`))
631	break;
632
633	LLVMContext &C = RetTy->getContext();
634	EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
635	bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
636	EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy)
637	: getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`1`]);
638	// Skip this if either the vector or subvector types are unpacked
639	// SVE types; they may get lowered to stack stores and loads.
640	if (isUnpackedVectorVT(VecVT) \|\| isUnpackedVectorVT(VecVT: SubVecVT))
641	break;
642
643	TargetLoweringBase::LegalizeKind SubVecLK =
644	getTLI()->getTypeConversion(Context&: C, VT: SubVecVT);
645	TargetLoweringBase::LegalizeKind VecLK =
646	getTLI()->getTypeConversion(Context&: C, VT: VecVT);
647	const Value *Idx = IsExtract ? ICA.getArgs()[`1`] : ICA.getArgs()[`2`];
648	const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx);
649	if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
650	VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
651	return TTI::TCC_Free;
652	break;
653	}
654	case Intrinsic::bitreverse: {
655	static const CostTblEntry BitreverseTbl[] = {
656	{.ISD: Intrinsic::bitreverse, .Type: MVT::i32, .Cost: `1`},
657	{.ISD: Intrinsic::bitreverse, .Type: MVT::i64, .Cost: `1`},
658	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i8, .Cost: `1`},
659	{.ISD: Intrinsic::bitreverse, .Type: MVT::v16i8, .Cost: `1`},
660	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i16, .Cost: `2`},
661	{.ISD: Intrinsic::bitreverse, .Type: MVT::v8i16, .Cost: `2`},
662	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i32, .Cost: `2`},
663	{.ISD: Intrinsic::bitreverse, .Type: MVT::v4i32, .Cost: `2`},
664	{.ISD: Intrinsic::bitreverse, .Type: MVT::v1i64, .Cost: `2`},
665	{.ISD: Intrinsic::bitreverse, .Type: MVT::v2i64, .Cost: `2`},
666	};
667	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
668	const auto *Entry =
669	CostTableLookup(Table: BitreverseTbl, ISD: ICA.getID(), Ty: LegalisationCost.second);
670	if (Entry) {
671	// Cost Model is using the legal type(i32) that i8 and i16 will be
672	// converted to +1 so that we match the actual lowering cost
673	if (TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i8 \|\|
674	TLI->getValueType(DL, Ty: RetTy, AllowUnknown: true) == MVT::i16)
675	return LegalisationCost.first * Entry->Cost + `1`;
676
677	return LegalisationCost.first * Entry->Cost;
678	}
679	break;
680	}
681	case Intrinsic::ctpop: {
682	if (!ST->hasNEON()) {
683	// 32-bit or 64-bit ctpop without NEON is 12 instructions.
684	return getTypeLegalizationCost(Ty: RetTy).first * `12`;
685	}
686	static const CostTblEntry CtpopCostTbl[] = {
687	{.ISD: ISD::CTPOP, .Type: MVT::v2i64, .Cost: `4`},
688	{.ISD: ISD::CTPOP, .Type: MVT::v4i32, .Cost: `3`},
689	{.ISD: ISD::CTPOP, .Type: MVT::v8i16, .Cost: `2`},
690	{.ISD: ISD::CTPOP, .Type: MVT::v16i8, .Cost: `1`},
691	{.ISD: ISD::CTPOP, .Type: MVT::i64, .Cost: `4`},
692	{.ISD: ISD::CTPOP, .Type: MVT::v2i32, .Cost: `3`},
693	{.ISD: ISD::CTPOP, .Type: MVT::v4i16, .Cost: `2`},
694	{.ISD: ISD::CTPOP, .Type: MVT::v8i8, .Cost: `1`},
695	{.ISD: ISD::CTPOP, .Type: MVT::i32, .Cost: `5`},
696	};
697	auto LT = getTypeLegalizationCost(Ty: RetTy);
698	MVT MTy = LT.second;
699	if (const auto *Entry = CostTableLookup(Table: CtpopCostTbl, ISD: ISD::CTPOP, Ty: MTy)) {
700	// Extra cost of +1 when illegal vector types are legalized by promoting
701	// the integer type.
702	int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
703	RetTy->getScalarSizeInBits()
704	? `1`
705	: `0`;
706	return LT.first * Entry->Cost + ExtraCost;
707	}
708	break;
709	}
710	case Intrinsic::sadd_with_overflow:
711	case Intrinsic::uadd_with_overflow:
712	case Intrinsic::ssub_with_overflow:
713	case Intrinsic::usub_with_overflow:
714	case Intrinsic::smul_with_overflow:
715	case Intrinsic::umul_with_overflow: {
716	static const CostTblEntry WithOverflowCostTbl[] = {
717	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
718	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i8, .Cost: `3`},
719	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
720	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i16, .Cost: `3`},
721	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
722	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i32, .Cost: `1`},
723	{.ISD: Intrinsic::sadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
724	{.ISD: Intrinsic::uadd_with_overflow, .Type: MVT::i64, .Cost: `1`},
725	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i8, .Cost: `3`},
726	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i8, .Cost: `3`},
727	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i16, .Cost: `3`},
728	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i16, .Cost: `3`},
729	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i32, .Cost: `1`},
730	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i32, .Cost: `1`},
731	{.ISD: Intrinsic::ssub_with_overflow, .Type: MVT::i64, .Cost: `1`},
732	{.ISD: Intrinsic::usub_with_overflow, .Type: MVT::i64, .Cost: `1`},
733	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i8, .Cost: `5`},
734	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i8, .Cost: `4`},
735	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i16, .Cost: `5`},
736	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i16, .Cost: `4`},
737	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;tst
738	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i32, .Cost: `2`}, // eg umull;cmp sxtw
739	{.ISD: Intrinsic::smul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;smulh;cmp
740	{.ISD: Intrinsic::umul_with_overflow, .Type: MVT::i64, .Cost: `3`}, // eg mul;umulh;cmp asr
741	};
742	EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: `0`), AllowUnknown: true);
743	if (MTy.isSimple())
744	if (const auto *Entry = CostTableLookup(Table: WithOverflowCostTbl, ISD: ICA.getID(),
745	Ty: MTy.getSimpleVT()))
746	return Entry->Cost;
747	break;
748	}
749	case Intrinsic::fptosi_sat:
750	case Intrinsic::fptoui_sat: {
751	if (ICA.getArgTypes().empty())
752	break;
753	bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
754	auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[`0`]);
755	EVT MTy = TLI->getValueType(DL, Ty: RetTy);
756	// Check for the legal types, which are where the size of the input and the
757	// output are the same, or we are using cvt f64->i32 or f32->i64.
758	if ((LT.second == MVT::f32 \|\| LT.second == MVT::f64 \|\|
759	LT.second == MVT::v2f32 \|\| LT.second == MVT::v4f32 \|\|
760	LT.second == MVT::v2f64) &&
761	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() \|\|
762	(LT.second == MVT::f64 && MTy == MVT::i32) \|\|
763	(LT.second == MVT::f32 && MTy == MVT::i64)))
764	return LT.first;
765	// Similarly for fp16 sizes
766	if (ST->hasFullFP16() &&
767	((LT.second == MVT::f16 && MTy == MVT::i32) \|\|
768	((LT.second == MVT::v4f16 \|\| LT.second == MVT::v8f16) &&
769	(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
770	return LT.first;
771
772	// Otherwise we use a legal convert followed by a min+max
773	if ((LT.second.getScalarType() == MVT::f32 \|\|
774	LT.second.getScalarType() == MVT::f64 \|\|
775	(ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
776	LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
777	Type *LegalTy =
778	Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits());
779	if (LT.second.isVector())
780	LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount());
781	InstructionCost Cost = `1`;
782	IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
783	LegalTy, {LegalTy, LegalTy});
784	Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind);
785	IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
786	LegalTy, {LegalTy, LegalTy});
787	Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind);
788	return LT.first * Cost;
789	}
790	break;
791	}
792	case Intrinsic::fshl:
793	case Intrinsic::fshr: {
794	if (ICA.getArgs().empty())
795	break;
796
797	// TODO: Add handling for fshl where third argument is not a constant.
798	const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[`2`]);
799	if (!OpInfoZ.isConstant())
800	break;
801
802	const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy);
803	if (OpInfoZ.isUniform()) {
804	// FIXME: The costs could be lower if the codegen is better.
805	static const CostTblEntry FshlTbl[] = {
806	{.ISD: Intrinsic::fshl, .Type: MVT::v4i32, .Cost: `3`}, // ushr + shl + orr
807	{.ISD: Intrinsic::fshl, .Type: MVT::v2i64, .Cost: `3`}, {.ISD: Intrinsic::fshl, .Type: MVT::v16i8, .Cost: `4`},
808	{.ISD: Intrinsic::fshl, .Type: MVT::v8i16, .Cost: `4`}, {.ISD: Intrinsic::fshl, .Type: MVT::v2i32, .Cost: `3`},
809	{.ISD: Intrinsic::fshl, .Type: MVT::v8i8, .Cost: `4`}, {.ISD: Intrinsic::fshl, .Type: MVT::v4i16, .Cost: `4`}};
810	// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
811	// to avoid having to duplicate the costs.
812	const auto *Entry =
813	CostTableLookup(Table: FshlTbl, ISD: Intrinsic::fshl, Ty: LegalisationCost.second);
814	if (Entry)
815	return LegalisationCost.first * Entry->Cost;
816	}
817
818	auto TyL = getTypeLegalizationCost(Ty: RetTy);
819	if (!RetTy->isIntegerTy())
820	break;
821
822	// Estimate cost manually, as types like i8 and i16 will get promoted to
823	// i32 and CostTableLookup will ignore the extra conversion cost.
824	bool HigherCost = (RetTy->getScalarSizeInBits() != `32` &&
825	RetTy->getScalarSizeInBits() < `64`) \|\|
826	(RetTy->getScalarSizeInBits() % `64` != `0`);
827	unsigned ExtraCost = HigherCost ? `1` : `0`;
828	if (RetTy->getScalarSizeInBits() == `32` \|\|
829	RetTy->getScalarSizeInBits() == `64`)
830	ExtraCost = `0`; // fhsl/fshr for i32 and i64 can be lowered to a single
831	// extr instruction.
832	else if (HigherCost)
833	ExtraCost = `1`;
834	else
835	break;
836	return TyL.first + ExtraCost;
837	}
838	case Intrinsic::get_active_lane_mask: {
839	auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType());
840	if (RetTy) {
841	EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy);
842	EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[`0`]);
843	if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) &&
844	!getTLI()->isTypeLegal(VT: RetVT)) {
845	// We don't have enough context at this point to determine if the mask
846	// is going to be kept live after the block, which will force the vXi1
847	// type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
848	// For now, we just assume the vectorizer created this intrinsic and
849	// the result will be the input for a PHI. In this case the cost will
850	// be extremely high for fixed-width vectors.
851	// NOTE: getScalarizationOverhead returns a cost that's far too
852	// pessimistic for the actual generated codegen. In reality there are
853	// two instructions generated per lane.
854	return RetTy->getNumElements() * `2`;
855	}
856	}
857	break;
858	}
859	default:
860	break;
861	}
862	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
863	}
864
865	/// The function will remove redundant reinterprets casting in the presence
866	/// of the control flow
867	static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
868	IntrinsicInst &II) {
869	SmallVector<Instruction *, `32`> Worklist;
870	auto RequiredType = II.getType();
871
872	auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: `0`));
873	assert(PN && "Expected Phi Node!");
874
875	// Don't create a new Phi unless we can remove the old one.
876	if (!PN->hasOneUse())
877	return std::nullopt;
878
879	for (Value *IncValPhi : PN->incoming_values()) {
880	auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi);
881	if (!Reinterpret \|\|
882	Reinterpret->getIntrinsicID() !=
883	Intrinsic::aarch64_sve_convert_to_svbool \|\|
884	RequiredType != Reinterpret->getArgOperand(i: `0`)->getType())
885	return std::nullopt;
886	}
887
888	// Create the new Phi
889	IC.Builder.SetInsertPoint(PN);
890	PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues());
891	Worklist.push_back(Elt: PN);
892
893	for (unsigned I = `0`; I < PN->getNumIncomingValues(); I++) {
894	auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I));
895	NPN->addIncoming(V: Reinterpret->getOperand(i: `0`), BB: PN->getIncomingBlock(i: I));
896	Worklist.push_back(Elt: Reinterpret);
897	}
898
899	// Cleanup Phi Node and reinterprets
900	return IC.replaceInstUsesWith(I&: II, V: NPN);
901	}
902
903	// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
904	// => (binop (pred) (from_svbool _) (from_svbool _))
905	//
906	// The above transformation eliminates a `to_svbool` in the predicate
907	// operand of bitwise operation `binop` by narrowing the vector width of
908	// the operation. For example, it would convert a `<vscale x 16 x i1>
909	// and` into a `<vscale x 4 x i1> and`. This is profitable because
910	// to_svbool must zero the new lanes during widening, whereas
911	// from_svbool is free.
912	static std::optional<Instruction *>
913	tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
914	auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: `0`));
915	if (!BinOp)
916	return std::nullopt;
917
918	auto IntrinsicID = BinOp->getIntrinsicID();
919	switch (IntrinsicID) {
920	case Intrinsic::aarch64_sve_and_z:
921	case Intrinsic::aarch64_sve_bic_z:
922	case Intrinsic::aarch64_sve_eor_z:
923	case Intrinsic::aarch64_sve_nand_z:
924	case Intrinsic::aarch64_sve_nor_z:
925	case Intrinsic::aarch64_sve_orn_z:
926	case Intrinsic::aarch64_sve_orr_z:
927	break;
928	default:
929	return std::nullopt;
930	}
931
932	auto BinOpPred = BinOp->getOperand(i_nocapture: `0`);
933	auto BinOpOp1 = BinOp->getOperand(i_nocapture: `1`);
934	auto BinOpOp2 = BinOp->getOperand(i_nocapture: `2`);
935
936	auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred);
937	if (!PredIntr \|\|
938	PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
939	return std::nullopt;
940
941	auto PredOp = PredIntr->getOperand(i_nocapture: `0`);
942	auto PredOpTy = cast<VectorType>(Val: PredOp->getType());
943	if (PredOpTy != II.getType())
944	return std::nullopt;
945
946	SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
947	auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
948	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp1});
949	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
950	if (BinOpOp1 == BinOpOp2)
951	NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1);
952	else
953	NarrowedBinOpArgs.push_back(Elt: IC.Builder.CreateIntrinsic(
954	ID: Intrinsic::aarch64_sve_convert_from_svbool, Types: {PredOpTy}, Args: {BinOpOp2}));
955
956	auto NarrowedBinOp =
957	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs);
958	return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp);
959	}
960
961	static std::optional<Instruction *>
962	instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
963	// If the reinterpret instruction operand is a PHI Node
964	if (isa<PHINode>(Val: II.getArgOperand(i: `0`)))
965	return processPhiNode(IC, II);
966
967	if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
968	return BinOpCombine;
969
970	// Ignore converts to/from svcount_t.
971	if (isa<TargetExtType>(Val: II.getArgOperand(i: `0`)->getType()) \|\|
972	isa<TargetExtType>(Val: II.getType()))
973	return std::nullopt;
974
975	SmallVector<Instruction *, `32`> CandidatesForRemoval;
976	Value Cursor = II.getOperand(i_nocapture: `0`), EarliestReplacement = nullptr;
977
978	const auto *IVTy = cast<VectorType>(Val: II.getType());
979
980	// Walk the chain of conversions.
981	while (Cursor) {
982	// If the type of the cursor has fewer lanes than the final result, zeroing
983	// must take place, which breaks the equivalence chain.
984	const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType());
985	if (CursorVTy->getElementCount().getKnownMinValue() <
986	IVTy->getElementCount().getKnownMinValue())
987	break;
988
989	// If the cursor has the same type as I, it is a viable replacement.
990	if (Cursor->getType() == IVTy)
991	EarliestReplacement = Cursor;
992
993	auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor);
994
995	// If this is not an SVE conversion intrinsic, this is the end of the chain.
996	if (!IntrinsicCursor \|\| !(IntrinsicCursor->getIntrinsicID() ==
997	Intrinsic::aarch64_sve_convert_to_svbool \|\|
998	IntrinsicCursor->getIntrinsicID() ==
999	Intrinsic::aarch64_sve_convert_from_svbool))
1000	break;
1001
1002	CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor);
1003	Cursor = IntrinsicCursor->getOperand(i_nocapture: `0`);
1004	}
1005
1006	// If no viable replacement in the conversion chain was found, there is
1007	// nothing to do.
1008	if (!EarliestReplacement)
1009	return std::nullopt;
1010
1011	return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement);
1012	}
1013
1014	static bool isAllActivePredicate(Value *Pred) {
1015	// Look through convert.from.svbool(convert.to.svbool(...) chain.
1016	Value *UncastedPred;
1017	if (match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1018	Op0: m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1019	Op0: m_Value(V&: UncastedPred)))))
1020	// If the predicate has the same or less lanes than the uncasted
1021	// predicate then we know the casting has no effect.
1022	if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <=
1023	cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements())
1024	Pred = UncastedPred;
1025
1026	return match(V: Pred, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1027	Op0: m_ConstantInt<AArch64SVEPredPattern::all>()));
1028	}
1029
1030	// Erase unary operation where predicate has all inactive lanes
1031	static std::optional<Instruction *>
1032	instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II,
1033	int PredPos) {
1034	if (match(V: II.getOperand(i_nocapture: PredPos), P: m_ZeroInt())) {
1035	return IC.eraseInstFromFunction(I&: II);
1036	}
1037	return std::nullopt;
1038	}
1039
1040	// Simplify unary operation where predicate has all inactive lanes by replacing
1041	// instruction with zeroed object
1042	static std::optional<Instruction *>
1043	instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II) {
1044	if (match(V: II.getOperand(i_nocapture: `0`), P: m_ZeroInt())) {
1045	Constant *Node;
1046	Type *RetTy = II.getType();
1047	if (RetTy->isStructTy()) {
1048	auto StructT = cast<StructType>(Val: RetTy);
1049	auto VecT = StructT->getElementType(N: `0`);
1050	SmallVector<llvm::Constant *, `4`> ZerVec;
1051	for (unsigned i = `0`; i < StructT->getNumElements(); i++) {
1052	ZerVec.push_back(Elt: VecT->isFPOrFPVectorTy() ? ConstantFP::get(Ty: VecT, V: `0.0`)
1053	: ConstantInt::get(Ty: VecT, V: `0`));
1054	}
1055	Node = ConstantStruct::get(T: StructT, V: ZerVec);
1056	} else if (RetTy->isFPOrFPVectorTy())
1057	Node = ConstantFP::get(Ty: RetTy, V: `0.0`);
1058	else
1059	Node = ConstantInt::get(Ty: II.getType(), V: `0`);
1060
1061	IC.replaceInstUsesWith(I&: II, V: Node);
1062	return IC.eraseInstFromFunction(I&: II);
1063	}
1064	return std::nullopt;
1065	}
1066
1067	static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1068	IntrinsicInst &II) {
1069	// svsel(ptrue, x, y) => x
1070	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1071	if (isAllActivePredicate(Pred: OpPredicate))
1072	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
1073
1074	auto Select =
1075	IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: `1`), False: II.getOperand(i_nocapture: `2`));
1076	return IC.replaceInstUsesWith(I&: II, V: Select);
1077	}
1078
1079	static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1080	IntrinsicInst &II) {
1081	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1082	if (!Pg)
1083	return std::nullopt;
1084
1085	if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1086	return std::nullopt;
1087
1088	const auto PTruePattern =
1089	cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: `0`))->getZExtValue();
1090	if (PTruePattern != AArch64SVEPredPattern::vl1)
1091	return std::nullopt;
1092
1093	// The intrinsic is inserting into lane zero so use an insert instead.
1094	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1095	auto *Insert = InsertElementInst::Create(
1096	Vec: II.getArgOperand(i: `0`), NewElt: II.getArgOperand(i: `2`), Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1097	Insert->insertBefore(InsertPos: &II);
1098	Insert->takeName(V: &II);
1099
1100	return IC.replaceInstUsesWith(I&: II, V: Insert);
1101	}
1102
1103	static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1104	IntrinsicInst &II) {
1105	// Replace DupX with a regular IR splat.
1106	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1107	Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(),
1108	V: II.getArgOperand(i: `0`));
1109	Splat->takeName(V: &II);
1110	return IC.replaceInstUsesWith(I&: II, V: Splat);
1111	}
1112
1113	static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1114	IntrinsicInst &II) {
1115	LLVMContext &Ctx = II.getContext();
1116
1117	// Check that the predicate is all active
1118	auto *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `0`));
1119	if (!Pg \|\| Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1120	return std::nullopt;
1121
1122	const auto PTruePattern =
1123	cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: `0`))->getZExtValue();
1124	if (PTruePattern != AArch64SVEPredPattern::all)
1125	return std::nullopt;
1126
1127	// Check that we have a compare of zero..
1128	auto *SplatValue =
1129	dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: `2`)));
1130	if (!SplatValue \|\| !SplatValue->isZero())
1131	return std::nullopt;
1132
1133	// ..against a dupq
1134	auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: `1`));
1135	if (!DupQLane \|\|
1136	DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1137	return std::nullopt;
1138
1139	// Where the dupq is a lane 0 replicate of a vector insert
1140	if (!cast<ConstantInt>(Val: DupQLane->getArgOperand(i: `1`))->isZero())
1141	return std::nullopt;
1142
1143	auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: `0`));
1144	if (!VecIns \|\| VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1145	return std::nullopt;
1146
1147	// Where the vector insert is a fixed constant vector insert into undef at
1148	// index zero
1149	if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: `0`)))
1150	return std::nullopt;
1151
1152	if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: `2`))->isZero())
1153	return std::nullopt;
1154
1155	auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: `1`));
1156	if (!ConstVec)
1157	return std::nullopt;
1158
1159	auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType());
1160	auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType());
1161	if (!VecTy \|\| !OutTy \|\| VecTy->getNumElements() != OutTy->getMinNumElements())
1162	return std::nullopt;
1163
1164	unsigned NumElts = VecTy->getNumElements();
1165	unsigned PredicateBits = `0`;
1166
1167	// Expand intrinsic operands to a 16-bit byte level predicate
1168	for (unsigned I = `0`; I < NumElts; ++I) {
1169	auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I));
1170	if (!Arg)
1171	return std::nullopt;
1172	if (!Arg->isZero())
1173	PredicateBits \|= `1` << (I * (`16` / NumElts));
1174	}
1175
1176	// If all bits are zero bail early with an empty predicate
1177	if (PredicateBits == `0`) {
1178	auto *PFalse = Constant::getNullValue(Ty: II.getType());
1179	PFalse->takeName(V: &II);
1180	return IC.replaceInstUsesWith(I&: II, V: PFalse);
1181	}
1182
1183	// Calculate largest predicate type used (where byte predicate is largest)
1184	unsigned Mask = `8`;
1185	for (unsigned I = `0`; I < `16`; ++I)
1186	if ((PredicateBits & (`1` << I)) != `0`)
1187	Mask \|= (I % `8`);
1188
1189	unsigned PredSize = Mask & -Mask;
1190	auto *PredType = ScalableVectorType::get(
1191	ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * `8`));
1192
1193	// Ensure all relevant bits are set
1194	for (unsigned I = `0`; I < `16`; I += PredSize)
1195	if ((PredicateBits & (`1` << I)) == `0`)
1196	return std::nullopt;
1197
1198	auto *PTruePat =
1199	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
1200	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
1201	Types: {PredType}, Args: {PTruePat});
1202	auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1203	ID: Intrinsic::aarch64_sve_convert_to_svbool, Types: {PredType}, Args: {PTrue});
1204	auto *ConvertFromSVBool =
1205	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_convert_from_svbool,
1206	Types: {II.getType()}, Args: {ConvertToSVBool});
1207
1208	ConvertFromSVBool->takeName(V: &II);
1209	return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool);
1210	}
1211
1212	static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1213	IntrinsicInst &II) {
1214	Value *Pg = II.getArgOperand(i: `0`);
1215	Value *Vec = II.getArgOperand(i: `1`);
1216	auto IntrinsicID = II.getIntrinsicID();
1217	bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1218
1219	// lastX(splat(X)) --> X
1220	if (auto *SplatVal = getSplatValue(V: Vec))
1221	return IC.replaceInstUsesWith(I&: II, V: SplatVal);
1222
1223	// If x and/or y is a splat value then:
1224	// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1225	Value LHS, RHS;
1226	if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) {
1227	if (isSplatValue(V: LHS) \|\| isSplatValue(V: RHS)) {
1228	auto *OldBinOp = cast<BinaryOperator>(Val: Vec);
1229	auto OpC = OldBinOp->getOpcode();
1230	auto *NewLHS =
1231	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS});
1232	auto *NewRHS =
1233	IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS});
1234	auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
1235	Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator());
1236	return IC.replaceInstUsesWith(I&: II, V: NewBinOp);
1237	}
1238	}
1239
1240	auto *C = dyn_cast<Constant>(Val: Pg);
1241	if (IsAfter && C && C->isNullValue()) {
1242	// The intrinsic is extracting lane 0 so use an extract instead.
1243	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1244	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: `0`));
1245	Extract->insertBefore(InsertPos: &II);
1246	Extract->takeName(V: &II);
1247	return IC.replaceInstUsesWith(I&: II, V: Extract);
1248	}
1249
1250	auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg);
1251	if (!IntrPG)
1252	return std::nullopt;
1253
1254	if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1255	return std::nullopt;
1256
1257	const auto PTruePattern =
1258	cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: `0`))->getZExtValue();
1259
1260	// Can the intrinsic's predicate be converted to a known constant index?
1261	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern);
1262	if (!MinNumElts)
1263	return std::nullopt;
1264
1265	unsigned Idx = MinNumElts - `1`;
1266	// Increment the index if extracting the element after the last active
1267	// predicate element.
1268	if (IsAfter)
1269	++Idx;
1270
1271	// Ignore extracts whose index is larger than the known minimum vector
1272	// length. NOTE: This is an artificial constraint where we prefer to
1273	// maintain what the user asked for until an alternative is proven faster.
1274	auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType());
1275	if (Idx >= PgVTy->getMinNumElements())
1276	return std::nullopt;
1277
1278	// The intrinsic is extracting a fixed lane so use an extract instead.
1279	auto *IdxTy = Type::getInt64Ty(C&: II.getContext());
1280	auto *Extract = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx));
1281	Extract->insertBefore(InsertPos: &II);
1282	Extract->takeName(V: &II);
1283	return IC.replaceInstUsesWith(I&: II, V: Extract);
1284	}
1285
1286	static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1287	IntrinsicInst &II) {
1288	// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1289	// integer variant across a variety of micro-architectures. Replace scalar
1290	// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1291	// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1292	// depending on the micro-architecture, but has been observed as generally
1293	// being faster, particularly when the CLAST[AB] op is a loop-carried
1294	// dependency.
1295	Value *Pg = II.getArgOperand(i: `0`);
1296	Value *Fallback = II.getArgOperand(i: `1`);
1297	Value *Vec = II.getArgOperand(i: `2`);
1298	Type *Ty = II.getType();
1299
1300	if (!Ty->isIntegerTy())
1301	return std::nullopt;
1302
1303	Type *FPTy;
1304	switch (cast<IntegerType>(Val: Ty)->getBitWidth()) {
1305	default:
1306	return std::nullopt;
1307	case `16`:
1308	FPTy = IC.Builder.getHalfTy();
1309	break;
1310	case `32`:
1311	FPTy = IC.Builder.getFloatTy();
1312	break;
1313	case `64`:
1314	FPTy = IC.Builder.getDoubleTy();
1315	break;
1316	}
1317
1318	Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy);
1319	auto *FPVTy = VectorType::get(
1320	ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount());
1321	Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy);
1322	auto *FPII = IC.Builder.CreateIntrinsic(
1323	ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec});
1324	Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType());
1325	return IC.replaceInstUsesWith(I&: II, V: FPIItoInt);
1326	}
1327
1328	static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1329	IntrinsicInst &II) {
1330	LLVMContext &Ctx = II.getContext();
1331	// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1332	// can work with RDFFR_PP for ptest elimination.
1333	auto *AllPat =
1334	ConstantInt::get(Ty: Type::getInt32Ty(C&: Ctx), V: AArch64SVEPredPattern::all);
1335	auto *PTrue = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue,
1336	Types: {II.getType()}, Args: {AllPat});
1337	auto *RDFFR =
1338	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_rdffr_z, Types: {}, Args: {PTrue});
1339	RDFFR->takeName(V: &II);
1340	return IC.replaceInstUsesWith(I&: II, V: RDFFR);
1341	}
1342
1343	static std::optional<Instruction *>
1344	instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
1345	const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: `0`))->getZExtValue();
1346
1347	if (Pattern == AArch64SVEPredPattern::all) {
1348	Constant *StepVal = ConstantInt::get(Ty: II.getType(), V: NumElts);
1349	auto *VScale = IC.Builder.CreateVScale(Scaling: StepVal);
1350	VScale->takeName(V: &II);
1351	return IC.replaceInstUsesWith(I&: II, V: VScale);
1352	}
1353
1354	unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1355
1356	return MinNumElts && NumElts >= MinNumElts
1357	? std::optional<Instruction *>(IC.replaceInstUsesWith(
1358	I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts)))
1359	: std::nullopt;
1360	}
1361
1362	static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1363	IntrinsicInst &II) {
1364	Value *PgVal = II.getArgOperand(i: `0`);
1365	Value *OpVal = II.getArgOperand(i: `1`);
1366
1367	// PTEST_<FIRST\|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1368	// Later optimizations prefer this form.
1369	if (PgVal == OpVal &&
1370	(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first \|\|
1371	II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1372	Value *Ops[] = {PgVal, OpVal};
1373	Type *Tys[] = {PgVal->getType()};
1374
1375	auto *PTest =
1376	IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptest_any, Types: Tys, Args: Ops);
1377	PTest->takeName(V: &II);
1378
1379	return IC.replaceInstUsesWith(I&: II, V: PTest);
1380	}
1381
1382	IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal);
1383	IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal);
1384
1385	if (!Pg \|\| !Op)
1386	return std::nullopt;
1387
1388	Intrinsic::ID OpIID = Op->getIntrinsicID();
1389
1390	if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1391	OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1392	Pg->getArgOperand(i: `0`)->getType() == Op->getArgOperand(i: `0`)->getType()) {
1393	Value *Ops[] = {Pg->getArgOperand(i: `0`), Op->getArgOperand(i: `0`)};
1394	Type *Tys[] = {Pg->getArgOperand(i: `0`)->getType()};
1395
1396	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
1397
1398	PTest->takeName(V: &II);
1399	return IC.replaceInstUsesWith(I&: II, V: PTest);
1400	}
1401
1402	// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1403	// Later optimizations may rewrite sequence to use the flag-setting variant
1404	// of instruction X to remove PTEST.
1405	if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1406	((OpIID == Intrinsic::aarch64_sve_brka_z) \|\|
1407	(OpIID == Intrinsic::aarch64_sve_brkb_z) \|\|
1408	(OpIID == Intrinsic::aarch64_sve_brkpa_z) \|\|
1409	(OpIID == Intrinsic::aarch64_sve_brkpb_z) \|\|
1410	(OpIID == Intrinsic::aarch64_sve_rdffr_z) \|\|
1411	(OpIID == Intrinsic::aarch64_sve_and_z) \|\|
1412	(OpIID == Intrinsic::aarch64_sve_bic_z) \|\|
1413	(OpIID == Intrinsic::aarch64_sve_eor_z) \|\|
1414	(OpIID == Intrinsic::aarch64_sve_nand_z) \|\|
1415	(OpIID == Intrinsic::aarch64_sve_nor_z) \|\|
1416	(OpIID == Intrinsic::aarch64_sve_orn_z) \|\|
1417	(OpIID == Intrinsic::aarch64_sve_orr_z))) {
1418	Value *Ops[] = {Pg->getArgOperand(i: `0`), Pg};
1419	Type *Tys[] = {Pg->getType()};
1420
1421	auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops);
1422	PTest->takeName(V: &II);
1423
1424	return IC.replaceInstUsesWith(I&: II, V: PTest);
1425	}
1426
1427	return std::nullopt;
1428	}
1429
1430	template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1431	static std::optional<Instruction *>
1432	instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
1433	bool MergeIntoAddendOp) {
1434	Value *P = II.getOperand(i_nocapture: `0`);
1435	Value MulOp0, MulOp1, AddendOp, Mul;
1436	if (MergeIntoAddendOp) {
1437	AddendOp = II.getOperand(i_nocapture: `1`);
1438	Mul = II.getOperand(i_nocapture: `2`);
1439	} else {
1440	AddendOp = II.getOperand(i_nocapture: `2`);
1441	Mul = II.getOperand(i_nocapture: `1`);
1442	}
1443
1444	if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0),
1445	m_Value(V&: MulOp1))))
1446	return std::nullopt;
1447
1448	if (!Mul->hasOneUse())
1449	return std::nullopt;
1450
1451	Instruction FMFSource = nullptr*;
1452	if (II.getType()->isFPOrFPVectorTy()) {
1453	llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1454	// Stop the combine when the flags on the inputs differ in case dropping
1455	// flags would lead to us missing out on more beneficial optimizations.
1456	if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags())
1457	return std::nullopt;
1458	if (!FAddFlags.allowContract())
1459	return std::nullopt;
1460	FMFSource = &II;
1461	}
1462
1463	CallInst *Res;
1464	if (MergeIntoAddendOp)
1465	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
1466	Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1467	else
1468	Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()},
1469	Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1470
1471	return IC.replaceInstUsesWith(I&: II, V: Res);
1472	}
1473
1474	static std::optional<Instruction *>
1475	instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1476	Value *Pred = II.getOperand(i_nocapture: `0`);
1477	Value *PtrOp = II.getOperand(i_nocapture: `1`);
1478	Type *VecTy = II.getType();
1479
1480	// Replace by zero constant when all lanes are inactive
1481	if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1482	return II_NA;
1483
1484	if (isAllActivePredicate(Pred)) {
1485	LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp);
1486	Load->copyMetadata(SrcInst: II);
1487	return IC.replaceInstUsesWith(I&: II, V: Load);
1488	}
1489
1490	CallInst *MaskedLoad =
1491	IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL),
1492	Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy));
1493	MaskedLoad->copyMetadata(SrcInst: II);
1494	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
1495	}
1496
1497	static std::optional<Instruction *>
1498	instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
1499	Value *VecOp = II.getOperand(i_nocapture: `0`);
1500	Value *Pred = II.getOperand(i_nocapture: `1`);
1501	Value *PtrOp = II.getOperand(i_nocapture: `2`);
1502
1503	if (isAllActivePredicate(Pred)) {
1504	StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp);
1505	Store->copyMetadata(SrcInst: II);
1506	return IC.eraseInstFromFunction(I&: II);
1507	}
1508
1509	CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1510	Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred);
1511	MaskedStore->copyMetadata(SrcInst: II);
1512	return IC.eraseInstFromFunction(I&: II);
1513	}
1514
1515	static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
1516	switch (Intrinsic) {
1517	case Intrinsic::aarch64_sve_fmul_u:
1518	return Instruction::BinaryOps::FMul;
1519	case Intrinsic::aarch64_sve_fadd_u:
1520	return Instruction::BinaryOps::FAdd;
1521	case Intrinsic::aarch64_sve_fsub_u:
1522	return Instruction::BinaryOps::FSub;
1523	default:
1524	return Instruction::BinaryOpsEnd;
1525	}
1526	}
1527
1528	static std::optional<Instruction *>
1529	instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
1530	// Bail due to missing support for ISD::STRICT_ scalable vector operations.
1531	if (II.isStrictFP())
1532	return std::nullopt;
1533
1534	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1535	auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID());
1536	if (BinOpCode == Instruction::BinaryOpsEnd \|\|
1537	!match(V: OpPredicate, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1538	Op0: m_ConstantInt<AArch64SVEPredPattern::all>())))
1539	return std::nullopt;
1540	IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
1541	IC.Builder.setFastMathFlags(II.getFastMathFlags());
1542	auto BinOp =
1543	IC.Builder.CreateBinOp(Opc: BinOpCode, LHS: II.getOperand(i_nocapture: `1`), RHS: II.getOperand(i_nocapture: `2`));
1544	return IC.replaceInstUsesWith(I&: II, V: BinOp);
1545	}
1546
1547	// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1548	// sve.add_u).
1549	static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1550	Intrinsic::ID IID) {
1551	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1552	if (!match(V: OpPredicate, P: m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1553	Op0: m_ConstantInt<AArch64SVEPredPattern::all>())))
1554	return std::nullopt;
1555
1556	auto *Mod = II.getModule();
1557	auto *NewDecl = Intrinsic::getDeclaration(M: Mod, id: IID, Tys: {II.getType()});
1558	II.setCalledFunction(NewDecl);
1559
1560	return &II;
1561	}
1562
1563	// Simplify operations where predicate has all inactive lanes or try to replace
1564	// with _u form when all lanes are active
1565	static std::optional<Instruction *>
1566	instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II,
1567	Intrinsic::ID IID) {
1568	if (match(V: II.getOperand(i_nocapture: `0`), P: m_ZeroInt())) {
1569	// llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1570	// inactive for sv[func]_m
1571	return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: `1`));
1572	}
1573	return instCombineSVEAllActive(II, IID);
1574	}
1575
1576	static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1577	IntrinsicInst &II) {
1578	if (auto II_U =
1579	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_add_u))
1580	return II_U;
1581	if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1582	Intrinsic::aarch64_sve_mla>(
1583	IC, II, MergeIntoAddendOp: true))
1584	return MLA;
1585	if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1586	Intrinsic::aarch64_sve_mad>(
1587	IC, II, MergeIntoAddendOp: false))
1588	return MAD;
1589	return std::nullopt;
1590	}
1591
1592	static std::optional<Instruction *>
1593	instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
1594	if (auto II_U =
1595	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fadd_u))
1596	return II_U;
1597	if (auto FMLA =
1598	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1599	Intrinsic::aarch64_sve_fmla>(IC, II,
1600	MergeIntoAddendOp: true))
1601	return FMLA;
1602	if (auto FMAD =
1603	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1604	Intrinsic::aarch64_sve_fmad>(IC, II,
1605	MergeIntoAddendOp: false))
1606	return FMAD;
1607	if (auto FMLA =
1608	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1609	Intrinsic::aarch64_sve_fmla>(IC, II,
1610	MergeIntoAddendOp: true))
1611	return FMLA;
1612	return std::nullopt;
1613	}
1614
1615	static std::optional<Instruction *>
1616	instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
1617	if (auto FMLA =
1618	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1619	Intrinsic::aarch64_sve_fmla>(IC, II,
1620	MergeIntoAddendOp: true))
1621	return FMLA;
1622	if (auto FMAD =
1623	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1624	Intrinsic::aarch64_sve_fmad>(IC, II,
1625	MergeIntoAddendOp: false))
1626	return FMAD;
1627	if (auto FMLA_U =
1628	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1629	Intrinsic::aarch64_sve_fmla_u>(
1630	IC, II, MergeIntoAddendOp: true))
1631	return FMLA_U;
1632	return instCombineSVEVectorBinOp(IC, II);
1633	}
1634
1635	static std::optional<Instruction *>
1636	instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
1637	if (auto II_U =
1638	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fsub_u))
1639	return II_U;
1640	if (auto FMLS =
1641	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1642	Intrinsic::aarch64_sve_fmls>(IC, II,
1643	MergeIntoAddendOp: true))
1644	return FMLS;
1645	if (auto FMSB =
1646	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1647	Intrinsic::aarch64_sve_fnmsb>(
1648	IC, II, MergeIntoAddendOp: false))
1649	return FMSB;
1650	if (auto FMLS =
1651	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1652	Intrinsic::aarch64_sve_fmls>(IC, II,
1653	MergeIntoAddendOp: true))
1654	return FMLS;
1655	return std::nullopt;
1656	}
1657
1658	static std::optional<Instruction *>
1659	instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
1660	if (auto FMLS =
1661	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1662	Intrinsic::aarch64_sve_fmls>(IC, II,
1663	MergeIntoAddendOp: true))
1664	return FMLS;
1665	if (auto FMSB =
1666	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1667	Intrinsic::aarch64_sve_fnmsb>(
1668	IC, II, MergeIntoAddendOp: false))
1669	return FMSB;
1670	if (auto FMLS_U =
1671	instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1672	Intrinsic::aarch64_sve_fmls_u>(
1673	IC, II, MergeIntoAddendOp: true))
1674	return FMLS_U;
1675	return instCombineSVEVectorBinOp(IC, II);
1676	}
1677
1678	static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1679	IntrinsicInst &II) {
1680	if (auto II_U =
1681	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sub_u))
1682	return II_U;
1683	if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1684	Intrinsic::aarch64_sve_mls>(
1685	IC, II, MergeIntoAddendOp: true))
1686	return MLS;
1687	return std::nullopt;
1688	}
1689
1690	static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1691	IntrinsicInst &II,
1692	Intrinsic::ID IID) {
1693	auto *OpPredicate = II.getOperand(i_nocapture: `0`);
1694	auto *OpMultiplicand = II.getOperand(i_nocapture: `1`);
1695	auto *OpMultiplier = II.getOperand(i_nocapture: `2`);
1696
1697	// Return true if a given instruction is a unit splat value, false otherwise.
1698	auto IsUnitSplat = [](auto *I) {
1699	auto *SplatValue = getSplatValue(I);
1700	if (!SplatValue)
1701	return false;
1702	return match(SplatValue, m_FPOne()) \|\| match(SplatValue, m_One());
1703	};
1704
1705	// Return true if a given instruction is an aarch64_sve_dup intrinsic call
1706	// with a unit splat value, false otherwise.
1707	auto IsUnitDup = [](auto *I) {
1708	auto *IntrI = dyn_cast<IntrinsicInst>(I);
1709	if (!IntrI \|\| IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1710	return false;
1711
1712	auto *SplatValue = IntrI->getOperand(`2`);
1713	return match(SplatValue, m_FPOne()) \|\| match(SplatValue, m_One());
1714	};
1715
1716	if (IsUnitSplat (OpMultiplier)) {
1717	// [f]mul pg %n, (dupx 1) => %n
1718	OpMultiplicand->takeName(V: &II);
1719	return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand);
1720	} else if (IsUnitDup (OpMultiplier)) {
1721	// [f]mul pg %n, (dup pg 1) => %n
1722	auto *DupInst = cast<IntrinsicInst>(Val: OpMultiplier);
1723	auto *DupPg = DupInst->getOperand(i_nocapture: `1`);
1724	// TODO: this is naive. The optimization is still valid if DupPg
1725	// 'encompasses' OpPredicate, not only if they're the same predicate.
1726	if (OpPredicate == DupPg) {
1727	OpMultiplicand->takeName(V: &II);
1728	return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand);
1729	}
1730	}
1731
1732	return instCombineSVEVectorBinOp(IC, II);
1733	}
1734
1735	static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1736	IntrinsicInst &II) {
1737	Value *UnpackArg = II.getArgOperand(i: `0`);
1738	auto *RetTy = cast<ScalableVectorType>(Val: II.getType());
1739	bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi \|\|
1740	II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1741
1742	// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1743	// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1744	if (auto *ScalarArg = getSplatValue(V: UnpackArg)) {
1745	ScalarArg =
1746	IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned);
1747	Value *NewVal =
1748	IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg);
1749	NewVal->takeName(V: &II);
1750	return IC.replaceInstUsesWith(I&: II, V: NewVal);
1751	}
1752
1753	return std::nullopt;
1754	}
1755	static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1756	IntrinsicInst &II) {
1757	auto *OpVal = II.getOperand(i_nocapture: `0`);
1758	auto *OpIndices = II.getOperand(i_nocapture: `1`);
1759	VectorType *VTy = cast<VectorType>(Val: II.getType());
1760
1761	// Check whether OpIndices is a constant splat value < minimal element count
1762	// of result.
1763	auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices));
1764	if (!SplatValue \|\|
1765	SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue()))
1766	return std::nullopt;
1767
1768	// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1769	// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1770	auto *Extract = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue);
1771	auto *VectorSplat =
1772	IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract);
1773
1774	VectorSplat->takeName(V: &II);
1775	return IC.replaceInstUsesWith(I&: II, V: VectorSplat);
1776	}
1777
1778	static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1779	IntrinsicInst &II) {
1780	Value A, B;
1781	Type *RetTy = II.getType();
1782	constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1783	constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1784
1785	// uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1786	// uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1787	if ((match(V: II.getArgOperand(i: `0`),
1788	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A)))) &&
1789	match(V: II.getArgOperand(i: `1`),
1790	P: m_Intrinsic<FromSVB>(Op0: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) \|\|
1791	(match(V: II.getArgOperand(i: `0`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: A))) &&
1792	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<ToSVB>(Op0: m_Value(V&: B))))) {
1793	auto *TyA = cast<ScalableVectorType>(Val: A->getType());
1794	if (TyA == B->getType() &&
1795	RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) {
1796	auto *SubVec = IC.Builder.CreateInsertVector(
1797	DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: IC.Builder.getInt64(C: `0`));
1798	auto *ConcatVec = IC.Builder.CreateInsertVector(
1799	DstType: RetTy, SrcVec: SubVec, SubVec: B, Idx: IC.Builder.getInt64(C: TyA->getMinNumElements()));
1800	ConcatVec->takeName(V: &II);
1801	return IC.replaceInstUsesWith(I&: II, V: ConcatVec);
1802	}
1803	}
1804
1805	return std::nullopt;
1806	}
1807
1808	static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1809	IntrinsicInst &II) {
1810	// zip1(uzp1(A, B), uzp2(A, B)) --> A
1811	// zip2(uzp1(A, B), uzp2(A, B)) --> B
1812	Value A, B;
1813	if (match(V: II.getArgOperand(i: `0`),
1814	P: m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(Op0: m_Value(V&: A), Op1: m_Value(V&: B))) &&
1815	match(V: II.getArgOperand(i: `1`), P: m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1816	Op0: m_Specific(V: A), Op1: m_Specific(V: B))))
1817	return IC.replaceInstUsesWith(
1818	I&: II, V: (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1819
1820	return std::nullopt;
1821	}
1822
1823	static std::optional<Instruction *>
1824	instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
1825	Value *Mask = II.getOperand(i_nocapture: `0`);
1826	Value *BasePtr = II.getOperand(i_nocapture: `1`);
1827	Value *Index = II.getOperand(i_nocapture: `2`);
1828	Type *Ty = II.getType();
1829	Value *PassThru = ConstantAggregateZero::get(Ty);
1830
1831	// Replace by zero constant when all lanes are inactive
1832	if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1833	return II_NA;
1834
1835	// Contiguous gather => masked load.
1836	// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1837	// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1838	Value *IndexBase;
1839	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
1840	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
1841	Align Alignment =
1842	BasePtr->getPointerAlignment(DL: II.getDataLayout());
1843
1844	Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty);
1845	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
1846	Ptr: BasePtr, IdxList: IndexBase);
1847	Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy);
1848	CallInst *MaskedLoad =
1849	IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1850	MaskedLoad->takeName(V: &II);
1851	return IC.replaceInstUsesWith(I&: II, V: MaskedLoad);
1852	}
1853
1854	return std::nullopt;
1855	}
1856
1857	static std::optional<Instruction *>
1858	instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
1859	Value *Val = II.getOperand(i_nocapture: `0`);
1860	Value *Mask = II.getOperand(i_nocapture: `1`);
1861	Value *BasePtr = II.getOperand(i_nocapture: `2`);
1862	Value *Index = II.getOperand(i_nocapture: `3`);
1863	Type *Ty = Val->getType();
1864
1865	// Contiguous scatter => masked store.
1866	// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1867	// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1868	Value *IndexBase;
1869	if (match(V: Index, P: m_Intrinsic<Intrinsic::aarch64_sve_index>(
1870	Op0: m_Value(V&: IndexBase), Op1: m_SpecificInt(V: `1`)))) {
1871	Align Alignment =
1872	BasePtr->getPointerAlignment(DL: II.getDataLayout());
1873
1874	Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(),
1875	Ptr: BasePtr, IdxList: IndexBase);
1876	Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty);
1877	Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy);
1878
1879	(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1880
1881	return IC.eraseInstFromFunction(I&: II);
1882	}
1883
1884	return std::nullopt;
1885	}
1886
1887	static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1888	IntrinsicInst &II) {
1889	Type *Int32Ty = IC.Builder.getInt32Ty();
1890	Value *Pred = II.getOperand(i_nocapture: `0`);
1891	Value *Vec = II.getOperand(i_nocapture: `1`);
1892	Value *DivVec = II.getOperand(i_nocapture: `2`);
1893
1894	Value *SplatValue = getSplatValue(V: DivVec);
1895	ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue);
1896	if (!SplatConstantInt)
1897	return std::nullopt;
1898	APInt Divisor = SplatConstantInt->getValue();
1899
1900	if (Divisor.isPowerOf2()) {
1901	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
1902	auto ASRD = IC.Builder.CreateIntrinsic(
1903	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
1904	return IC.replaceInstUsesWith(I&: II, V: ASRD);
1905	}
1906	if (Divisor.isNegatedPowerOf2()) {
1907	Divisor.negate();
1908	Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2());
1909	auto ASRD = IC.Builder.CreateIntrinsic(
1910	ID: Intrinsic::aarch64_sve_asrd, Types: {II.getType()}, Args: {Pred, Vec, DivisorLog2});
1911	auto NEG = IC.Builder.CreateIntrinsic(
1912	ID: Intrinsic::aarch64_sve_neg, Types: {ASRD->getType()}, Args: {ASRD, Pred, ASRD});
1913	return IC.replaceInstUsesWith(I&: II, V: NEG);
1914	}
1915
1916	return std::nullopt;
1917	}
1918
1919	bool SimplifyValuePattern(SmallVector<Value > &Vec, bool* AllowPoison) {
1920	size_t VecSize = Vec.size();
1921	if (VecSize == `1`)
1922	return true;
1923	if (!isPowerOf2_64(Value: VecSize))
1924	return false;
1925	size_t HalfVecSize = VecSize / `2`;
1926
1927	for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1928	RHS != Vec.end(); LHS++, RHS++) {
1929	if (LHS != nullptr* && RHS != nullptr*) {
1930	if (LHS == RHS)
1931	continue;
1932	else
1933	return false;
1934	}
1935	if (!AllowPoison)
1936	return false;
1937	if (LHS == nullptr* && RHS != nullptr*)
1938	LHS = RHS;
1939	}
1940
1941	Vec.resize(N: HalfVecSize);
1942	SimplifyValuePattern(Vec, AllowPoison);
1943	return true;
1944	}
1945
1946	// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1947	// to dupqlane(f64(C)) where C is A concatenated with B
1948	static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1949	IntrinsicInst &II) {
1950	Value CurrentInsertElt = nullptr, Default = nullptr;
1951	if (!match(V: II.getOperand(i_nocapture: `0`),
1952	P: m_Intrinsic<Intrinsic::vector_insert>(
1953	Op0: m_Value(V&: Default), Op1: m_Value(V&: CurrentInsertElt), Op2: m_Value())) \|\|
1954	!isa<FixedVectorType>(Val: CurrentInsertElt->getType()))
1955	return std::nullopt;
1956	auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType());
1957
1958	// Insert the scalars into a container ordered by InsertElement index
1959	SmallVector<Value > Elts(IIScalableTy->getMinNumElements(), nullptr*);
1960	while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) {
1961	auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: `2`));
1962	Elts [Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: `1`);
1963	CurrentInsertElt = InsertElt->getOperand(i_nocapture: `0`);
1964	}
1965
1966	bool AllowPoison =
1967	isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default);
1968	if (!SimplifyValuePattern(Vec&: Elts, AllowPoison))
1969	return std::nullopt;
1970
1971	// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1972	Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType());
1973	for (size_t I = `0`; I < Elts.size(); I++) {
1974	if (Elts [I] == nullptr)
1975	continue;
1976	InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts [I],
1977	Idx: IC.Builder.getInt64(C: I));
1978	}
1979	if (InsertEltChain == nullptr)
1980	return std::nullopt;
1981
1982	// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1983	// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1984	// be bitcast to a type wide enough to fit the sequence, be splatted, and then
1985	// be narrowed back to the original type.
1986	unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1987	unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1988	IIScalableTy->getMinNumElements() /
1989	PatternWidth;
1990
1991	IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth);
1992	auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount);
1993	auto *WideShuffleMaskTy =
1994	ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount);
1995
1996	auto ZeroIdx = ConstantInt::get(Ty: IC.Builder.getInt64Ty(), V: APInt (`64`, `0`));
1997	auto InsertSubvector = IC.Builder.CreateInsertVector(
1998	DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, Idx: ZeroIdx);
1999	auto WideBitcast =
2000	IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy);
2001	auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy);
2002	auto WideShuffle = IC.Builder.CreateShuffleVector(
2003	V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask);
2004	auto NarrowBitcast =
2005	IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType());
2006
2007	return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast);
2008	}
2009
2010	static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2011	IntrinsicInst &II) {
2012	Value *A = II.getArgOperand(i: `0`);
2013	Value *B = II.getArgOperand(i: `1`);
2014	if (A == B)
2015	return IC.replaceInstUsesWith(I&: II, V: A);
2016
2017	return std::nullopt;
2018	}
2019
2020	static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2021	IntrinsicInst &II) {
2022	Value *Pred = II.getOperand(i_nocapture: `0`);
2023	Value *Vec = II.getOperand(i_nocapture: `1`);
2024	Value *Shift = II.getOperand(i_nocapture: `2`);
2025
2026	// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2027	Value AbsPred, MergedValue;
2028	if (!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2029	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())) &&
2030	!match(V: Vec, P: m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2031	Op0: m_Value(V&: MergedValue), Op1: m_Value(V&: AbsPred), Op2: m_Value())))
2032
2033	return std::nullopt;
2034
2035	// Transform is valid if any of the following are true:
2036	// The ABS merge value is an undef or non-negative*
2037	// The ABS predicate is all active*
2038	// The ABS predicate and the SRSHL predicates are the same*
2039	if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) &&
2040	AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred))
2041	return std::nullopt;
2042
2043	// Only valid when the shift amount is non-negative, otherwise the rounding
2044	// behaviour of SRSHL cannot be ignored.
2045	if (!match(V: Shift, P: m_NonNegative()))
2046	return std::nullopt;
2047
2048	auto LSL = IC.Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_lsl,
2049	Types: {II.getType()}, Args: {Pred, Vec, Shift});
2050
2051	return IC.replaceInstUsesWith(I&: II, V: LSL);
2052	}
2053
2054	std::optional<Instruction *>
2055	AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
2056	IntrinsicInst &II) const {
2057	Intrinsic::ID IID = II.getIntrinsicID();
2058	switch (IID) {
2059	default:
2060	break;
2061
2062	case Intrinsic::aarch64_sve_st1_scatter:
2063	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2064	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2065	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2066	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2067	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2068	case Intrinsic::aarch64_sve_st1dq:
2069	case Intrinsic::aarch64_sve_st1q_scatter_index:
2070	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2071	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2072	case Intrinsic::aarch64_sve_st1wq:
2073	case Intrinsic::aarch64_sve_stnt1:
2074	case Intrinsic::aarch64_sve_stnt1_scatter:
2075	case Intrinsic::aarch64_sve_stnt1_scatter_index:
2076	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2077	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2078	return instCombineSVENoActiveUnaryErase(IC, II, PredPos: `1`);
2079	case Intrinsic::aarch64_sve_st2:
2080	case Intrinsic::aarch64_sve_st2q:
2081	return instCombineSVENoActiveUnaryErase(IC, II, PredPos: `2`);
2082	case Intrinsic::aarch64_sve_st3:
2083	case Intrinsic::aarch64_sve_st3q:
2084	return instCombineSVENoActiveUnaryErase(IC, II, PredPos: `3`);
2085	case Intrinsic::aarch64_sve_st4:
2086	case Intrinsic::aarch64_sve_st4q:
2087	return instCombineSVENoActiveUnaryErase(IC, II, PredPos: `4`);
2088	case Intrinsic::aarch64_sve_ld1_gather:
2089	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2090	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2091	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2092	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2093	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2094	case Intrinsic::aarch64_sve_ld1q_gather_index:
2095	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2096	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2097	case Intrinsic::aarch64_sve_ld1ro:
2098	case Intrinsic::aarch64_sve_ld1rq:
2099	case Intrinsic::aarch64_sve_ld1udq:
2100	case Intrinsic::aarch64_sve_ld1uwq:
2101	case Intrinsic::aarch64_sve_ld2_sret:
2102	case Intrinsic::aarch64_sve_ld2q_sret:
2103	case Intrinsic::aarch64_sve_ld3_sret:
2104	case Intrinsic::aarch64_sve_ld3q_sret:
2105	case Intrinsic::aarch64_sve_ld4_sret:
2106	case Intrinsic::aarch64_sve_ld4q_sret:
2107	case Intrinsic::aarch64_sve_ldff1:
2108	case Intrinsic::aarch64_sve_ldff1_gather:
2109	case Intrinsic::aarch64_sve_ldff1_gather_index:
2110	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2111	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2112	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2113	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2114	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2115	case Intrinsic::aarch64_sve_ldnf1:
2116	case Intrinsic::aarch64_sve_ldnt1:
2117	case Intrinsic::aarch64_sve_ldnt1_gather:
2118	case Intrinsic::aarch64_sve_ldnt1_gather_index:
2119	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2120	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2121	return instCombineSVENoActiveUnaryZero(IC, II);
2122	case Intrinsic::aarch64_neon_fmaxnm:
2123	case Intrinsic::aarch64_neon_fminnm:
2124	return instCombineMaxMinNM(IC, II);
2125	case Intrinsic::aarch64_sve_convert_from_svbool:
2126	return instCombineConvertFromSVBool(IC, II);
2127	case Intrinsic::aarch64_sve_dup:
2128	return instCombineSVEDup(IC, II);
2129	case Intrinsic::aarch64_sve_dup_x:
2130	return instCombineSVEDupX(IC, II);
2131	case Intrinsic::aarch64_sve_cmpne:
2132	case Intrinsic::aarch64_sve_cmpne_wide:
2133	return instCombineSVECmpNE(IC, II);
2134	case Intrinsic::aarch64_sve_rdffr:
2135	return instCombineRDFFR(IC, II);
2136	case Intrinsic::aarch64_sve_lasta:
2137	case Intrinsic::aarch64_sve_lastb:
2138	return instCombineSVELast(IC, II);
2139	case Intrinsic::aarch64_sve_clasta_n:
2140	case Intrinsic::aarch64_sve_clastb_n:
2141	return instCombineSVECondLast(IC, II);
2142	case Intrinsic::aarch64_sve_cntd:
2143	return instCombineSVECntElts(IC, II, NumElts: `2`);
2144	case Intrinsic::aarch64_sve_cntw:
2145	return instCombineSVECntElts(IC, II, NumElts: `4`);
2146	case Intrinsic::aarch64_sve_cnth:
2147	return instCombineSVECntElts(IC, II, NumElts: `8`);
2148	case Intrinsic::aarch64_sve_cntb:
2149	return instCombineSVECntElts(IC, II, NumElts: `16`);
2150	case Intrinsic::aarch64_sve_ptest_any:
2151	case Intrinsic::aarch64_sve_ptest_first:
2152	case Intrinsic::aarch64_sve_ptest_last:
2153	return instCombineSVEPTest(IC, II);
2154	case Intrinsic::aarch64_sve_fabd:
2155	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fabd_u);
2156	case Intrinsic::aarch64_sve_fadd:
2157	return instCombineSVEVectorFAdd(IC, II);
2158	case Intrinsic::aarch64_sve_fadd_u:
2159	return instCombineSVEVectorFAddU(IC, II);
2160	case Intrinsic::aarch64_sve_fdiv:
2161	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fdiv_u);
2162	case Intrinsic::aarch64_sve_fmax:
2163	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmax_u);
2164	case Intrinsic::aarch64_sve_fmaxnm:
2165	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmaxnm_u);
2166	case Intrinsic::aarch64_sve_fmin:
2167	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmin_u);
2168	case Intrinsic::aarch64_sve_fminnm:
2169	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fminnm_u);
2170	case Intrinsic::aarch64_sve_fmla:
2171	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmla_u);
2172	case Intrinsic::aarch64_sve_fmls:
2173	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmls_u);
2174	case Intrinsic::aarch64_sve_fmul:
2175	if (auto II_U =
2176	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmul_u))
2177	return II_U;
2178	return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_fmul_u);
2179	case Intrinsic::aarch64_sve_fmul_u:
2180	return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_fmul_u);
2181	case Intrinsic::aarch64_sve_fmulx:
2182	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fmulx_u);
2183	case Intrinsic::aarch64_sve_fnmla:
2184	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fnmla_u);
2185	case Intrinsic::aarch64_sve_fnmls:
2186	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_fnmls_u);
2187	case Intrinsic::aarch64_sve_fsub:
2188	return instCombineSVEVectorFSub(IC, II);
2189	case Intrinsic::aarch64_sve_fsub_u:
2190	return instCombineSVEVectorFSubU(IC, II);
2191	case Intrinsic::aarch64_sve_add:
2192	return instCombineSVEVectorAdd(IC, II);
2193	case Intrinsic::aarch64_sve_add_u:
2194	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2195	Intrinsic::aarch64_sve_mla_u>(
2196	IC, II, MergeIntoAddendOp: true);
2197	case Intrinsic::aarch64_sve_mla:
2198	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mla_u);
2199	case Intrinsic::aarch64_sve_mls:
2200	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mls_u);
2201	case Intrinsic::aarch64_sve_mul:
2202	if (auto II_U =
2203	instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_mul_u))
2204	return II_U;
2205	return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_mul_u);
2206	case Intrinsic::aarch64_sve_mul_u:
2207	return instCombineSVEVectorMul(IC, II, IID: Intrinsic::aarch64_sve_mul_u);
2208	case Intrinsic::aarch64_sve_sabd:
2209	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sabd_u);
2210	case Intrinsic::aarch64_sve_smax:
2211	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smax_u);
2212	case Intrinsic::aarch64_sve_smin:
2213	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smin_u);
2214	case Intrinsic::aarch64_sve_smulh:
2215	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_smulh_u);
2216	case Intrinsic::aarch64_sve_sub:
2217	return instCombineSVEVectorSub(IC, II);
2218	case Intrinsic::aarch64_sve_sub_u:
2219	return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2220	Intrinsic::aarch64_sve_mls_u>(
2221	IC, II, MergeIntoAddendOp: true);
2222	case Intrinsic::aarch64_sve_uabd:
2223	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_uabd_u);
2224	case Intrinsic::aarch64_sve_umax:
2225	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umax_u);
2226	case Intrinsic::aarch64_sve_umin:
2227	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umin_u);
2228	case Intrinsic::aarch64_sve_umulh:
2229	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_umulh_u);
2230	case Intrinsic::aarch64_sve_asr:
2231	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_asr_u);
2232	case Intrinsic::aarch64_sve_lsl:
2233	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_lsl_u);
2234	case Intrinsic::aarch64_sve_lsr:
2235	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_lsr_u);
2236	case Intrinsic::aarch64_sve_and:
2237	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_and_u);
2238	case Intrinsic::aarch64_sve_bic:
2239	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_bic_u);
2240	case Intrinsic::aarch64_sve_eor:
2241	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_eor_u);
2242	case Intrinsic::aarch64_sve_orr:
2243	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_orr_u);
2244	case Intrinsic::aarch64_sve_sqsub:
2245	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_sqsub_u);
2246	case Intrinsic::aarch64_sve_uqsub:
2247	return instCombineSVEAllOrNoActive(IC, II, IID: Intrinsic::aarch64_sve_uqsub_u);
2248	case Intrinsic::aarch64_sve_tbl:
2249	return instCombineSVETBL(IC, II);
2250	case Intrinsic::aarch64_sve_uunpkhi:
2251	case Intrinsic::aarch64_sve_uunpklo:
2252	case Intrinsic::aarch64_sve_sunpkhi:
2253	case Intrinsic::aarch64_sve_sunpklo:
2254	return instCombineSVEUnpack(IC, II);
2255	case Intrinsic::aarch64_sve_uzp1:
2256	return instCombineSVEUzp1(IC, II);
2257	case Intrinsic::aarch64_sve_zip1:
2258	case Intrinsic::aarch64_sve_zip2:
2259	return instCombineSVEZip(IC, II);
2260	case Intrinsic::aarch64_sve_ld1_gather_index:
2261	return instCombineLD1GatherIndex(IC, II);
2262	case Intrinsic::aarch64_sve_st1_scatter_index:
2263	return instCombineST1ScatterIndex(IC, II);
2264	case Intrinsic::aarch64_sve_ld1:
2265	return instCombineSVELD1(IC, II, DL);
2266	case Intrinsic::aarch64_sve_st1:
2267	return instCombineSVEST1(IC, II, DL);
2268	case Intrinsic::aarch64_sve_sdiv:
2269	return instCombineSVESDIV(IC, II);
2270	case Intrinsic::aarch64_sve_sel:
2271	return instCombineSVESel(IC, II);
2272	case Intrinsic::aarch64_sve_srshl:
2273	return instCombineSVESrshl(IC, II);
2274	case Intrinsic::aarch64_sve_dupq_lane:
2275	return instCombineSVEDupqLane(IC, II);
2276	}
2277
2278	return std::nullopt;
2279	}
2280
2281	std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
2282	InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2283	APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2284	std::function<void(Instruction , unsigned*, APInt, APInt &)>
2285	SimplifyAndSetOp) const {
2286	switch (II.getIntrinsicID()) {
2287	default:
2288	break;
2289	case Intrinsic::aarch64_neon_fcvtxn:
2290	case Intrinsic::aarch64_neon_rshrn:
2291	case Intrinsic::aarch64_neon_sqrshrn:
2292	case Intrinsic::aarch64_neon_sqrshrun:
2293	case Intrinsic::aarch64_neon_sqshrn:
2294	case Intrinsic::aarch64_neon_sqshrun:
2295	case Intrinsic::aarch64_neon_sqxtn:
2296	case Intrinsic::aarch64_neon_sqxtun:
2297	case Intrinsic::aarch64_neon_uqrshrn:
2298	case Intrinsic::aarch64_neon_uqshrn:
2299	case Intrinsic::aarch64_neon_uqxtn:
2300	SimplifyAndSetOp (&II, `0`, OrigDemandedElts, UndefElts);
2301	break;
2302	}
2303
2304	return std::nullopt;
2305	}
2306
2307	bool AArch64TTIImpl::enableScalableVectorization() const {
2308	return ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
2309	EnableScalableAutovecInStreamingMode);
2310	}
2311
2312	TypeSize
2313	AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
2314	switch (K) {
2315	case TargetTransformInfo::RGK_Scalar:
2316	return TypeSize::getFixed(ExactSize: `64`);
2317	case TargetTransformInfo::RGK_FixedWidthVector:
2318	if (ST->useSVEForFixedLengthVectors() &&
2319	(ST->isSVEAvailable() \|\| EnableFixedwidthAutovecInStreamingMode))
2320	return TypeSize::getFixed(
2321	ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: `128u`));
2322	else if (ST->isNeonAvailable())
2323	return TypeSize::getFixed(ExactSize: `128`);
2324	else
2325	return TypeSize::getFixed(ExactSize: `0`);
2326	case TargetTransformInfo::RGK_ScalableVector:
2327	if (ST->isSVEAvailable() \|\| (ST->isSVEorStreamingSVEAvailable() &&
2328	EnableScalableAutovecInStreamingMode))
2329	return TypeSize::getScalable(MinimumSize: `128`);
2330	else
2331	return TypeSize::getScalable(MinimumSize: `0`);
2332	}
2333	llvm_unreachable("Unsupported register kind");
2334	}
2335
2336	bool AArch64TTIImpl::isWideningInstruction(Type DstTy, unsigned* Opcode,
2337	ArrayRef<const Value *> Args,
2338	Type *SrcOverrideTy) {
2339	// A helper that returns a vector type from the given type. The number of
2340	// elements in type Ty determines the vector width.
2341	auto toVectorTy = [&](Type *ArgTy) {
2342	return VectorType::get(ElementType: ArgTy->getScalarType(),
2343	EC: cast<VectorType>(Val: DstTy)->getElementCount());
2344	};
2345
2346	// Exit early if DstTy is not a vector type whose elements are one of [i16,
2347	// i32, i64]. SVE doesn't generally have the same set of instructions to
2348	// perform an extend with the add/sub/mul. There are SMULLB style
2349	// instructions, but they operate on top/bottom, requiring some sort of lane
2350	// interleaving to be used with zext/sext.
2351	unsigned DstEltSize = DstTy->getScalarSizeInBits();
2352	if (!useNeonVector(Ty: DstTy) \|\| Args.size() != `2` \|\|
2353	(DstEltSize != `16` && DstEltSize != `32` && DstEltSize != `64`))
2354	return false;
2355
2356	// Determine if the operation has a widening variant. We consider both the
2357	// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2358	// instructions.
2359	//
2360	// TODO: Add additional widening operations (e.g., shl, etc.) once we
2361	// verify that their extending operands are eliminated during code
2362	// generation.
2363	Type *SrcTy = SrcOverrideTy;
2364	switch (Opcode) {
2365	case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2366	case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2367	// The second operand needs to be an extend
2368	if (isa<SExtInst>(Val: Args [`1`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2369	if (!SrcTy)
2370	SrcTy =
2371	toVectorTy (cast<Instruction>(Val: Args [`1`])->getOperand(i: `0`)->getType());
2372	} else
2373	return false;
2374	break;
2375	case Instruction::Mul: { // SMULL(2), UMULL(2)
2376	// Both operands need to be extends of the same type.
2377	if ((isa<SExtInst>(Val: Args [`0`]) && isa<SExtInst>(Val: Args [`1`])) \|\|
2378	(isa<ZExtInst>(Val: Args [`0`]) && isa<ZExtInst>(Val: Args [`1`]))) {
2379	if (!SrcTy)
2380	SrcTy =
2381	toVectorTy (cast<Instruction>(Val: Args [`0`])->getOperand(i: `0`)->getType());
2382	} else if (isa<ZExtInst>(Val: Args [`0`]) \|\| isa<ZExtInst>(Val: Args [`1`])) {
2383	// If one of the operands is a Zext and the other has enough zero bits to
2384	// be treated as unsigned, we can still general a umull, meaning the zext
2385	// is free.
2386	KnownBits Known =
2387	computeKnownBits(V: isa<ZExtInst>(Val: Args [`0`]) ? Args [`1`] : Args [`0`], DL);
2388	if (Args [`0`]->getType()->getScalarSizeInBits() -
2389	Known.Zero.countLeadingOnes() >
2390	DstTy->getScalarSizeInBits() / `2`)
2391	return false;
2392	if (!SrcTy)
2393	SrcTy = toVectorTy (Type::getIntNTy(C&: DstTy->getContext(),
2394	N: DstTy->getScalarSizeInBits() / `2`));
2395	} else
2396	return false;
2397	break;
2398	}
2399	default:
2400	return false;
2401	}
2402
2403	// Legalize the destination type and ensure it can be used in a widening
2404	// operation.
2405	auto DstTyL = getTypeLegalizationCost(Ty: DstTy);
2406	if (!DstTyL.second.isVector() \|\| DstEltSize != DstTy->getScalarSizeInBits())
2407	return false;
2408
2409	// Legalize the source type and ensure it can be used in a widening
2410	// operation.
2411	assert(SrcTy && "Expected some SrcTy");
2412	auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy);
2413	unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2414	if (!SrcTyL.second.isVector() \|\| SrcElTySize != SrcTy->getScalarSizeInBits())
2415	return false;
2416
2417	// Get the total number of vector elements in the legalized types.
2418	InstructionCost NumDstEls =
2419	DstTyL.first * DstTyL.second.getVectorMinNumElements();
2420	InstructionCost NumSrcEls =
2421	SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2422
2423	// Return true if the legalized types have the same number of vector elements
2424	// and the destination element type size is twice that of the source type.
2425	return NumDstEls == NumSrcEls && `2` * SrcElTySize == DstEltSize;
2426	}
2427
2428	// s/urhadd instructions implement the following pattern, making the
2429	// extends free:
2430	// %x = add ((zext i8 -> i16), 1)
2431	// %y = (zext i8 -> i16)
2432	// trunc i16 (lshr (add %x, %y), 1) -> i8
2433	//
2434	bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction ExtUser, Type Dst,
2435	Type *Src) {
2436	// The source should be a legal vector type.
2437	if (!Src->isVectorTy() \|\| !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) \|\|
2438	(Src->isScalableTy() && !ST->hasSVE2()))
2439	return false;
2440
2441	if (ExtUser->getOpcode() != Instruction::Add \|\| !ExtUser->hasOneUse())
2442	return false;
2443
2444	// Look for trunc/shl/add before trying to match the pattern.
2445	const Instruction *Add = ExtUser;
2446	auto *AddUser =
2447	dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
2448	if (AddUser && AddUser->getOpcode() == Instruction::Add)
2449	Add = AddUser;
2450
2451	auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser());
2452	if (!Shr \|\| Shr->getOpcode() != Instruction::LShr)
2453	return false;
2454
2455	auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser());
2456	if (!Trunc \|\| Trunc->getOpcode() != Instruction::Trunc \|\|
2457	Src->getScalarSizeInBits() !=
2458	cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits())
2459	return false;
2460
2461	// Try to match the whole pattern. Ext could be either the first or second
2462	// m_ZExtOrSExt matched.
2463	Instruction Ex1, Ex2;
2464	if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1),
2465	R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: `1`))))))
2466	return false;
2467
2468	// Ensure both extends are of the same type
2469	if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) &&
2470	Ex1->getOpcode() == Ex2->getOpcode())
2471	return true;
2472
2473	return false;
2474	}
2475
2476	InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
2477	Type *Src,
2478	TTI::CastContextHint CCH,
2479	TTI::TargetCostKind CostKind,
2480	const Instruction *I) {
2481	int ISD = TLI->InstructionOpcodeToISD(Opcode);
2482	assert(ISD && "Invalid opcode");
2483	// If the cast is observable, and it is used by a widening instruction (e.g.,
2484	// uaddl, saddw, etc.), it may be free.
2485	if (I && I->hasOneUser()) {
2486	auto SingleUser = cast<Instruction>(Val: I->user_begin());
2487	SmallVector<const Value *, `4`> Operands(SingleUser->operand_values());
2488	if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) {
2489	// For adds only count the second operand as free if both operands are
2490	// extends but not the same operation. (i.e both operands are not free in
2491	// add(sext, zext)).
2492	if (SingleUser->getOpcode() == Instruction::Add) {
2493	if (I == SingleUser->getOperand(i: `1`) \|\|
2494	(isa<CastInst>(Val: SingleUser->getOperand(i: `1`)) &&
2495	cast<CastInst>(Val: SingleUser->getOperand(i: `1`))->getOpcode() == Opcode))
2496	return `0`;
2497	} else // Others are free so long as isWideningInstruction returned true.
2498	return `0`;
2499	}
2500
2501	// The cast will be free for the s/urhadd instructions
2502	if ((isa<ZExtInst>(Val: I) \|\| isa<SExtInst>(Val: I)) &&
2503	isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src))
2504	return `0`;
2505	}
2506
2507	// TODO: Allow non-throughput costs that aren't binary.
2508	auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2509	if (CostKind != TTI::TCK_RecipThroughput)
2510	return Cost == `0` ? `0` : `1`;
2511	return Cost;
2512	};
2513
2514	EVT SrcTy = TLI->getValueType(DL, Ty: Src);
2515	EVT DstTy = TLI->getValueType(DL, Ty: Dst);
2516
2517	if (!SrcTy.isSimple() \|\| !DstTy.isSimple())
2518	return AdjustCost (
2519	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2520
2521	static const TypeConversionCostTblEntry
2522	ConversionTbl[] = {
2523	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v2i8, .Src: MVT::v2i64, .Cost: `1`}, // xtn
2524	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v2i16, .Src: MVT::v2i64, .Cost: `1`}, // xtn
2525	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v2i32, .Src: MVT::v2i64, .Cost: `1`}, // xtn
2526	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i32, .Cost: `1`}, // xtn
2527	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i8, .Src: MVT::v4i64, .Cost: `3`}, // 2 xtn + 1 uzp1
2528	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i32, .Cost: `1`}, // xtn
2529	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i16, .Src: MVT::v4i64, .Cost: `2`}, // 1 uzp1 + 1 xtn
2530	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v4i32, .Src: MVT::v4i64, .Cost: `1`}, // 1 uzp1
2531	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i16, .Cost: `1`}, // 1 xtn
2532	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i32, .Cost: `2`}, // 1 uzp1 + 1 xtn
2533	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i8, .Src: MVT::v8i64, .Cost: `4`}, // 3 x uzp1 + xtn
2534	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i32, .Cost: `1`}, // 1 uzp1
2535	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i16, .Src: MVT::v8i64, .Cost: `3`}, // 3 x uzp1
2536	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v8i32, .Src: MVT::v8i64, .Cost: `2`}, // 2 x uzp1
2537	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i16, .Cost: `1`}, // uzp1
2538	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i32, .Cost: `3`}, // (2 + 1) x uzp1
2539	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i8, .Src: MVT::v16i64, .Cost: `7`}, // (4 + 2 + 1) x uzp1
2540	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i32, .Cost: `2`}, // 2 x uzp1
2541	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i16, .Src: MVT::v16i64, .Cost: `6`}, // (4 + 2) x uzp1
2542	{ .ISD: ISD::TRUNCATE, .Dst: MVT::v16i32, .Src: MVT::v16i64, .Cost: `4`}, // 4 x uzp1
2543
2544	// Truncations on nxvmiN
2545	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i16, .Cost: `1` },
2546	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i32, .Cost: `1` },
2547	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i1, .Src: MVT::nxv2i64, .Cost: `1` },
2548	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i16, .Cost: `1` },
2549	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i32, .Cost: `1` },
2550	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i1, .Src: MVT::nxv4i64, .Cost: `2` },
2551	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i16, .Cost: `1` },
2552	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i32, .Cost: `3` },
2553	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i1, .Src: MVT::nxv8i64, .Cost: `5` },
2554	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv16i1, .Src: MVT::nxv16i8, .Cost: `1` },
2555	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i16, .Src: MVT::nxv2i32, .Cost: `1` },
2556	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv2i32, .Src: MVT::nxv2i64, .Cost: `1` },
2557	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i16, .Src: MVT::nxv4i32, .Cost: `1` },
2558	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv4i32, .Src: MVT::nxv4i64, .Cost: `2` },
2559	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i16, .Src: MVT::nxv8i32, .Cost: `3` },
2560	{ .ISD: ISD::TRUNCATE, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i64, .Cost: `6` },
2561
2562	// The number of shll instructions for the extension.
2563	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
2564	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i16, .Cost: `3` },
2565	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2` },
2566	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v4i64, .Src: MVT::v4i32, .Cost: `2` },
2567	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
2568	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i8, .Cost: `3` },
2569	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2` },
2570	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i32, .Src: MVT::v8i16, .Cost: `2` },
2571	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
2572	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i8, .Cost: `7` },
2573	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
2574	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v8i64, .Src: MVT::v8i16, .Cost: `6` },
2575	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2` },
2576	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i16, .Src: MVT::v16i8, .Cost: `2` },
2577	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
2578	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::v16i32, .Src: MVT::v16i8, .Cost: `6` },
2579
2580	// LowerVectorINT_TO_FP:
2581	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
2582	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
2583	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1` },
2584	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i32, .Cost: `1` },
2585	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i32, .Cost: `1` },
2586	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i64, .Cost: `1` },
2587
2588	// Complex: to v2f32
2589	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
2590	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3` },
2591	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: `2` },
2592	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i8, .Cost: `3` },
2593	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i16, .Cost: `3` },
2594	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f32, .Src: MVT::v2i64, .Cost: `2` },
2595
2596	// Complex: to v4f32
2597	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `4` },
2598	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
2599	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i8, .Cost: `3` },
2600	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f32, .Src: MVT::v4i16, .Cost: `2` },
2601
2602	// Complex: to v8f32
2603	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10` },
2604	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
2605	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i8, .Cost: `10` },
2606	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f32, .Src: MVT::v8i16, .Cost: `4` },
2607
2608	// Complex: to v16f32
2609	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21` },
2610	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f32, .Src: MVT::v16i8, .Cost: `21` },
2611
2612	// Complex: to v2f64
2613	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
2614	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4` },
2615	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
2616	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i8, .Cost: `4` },
2617	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i16, .Cost: `4` },
2618	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v2f64, .Src: MVT::v2i32, .Cost: `2` },
2619
2620	// Complex: to v4f64
2621	{ .ISD: ISD::SINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4` },
2622	{ .ISD: ISD::UINT_TO_FP, .Dst: MVT::v4f64, .Src: MVT::v4i32, .Cost: `4` },
2623
2624	// LowerVectorFP_TO_INT
2625	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1` },
2626	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
2627	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1` },
2628	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f32, .Cost: `1` },
2629	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f32, .Cost: `1` },
2630	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f64, .Cost: `1` },
2631
2632	// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2633	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2` },
2634	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1` },
2635	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1` },
2636	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i64, .Src: MVT::v2f32, .Cost: `2` },
2637	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f32, .Cost: `1` },
2638	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f32, .Cost: `1` },
2639
2640	// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2641	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
2642	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2` },
2643	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f32, .Cost: `2` },
2644	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f32, .Cost: `2` },
2645
2646	// Complex, from nxv2f32.
2647	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1` },
2648	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1` },
2649	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1` },
2650	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1` },
2651	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f32, .Cost: `1` },
2652	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `1` },
2653	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f32, .Cost: `1` },
2654	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f32, .Cost: `1` },
2655
2656	// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2657	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
2658	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2` },
2659	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2` },
2660	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i32, .Src: MVT::v2f64, .Cost: `2` },
2661	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i16, .Src: MVT::v2f64, .Cost: `2` },
2662	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::v2i8, .Src: MVT::v2f64, .Cost: `2` },
2663
2664	// Complex, from nxv2f64.
2665	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1` },
2666	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1` },
2667	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1` },
2668	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1` },
2669	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f64, .Cost: `1` },
2670	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f64, .Cost: `1` },
2671	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f64, .Cost: `1` },
2672	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f64, .Cost: `1` },
2673
2674	// Complex, from nxv4f32.
2675	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4` },
2676	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1` },
2677	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1` },
2678	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1` },
2679	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f32, .Cost: `4` },
2680	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f32, .Cost: `1` },
2681	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f32, .Cost: `1` },
2682	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f32, .Cost: `1` },
2683
2684	// Complex, from nxv8f64. Illegal -> illegal conversions not required.
2685	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7` },
2686	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7` },
2687	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f64, .Cost: `7` },
2688	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f64, .Cost: `7` },
2689
2690	// Complex, from nxv4f64. Illegal -> illegal conversions not required.
2691	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3` },
2692	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3` },
2693	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3` },
2694	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f64, .Cost: `3` },
2695	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f64, .Cost: `3` },
2696	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f64, .Cost: `3` },
2697
2698	// Complex, from nxv8f32. Illegal -> illegal conversions not required.
2699	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3` },
2700	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3` },
2701	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f32, .Cost: `3` },
2702	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f32, .Cost: `3` },
2703
2704	// Complex, from nxv8f16.
2705	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10` },
2706	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4` },
2707	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1` },
2708	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1` },
2709	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i64, .Src: MVT::nxv8f16, .Cost: `10` },
2710	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i32, .Src: MVT::nxv8f16, .Cost: `4` },
2711	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i16, .Src: MVT::nxv8f16, .Cost: `1` },
2712	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv8i8, .Src: MVT::nxv8f16, .Cost: `1` },
2713
2714	// Complex, from nxv4f16.
2715	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4` },
2716	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1` },
2717	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1` },
2718	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1` },
2719	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i64, .Src: MVT::nxv4f16, .Cost: `4` },
2720	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i32, .Src: MVT::nxv4f16, .Cost: `1` },
2721	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `1` },
2722	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv4i8, .Src: MVT::nxv4f16, .Cost: `1` },
2723
2724	// Complex, from nxv2f16.
2725	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1` },
2726	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1` },
2727	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1` },
2728	{ .ISD: ISD::FP_TO_SINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1` },
2729	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i64, .Src: MVT::nxv2f16, .Cost: `1` },
2730	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f16, .Cost: `1` },
2731	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `1` },
2732	{ .ISD: ISD::FP_TO_UINT, .Dst: MVT::nxv2i8, .Src: MVT::nxv2f16, .Cost: `1` },
2733
2734	// Truncate from nxvmf32 to nxvmf16.
2735	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f32, .Cost: `1` },
2736	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f32, .Cost: `1` },
2737	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f32, .Cost: `3` },
2738
2739	// Truncate from nxvmf64 to nxvmf16.
2740	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f16, .Src: MVT::nxv2f64, .Cost: `1` },
2741	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f16, .Src: MVT::nxv4f64, .Cost: `3` },
2742	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f16, .Src: MVT::nxv8f64, .Cost: `7` },
2743
2744	// Truncate from nxvmf64 to nxvmf32.
2745	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f64, .Cost: `1` },
2746	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f64, .Cost: `3` },
2747	{ .ISD: ISD::FP_ROUND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f64, .Cost: `6` },
2748
2749	// Extend from nxvmf16 to nxvmf32.
2750	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f32, .Src: MVT::nxv2f16, .Cost: `1`},
2751	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f32, .Src: MVT::nxv4f16, .Cost: `1`},
2752	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f32, .Src: MVT::nxv8f16, .Cost: `2`},
2753
2754	// Extend from nxvmf16 to nxvmf64.
2755	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f16, .Cost: `1`},
2756	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f16, .Cost: `2`},
2757	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f16, .Cost: `4`},
2758
2759	// Extend from nxvmf32 to nxvmf64.
2760	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv2f64, .Src: MVT::nxv2f32, .Cost: `1`},
2761	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv4f64, .Src: MVT::nxv4f32, .Cost: `2`},
2762	{ .ISD: ISD::FP_EXTEND, .Dst: MVT::nxv8f64, .Src: MVT::nxv8f32, .Cost: `6`},
2763
2764	// Bitcasts from float to integer
2765	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv2f16, .Src: MVT::nxv2i16, .Cost: `0` },
2766	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv4f16, .Src: MVT::nxv4i16, .Cost: `0` },
2767	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv2f32, .Src: MVT::nxv2i32, .Cost: `0` },
2768
2769	// Bitcasts from integer to float
2770	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv2i16, .Src: MVT::nxv2f16, .Cost: `0` },
2771	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv4i16, .Src: MVT::nxv4f16, .Cost: `0` },
2772	{ .ISD: ISD::BITCAST, .Dst: MVT::nxv2i32, .Src: MVT::nxv2f32, .Cost: `0` },
2773
2774	// Add cost for extending to illegal -too wide- scalable vectors.
2775	// zero/sign extend are implemented by multiple unpack operations,
2776	// where each operation has a cost of 1.
2777	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
2778	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
2779	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
2780	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
2781	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
2782	{ .ISD: ISD::ZERO_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
2783
2784	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i16, .Src: MVT::nxv16i8, .Cost: `2`},
2785	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i32, .Src: MVT::nxv16i8, .Cost: `6`},
2786	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv16i64, .Src: MVT::nxv16i8, .Cost: `14`},
2787	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i32, .Src: MVT::nxv8i16, .Cost: `2`},
2788	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv8i64, .Src: MVT::nxv8i16, .Cost: `6`},
2789	{ .ISD: ISD::SIGN_EXTEND, .Dst: MVT::nxv4i64, .Src: MVT::nxv4i32, .Cost: `2`},
2790	};
2791
2792	// We have to estimate a cost of fixed length operation upon
2793	// SVE registers(operations) with the number of registers required
2794	// for a fixed type to be represented upon SVE registers.
2795	EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy;
2796	if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2797	SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2798	ST->useSVEForFixedLengthVectors(VT: WiderTy)) {
2799	std::pair<InstructionCost, MVT> LT =
2800	getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext()));
2801	unsigned NumElements = AArch64::SVEBitsPerBlock /
2802	LT.second.getScalarSizeInBits();
2803	return AdjustCost (
2804	LT.first *
2805	getCastInstrCost(
2806	Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements),
2807	Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH,
2808	CostKind, I));
2809	}
2810
2811	if (const auto *Entry = ConvertCostTableLookup(Table: ConversionTbl, ISD,
2812	Dst: DstTy.getSimpleVT(),
2813	Src: SrcTy.getSimpleVT()))
2814	return AdjustCost (Entry->Cost);
2815
2816	static const TypeConversionCostTblEntry FP16Tbl[] = {
2817	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
2818	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i8, .Src: MVT::v4f16, .Cost: `1`},
2819	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`}, // fcvtzs
2820	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i16, .Src: MVT::v4f16, .Cost: `1`},
2821	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`}, // fcvtl+fcvtzs
2822	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v4i32, .Src: MVT::v4f16, .Cost: `2`},
2823	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`}, // fcvtzs+xtn
2824	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i8, .Src: MVT::v8f16, .Cost: `2`},
2825	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`}, // fcvtzs
2826	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i16, .Src: MVT::v8f16, .Cost: `1`},
2827	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`}, // 2fcvtl+2fcvtzs
2828	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v8i32, .Src: MVT::v8f16, .Cost: `4`},
2829	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`}, // 2fcvtzs+xtn*
2830	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i8, .Src: MVT::v16f16, .Cost: `3`},
2831	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`}, // 2fcvtzs*
2832	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i16, .Src: MVT::v16f16, .Cost: `2`},
2833	{.ISD: ISD::FP_TO_SINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`}, // 4fcvtl+4fcvtzs
2834	{.ISD: ISD::FP_TO_UINT, .Dst: MVT::v16i32, .Src: MVT::v16f16, .Cost: `8`},
2835	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // ushll + ucvtf
2836	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v8f16, .Src: MVT::v8i8, .Cost: `2`}, // sshll + scvtf
2837	{.ISD: ISD::UINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 ushl(2) + 2 * ucvtf*
2838	{.ISD: ISD::SINT_TO_FP, .Dst: MVT::v16f16, .Src: MVT::v16i8, .Cost: `4`}, // 2 sshl(2) + 2 * scvtf*
2839	};
2840
2841	if (ST->hasFullFP16())
2842	if (const auto *Entry = ConvertCostTableLookup(
2843	Table: FP16Tbl, ISD, Dst: DstTy.getSimpleVT(), Src: SrcTy.getSimpleVT()))
2844	return AdjustCost (Entry->Cost);
2845
2846	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
2847	CCH == TTI::CastContextHint::Masked &&
2848	ST->isSVEorStreamingSVEAvailable() &&
2849	TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) ==
2850	TargetLowering::TypePromoteInteger &&
2851	TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) ==
2852	TargetLowering::TypeSplitVector) {
2853	// The standard behaviour in the backend for these cases is to split the
2854	// extend up into two parts:
2855	// 1. Perform an extending load or masked load up to the legal type.
2856	// 2. Extend the loaded data to the final type.
2857	std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src);
2858	Type *LegalTy = EVT (SrcLT.second).getTypeForEVT(Context&: Src->getContext());
2859	InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost(
2860	Opcode, Dst: LegalTy, Src, CCH, CostKind, I);
2861	InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost(
2862	Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I);
2863	return Part1 + Part2;
2864	}
2865
2866	// The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2867	// but we also want to include the TTI::CastContextHint::Masked case too.
2868	if ((ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND) &&
2869	CCH == TTI::CastContextHint::Masked &&
2870	ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(VT: DstTy))
2871	CCH = TTI::CastContextHint::Normal;
2872
2873	return AdjustCost (
2874	BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2875	}
2876
2877	InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode,
2878	Type *Dst,
2879	VectorType *VecTy,
2880	unsigned Index) {
2881
2882	// Make sure we were given a valid extend opcode.
2883	assert((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
2884	"Invalid opcode");
2885
2886	// We are extending an element we extract from a vector, so the source type
2887	// of the extend is the element type of the vector.
2888	auto *Src = VecTy->getElementType();
2889
2890	// Sign- and zero-extends are for integer types only.
2891	assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2892
2893	// Get the cost for the extract. We compute the cost (if any) for the extend
2894	// below.
2895	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2896	InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
2897	CostKind, Index, Op0: nullptr, Op1: nullptr);
2898
2899	// Legalize the types.
2900	auto VecLT = getTypeLegalizationCost(Ty: VecTy);
2901	auto DstVT = TLI->getValueType(DL, Ty: Dst);
2902	auto SrcVT = TLI->getValueType(DL, Ty: Src);
2903
2904	// If the resulting type is still a vector and the destination type is legal,
2905	// we may get the extension for free. If not, get the default cost for the
2906	// extend.
2907	if (!VecLT.second.isVector() \|\| !TLI->isTypeLegal(VT: DstVT))
2908	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2909	CostKind);
2910
2911	// The destination type should be larger than the element type. If not, get
2912	// the default cost for the extend.
2913	if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2914	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2915	CostKind);
2916
2917	switch (Opcode) {
2918	default:
2919	llvm_unreachable("Opcode should be either SExt or ZExt");
2920
2921	// For sign-extends, we only need a smov, which performs the extension
2922	// automatically.
2923	case Instruction::SExt:
2924	return Cost;
2925
2926	// For zero-extends, the extend is performed automatically by a umov unless
2927	// the destination type is i64 and the element type is i8 or i16.
2928	case Instruction::ZExt:
2929	if (DstVT.getSizeInBits() != `64u` \|\| SrcVT.getSizeInBits() == `32u`)
2930	return Cost;
2931	}
2932
2933	// If we are unable to perform the extend for free, get the default cost.
2934	return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None,
2935	CostKind);
2936	}
2937
2938	InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
2939	TTI::TargetCostKind CostKind,
2940	const Instruction *I) {
2941	if (CostKind != TTI::TCK_RecipThroughput)
2942	return Opcode == Instruction::PHI ? `0` : `1`;
2943	assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2944	// Branches are assumed to be predicted.
2945	return `0`;
2946	}
2947
2948	InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2949	Type *Val,
2950	unsigned Index,
2951	bool HasRealUse) {
2952	assert(Val->isVectorTy() && "This must be a vector type");
2953
2954	if (Index != -`1U`) {
2955	// Legalize the type.
2956	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val);
2957
2958	// This type is legalized to a scalar type.
2959	if (!LT.second.isVector())
2960	return `0`;
2961
2962	// The type may be split. For fixed-width vectors we can normalize the
2963	// index to the new type.
2964	if (LT.second.isFixedLengthVector()) {
2965	unsigned Width = LT.second.getVectorNumElements();
2966	Index = Index % Width;
2967	}
2968
2969	// The element at index zero is already inside the vector.
2970	// - For a physical (HasRealUse==true) insert-element or extract-element
2971	// instruction that extracts integers, an explicit FPR -> GPR move is
2972	// needed. So it has non-zero cost.
2973	// - For the rest of cases (virtual instruction or element type is float),
2974	// consider the instruction free.
2975	if (Index == `0` && (!HasRealUse \|\| !Val->getScalarType()->isIntegerTy()))
2976	return `0`;
2977
2978	// This is recognising a LD1 single-element structure to one lane of one
2979	// register instruction. I.e., if this is an `insertelement` instruction,
2980	// and its second operand is a load, then we will generate a LD1, which
2981	// are expensive instructions.
2982	if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: `1`)))
2983	return ST->getVectorInsertExtractBaseCost() + `1`;
2984
2985	// i1 inserts and extract will include an extra cset or cmp of the vector
2986	// value. Increase the cost by 1 to account.
2987	if (Val->getScalarSizeInBits() == `1`)
2988	return ST->getVectorInsertExtractBaseCost() + `1`;
2989
2990	// FIXME:
2991	// If the extract-element and insert-element instructions could be
2992	// simplified away (e.g., could be combined into users by looking at use-def
2993	// context), they have no cost. This is not done in the first place for
2994	// compile-time considerations.
2995	}
2996
2997	// All other insert/extracts cost this much.
2998	return ST->getVectorInsertExtractBaseCost();
2999	}
3000
3001	InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
3002	TTI::TargetCostKind CostKind,
3003	unsigned Index, Value *Op0,
3004	Value *Op1) {
3005	bool HasRealUse =
3006	Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0);
3007	return getVectorInstrCostHelper(I: nullptr, Val, Index, HasRealUse);
3008	}
3009
3010	InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
3011	Type *Val,
3012	TTI::TargetCostKind CostKind,
3013	unsigned Index) {
3014	return getVectorInstrCostHelper(I: &I, Val, Index, HasRealUse: true / HasRealUse /);
3015	}
3016
3017	InstructionCost AArch64TTIImpl::getScalarizationOverhead(
3018	VectorType Ty, const* APInt &DemandedElts, bool Insert, bool Extract,
3019	TTI::TargetCostKind CostKind) {
3020	if (isa<ScalableVectorType>(Val: Ty))
3021	return InstructionCost::getInvalid();
3022	if (Ty->getElementType()->isFloatingPointTy())
3023	return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract,
3024	CostKind);
3025	return DemandedElts.popcount() * (Insert + Extract) *
3026	ST->getVectorInsertExtractBaseCost();
3027	}
3028
3029	InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
3030	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3031	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
3032	ArrayRef<const Value *> Args,
3033	const Instruction *CxtI) {
3034
3035	// The code-generator is currently not able to handle scalable vectors
3036	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3037	// it. This change will be removed when code-generation for these types is
3038	// sufficiently reliable.
3039	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3040	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3041	return InstructionCost::getInvalid();
3042
3043	// TODO: Handle more cost kinds.
3044	if (CostKind != TTI::TCK_RecipThroughput)
3045	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3046	Opd2Info: Op2Info, Args, CxtI);
3047
3048	// Legalize the type.
3049	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3050	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3051
3052	switch (ISD) {
3053	default:
3054	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3055	Opd2Info: Op2Info);
3056	case ISD::SDIV:
3057	if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3058	// On AArch64, scalar signed division by constants power-of-two are
3059	// normally expanded to the sequence ADD + CMP + SELECT + SRA.
3060	// The OperandValue properties many not be same as that of previous
3061	// operation; conservatively assume OP_None.
3062	InstructionCost Cost = getArithmeticInstrCost(
3063	Opcode: Instruction::Add, Ty, CostKind,
3064	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3065	Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind,
3066	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3067	Cost += getArithmeticInstrCost(
3068	Opcode: Instruction::Select, Ty, CostKind,
3069	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3070	Cost += getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind,
3071	Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3072	return Cost;
3073	}
3074	[[fallthrough]];
3075	case ISD::UDIV: {
3076	if (Op2Info.isConstant() && Op2Info.isUniform()) {
3077	auto VT = TLI->getValueType(DL, Ty);
3078	if (TLI->isOperationLegalOrCustom(Op: ISD::MULHU, VT)) {
3079	// Vector signed division by constant are expanded to the
3080	// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3081	// to MULHS + SUB + SRL + ADD + SRL.
3082	InstructionCost MulCost = getArithmeticInstrCost(
3083	Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3084	InstructionCost AddCost = getArithmeticInstrCost(
3085	Opcode: Instruction::Add, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3086	InstructionCost ShrCost = getArithmeticInstrCost(
3087	Opcode: Instruction::AShr, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps());
3088	return MulCost * `2` + AddCost * `2` + ShrCost * `2` + `1`;
3089	}
3090	}
3091
3092	InstructionCost Cost = BaseT::getArithmeticInstrCost(
3093	Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
3094	if (Ty->isVectorTy()) {
3095	if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) {
3096	// SDIV/UDIV operations are lowered using SVE, then we can have less
3097	// costs.
3098	if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty)
3099	->getPrimitiveSizeInBits()
3100	.getFixedValue() < `128`) {
3101	EVT VT = TLI->getValueType(DL, Ty);
3102	static const CostTblEntry DivTbl[]{
3103	{.ISD: ISD::SDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v4i8, .Cost: `8`},
3104	{.ISD: ISD::SDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::SDIV, .Type: MVT::v2i16, .Cost: `5`},
3105	{.ISD: ISD::SDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::SDIV, .Type: MVT::v2i32, .Cost: `1`},
3106	{.ISD: ISD::UDIV, .Type: MVT::v2i8, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v4i8, .Cost: `8`},
3107	{.ISD: ISD::UDIV, .Type: MVT::v8i8, .Cost: `8`}, {.ISD: ISD::UDIV, .Type: MVT::v2i16, .Cost: `5`},
3108	{.ISD: ISD::UDIV, .Type: MVT::v4i16, .Cost: `5`}, {.ISD: ISD::UDIV, .Type: MVT::v2i32, .Cost: `1`}};
3109
3110	const auto *Entry = CostTableLookup(Table: DivTbl, ISD, Ty: VT.getSimpleVT());
3111	if (nullptr != Entry)
3112	return Entry->Cost;
3113	}
3114	// For 8/16-bit elements, the cost is higher because the type
3115	// requires promotion and possibly splitting:
3116	if (LT.second.getScalarType() == MVT::i8)
3117	Cost *= `8`;
3118	else if (LT.second.getScalarType() == MVT::i16)
3119	Cost *= `4`;
3120	return Cost;
3121	} else {
3122	// If one of the operands is a uniform constant then the cost for each
3123	// element is Cost for insertion, extraction and division.
3124	// Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3125	// operation with scalar type
3126	if ((Op1Info.isConstant() && Op1Info.isUniform()) \|\|
3127	(Op2Info.isConstant() && Op2Info.isUniform())) {
3128	if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) {
3129	InstructionCost DivCost = BaseT::getArithmeticInstrCost(
3130	Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info);
3131	return (`4` + DivCost) * VTy->getNumElements();
3132	}
3133	}
3134	// On AArch64, without SVE, vector divisions are expanded
3135	// into scalar divisions of each pair of elements.
3136	Cost += getArithmeticInstrCost(Opcode: Instruction::ExtractElement, Ty,
3137	CostKind, Op1Info, Op2Info);
3138	Cost += getArithmeticInstrCost(Opcode: Instruction::InsertElement, Ty, CostKind,
3139	Op1Info, Op2Info);
3140	}
3141
3142	// TODO: if one of the arguments is scalar, then it's not necessary to
3143	// double the cost of handling the vector elements.
3144	Cost += Cost;
3145	}
3146	return Cost;
3147	}
3148	case ISD::MUL:
3149	// When SVE is available, then we can lower the v2i64 operation using
3150	// the SVE mul instruction, which has a lower cost.
3151	if (LT.second == MVT::v2i64 && ST->hasSVE())
3152	return LT.first;
3153
3154	// When SVE is not available, there is no MUL.2d instruction,
3155	// which means mul <2 x i64> is expensive as elements are extracted
3156	// from the vectors and the muls scalarized.
3157	// As getScalarizationOverhead is a bit too pessimistic, we
3158	// estimate the cost for a i64 vector directly here, which is:
3159	// - four 2-cost i64 extracts,
3160	// - two 2-cost i64 inserts, and
3161	// - two 1-cost muls.
3162	// So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3163	// LT.first = 2 the cost is 28. If both operands are extensions it will not
3164	// need to scalarize so the cost can be cheaper (smull or umull).
3165	// so the cost can be cheaper (smull or umull).
3166	if (LT.second != MVT::v2i64 \|\| isWideningInstruction(DstTy: Ty, Opcode, Args))
3167	return LT.first;
3168	return LT.first * `14`;
3169	case ISD::ADD:
3170	case ISD::XOR:
3171	case ISD::OR:
3172	case ISD::AND:
3173	case ISD::SRL:
3174	case ISD::SRA:
3175	case ISD::SHL:
3176	// These nodes are marked as 'custom' for combining purposes only.
3177	// We know that they are legal. See LowerAdd in ISelLowering.
3178	return LT.first;
3179
3180	case ISD::FNEG:
3181	case ISD::FADD:
3182	case ISD::FSUB:
3183	// Increase the cost for half and bfloat types if not architecturally
3184	// supported.
3185	if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) \|\|
3186	(Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3187	return `2` * LT.first;
3188	if (!Ty->getScalarType()->isFP128Ty())
3189	return LT.first;
3190	[[fallthrough]];
3191	case ISD::FMUL:
3192	case ISD::FDIV:
3193	// These nodes are marked as 'custom' just to lower them to SVE.
3194	// We know said lowering will incur no additional cost.
3195	if (!Ty->getScalarType()->isFP128Ty())
3196	return `2` * LT.first;
3197
3198	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3199	Opd2Info: Op2Info);
3200	case ISD::FREM:
3201	// Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3202	// those functions are not declared in the module.
3203	if (!Ty->isVectorTy())
3204	return getCallInstrCost(/Function/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind);
3205	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
3206	Opd2Info: Op2Info);
3207	}
3208	}
3209
3210	InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty,
3211	ScalarEvolution *SE,
3212	const SCEV *Ptr) {
3213	// Address computations in vectorized code with non-consecutive addresses will
3214	// likely result in more instructions compared to scalar code where the
3215	// computation can more often be merged into the index mode. The resulting
3216	// extra micro-ops can significantly decrease throughput.
3217	unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3218	int MaxMergeDistance = `64`;
3219
3220	if (Ty->isVectorTy() && SE &&
3221	!BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + `1`))
3222	return NumVectorInstToHideOverhead;
3223
3224	// In many cases the address computation is not merged into the instruction
3225	// addressing mode.
3226	return `1`;
3227	}
3228
3229	InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
3230	Type *CondTy,
3231	CmpInst::Predicate VecPred,
3232	TTI::TargetCostKind CostKind,
3233	const Instruction *I) {
3234	// TODO: Handle other cost kinds.
3235	if (CostKind != TTI::TCK_RecipThroughput)
3236	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3237	I);
3238
3239	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3240	// We don't lower some vector selects well that are wider than the register
3241	// width.
3242	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) {
3243	// We would need this many instructions to hide the scalarization happening.
3244	const int AmortizationCost = `20`;
3245
3246	// If VecPred is not set, check if we can get a predicate from the context
3247	// instruction, if its type matches the requested ValTy.
3248	if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3249	CmpInst::Predicate CurrentPred;
3250	if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(),
3251	R: m_Value())))
3252	VecPred = CurrentPred;
3253	}
3254	// Check if we have a compare/select chain that can be lowered using
3255	// a (F)CMxx & BFI pair.
3256	if (CmpInst::isIntPredicate(P: VecPred) \|\| VecPred == CmpInst::FCMP_OLE \|\|
3257	VecPred == CmpInst::FCMP_OLT \|\| VecPred == CmpInst::FCMP_OGT \|\|
3258	VecPred == CmpInst::FCMP_OGE \|\| VecPred == CmpInst::FCMP_OEQ \|\|
3259	VecPred == CmpInst::FCMP_UNE) {
3260	static const auto ValidMinMaxTys = {
3261	MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3262	MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3263	static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3264
3265	auto LT = getTypeLegalizationCost(Ty: ValTy);
3266	if (any_of(Range: ValidMinMaxTys, P: [&LT](MVT M) { return M == LT.second; }) \|\|
3267	(ST->hasFullFP16() &&
3268	any_of(Range: ValidFP16MinMaxTys, P: [&LT](MVT M) { return M == LT.second; })))
3269	return LT.first;
3270	}
3271
3272	static const TypeConversionCostTblEntry
3273	VectorSelectTbl[] = {
3274	{ .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f32, .Cost: `2` },
3275	{ .ISD: ISD::SELECT, .Dst: MVT::v2i1, .Src: MVT::v2f64, .Cost: `2` },
3276	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f32, .Cost: `2` },
3277	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4f16, .Cost: `2` },
3278	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8f16, .Cost: `2` },
3279	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i16, .Cost: `16` },
3280	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i32, .Cost: `8` },
3281	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i32, .Cost: `16` },
3282	{ .ISD: ISD::SELECT, .Dst: MVT::v4i1, .Src: MVT::v4i64, .Cost: `4` * AmortizationCost },
3283	{ .ISD: ISD::SELECT, .Dst: MVT::v8i1, .Src: MVT::v8i64, .Cost: `8` * AmortizationCost },
3284	{ .ISD: ISD::SELECT, .Dst: MVT::v16i1, .Src: MVT::v16i64, .Cost: `16` * AmortizationCost }
3285	};
3286
3287	EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy);
3288	EVT SelValTy = TLI->getValueType(DL, Ty: ValTy);
3289	if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3290	if (const auto *Entry = ConvertCostTableLookup(Table: VectorSelectTbl, ISD,
3291	Dst: SelCondTy.getSimpleVT(),
3292	Src: SelValTy.getSimpleVT()))
3293	return Entry->Cost;
3294	}
3295	}
3296
3297	if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) {
3298	auto LT = getTypeLegalizationCost(Ty: ValTy);
3299	// Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3300	if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3301	return LT.first * `4`; // fcvtl + fcvtl + fcmp + xtn
3302	}
3303
3304	// Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3305	// FIXME: This can apply to more conditions and add/sub if it can be shown to
3306	// be profitable.
3307	if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3308	ICmpInst::isEquality(P: VecPred) &&
3309	TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) &&
3310	match(V: I->getOperand(i: `1`), P: m_Zero()) &&
3311	match(V: I->getOperand(i: `0`), P: m_And(L: m_Value(), R: m_Value())))
3312	return `0`;
3313
3314	// The base case handles scalable vectors fine for now, since it treats the
3315	// cost as 1 legalization cost.*
3316	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3317	}
3318
3319	AArch64TTIImpl::TTI::MemCmpExpansionOptions
3320	AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3321	TTI::MemCmpExpansionOptions Options;
3322	if (ST->requiresStrictAlign()) {
3323	// TODO: Add cost modeling for strict align. Misaligned loads expand to
3324	// a bunch of instructions when strict align is enabled.
3325	return Options;
3326	}
3327	Options.AllowOverlappingLoads = true;
3328	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3329	Options.NumLoadsPerBlock = Options.MaxNumLoads;
3330	// TODO: Though vector loads usually perform well on AArch64, in some targets
3331	// they may wake up the FP unit, which raises the power consumption. Perhaps
3332	// they could be used with no holds barred (-O3).
3333	Options.LoadSizes = {`8`, `4`, `2`, `1`};
3334	Options.AllowedTailExpansions = {`3`, `5`, `6`};
3335	return Options;
3336	}
3337
3338	bool AArch64TTIImpl::prefersVectorizedAddressing() const {
3339	return ST->hasSVE();
3340	}
3341
3342	InstructionCost
3343	AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
3344	Align Alignment, unsigned AddressSpace,
3345	TTI::TargetCostKind CostKind) {
3346	if (useNeonVector(Ty: Src))
3347	return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace,
3348	CostKind);
3349	auto LT = getTypeLegalizationCost(Ty: Src);
3350	if (!LT.first.isValid())
3351	return InstructionCost::getInvalid();
3352
3353	// Return an invalid cost for element types that we are unable to lower.
3354	auto *VT = cast<VectorType>(Val: Src);
3355	if (VT->getElementType()->isIntegerTy(Bitwidth: `1`))
3356	return InstructionCost::getInvalid();
3357
3358	// The code-generator is currently not able to handle scalable vectors
3359	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3360	// it. This change will be removed when code-generation for these types is
3361	// sufficiently reliable.
3362	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3363	return InstructionCost::getInvalid();
3364
3365	return LT.first;
3366	}
3367
3368	static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3369	return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3370	}
3371
3372	InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
3373	unsigned Opcode, Type DataTy, const* Value Ptr, bool* VariableMask,
3374	Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3375	if (useNeonVector(Ty: DataTy) \|\| !isLegalMaskedGatherScatter(DataType: DataTy))
3376	return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3377	Alignment, CostKind, I);
3378	auto *VT = cast<VectorType>(Val: DataTy);
3379	auto LT = getTypeLegalizationCost(Ty: DataTy);
3380	if (!LT.first.isValid())
3381	return InstructionCost::getInvalid();
3382
3383	// Return an invalid cost for element types that we are unable to lower.
3384	if (!LT.second.isVector() \|\|
3385	!isElementTypeLegalForScalableVector(Ty: VT->getElementType()) \|\|
3386	VT->getElementType()->isIntegerTy(Bitwidth: `1`))
3387	return InstructionCost::getInvalid();
3388
3389	// The code-generator is currently not able to handle scalable vectors
3390	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3391	// it. This change will be removed when code-generation for these types is
3392	// sufficiently reliable.
3393	if (VT->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3394	return InstructionCost::getInvalid();
3395
3396	ElementCount LegalVF = LT.second.getVectorElementCount();
3397	InstructionCost MemOpCost =
3398	getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: `0`, CostKind,
3399	OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
3400	// Add on an overhead cost for using gathers/scatters.
3401	// TODO: At the moment this is applied unilaterally for all CPUs, but at some
3402	// point we may want a per-CPU overhead.
3403	MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3404	return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF);
3405	}
3406
3407	bool AArch64TTIImpl::useNeonVector(const Type Ty) const* {
3408	return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors();
3409	}
3410
3411	InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
3412	MaybeAlign Alignment,
3413	unsigned AddressSpace,
3414	TTI::TargetCostKind CostKind,
3415	TTI::OperandValueInfo OpInfo,
3416	const Instruction *I) {
3417	EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true);
3418	// Type legalization can't handle structs
3419	if (VT == MVT::Other)
3420	return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace,
3421	CostKind);
3422
3423	auto LT = getTypeLegalizationCost(Ty);
3424	if (!LT.first.isValid())
3425	return InstructionCost::getInvalid();
3426
3427	// The code-generator is currently not able to handle scalable vectors
3428	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3429	// it. This change will be removed when code-generation for these types is
3430	// sufficiently reliable.
3431	// We also only support full register predicate loads and stores.
3432	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3433	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`) \|\|
3434	(VTy->getElementType()->isIntegerTy(Bitwidth: `1`) &&
3435	!VTy->getElementCount().isKnownMultipleOf(
3436	RHS: ElementCount::getScalable(MinVal: `16`))))
3437	return InstructionCost::getInvalid();
3438
3439	// TODO: consider latency as well for TCK_SizeAndLatency.
3440	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency)
3441	return LT.first;
3442
3443	if (CostKind != TTI::TCK_RecipThroughput)
3444	return `1`;
3445
3446	if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3447	LT.second.is128BitVector() && (!Alignment \|\| *Alignment < Align (`16`))) {
3448	// Unaligned stores are extremely inefficient. We don't split all
3449	// unaligned 128-bit stores because the negative impact that has shown in
3450	// practice on inlined block copy code.
3451	// We make such stores expensive so that we will only vectorize if there
3452	// are 6 other instructions getting vectorized.
3453	const int AmortizationCost = `6`;
3454
3455	return LT.first * `2` * AmortizationCost;
3456	}
3457
3458	// Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3459	if (Ty->isPtrOrPtrVectorTy())
3460	return LT.first;
3461
3462	if (useNeonVector(Ty)) {
3463	// Check truncating stores and extending loads.
3464	if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3465	// v4i8 types are lowered to scalar a load/store and sshll/xtn.
3466	if (VT == MVT::v4i8)
3467	return `2`;
3468	// Otherwise we need to scalarize.
3469	return cast<FixedVectorType>(Val: Ty)->getNumElements() * `2`;
3470	}
3471	EVT EltVT = VT.getVectorElementType();
3472	unsigned EltSize = EltVT.getScalarSizeInBits();
3473	if (!isPowerOf2_32(Value: EltSize) \|\| EltSize < `8` \|\| EltSize > `64` \|\|
3474	VT.getVectorNumElements() >= (`128` / EltSize) \|\| !Alignment \|\|
3475	*Alignment != Align (`1`))
3476	return LT.first;
3477	// FIXME: v3i8 lowering currently is very inefficient, due to automatic
3478	// widening to v4i8, which produces suboptimal results.
3479	if (VT.getVectorNumElements() == `3` && EltVT == MVT::i8)
3480	return LT.first;
3481
3482	// Check non-power-of-2 loads/stores for legal vector element types with
3483	// NEON. Non-power-of-2 memory ops will get broken down to a set of
3484	// operations on smaller power-of-2 ops, including ld1/st1.
3485	LLVMContext &C = Ty->getContext();
3486	InstructionCost Cost(`0`);
3487	SmallVector<EVT> TypeWorklist;
3488	TypeWorklist.push_back(Elt: VT);
3489	while (!TypeWorklist.empty()) {
3490	EVT CurrVT = TypeWorklist.pop_back_val();
3491	unsigned CurrNumElements = CurrVT.getVectorNumElements();
3492	if (isPowerOf2_32(Value: CurrNumElements)) {
3493	Cost += `1`;
3494	continue;
3495	}
3496
3497	unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / `2`;
3498	TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2));
3499	TypeWorklist.push_back(
3500	Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2));
3501	}
3502	return Cost;
3503	}
3504
3505	return LT.first;
3506	}
3507
3508	InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
3509	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
3510	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3511	bool UseMaskForCond, bool UseMaskForGaps) {
3512	assert(Factor >= `2` && "Invalid interleave factor");
3513	auto *VecVTy = cast<VectorType>(Val: VecTy);
3514
3515	if (VecTy->isScalableTy() && (!ST->hasSVE() \|\| Factor != `2`))
3516	return InstructionCost::getInvalid();
3517
3518	// Vectorization for masked interleaved accesses is only enabled for scalable
3519	// VF.
3520	if (!VecTy->isScalableTy() && (UseMaskForCond \|\| UseMaskForGaps))
3521	return InstructionCost::getInvalid();
3522
3523	if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3524	unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3525	auto *SubVecTy =
3526	VectorType::get(ElementType: VecVTy->getElementType(),
3527	EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor));
3528
3529	// ldN/stN only support legal vector types of size 64 or 128 in bits.
3530	// Accesses having vector types that are a multiple of 128 bits can be
3531	// matched to more than one ldN/stN instruction.
3532	bool UseScalable;
3533	if (MinElts % Factor == `0` &&
3534	TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
3535	return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
3536	}
3537
3538	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3539	Alignment, AddressSpace, CostKind,
3540	UseMaskForCond, UseMaskForGaps);
3541	}
3542
3543	InstructionCost
3544	AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
3545	InstructionCost Cost = `0`;
3546	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3547	for (auto *I : Tys) {
3548	if (!I->isVectorTy())
3549	continue;
3550	if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() ==
3551	`128`)
3552	Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind) +
3553	getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align (`128`), AddressSpace: `0`, CostKind);
3554	}
3555	return Cost;
3556	}
3557
3558	unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
3559	return ST->getMaxInterleaveFactor();
3560	}
3561
3562	// For Falkor, we want to avoid having too many strided loads in a loop since
3563	// that can exhaust the HW prefetcher resources. We adjust the unroller
3564	// MaxCount preference below to attempt to ensure unrolling doesn't create too
3565	// many strided loads.
3566	static void
3567	getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3568	TargetTransformInfo::UnrollingPreferences &UP) {
3569	enum { MaxStridedLoads = `7` };
3570	auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3571	int StridedLoads = `0`;
3572	// FIXME? We could make this more precise by looking at the CFG and
3573	// e.g. not counting loads in each side of an if-then-else diamond.
3574	for (const auto BB : L->blocks()) {
3575	for (auto &I : *BB) {
3576	LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I);
3577	if (!LMemI)
3578	continue;
3579
3580	Value *PtrValue = LMemI->getPointerOperand();
3581	if (L->isLoopInvariant(V: PtrValue))
3582	continue;
3583
3584	const SCEV *LSCEV = SE.getSCEV(V: PtrValue);
3585	const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV);
3586	if (!LSCEVAddRec \|\| !LSCEVAddRec->isAffine())
3587	continue;
3588
3589	// FIXME? We could take pairing of unrolled load copies into account
3590	// by looking at the AddRec, but we would probably have to limit this
3591	// to loops with no stores or other memory optimization barriers.
3592	++StridedLoads;
3593	// We've seen enough strided loads that seeing more won't make a
3594	// difference.
3595	if (StridedLoads > MaxStridedLoads / `2`)
3596	return StridedLoads;
3597	}
3598	}
3599	return StridedLoads;
3600	};
3601
3602	int StridedLoads = countStridedLoads (L, SE);
3603	LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3604	<< " strided loads\n");
3605	// Pick the largest power of 2 unroll count that won't result in too many
3606	// strided loads.
3607	if (StridedLoads) {
3608	UP.MaxCount = `1` << Log2_32(Value: MaxStridedLoads / StridedLoads);
3609	LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3610	<< UP.MaxCount << `'\n'`);
3611	}
3612	}
3613
3614	void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
3615	TTI::UnrollingPreferences &UP,
3616	OptimizationRemarkEmitter *ORE) {
3617	// Enable partial unrolling and runtime unrolling.
3618	BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3619
3620	UP.UpperBound = true;
3621
3622	// For inner loop, it is more likely to be a hot one, and the runtime check
3623	// can be promoted out from LICM pass, so the overhead is less, let's try
3624	// a larger threshold to unroll more loops.
3625	if (L->getLoopDepth() > `1`)
3626	UP.PartialThreshold *= `2`;
3627
3628	// Disable partial & runtime unrolling on -Os.
3629	UP.PartialOptSizeThreshold = `0`;
3630
3631	if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3632	EnableFalkorHWPFUnrollFix)
3633	getFalkorUnrollingPreferences(L, SE, UP);
3634
3635	// Scan the loop: don't unroll loops with calls as this could prevent
3636	// inlining. Don't unroll vector loops either, as they don't benefit much from
3637	// unrolling.
3638	for (auto *BB : L->getBlocks()) {
3639	for (auto &I : *BB) {
3640	// Don't unroll vectorised loop.
3641	if (I.getType()->isVectorTy())
3642	return;
3643
3644	if (isa<CallInst>(Val: I) \|\| isa<InvokeInst>(Val: I)) {
3645	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
3646	if (!isLoweredToCall(F))
3647	continue;
3648	}
3649	return;
3650	}
3651	}
3652	}
3653
3654	// Enable runtime unrolling for in-order models
3655	// If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3656	// checking for that case, we can ensure that the default behaviour is
3657	// unchanged
3658	if (ST->getProcFamily() != AArch64Subtarget::Others &&
3659	!ST->getSchedModel().isOutOfOrder()) {
3660	UP.Runtime = true;
3661	UP.Partial = true;
3662	UP.UnrollRemainder = true;
3663	UP.DefaultUnrollRuntimeCount = `4`;
3664
3665	UP.UnrollAndJam = true;
3666	UP.UnrollAndJamInnerLoopThreshold = `60`;
3667	}
3668	}
3669
3670	void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
3671	TTI::PeelingPreferences &PP) {
3672	BaseT::getPeelingPreferences(L, SE, PP);
3673	}
3674
3675	Value AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst Inst,
3676	Type *ExpectedType) {
3677	switch (Inst->getIntrinsicID()) {
3678	default:
3679	return nullptr;
3680	case Intrinsic::aarch64_neon_st2:
3681	case Intrinsic::aarch64_neon_st3:
3682	case Intrinsic::aarch64_neon_st4: {
3683	// Create a struct type
3684	StructType *ST = dyn_cast<StructType>(Val: ExpectedType);
3685	if (!ST)
3686	return nullptr;
3687	unsigned NumElts = Inst->arg_size() - `1`;
3688	if (ST->getNumElements() != NumElts)
3689	return nullptr;
3690	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
3691	if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i))
3692	return nullptr;
3693	}
3694	Value *Res = PoisonValue::get(T: ExpectedType);
3695	IRBuilder<> Builder(Inst);
3696	for (unsigned i = `0`, e = NumElts; i != e; ++i) {
3697	Value *L = Inst->getArgOperand(i);
3698	Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i);
3699	}
3700	return Res;
3701	}
3702	case Intrinsic::aarch64_neon_ld2:
3703	case Intrinsic::aarch64_neon_ld3:
3704	case Intrinsic::aarch64_neon_ld4:
3705	if (Inst->getType() == ExpectedType)
3706	return Inst;
3707	return nullptr;
3708	}
3709	}
3710
3711	bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
3712	MemIntrinsicInfo &Info) {
3713	switch (Inst->getIntrinsicID()) {
3714	default:
3715	break;
3716	case Intrinsic::aarch64_neon_ld2:
3717	case Intrinsic::aarch64_neon_ld3:
3718	case Intrinsic::aarch64_neon_ld4:
3719	Info.ReadMem = true;
3720	Info.WriteMem = false;
3721	Info.PtrVal = Inst->getArgOperand(i: `0`);
3722	break;
3723	case Intrinsic::aarch64_neon_st2:
3724	case Intrinsic::aarch64_neon_st3:
3725	case Intrinsic::aarch64_neon_st4:
3726	Info.ReadMem = false;
3727	Info.WriteMem = true;
3728	Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - `1`);
3729	break;
3730	}
3731
3732	switch (Inst->getIntrinsicID()) {
3733	default:
3734	return false;
3735	case Intrinsic::aarch64_neon_ld2:
3736	case Intrinsic::aarch64_neon_st2:
3737	Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3738	break;
3739	case Intrinsic::aarch64_neon_ld3:
3740	case Intrinsic::aarch64_neon_st3:
3741	Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3742	break;
3743	case Intrinsic::aarch64_neon_ld4:
3744	case Intrinsic::aarch64_neon_st4:
3745	Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3746	break;
3747	}
3748	return true;
3749	}
3750
3751	/// See if \p I should be considered for address type promotion. We check if \p
3752	/// I is a sext with right type and used in memory accesses. If it used in a
3753	/// "complex" getelementptr, we allow it to be promoted without finding other
3754	/// sext instructions that sign extended the same initial value. A getelementptr
3755	/// is considered as "complex" if it has more than 2 operands.
3756	bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
3757	const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3758	bool Considerable = false;
3759	AllowPromotionWithoutCommonHeader = false;
3760	if (!isa<SExtInst>(Val: &I))
3761	return false;
3762	Type *ConsideredSExtType =
3763	Type::getInt64Ty(C&: I.getParent()->getParent()->getContext());
3764	if (I.getType() != ConsideredSExtType)
3765	return false;
3766	// See if the sext is the one with the right type and used in at least one
3767	// GetElementPtrInst.
3768	for (const User *U : I.users()) {
3769	if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) {
3770	Considerable = true;
3771	// A getelementptr is considered as "complex" if it has more than 2
3772	// operands. We will promote a SExt used in such complex GEP as we
3773	// expect some computation to be merged if they are done on 64 bits.
3774	if (GEPInst->getNumOperands() > `2`) {
3775	AllowPromotionWithoutCommonHeader = true;
3776	break;
3777	}
3778	}
3779	}
3780	return Considerable;
3781	}
3782
3783	bool AArch64TTIImpl::isLegalToVectorizeReduction(
3784	const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3785	if (!VF.isScalable())
3786	return true;
3787
3788	Type *Ty = RdxDesc.getRecurrenceType();
3789	if (Ty->isBFloatTy() \|\| !isElementTypeLegalForScalableVector(Ty))
3790	return false;
3791
3792	switch (RdxDesc.getRecurrenceKind()) {
3793	case RecurKind::Add:
3794	case RecurKind::FAdd:
3795	case RecurKind::And:
3796	case RecurKind::Or:
3797	case RecurKind::Xor:
3798	case RecurKind::SMin:
3799	case RecurKind::SMax:
3800	case RecurKind::UMin:
3801	case RecurKind::UMax:
3802	case RecurKind::FMin:
3803	case RecurKind::FMax:
3804	case RecurKind::FMulAdd:
3805	case RecurKind::IAnyOf:
3806	case RecurKind::FAnyOf:
3807	return true;
3808	default:
3809	return false;
3810	}
3811	}
3812
3813	InstructionCost
3814	AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
3815	FastMathFlags FMF,
3816	TTI::TargetCostKind CostKind) {
3817	// The code-generator is currently not able to handle scalable vectors
3818	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3819	// it. This change will be removed when code-generation for these types is
3820	// sufficiently reliable.
3821	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty))
3822	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3823	return InstructionCost::getInvalid();
3824
3825	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3826
3827	if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3828	return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3829
3830	InstructionCost LegalizationCost = `0`;
3831	if (LT.first > `1`) {
3832	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Ty->getContext());
3833	IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3834	LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - `1`);
3835	}
3836
3837	return LegalizationCost + /Cost of horizontal reduction/ `2`;
3838	}
3839
3840	InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE(
3841	unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3842	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
3843	InstructionCost LegalizationCost = `0`;
3844	if (LT.first > `1`) {
3845	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: ValTy->getContext());
3846	LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind);
3847	LegalizationCost *= LT.first - `1`;
3848	}
3849
3850	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3851	assert(ISD && "Invalid opcode");
3852	// Add the final reduction cost for the legal horizontal reduction
3853	switch (ISD) {
3854	case ISD::ADD:
3855	case ISD::AND:
3856	case ISD::OR:
3857	case ISD::XOR:
3858	case ISD::FADD:
3859	return LegalizationCost + `2`;
3860	default:
3861	return InstructionCost::getInvalid();
3862	}
3863	}
3864
3865	InstructionCost
3866	AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
3867	std::optional<FastMathFlags> FMF,
3868	TTI::TargetCostKind CostKind) {
3869	// The code-generator is currently not able to handle scalable vectors
3870	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3871	// it. This change will be removed when code-generation for these types is
3872	// sufficiently reliable.
3873	if (auto *VTy = dyn_cast<ScalableVectorType>(Val: ValTy))
3874	if (VTy->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3875	return InstructionCost::getInvalid();
3876
3877	if (TTI::requiresOrderedReduction(FMF)) {
3878	if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) {
3879	InstructionCost BaseCost =
3880	BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
3881	// Add on extra cost to reflect the extra overhead on some CPUs. We still
3882	// end up vectorizing for more computationally intensive loops.
3883	return BaseCost + FixedVTy->getNumElements();
3884	}
3885
3886	if (Opcode != Instruction::FAdd)
3887	return InstructionCost::getInvalid();
3888
3889	auto *VTy = cast<ScalableVectorType>(Val: ValTy);
3890	InstructionCost Cost =
3891	getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind);
3892	Cost *= getMaxNumElements(VF: VTy->getElementCount());
3893	return Cost;
3894	}
3895
3896	if (isa<ScalableVectorType>(Val: ValTy))
3897	return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3898
3899	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy);
3900	MVT MTy = LT.second;
3901	int ISD = TLI->InstructionOpcodeToISD(Opcode);
3902	assert(ISD && "Invalid opcode");
3903
3904	// Horizontal adds can use the 'addv' instruction. We model the cost of these
3905	// instructions as twice a normal vector add, plus 1 for each legalization
3906	// step (LT.first). This is the only arithmetic vector reduction operation for
3907	// which we have an instruction.
3908	// OR, XOR and AND costs should match the codegen from:
3909	// OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3910	// XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3911	// AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3912	static const CostTblEntry CostTblNoPairwise[]{
3913	{.ISD: ISD::ADD, .Type: MVT::v8i8, .Cost: `2`},
3914	{.ISD: ISD::ADD, .Type: MVT::v16i8, .Cost: `2`},
3915	{.ISD: ISD::ADD, .Type: MVT::v4i16, .Cost: `2`},
3916	{.ISD: ISD::ADD, .Type: MVT::v8i16, .Cost: `2`},
3917	{.ISD: ISD::ADD, .Type: MVT::v4i32, .Cost: `2`},
3918	{.ISD: ISD::ADD, .Type: MVT::v2i64, .Cost: `2`},
3919	{.ISD: ISD::OR, .Type: MVT::v8i8, .Cost: `15`},
3920	{.ISD: ISD::OR, .Type: MVT::v16i8, .Cost: `17`},
3921	{.ISD: ISD::OR, .Type: MVT::v4i16, .Cost: `7`},
3922	{.ISD: ISD::OR, .Type: MVT::v8i16, .Cost: `9`},
3923	{.ISD: ISD::OR, .Type: MVT::v2i32, .Cost: `3`},
3924	{.ISD: ISD::OR, .Type: MVT::v4i32, .Cost: `5`},
3925	{.ISD: ISD::OR, .Type: MVT::v2i64, .Cost: `3`},
3926	{.ISD: ISD::XOR, .Type: MVT::v8i8, .Cost: `15`},
3927	{.ISD: ISD::XOR, .Type: MVT::v16i8, .Cost: `17`},
3928	{.ISD: ISD::XOR, .Type: MVT::v4i16, .Cost: `7`},
3929	{.ISD: ISD::XOR, .Type: MVT::v8i16, .Cost: `9`},
3930	{.ISD: ISD::XOR, .Type: MVT::v2i32, .Cost: `3`},
3931	{.ISD: ISD::XOR, .Type: MVT::v4i32, .Cost: `5`},
3932	{.ISD: ISD::XOR, .Type: MVT::v2i64, .Cost: `3`},
3933	{.ISD: ISD::AND, .Type: MVT::v8i8, .Cost: `15`},
3934	{.ISD: ISD::AND, .Type: MVT::v16i8, .Cost: `17`},
3935	{.ISD: ISD::AND, .Type: MVT::v4i16, .Cost: `7`},
3936	{.ISD: ISD::AND, .Type: MVT::v8i16, .Cost: `9`},
3937	{.ISD: ISD::AND, .Type: MVT::v2i32, .Cost: `3`},
3938	{.ISD: ISD::AND, .Type: MVT::v4i32, .Cost: `5`},
3939	{.ISD: ISD::AND, .Type: MVT::v2i64, .Cost: `3`},
3940	};
3941	switch (ISD) {
3942	default:
3943	break;
3944	case ISD::ADD:
3945	if (const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy))
3946	return (LT.first - `1`) + Entry->Cost;
3947	break;
3948	case ISD::XOR:
3949	case ISD::AND:
3950	case ISD::OR:
3951	const auto *Entry = CostTableLookup(Table: CostTblNoPairwise, ISD, Ty: MTy);
3952	if (!Entry)
3953	break;
3954	auto *ValVTy = cast<FixedVectorType>(Val: ValTy);
3955	if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3956	isPowerOf2_32(Value: ValVTy->getNumElements())) {
3957	InstructionCost ExtraCost = `0`;
3958	if (LT.first != `1`) {
3959	// Type needs to be split, so there is an extra cost of LT.first - 1
3960	// arithmetic ops.
3961	auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(),
3962	NumElts: MTy.getVectorNumElements());
3963	ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3964	ExtraCost *= LT.first - `1`;
3965	}
3966	// All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3967	auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: `1`) ? `2` : Entry->Cost;
3968	return Cost + ExtraCost;
3969	}
3970	break;
3971	}
3972	return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind);
3973	}
3974
3975	InstructionCost AArch64TTIImpl::getSpliceCost(VectorType Tp, int* Index) {
3976	static const CostTblEntry ShuffleTbl[] = {
3977	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv16i8, .Cost: `1` },
3978	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8i16, .Cost: `1` },
3979	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4i32, .Cost: `1` },
3980	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2i64, .Cost: `1` },
3981	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f16, .Cost: `1` },
3982	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f16, .Cost: `1` },
3983	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8f16, .Cost: `1` },
3984	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2bf16, .Cost: `1` },
3985	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4bf16, .Cost: `1` },
3986	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv8bf16, .Cost: `1` },
3987	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f32, .Cost: `1` },
3988	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv4f32, .Cost: `1` },
3989	{ .ISD: TTI::SK_Splice, .Type: MVT::nxv2f64, .Cost: `1` },
3990	};
3991
3992	// The code-generator is currently not able to handle scalable vectors
3993	// of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3994	// it. This change will be removed when code-generation for these types is
3995	// sufficiently reliable.
3996	if (Tp->getElementCount() == ElementCount::getScalable(MinVal: `1`))
3997	return InstructionCost::getInvalid();
3998
3999	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
4000	Type *LegalVTy = EVT (LT.second).getTypeForEVT(Context&: Tp->getContext());
4001	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4002	EVT PromotedVT = LT.second.getScalarType() == MVT::i1
4003	? TLI->getPromotedVTForPredicate(VT: EVT (LT.second))
4004	: LT.second;
4005	Type *PromotedVTy = EVT (PromotedVT).getTypeForEVT(Context&: Tp->getContext());
4006	InstructionCost LegalizationCost = `0`;
4007	if (Index < `0`) {
4008	LegalizationCost =
4009	getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy,
4010	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) +
4011	getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy,
4012	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
4013	}
4014
4015	// Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
4016	// Cost performed on a promoted type.
4017	if (LT.second.getScalarType() == MVT::i1) {
4018	LegalizationCost +=
4019	getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy,
4020	CCH: TTI::CastContextHint::None, CostKind) +
4021	getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy,
4022	CCH: TTI::CastContextHint::None, CostKind);
4023	}
4024	const auto *Entry =
4025	CostTableLookup(Table: ShuffleTbl, ISD: TTI::SK_Splice, Ty: PromotedVT.getSimpleVT());
4026	assert(Entry && "Illegal Type for Splice");
4027	LegalizationCost += Entry->Cost;
4028	return LegalizationCost * LT.first;
4029	}
4030
4031	InstructionCost AArch64TTIImpl::getShuffleCost(
4032	TTI::ShuffleKind Kind, VectorType Tp, ArrayRef<int*> Mask,
4033	TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
4034	ArrayRef<const Value > Args, const* Instruction *CxtI) {
4035	std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp);
4036
4037	// If we have a Mask, and the LT is being legalized somehow, split the Mask
4038	// into smaller vectors and sum the cost of each shuffle.
4039	if (!Mask.empty() && isa<FixedVectorType>(Val: Tp) && LT.second.isVector() &&
4040	Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4041	Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4042
4043	// Check for LD3/LD4 instructions, which are represented in llvm IR as
4044	// deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4045	// but we model it with a cost of LT.first so that LD3/LD4 have a higher
4046	// cost than just the load.
4047	if (Args.size() >= `1` && isa<LoadInst>(Val: Args [`0`]) &&
4048	(ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `3`) \|\|
4049	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: `4`)))
4050	return std::max<InstructionCost>(a: `1`, b: LT.first / `4`);
4051
4052	// Check for ST3/ST4 instructions, which are represented in llvm IR as
4053	// store(interleaving-shuffle). The shuffle cost could potentially be free,
4054	// but we model it with a cost of LT.first so that ST3/ST4 have a higher
4055	// cost than just the store.
4056	if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) &&
4057	(ShuffleVectorInst::isInterleaveMask(
4058	Mask, Factor: `4`, NumInputElts: Tp->getElementCount().getKnownMinValue() * `2`) \|\|
4059	ShuffleVectorInst::isInterleaveMask(
4060	Mask, Factor: `3`, NumInputElts: Tp->getElementCount().getKnownMinValue() * `2`)))
4061	return LT.first;
4062
4063	unsigned TpNumElts = Mask.size();
4064	unsigned LTNumElts = LT.second.getVectorNumElements();
4065	unsigned NumVecs = (TpNumElts + LTNumElts - `1`) / LTNumElts;
4066	VectorType *NTp =
4067	VectorType::get(ElementType: Tp->getScalarType(), EC: LT.second.getVectorElementCount());
4068	InstructionCost Cost;
4069	for (unsigned N = `0`; N < NumVecs; N++) {
4070	SmallVector<int> NMask;
4071	// Split the existing mask into chunks of size LTNumElts. Track the source
4072	// sub-vectors to ensure the result has at most 2 inputs.
4073	unsigned Source1, Source2;
4074	unsigned NumSources = `0`;
4075	for (unsigned E = `0`; E < LTNumElts; E++) {
4076	int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask [N * LTNumElts + E]
4077	: PoisonMaskElem;
4078	if (MaskElt < `0`) {
4079	NMask.push_back(Elt: PoisonMaskElem);
4080	continue;
4081	}
4082
4083	// Calculate which source from the input this comes from and whether it
4084	// is new to us.
4085	unsigned Source = MaskElt / LTNumElts;
4086	if (NumSources == `0`) {
4087	Source1 = Source;
4088	NumSources = `1`;
4089	} else if (NumSources == `1` && Source != Source1) {
4090	Source2 = Source;
4091	NumSources = `2`;
4092	} else if (NumSources >= `2` && Source != Source1 && Source != Source2) {
4093	NumSources++;
4094	}
4095
4096	// Add to the new mask. For the NumSources>2 case these are not correct,
4097	// but are only used for the modular lane number.
4098	if (Source == Source1)
4099	NMask.push_back(Elt: MaskElt % LTNumElts);
4100	else if (Source == Source2)
4101	NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts);
4102	else
4103	NMask.push_back(Elt: MaskElt % LTNumElts);
4104	}
4105	// If the sub-mask has at most 2 input sub-vectors then re-cost it using
4106	// getShuffleCost. If not then cost it using the worst case.
4107	if (NumSources <= `2`)
4108	Cost += getShuffleCost(Kind: NumSources <= `1` ? TTI::SK_PermuteSingleSrc
4109	: TTI::SK_PermuteTwoSrc,
4110	Tp: NTp, Mask: NMask, CostKind, Index: `0`, SubTp: nullptr, Args, CxtI);
4111	else if (any_of(Range: enumerate(First&: NMask), P: [&](const auto &ME) {
4112	return ME.value() % LTNumElts == ME.index();
4113	}))
4114	Cost += LTNumElts - `1`;
4115	else
4116	Cost += LTNumElts;
4117	}
4118	return Cost;
4119	}
4120
4121	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
4122	// Treat extractsubvector as single op permutation.
4123	bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4124	if (IsExtractSubvector && LT.second.isFixedLengthVector())
4125	Kind = TTI::SK_PermuteSingleSrc;
4126
4127	// Check for broadcast loads, which are supported by the LD1R instruction.
4128	// In terms of code-size, the shuffle vector is free when a load + dup get
4129	// folded into a LD1R. That's what we check and return here. For performance
4130	// and reciprocal throughput, a LD1R is not completely free. In this case, we
4131	// return the cost for the broadcast below (i.e. 1 for most/all types), so
4132	// that we model the load + dup sequence slightly higher because LD1R is a
4133	// high latency instruction.
4134	if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4135	bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args [`0`]);
4136	if (IsLoad && LT.second.isVector() &&
4137	isLegalBroadcastLoad(ElementTy: Tp->getElementType(),
4138	NumElements: LT.second.getVectorElementCount()))
4139	return `0`;
4140	}
4141
4142	// If we have 4 elements for the shuffle and a Mask, get the cost straight
4143	// from the perfect shuffle tables.
4144	if (Mask.size() == `4` && Tp->getElementCount() == ElementCount::getFixed(MinVal: `4`) &&
4145	(Tp->getScalarSizeInBits() == `16` \|\| Tp->getScalarSizeInBits() == `32`) &&
4146	all_of(Range&: Mask, P: [](int E) { return E < `8`; }))
4147	return getPerfectShuffleCost(M: Mask);
4148
4149	// Check for identity masks, which we can treat as free.
4150	if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4151	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
4152	all_of(Range: enumerate(First&: Mask), P: [](const auto &M) {
4153	return M.value() < `0` \|\| M.value() == (int)M.index();
4154	}))
4155	return `0`;
4156
4157	// Check for other shuffles that are not SK_ kinds but we have native
4158	// instructions for, for example ZIP and UZP.
4159	unsigned Unused;
4160	if (LT.second.isFixedLengthVector() &&
4161	LT.second.getVectorNumElements() == Mask.size() &&
4162	(Kind == TTI::SK_PermuteTwoSrc \|\| Kind == TTI::SK_PermuteSingleSrc) &&
4163	(isZIPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) \|\|
4164	isUZPMask(M: Mask, NumElts: LT.second.getVectorNumElements(), WhichResultOut&: Unused) \|\|
4165	// Check for non-zero lane splats
4166	all_of(Range: drop_begin(RangeOrContainer&: Mask),
4167	P: [&Mask](int M) { return M < `0` \|\| M == Mask [`0`]; })))
4168	return `1`;
4169
4170	if (Kind == TTI::SK_Broadcast \|\| Kind == TTI::SK_Transpose \|\|
4171	Kind == TTI::SK_Select \|\| Kind == TTI::SK_PermuteSingleSrc \|\|
4172	Kind == TTI::SK_Reverse \|\| Kind == TTI::SK_Splice) {
4173	static const CostTblEntry ShuffleTbl[] = {
4174	// Broadcast shuffle kinds can be performed with 'dup'.
4175	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i8, .Cost: `1`},
4176	{.ISD: TTI::SK_Broadcast, .Type: MVT::v16i8, .Cost: `1`},
4177	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i16, .Cost: `1`},
4178	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8i16, .Cost: `1`},
4179	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i32, .Cost: `1`},
4180	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4i32, .Cost: `1`},
4181	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2i64, .Cost: `1`},
4182	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f16, .Cost: `1`},
4183	{.ISD: TTI::SK_Broadcast, .Type: MVT::v8f16, .Cost: `1`},
4184	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f32, .Cost: `1`},
4185	{.ISD: TTI::SK_Broadcast, .Type: MVT::v4f32, .Cost: `1`},
4186	{.ISD: TTI::SK_Broadcast, .Type: MVT::v2f64, .Cost: `1`},
4187	// Transpose shuffle kinds can be performed with 'trn1/trn2' and
4188	// 'zip1/zip2' instructions.
4189	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i8, .Cost: `1`},
4190	{.ISD: TTI::SK_Transpose, .Type: MVT::v16i8, .Cost: `1`},
4191	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i16, .Cost: `1`},
4192	{.ISD: TTI::SK_Transpose, .Type: MVT::v8i16, .Cost: `1`},
4193	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i32, .Cost: `1`},
4194	{.ISD: TTI::SK_Transpose, .Type: MVT::v4i32, .Cost: `1`},
4195	{.ISD: TTI::SK_Transpose, .Type: MVT::v2i64, .Cost: `1`},
4196	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f16, .Cost: `1`},
4197	{.ISD: TTI::SK_Transpose, .Type: MVT::v8f16, .Cost: `1`},
4198	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f32, .Cost: `1`},
4199	{.ISD: TTI::SK_Transpose, .Type: MVT::v4f32, .Cost: `1`},
4200	{.ISD: TTI::SK_Transpose, .Type: MVT::v2f64, .Cost: `1`},
4201	// Select shuffle kinds.
4202	// TODO: handle vXi8/vXi16.
4203	{.ISD: TTI::SK_Select, .Type: MVT::v2i32, .Cost: `1`}, // mov.
4204	{.ISD: TTI::SK_Select, .Type: MVT::v4i32, .Cost: `2`}, // rev+trn (or similar).
4205	{.ISD: TTI::SK_Select, .Type: MVT::v2i64, .Cost: `1`}, // mov.
4206	{.ISD: TTI::SK_Select, .Type: MVT::v2f32, .Cost: `1`}, // mov.
4207	{.ISD: TTI::SK_Select, .Type: MVT::v4f32, .Cost: `2`}, // rev+trn (or similar).
4208	{.ISD: TTI::SK_Select, .Type: MVT::v2f64, .Cost: `1`}, // mov.
4209	// PermuteSingleSrc shuffle kinds.
4210	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i32, .Cost: `1`}, // mov.
4211	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i32, .Cost: `3`}, // perfectshuffle worst case.
4212	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2i64, .Cost: `1`}, // mov.
4213	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f32, .Cost: `1`}, // mov.
4214	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f32, .Cost: `3`}, // perfectshuffle worst case.
4215	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v2f64, .Cost: `1`}, // mov.
4216	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4i16, .Cost: `3`}, // perfectshuffle worst case.
4217	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4f16, .Cost: `3`}, // perfectshuffle worst case.
4218	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v4bf16, .Cost: `3`}, // same
4219	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i16, .Cost: `8`}, // constpool + load + tbl
4220	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8f16, .Cost: `8`}, // constpool + load + tbl
4221	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8bf16, .Cost: `8`}, // constpool + load + tbl
4222	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v8i8, .Cost: `8`}, // constpool + load + tbl
4223	{.ISD: TTI::SK_PermuteSingleSrc, .Type: MVT::v16i8, .Cost: `8`}, // constpool + load + tbl
4224	// Reverse can be lowered with `rev`.
4225	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i32, .Cost: `1`}, // REV64
4226	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i32, .Cost: `2`}, // REV64; EXT
4227	{.ISD: TTI::SK_Reverse, .Type: MVT::v2i64, .Cost: `1`}, // EXT
4228	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f32, .Cost: `1`}, // REV64
4229	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f32, .Cost: `2`}, // REV64; EXT
4230	{.ISD: TTI::SK_Reverse, .Type: MVT::v2f64, .Cost: `1`}, // EXT
4231	{.ISD: TTI::SK_Reverse, .Type: MVT::v8f16, .Cost: `2`}, // REV64; EXT
4232	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i16, .Cost: `2`}, // REV64; EXT
4233	{.ISD: TTI::SK_Reverse, .Type: MVT::v16i8, .Cost: `2`}, // REV64; EXT
4234	{.ISD: TTI::SK_Reverse, .Type: MVT::v4f16, .Cost: `1`}, // REV64
4235	{.ISD: TTI::SK_Reverse, .Type: MVT::v4i16, .Cost: `1`}, // REV64
4236	{.ISD: TTI::SK_Reverse, .Type: MVT::v8i8, .Cost: `1`}, // REV64
4237	// Splice can all be lowered as `ext`.
4238	{.ISD: TTI::SK_Splice, .Type: MVT::v2i32, .Cost: `1`},
4239	{.ISD: TTI::SK_Splice, .Type: MVT::v4i32, .Cost: `1`},
4240	{.ISD: TTI::SK_Splice, .Type: MVT::v2i64, .Cost: `1`},
4241	{.ISD: TTI::SK_Splice, .Type: MVT::v2f32, .Cost: `1`},
4242	{.ISD: TTI::SK_Splice, .Type: MVT::v4f32, .Cost: `1`},
4243	{.ISD: TTI::SK_Splice, .Type: MVT::v2f64, .Cost: `1`},
4244	{.ISD: TTI::SK_Splice, .Type: MVT::v8f16, .Cost: `1`},
4245	{.ISD: TTI::SK_Splice, .Type: MVT::v8bf16, .Cost: `1`},
4246	{.ISD: TTI::SK_Splice, .Type: MVT::v8i16, .Cost: `1`},
4247	{.ISD: TTI::SK_Splice, .Type: MVT::v16i8, .Cost: `1`},
4248	{.ISD: TTI::SK_Splice, .Type: MVT::v4bf16, .Cost: `1`},
4249	{.ISD: TTI::SK_Splice, .Type: MVT::v4f16, .Cost: `1`},
4250	{.ISD: TTI::SK_Splice, .Type: MVT::v4i16, .Cost: `1`},
4251	{.ISD: TTI::SK_Splice, .Type: MVT::v8i8, .Cost: `1`},
4252	// Broadcast shuffle kinds for scalable vectors
4253	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i8, .Cost: `1`},
4254	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i16, .Cost: `1`},
4255	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i32, .Cost: `1`},
4256	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i64, .Cost: `1`},
4257	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f16, .Cost: `1`},
4258	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f16, .Cost: `1`},
4259	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8f16, .Cost: `1`},
4260	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2bf16, .Cost: `1`},
4261	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4bf16, .Cost: `1`},
4262	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8bf16, .Cost: `1`},
4263	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f32, .Cost: `1`},
4264	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4f32, .Cost: `1`},
4265	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2f64, .Cost: `1`},
4266	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv16i1, .Cost: `1`},
4267	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv8i1, .Cost: `1`},
4268	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv4i1, .Cost: `1`},
4269	{.ISD: TTI::SK_Broadcast, .Type: MVT::nxv2i1, .Cost: `1`},
4270	// Handle the cases for vector.reverse with scalable vectors
4271	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i8, .Cost: `1`},
4272	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i16, .Cost: `1`},
4273	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i32, .Cost: `1`},
4274	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i64, .Cost: `1`},
4275	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f16, .Cost: `1`},
4276	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f16, .Cost: `1`},
4277	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8f16, .Cost: `1`},
4278	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2bf16, .Cost: `1`},
4279	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4bf16, .Cost: `1`},
4280	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8bf16, .Cost: `1`},
4281	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f32, .Cost: `1`},
4282	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4f32, .Cost: `1`},
4283	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2f64, .Cost: `1`},
4284	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv16i1, .Cost: `1`},
4285	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv8i1, .Cost: `1`},
4286	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv4i1, .Cost: `1`},
4287	{.ISD: TTI::SK_Reverse, .Type: MVT::nxv2i1, .Cost: `1`},
4288	};
4289	if (const auto *Entry = CostTableLookup(Table: ShuffleTbl, ISD: Kind, Ty: LT.second))
4290	return LT.first * Entry->Cost;
4291	}
4292
4293	if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: Tp))
4294	return getSpliceCost(Tp, Index);
4295
4296	// Inserting a subvector can often be done with either a D, S or H register
4297	// move, so long as the inserted vector is "aligned".
4298	if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4299	LT.second.getSizeInBits() <= `128` && SubTp) {
4300	std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp);
4301	if (SubLT.second.isVector()) {
4302	int NumElts = LT.second.getVectorNumElements();
4303	int NumSubElts = SubLT.second.getVectorNumElements();
4304	if ((Index % NumSubElts) == `0` && (NumElts % NumSubElts) == `0`)
4305	return SubLT.first;
4306	}
4307	}
4308
4309	// Restore optimal kind.
4310	if (IsExtractSubvector)
4311	Kind = TTI::SK_ExtractSubvector;
4312	return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4313	CxtI);
4314	}
4315
4316	static bool containsDecreasingPointers(Loop *TheLoop,
4317	PredicatedScalarEvolution *PSE) {
4318	const auto &Strides = DenseMap<Value , const* SCEV *>();
4319	for (BasicBlock *BB : TheLoop->blocks()) {
4320	// Scan the instructions in the block and look for addresses that are
4321	// consecutive and decreasing.
4322	for (Instruction &I : *BB) {
4323	if (isa<LoadInst>(Val: &I) \|\| isa<StoreInst>(Val: &I)) {
4324	Value *Ptr = getLoadStorePointerOperand(V: &I);
4325	Type *AccessTy = getLoadStoreType(I: &I);
4326	if (getPtrStride(PSE&: PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /Assume=/*true,
4327	/ShouldCheckWrap=/false)
4328	.value_or(u: `0`) < `0`)
4329	return true;
4330	}
4331	}
4332	}
4333	return false;
4334	}
4335
4336	bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
4337	if (!ST->hasSVE())
4338	return false;
4339
4340	// We don't currently support vectorisation with interleaving for SVE - with
4341	// such loops we're better off not using tail-folding. This gives us a chance
4342	// to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4343	if (TFI->IAI->hasGroups())
4344	return false;
4345
4346	TailFoldingOpts Required = TailFoldingOpts::Disabled;
4347	if (TFI->LVL->getReductionVars().size())
4348	Required \|= TailFoldingOpts::Reductions;
4349	if (TFI->LVL->getFixedOrderRecurrences().size())
4350	Required \|= TailFoldingOpts::Recurrences;
4351
4352	// We call this to discover whether any load/store pointers in the loop have
4353	// negative strides. This will require extra work to reverse the loop
4354	// predicate, which may be expensive.
4355	if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(),
4356	PSE: TFI->LVL->getPredicatedScalarEvolution()))
4357	Required \|= TailFoldingOpts::Reverse;
4358	if (Required == TailFoldingOpts::Disabled)
4359	Required \|= TailFoldingOpts::Simple;
4360
4361	if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(),
4362	Required))
4363	return false;
4364
4365	// Don't tail-fold for tight loops where we would be better off interleaving
4366	// with an unpredicated loop.
4367	unsigned NumInsns = `0`;
4368	for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4369	NumInsns += BB->sizeWithoutDebug();
4370	}
4371
4372	// We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4373	return NumInsns >= SVETailFoldInsnThreshold;
4374	}
4375
4376	InstructionCost
4377	AArch64TTIImpl::getScalingFactorCost(Type Ty, GlobalValue BaseGV,
4378	StackOffset BaseOffset, bool HasBaseReg,
4379	int64_t Scale, unsigned AddrSpace) const {
4380	// Scaling factors are not free at all.
4381	// Operands \| Rt Latency
4382	// -------------------------------------------
4383	// Rt, [Xn, Xm] \| 4
4384	// -------------------------------------------
4385	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
4386	// Rt, [Xn, Wm, <extend> #imm] \|
4387	TargetLoweringBase::AddrMode AM;
4388	AM.BaseGV = BaseGV;
4389	AM.BaseOffs = BaseOffset.getFixed();
4390	AM.HasBaseReg = HasBaseReg;
4391	AM.Scale = Scale;
4392	AM.ScalableOffset = BaseOffset.getScalable();
4393	if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace))
4394	// Scale represents reg2 scale, thus account for 1 if*
4395	// it is not equal to 0 or 1.
4396	return AM.Scale != `0` && AM.Scale != `1`;
4397	return -`1`;
4398	}
4399
4400	bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
4401	// For the binary operators (e.g. or) we need to be more careful than
4402	// selects, here we only transform them if they are already at a natural
4403	// break point in the code - the end of a block with an unconditional
4404	// terminator.
4405	if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4406	isa<BranchInst>(Val: I->getNextNode()) &&
4407	cast<BranchInst>(Val: I->getNextNode())->isUnconditional())
4408	return true;
4409	return BaseT::shouldTreatInstructionLikeSelect(I);
4410	}
4411
4412	bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
4413	const TargetTransformInfo::LSRCost &C2) {
4414	// AArch64 specific here is adding the number of instructions to the
4415	// comparison (though not as the first consideration, as some targets do)
4416	// along with changing the priority of the base additions.
4417	// TODO: Maybe a more nuanced tradeoff between instruction count
4418	// and number of registers? To be investigated at a later date.
4419	if (EnableLSRCostOpt)
4420	return std::tie(args: C1.NumRegs, args: C1.Insns, args: C1.NumBaseAdds, args: C1.AddRecCost,
4421	args: C1.NumIVMuls, args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) <
4422	std::tie(args: C2.NumRegs, args: C2.Insns, args: C2.NumBaseAdds, args: C2.AddRecCost,
4423	args: C2.NumIVMuls, args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost);
4424
4425	return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
4426	}
4427

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp