AArch64ISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp]

1	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the AArch64TargetLowering class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AArch64ISelLowering.h"
14	#include "AArch64CallingConvention.h"
15	#include "AArch64ExpandImm.h"
16	#include "AArch64MachineFunctionInfo.h"
17	#include "AArch64PerfectShuffle.h"
18	#include "AArch64RegisterInfo.h"
19	#include "AArch64Subtarget.h"
20	#include "MCTargetDesc/AArch64AddressingModes.h"
21	#include "Utils/AArch64BaseInfo.h"
22	#include "Utils/AArch64SMEAttributes.h"
23	#include "llvm/ADT/APFloat.h"
24	#include "llvm/ADT/APInt.h"
25	#include "llvm/ADT/ArrayRef.h"
26	#include "llvm/ADT/STLExtras.h"
27	#include "llvm/ADT/SmallSet.h"
28	#include "llvm/ADT/SmallVector.h"
29	#include "llvm/ADT/SmallVectorExtras.h"
30	#include "llvm/ADT/Statistic.h"
31	#include "llvm/ADT/StringRef.h"
32	#include "llvm/ADT/Twine.h"
33	#include "llvm/Analysis/LoopInfo.h"
34	#include "llvm/Analysis/MemoryLocation.h"
35	#include "llvm/Analysis/ObjCARCUtil.h"
36	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
37	#include "llvm/Analysis/TargetTransformInfo.h"
38	#include "llvm/Analysis/ValueTracking.h"
39	#include "llvm/Analysis/VectorUtils.h"
40	#include "llvm/CodeGen/Analysis.h"
41	#include "llvm/CodeGen/CallingConvLower.h"
42	#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
43	#include "llvm/CodeGen/GlobalISel/Utils.h"
44	#include "llvm/CodeGen/ISDOpcodes.h"
45	#include "llvm/CodeGen/MachineBasicBlock.h"
46	#include "llvm/CodeGen/MachineFrameInfo.h"
47	#include "llvm/CodeGen/MachineFunction.h"
48	#include "llvm/CodeGen/MachineInstr.h"
49	#include "llvm/CodeGen/MachineInstrBuilder.h"
50	#include "llvm/CodeGen/MachineMemOperand.h"
51	#include "llvm/CodeGen/MachineRegisterInfo.h"
52	#include "llvm/CodeGen/SelectionDAG.h"
53	#include "llvm/CodeGen/SelectionDAGNodes.h"
54	#include "llvm/CodeGen/TargetCallingConv.h"
55	#include "llvm/CodeGen/TargetInstrInfo.h"
56	#include "llvm/CodeGen/TargetOpcodes.h"
57	#include "llvm/CodeGen/ValueTypes.h"
58	#include "llvm/CodeGenTypes/MachineValueType.h"
59	#include "llvm/IR/Attributes.h"
60	#include "llvm/IR/Constants.h"
61	#include "llvm/IR/DataLayout.h"
62	#include "llvm/IR/DebugLoc.h"
63	#include "llvm/IR/DerivedTypes.h"
64	#include "llvm/IR/Function.h"
65	#include "llvm/IR/GetElementPtrTypeIterator.h"
66	#include "llvm/IR/GlobalValue.h"
67	#include "llvm/IR/IRBuilder.h"
68	#include "llvm/IR/Instruction.h"
69	#include "llvm/IR/Instructions.h"
70	#include "llvm/IR/IntrinsicInst.h"
71	#include "llvm/IR/Intrinsics.h"
72	#include "llvm/IR/IntrinsicsAArch64.h"
73	#include "llvm/IR/Module.h"
74	#include "llvm/IR/PatternMatch.h"
75	#include "llvm/IR/Type.h"
76	#include "llvm/IR/Use.h"
77	#include "llvm/IR/Value.h"
78	#include "llvm/Support/AtomicOrdering.h"
79	#include "llvm/Support/Casting.h"
80	#include "llvm/Support/CodeGen.h"
81	#include "llvm/Support/CommandLine.h"
82	#include "llvm/Support/Debug.h"
83	#include "llvm/Support/ErrorHandling.h"
84	#include "llvm/Support/InstructionCost.h"
85	#include "llvm/Support/KnownBits.h"
86	#include "llvm/Support/MathExtras.h"
87	#include "llvm/Support/SipHash.h"
88	#include "llvm/Support/raw_ostream.h"
89	#include "llvm/Target/TargetMachine.h"
90	#include "llvm/Target/TargetOptions.h"
91	#include "llvm/TargetParser/Triple.h"
92	#include <algorithm>
93	#include <bitset>
94	#include <cassert>
95	#include <cctype>
96	#include <cstdint>
97	#include <cstdlib>
98	#include <iterator>
99	#include <limits>
100	#include <optional>
101	#include <tuple>
102	#include <utility>
103	#include <vector>
104
105	using namespace llvm;
106	using namespace llvm::PatternMatch;
107
108	#define DEBUG_TYPE "aarch64-lower"
109
110	STATISTIC(NumTailCalls, "Number of tail calls");
111	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
112
113	// FIXME: The necessary dtprel relocations don't seem to be supported
114	// well in the GNU bfd and gold linkers at the moment. Therefore, by
115	// default, for now, fall back to GeneralDynamic code generation.
116	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
117	"aarch64-elf-ldtls-generation", cl::Hidden,
118	cl::desc ("Allow AArch64 Local Dynamic TLS code generation"),
119	cl::init(Val: false));
120
121	static cl::opt<bool>
122	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
123	cl::desc ("Enable AArch64 logical imm instruction "
124	"optimization"),
125	cl::init(Val: true));
126
127	// Temporary option added for the purpose of testing functionality added
128	// to DAGCombiner.cpp in D92230. It is expected that this can be removed
129	// in future when both implementations will be based off MGATHER rather
130	// than the GLD1 nodes added for the SVE gather load intrinsics.
131	static cl::opt<bool>
132	EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
133	cl::desc ("Combine extends of AArch64 masked "
134	"gather intrinsics"),
135	cl::init(Val: true));
136
137	static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
138	cl::desc ("Combine ext and trunc to TBL"),
139	cl::init(Val: true));
140
141	// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
142	// bottleneck after this transform on high end CPU. So this max leaf node
143	// limitation is guard cmp+ccmp will be profitable.
144	static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: `16`), cl::Hidden,
145	cl::desc ("Maximum of xors"));
146
147	// By turning this on, we will not fallback to DAG ISel when encountering
148	// scalable vector types for all instruction, even if SVE is not yet supported
149	// with some instructions.
150	// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
151	cl::opt<bool> EnableSVEGISel(
152	"aarch64-enable-gisel-sve", cl::Hidden,
153	cl::desc ("Enable / disable SVE scalable vectors in Global ISel"),
154	cl::init(Val: false));
155
156	// TODO: This option should be removed once we switch to always using PTRADD in
157	// the SelectionDAG.
158	static cl::opt<bool> UseFEATCPACodegen(
159	"aarch64-use-featcpa-codegen", cl::Hidden,
160	cl::desc ("Generate ISD::PTRADD nodes for pointer arithmetic in "
161	"SelectionDAG for FEAT_CPA"),
162	cl::init(Val: false));
163
164	/// Value type used for condition codes.
165	static const MVT MVT_CC = MVT::i32;
166
167	static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
168	AArch64::X3, AArch64::X4, AArch64::X5,
169	AArch64::X6, AArch64::X7};
170	static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
171	AArch64::Q3, AArch64::Q4, AArch64::Q5,
172	AArch64::Q6, AArch64::Q7};
173
174	ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
175
176	ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
177
178	static inline EVT getPackedSVEVectorVT(EVT VT) {
179	switch (VT.getSimpleVT().SimpleTy) {
180	default:
181	llvm_unreachable("unexpected element type for vector");
182	case MVT::i8:
183	return MVT::nxv16i8;
184	case MVT::i16:
185	return MVT::nxv8i16;
186	case MVT::i32:
187	return MVT::nxv4i32;
188	case MVT::i64:
189	return MVT::nxv2i64;
190	case MVT::f16:
191	return MVT::nxv8f16;
192	case MVT::f32:
193	return MVT::nxv4f32;
194	case MVT::f64:
195	return MVT::nxv2f64;
196	case MVT::bf16:
197	return MVT::nxv8bf16;
198	}
199	}
200
201	// NOTE: Currently there's only a need to return integer vector types. If this
202	// changes then just add an extra "type" parameter.
203	static inline EVT getPackedSVEVectorVT(ElementCount EC) {
204	switch (EC.getKnownMinValue()) {
205	default:
206	llvm_unreachable("unexpected element count for vector");
207	case `16`:
208	return MVT::nxv16i8;
209	case `8`:
210	return MVT::nxv8i16;
211	case `4`:
212	return MVT::nxv4i32;
213	case `2`:
214	return MVT::nxv2i64;
215	}
216	}
217
218	static inline EVT getPromotedVTForPredicate(EVT VT) {
219	assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
220	"Expected scalable predicate vector type!");
221	switch (VT.getVectorMinNumElements()) {
222	default:
223	llvm_unreachable("unexpected element count for vector");
224	case `2`:
225	return MVT::nxv2i64;
226	case `4`:
227	return MVT::nxv4i32;
228	case `8`:
229	return MVT::nxv8i16;
230	case `16`:
231	return MVT::nxv16i8;
232	}
233	}
234
235	/// Returns true if VT's elements occupy the lowest bit positions of its
236	/// associated register class without any intervening space.
237	///
238	/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
239	/// same register class, but only nxv8f16 can be treated as a packed vector.
240	static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
241	assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
242	"Expected legal vector type!");
243	return VT.isFixedLengthVector() \|\|
244	VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
245	}
246
247	// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
248	// predicate and end with a passthru value matching the result type.
249	static bool isMergePassthruOpcode(unsigned Opc) {
250	switch (Opc) {
251	default:
252	return false;
253	case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
254	case AArch64ISD::BSWAP_MERGE_PASSTHRU:
255	case AArch64ISD::REVH_MERGE_PASSTHRU:
256	case AArch64ISD::REVW_MERGE_PASSTHRU:
257	case AArch64ISD::REVD_MERGE_PASSTHRU:
258	case AArch64ISD::CTLZ_MERGE_PASSTHRU:
259	case AArch64ISD::CTPOP_MERGE_PASSTHRU:
260	case AArch64ISD::DUP_MERGE_PASSTHRU:
261	case AArch64ISD::ABS_MERGE_PASSTHRU:
262	case AArch64ISD::NEG_MERGE_PASSTHRU:
263	case AArch64ISD::FNEG_MERGE_PASSTHRU:
264	case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
265	case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
266	case AArch64ISD::FCEIL_MERGE_PASSTHRU:
267	case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
268	case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
269	case AArch64ISD::FRINT_MERGE_PASSTHRU:
270	case AArch64ISD::FROUND_MERGE_PASSTHRU:
271	case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
272	case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
273	case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
274	case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
275	case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
276	case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
277	case AArch64ISD::FCVTX_MERGE_PASSTHRU:
278	case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
279	case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
280	case AArch64ISD::FSQRT_MERGE_PASSTHRU:
281	case AArch64ISD::FRECPX_MERGE_PASSTHRU:
282	case AArch64ISD::FABS_MERGE_PASSTHRU:
283	return true;
284	}
285	}
286
287	// Returns true if inactive lanes are known to be zeroed by construction.
288	static bool isZeroingInactiveLanes(SDValue Op) {
289	switch (Op.getOpcode()) {
290	default:
291	return false;
292	// We guarantee i1 splat_vectors to zero the other lanes
293	case ISD::SPLAT_VECTOR:
294	case ISD::GET_ACTIVE_LANE_MASK:
295	case AArch64ISD::PTRUE:
296	case AArch64ISD::SETCC_MERGE_ZERO:
297	return true;
298	case ISD::INTRINSIC_WO_CHAIN:
299	switch (Op.getConstantOperandVal(i: `0`)) {
300	default:
301	return false;
302	case Intrinsic::aarch64_sve_ptrue:
303	case Intrinsic::aarch64_sve_pnext:
304	case Intrinsic::aarch64_sve_cmpeq:
305	case Intrinsic::aarch64_sve_cmpne:
306	case Intrinsic::aarch64_sve_cmpge:
307	case Intrinsic::aarch64_sve_cmpgt:
308	case Intrinsic::aarch64_sve_cmphs:
309	case Intrinsic::aarch64_sve_cmphi:
310	case Intrinsic::aarch64_sve_cmpeq_wide:
311	case Intrinsic::aarch64_sve_cmpne_wide:
312	case Intrinsic::aarch64_sve_cmpge_wide:
313	case Intrinsic::aarch64_sve_cmpgt_wide:
314	case Intrinsic::aarch64_sve_cmplt_wide:
315	case Intrinsic::aarch64_sve_cmple_wide:
316	case Intrinsic::aarch64_sve_cmphs_wide:
317	case Intrinsic::aarch64_sve_cmphi_wide:
318	case Intrinsic::aarch64_sve_cmplo_wide:
319	case Intrinsic::aarch64_sve_cmpls_wide:
320	case Intrinsic::aarch64_sve_fcmpeq:
321	case Intrinsic::aarch64_sve_fcmpne:
322	case Intrinsic::aarch64_sve_fcmpge:
323	case Intrinsic::aarch64_sve_fcmpgt:
324	case Intrinsic::aarch64_sve_fcmpuo:
325	case Intrinsic::aarch64_sve_facgt:
326	case Intrinsic::aarch64_sve_facge:
327	case Intrinsic::aarch64_sve_whilege:
328	case Intrinsic::aarch64_sve_whilegt:
329	case Intrinsic::aarch64_sve_whilehi:
330	case Intrinsic::aarch64_sve_whilehs:
331	case Intrinsic::aarch64_sve_whilele:
332	case Intrinsic::aarch64_sve_whilelo:
333	case Intrinsic::aarch64_sve_whilels:
334	case Intrinsic::aarch64_sve_whilelt:
335	case Intrinsic::aarch64_sve_match:
336	case Intrinsic::aarch64_sve_nmatch:
337	case Intrinsic::aarch64_sve_whilege_x2:
338	case Intrinsic::aarch64_sve_whilegt_x2:
339	case Intrinsic::aarch64_sve_whilehi_x2:
340	case Intrinsic::aarch64_sve_whilehs_x2:
341	case Intrinsic::aarch64_sve_whilele_x2:
342	case Intrinsic::aarch64_sve_whilelo_x2:
343	case Intrinsic::aarch64_sve_whilels_x2:
344	case Intrinsic::aarch64_sve_whilelt_x2:
345	return true;
346	}
347	}
348	}
349
350	static std::tuple<SDValue, SDValue>
351	extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
352	SDLoc DL(Disc);
353	SDValue AddrDisc;
354	SDValue ConstDisc;
355
356	// If this is a blend, remember the constant and address discriminators.
357	// Otherwise, it's either a constant discriminator, or a non-blended
358	// address discriminator.
359	if (Disc ->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
360	Disc ->getConstantOperandVal(Num: `0`) == Intrinsic::ptrauth_blend) {
361	AddrDisc = Disc ->getOperand(Num: `1`);
362	ConstDisc = Disc ->getOperand(Num: `2`);
363	} else {
364	ConstDisc = Disc;
365	}
366
367	// If the constant discriminator (either the blend RHS, or the entire
368	// discriminator value) isn't a 16-bit constant, bail out, and let the
369	// discriminator be computed separately.
370	const auto *ConstDiscN = dyn_cast<ConstantSDNode>(Val&: ConstDisc);
371	if (!ConstDiscN \|\| !isUInt<`16`>(x: ConstDiscN->getZExtValue()))
372	return std::make_tuple(args: DAG->getTargetConstant(Val: `0`, DL, VT: MVT::i64), args&: Disc);
373
374	// If there's no address discriminator, use NoRegister, which we'll later
375	// replace with XZR, or directly use a Z variant of the inst. when available.
376	if (!AddrDisc)
377	AddrDisc = DAG->getRegister(Reg: AArch64::NoRegister, VT: MVT::i64);
378
379	return std::make_tuple(
380	args: DAG->getTargetConstant(Val: ConstDiscN->getZExtValue(), DL, VT: MVT::i64),
381	args&: AddrDisc);
382	}
383
384	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
385	const AArch64Subtarget &STI)
386	: TargetLowering (TM), Subtarget(&STI) {
387	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
388	// we have to make something up. Arbitrarily, choose ZeroOrOne.
389	setBooleanContents(ZeroOrOneBooleanContent);
390	// When comparing vectors the result sets the different elements in the
391	// vector to all-one or all-zero.
392	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
393
394	// Set up the register classes.
395	addRegisterClass(VT: MVT::i32, RC: &AArch64::GPR32allRegClass);
396	addRegisterClass(VT: MVT::i64, RC: &AArch64::GPR64allRegClass);
397
398	if (Subtarget->hasLS64()) {
399	addRegisterClass(VT: MVT::i64x8, RC: &AArch64::GPR64x8ClassRegClass);
400	setOperationAction(Op: ISD::LOAD, VT: MVT::i64x8, Action: Custom);
401	setOperationAction(Op: ISD::STORE, VT: MVT::i64x8, Action: Custom);
402	}
403
404	if (Subtarget->hasFPARMv8()) {
405	addRegisterClass(VT: MVT::aarch64mfp8, RC: &AArch64::FPR8RegClass);
406	addRegisterClass(VT: MVT::f16, RC: &AArch64::FPR16RegClass);
407	addRegisterClass(VT: MVT::bf16, RC: &AArch64::FPR16RegClass);
408	addRegisterClass(VT: MVT::f32, RC: &AArch64::FPR32RegClass);
409	addRegisterClass(VT: MVT::f64, RC: &AArch64::FPR64RegClass);
410	addRegisterClass(VT: MVT::f128, RC: &AArch64::FPR128RegClass);
411	}
412
413	if (Subtarget->hasNEON()) {
414	addRegisterClass(VT: MVT::v16i8, RC: &AArch64::FPR8RegClass);
415	addRegisterClass(VT: MVT::v8i16, RC: &AArch64::FPR16RegClass);
416
417	addDRType(VT: MVT::v2f32);
418	addDRType(VT: MVT::v8i8);
419	addDRType(VT: MVT::v4i16);
420	addDRType(VT: MVT::v2i32);
421	addDRType(VT: MVT::v1i64);
422	addDRType(VT: MVT::v1f64);
423	addDRType(VT: MVT::v4f16);
424	addDRType(VT: MVT::v4bf16);
425
426	addQRType(VT: MVT::v4f32);
427	addQRType(VT: MVT::v2f64);
428	addQRType(VT: MVT::v16i8);
429	addQRType(VT: MVT::v8i16);
430	addQRType(VT: MVT::v4i32);
431	addQRType(VT: MVT::v2i64);
432	addQRType(VT: MVT::v8f16);
433	addQRType(VT: MVT::v8bf16);
434	}
435
436	if (Subtarget->isSVEorStreamingSVEAvailable()) {
437	// Add legal sve predicate types
438	addRegisterClass(VT: MVT::nxv1i1, RC: &AArch64::PPRRegClass);
439	addRegisterClass(VT: MVT::nxv2i1, RC: &AArch64::PPRRegClass);
440	addRegisterClass(VT: MVT::nxv4i1, RC: &AArch64::PPRRegClass);
441	addRegisterClass(VT: MVT::nxv8i1, RC: &AArch64::PPRRegClass);
442	addRegisterClass(VT: MVT::nxv16i1, RC: &AArch64::PPRRegClass);
443
444	// Add legal sve data types
445	addRegisterClass(VT: MVT::nxv16i8, RC: &AArch64::ZPRRegClass);
446	addRegisterClass(VT: MVT::nxv8i16, RC: &AArch64::ZPRRegClass);
447	addRegisterClass(VT: MVT::nxv4i32, RC: &AArch64::ZPRRegClass);
448	addRegisterClass(VT: MVT::nxv2i64, RC: &AArch64::ZPRRegClass);
449
450	addRegisterClass(VT: MVT::nxv2f16, RC: &AArch64::ZPRRegClass);
451	addRegisterClass(VT: MVT::nxv4f16, RC: &AArch64::ZPRRegClass);
452	addRegisterClass(VT: MVT::nxv8f16, RC: &AArch64::ZPRRegClass);
453	addRegisterClass(VT: MVT::nxv2f32, RC: &AArch64::ZPRRegClass);
454	addRegisterClass(VT: MVT::nxv4f32, RC: &AArch64::ZPRRegClass);
455	addRegisterClass(VT: MVT::nxv2f64, RC: &AArch64::ZPRRegClass);
456
457	addRegisterClass(VT: MVT::nxv2bf16, RC: &AArch64::ZPRRegClass);
458	addRegisterClass(VT: MVT::nxv4bf16, RC: &AArch64::ZPRRegClass);
459	addRegisterClass(VT: MVT::nxv8bf16, RC: &AArch64::ZPRRegClass);
460
461	if (Subtarget->useSVEForFixedLengthVectors()) {
462	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
463	if (useSVEForFixedLengthVectorVT(VT))
464	addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
465
466	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
467	if (useSVEForFixedLengthVectorVT(VT))
468	addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
469	}
470	}
471
472	if (Subtarget->hasSVE2p1() \|\| Subtarget->hasSME2()) {
473	addRegisterClass(VT: MVT::aarch64svcount, RC: &AArch64::PPRRegClass);
474	setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
475	setOperationPromotedToType(Opc: ISD::STORE, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
476
477	setOperationAction(Op: ISD::SELECT, VT: MVT::aarch64svcount, Action: Custom);
478	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::aarch64svcount, Action: Expand);
479	}
480
481	// Compute derived properties from the register classes
482	computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
483
484	// Provide all sorts of operation actions
485	setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
486	setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
487	setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
488	setOperationAction(Op: ISD::SETCC, VT: MVT::i64, Action: Custom);
489	setOperationAction(Op: ISD::SETCC, VT: MVT::bf16, Action: Custom);
490	setOperationAction(Op: ISD::SETCC, VT: MVT::f16, Action: Custom);
491	setOperationAction(Op: ISD::SETCC, VT: MVT::f32, Action: Custom);
492	setOperationAction(Op: ISD::SETCC, VT: MVT::f64, Action: Custom);
493	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::bf16, Action: Custom);
494	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f16, Action: Custom);
495	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Custom);
496	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Custom);
497	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f16, Action: Custom);
498	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Custom);
499	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Custom);
500	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
501	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
502	setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
503	setOperationAction(Op: ISD::BR_CC, VT: MVT::i32, Action: Custom);
504	setOperationAction(Op: ISD::BR_CC, VT: MVT::i64, Action: Custom);
505	setOperationAction(Op: ISD::BR_CC, VT: MVT::f16, Action: Custom);
506	setOperationAction(Op: ISD::BR_CC, VT: MVT::f32, Action: Custom);
507	setOperationAction(Op: ISD::BR_CC, VT: MVT::f64, Action: Custom);
508	setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Custom);
509	setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
510	if (Subtarget->hasFPARMv8()) {
511	setOperationAction(Op: ISD::SELECT, VT: MVT::f16, Action: Custom);
512	setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Custom);
513	}
514	setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Custom);
515	setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Custom);
516	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
517	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Custom);
518	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f16, Action: Custom);
519	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::bf16, Action: Custom);
520	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
521	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
522	setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Custom);
523	setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
524	setOperationAction(Op: ISD::BRIND, VT: MVT::Other, Action: Custom);
525	setOperationAction(Op: ISD::SETCCCARRY, VT: MVT::i64, Action: Custom);
526
527	setOperationAction(Op: ISD::PtrAuthGlobalAddress, VT: MVT::i64, Action: Custom);
528
529	setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
530	setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
531	setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
532
533	setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: Expand);
534	setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: Expand);
535	setOperationAction(Op: ISD::FREM, VT: MVT::f80, Action: Expand);
536
537	setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
538
539	// Custom lowering hooks are needed for XOR
540	// to fold it into CSINC/CSINV.
541	setOperationAction(Op: ISD::XOR, VT: MVT::i32, Action: Custom);
542	setOperationAction(Op: ISD::XOR, VT: MVT::i64, Action: Custom);
543
544	setOperationAction(Op: ISD::ADDRSPACECAST, VT: MVT::i32, Action: Custom);
545	setOperationAction(Op: ISD::ADDRSPACECAST, VT: MVT::i64, Action: Custom);
546
547	// Virtually no operation on f128 is legal, but LLVM can't expand them when
548	// there's a valid register class, so we need custom operations in most cases.
549	setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
550	setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
551	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
552	setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
553	setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: LibCall);
554	setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
555	setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: LibCall);
556	setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
557	setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
558	setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: Expand);
559	setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Expand);
560	setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
561	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
562	setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
563	setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
564	setOperationAction(Op: ISD::FTAN, VT: MVT::f128, Action: Expand);
565	setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Expand);
566	setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
567	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
568	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
569	setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Custom);
570	setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Custom);
571	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
572	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Custom);
573	// FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
574	// aren't handled.
575
576	// Lowering for many of the conversions is actually specified by the non-f128
577	// type. The LowerXXX function will be trivial when f128 isn't involved.
578	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
579	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
580	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i128, Action: Custom);
581	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
582	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
583	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i128, Action: Custom);
584	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
585	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
586	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i128, Action: Custom);
587	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
588	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
589	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i128, Action: Custom);
590	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
591	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
592	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i128, Action: Custom);
593	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
594	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
595	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i128, Action: Custom);
596	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
597	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
598	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i128, Action: Custom);
599	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
600	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
601	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i128, Action: Custom);
602	if (Subtarget->hasFPARMv8()) {
603	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f16, Action: Custom);
604	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Custom);
605	}
606	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Custom);
607	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Custom);
608	if (Subtarget->hasFPARMv8()) {
609	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f16, Action: Custom);
610	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::bf16, Action: Custom);
611	}
612	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Custom);
613	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Custom);
614
615	setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i32, Action: Custom);
616	setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i64, Action: Custom);
617	setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i32, Action: Custom);
618	setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i64, Action: Custom);
619
620	// Variable arguments.
621	setOperationAction(Op: ISD::VASTART, VT: MVT::Other, Action: Custom);
622	setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
623	setOperationAction(Op: ISD::VACOPY, VT: MVT::Other, Action: Custom);
624	setOperationAction(Op: ISD::VAEND, VT: MVT::Other, Action: Expand);
625
626	// Variable-sized objects.
627	setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Expand);
628	setOperationAction(Op: ISD::STACKRESTORE, VT: MVT::Other, Action: Expand);
629
630	// Lowering Funnel Shifts to EXTR
631	setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
632	setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
633	setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
634	setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
635
636	setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64, Action: Custom);
637
638	// Constant pool entries
639	setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
640
641	// BlockAddress
642	setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
643
644	// AArch64 lacks both left-rotate and popcount instructions.
645	setOperationAction(Op: ISD::ROTL, VT: MVT::i32, Action: Expand);
646	setOperationAction(Op: ISD::ROTL, VT: MVT::i64, Action: Expand);
647	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
648	setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
649	setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
650	}
651
652	// AArch64 doesn't have i32 MULH{S\|U}.
653	setOperationAction(Op: ISD::MULHU, VT: MVT::i32, Action: Expand);
654	setOperationAction(Op: ISD::MULHS, VT: MVT::i32, Action: Expand);
655
656	// AArch64 doesn't have {U\|S}MUL_LOHI.
657	setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
658	setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
659	setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
660	setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
661
662	if (Subtarget->hasCSSC()) {
663	setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Legal);
664	setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Legal);
665	setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Expand);
666
667	setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Expand);
668
669	setOperationAction(Op: ISD::CTTZ, VT: MVT::i32, Action: Legal);
670	setOperationAction(Op: ISD::CTTZ, VT: MVT::i64, Action: Legal);
671	setOperationAction(Op: ISD::CTTZ, VT: MVT::i128, Action: Expand);
672
673	setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Legal);
674	setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Legal);
675
676	setOperationAction(Op: ISD::SMAX, VT: MVT::i32, Action: Legal);
677	setOperationAction(Op: ISD::SMAX, VT: MVT::i64, Action: Legal);
678	setOperationAction(Op: ISD::UMAX, VT: MVT::i32, Action: Legal);
679	setOperationAction(Op: ISD::UMAX, VT: MVT::i64, Action: Legal);
680
681	setOperationAction(Op: ISD::SMIN, VT: MVT::i32, Action: Legal);
682	setOperationAction(Op: ISD::SMIN, VT: MVT::i64, Action: Legal);
683	setOperationAction(Op: ISD::UMIN, VT: MVT::i32, Action: Legal);
684	setOperationAction(Op: ISD::UMIN, VT: MVT::i64, Action: Legal);
685	} else {
686	setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Custom);
687	setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Custom);
688	setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Custom);
689
690	setOperationAction(Op: ISD::PARITY, VT: MVT::i64, Action: Custom);
691	setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Custom);
692
693	setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Custom);
694	setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Custom);
695	}
696
697	setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
698	setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
699	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
700	setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
701	setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
702	}
703	setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
704	setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
705	setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
706	setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
707	setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
708	setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
709
710	// Custom lower Add/Sub/Mul with overflow.
711	setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
712	setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
713	setOperationAction(Op: ISD::UADDO, VT: MVT::i32, Action: Custom);
714	setOperationAction(Op: ISD::UADDO, VT: MVT::i64, Action: Custom);
715	setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
716	setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
717	setOperationAction(Op: ISD::USUBO, VT: MVT::i32, Action: Custom);
718	setOperationAction(Op: ISD::USUBO, VT: MVT::i64, Action: Custom);
719	setOperationAction(Op: ISD::SMULO, VT: MVT::i32, Action: Custom);
720	setOperationAction(Op: ISD::SMULO, VT: MVT::i64, Action: Custom);
721	setOperationAction(Op: ISD::UMULO, VT: MVT::i32, Action: Custom);
722	setOperationAction(Op: ISD::UMULO, VT: MVT::i64, Action: Custom);
723
724	setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i32, Action: Custom);
725	setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i64, Action: Custom);
726	setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i32, Action: Custom);
727	setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i64, Action: Custom);
728	setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i32, Action: Custom);
729	setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i64, Action: Custom);
730	setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i32, Action: Custom);
731	setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i64, Action: Custom);
732
733	setOperationAction(Op: ISD::FSIN, VT: MVT::f32, Action: Expand);
734	setOperationAction(Op: ISD::FSIN, VT: MVT::f64, Action: Expand);
735	setOperationAction(Op: ISD::FCOS, VT: MVT::f32, Action: Expand);
736	setOperationAction(Op: ISD::FCOS, VT: MVT::f64, Action: Expand);
737	setOperationAction(Op: ISD::FPOW, VT: MVT::f32, Action: Expand);
738	setOperationAction(Op: ISD::FPOW, VT: MVT::f64, Action: Expand);
739	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Custom);
740	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Custom);
741	if (Subtarget->hasFullFP16()) {
742	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Custom);
743	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Custom);
744	} else {
745	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Promote);
746	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Promote);
747	}
748
749	for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
750	ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
751	ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
752	ISD::FASIN, ISD::FATAN, ISD::FATAN2,
753	ISD::FCOSH, ISD::FSINH, ISD::FTANH,
754	ISD::FTAN, ISD::FEXP, ISD::FEXP2,
755	ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
756	ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
757	ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
758	ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN,
759	ISD::STRICT_FATAN2, ISD::STRICT_FCOSH, ISD::STRICT_FSINH,
760	ISD::STRICT_FTANH, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
761	ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10,
762	ISD::STRICT_FTAN}) {
763	setOperationAction(Op, VT: MVT::f16, Action: Promote);
764	setOperationAction(Op, VT: MVT::v4f16, Action: Expand);
765	setOperationAction(Op, VT: MVT::v8f16, Action: Expand);
766	setOperationAction(Op, VT: MVT::bf16, Action: Promote);
767	setOperationAction(Op, VT: MVT::v4bf16, Action: Expand);
768	setOperationAction(Op, VT: MVT::v8bf16, Action: Expand);
769	}
770
771	// fpextend from f16 or bf16 to f32 is legal
772	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f32, Action: Legal);
773	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::v4f32, Action: Legal);
774	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f32, Action: Legal);
775	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::v4f32, Action: Legal);
776	// fpextend from bf16 to f64 needs to be split into two fpextends
777	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f64, Action: Custom);
778	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT: MVT::f64, Action: Custom);
779
780	auto LegalizeNarrowFP = [this](MVT ScalarVT) {
781	for (auto Op : {
782	ISD::SETCC,
783	ISD::SELECT_CC,
784	ISD::BR_CC,
785	ISD::FADD,
786	ISD::FSUB,
787	ISD::FMUL,
788	ISD::FDIV,
789	ISD::FMA,
790	ISD::FCEIL,
791	ISD::FSQRT,
792	ISD::FFLOOR,
793	ISD::FNEARBYINT,
794	ISD::FRINT,
795	ISD::FROUND,
796	ISD::FROUNDEVEN,
797	ISD::FTRUNC,
798	ISD::FMINNUM,
799	ISD::FMAXNUM,
800	ISD::FMINIMUM,
801	ISD::FMAXIMUM,
802	ISD::FMINIMUMNUM,
803	ISD::FMAXIMUMNUM,
804	ISD::FCANONICALIZE,
805	ISD::STRICT_FADD,
806	ISD::STRICT_FSUB,
807	ISD::STRICT_FMUL,
808	ISD::STRICT_FDIV,
809	ISD::STRICT_FMA,
810	ISD::STRICT_FCEIL,
811	ISD::STRICT_FFLOOR,
812	ISD::STRICT_FSQRT,
813	ISD::STRICT_FRINT,
814	ISD::STRICT_FNEARBYINT,
815	ISD::STRICT_FROUND,
816	ISD::STRICT_FTRUNC,
817	ISD::STRICT_FROUNDEVEN,
818	ISD::STRICT_FMINNUM,
819	ISD::STRICT_FMAXNUM,
820	ISD::STRICT_FMINIMUM,
821	ISD::STRICT_FMAXIMUM,
822	})
823	setOperationAction(Op, VT: ScalarVT, Action: Promote);
824
825	for (auto Op : {ISD::FNEG, ISD::FABS})
826	setOperationAction(Op, VT: ScalarVT, Action: Legal);
827
828	// Round-to-integer need custom lowering for fp16, as Promote doesn't work
829	// because the result type is integer.
830	for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
831	ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
832	ISD::STRICT_LLRINT})
833	setOperationAction(Op, VT: ScalarVT, Action: Custom);
834
835	// promote v4f16 to v4f32 when that is known to be safe.
836	auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `4`);
837	setOperationPromotedToType(Opc: ISD::FADD, OrigVT: V4Narrow, DestVT: MVT::v4f32);
838	setOperationPromotedToType(Opc: ISD::FSUB, OrigVT: V4Narrow, DestVT: MVT::v4f32);
839	setOperationPromotedToType(Opc: ISD::FMUL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
840	setOperationPromotedToType(Opc: ISD::FDIV, OrigVT: V4Narrow, DestVT: MVT::v4f32);
841	setOperationPromotedToType(Opc: ISD::FCEIL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
842	setOperationPromotedToType(Opc: ISD::FFLOOR, OrigVT: V4Narrow, DestVT: MVT::v4f32);
843	setOperationPromotedToType(Opc: ISD::FROUND, OrigVT: V4Narrow, DestVT: MVT::v4f32);
844	setOperationPromotedToType(Opc: ISD::FTRUNC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
845	setOperationPromotedToType(Opc: ISD::FROUNDEVEN, OrigVT: V4Narrow, DestVT: MVT::v4f32);
846	setOperationPromotedToType(Opc: ISD::FRINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
847	setOperationPromotedToType(Opc: ISD::FNEARBYINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
848	setOperationPromotedToType(Opc: ISD::FCANONICALIZE, OrigVT: V4Narrow, DestVT: MVT::v4f32);
849	setOperationPromotedToType(Opc: ISD::SETCC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
850
851	setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
852	setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
853	setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
854	setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
855	setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
856	setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
857	setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
858	setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
859
860	auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `8`);
861	setOperationPromotedToType(Opc: ISD::FCANONICALIZE, OrigVT: V8Narrow, DestVT: MVT::v8f32);
862	setOperationPromotedToType(Opc: ISD::SETCC, OrigVT: V8Narrow, DestVT: MVT::v8f32);
863
864	setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
865	setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
866	setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
867	setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
868	setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
869	setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
870	setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
871	setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
872	setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
873	setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
874	setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
875	setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
876	setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
877	setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
878	setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
879	setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
880	setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
881	setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
882	setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
883	setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
884	};
885
886	if (!Subtarget->hasFullFP16()) {
887	LegalizeNarrowFP (MVT::f16);
888	}
889	LegalizeNarrowFP (MVT::bf16);
890	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f32, Action: Custom);
891	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4bf16, Action: Custom);
892
893	// AArch64 has implementations of a lot of rounding-like FP operations.
894	// clang-format off
895	for (auto Op :
896	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
897	ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
898	ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
899	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
900	ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
901	ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
902	ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
903	ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
904	ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
905	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
906	ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
907	for (MVT Ty : {MVT::f32, MVT::f64})
908	setOperationAction(Op, VT: Ty, Action: Legal);
909	if (Subtarget->hasFullFP16())
910	setOperationAction(Op, VT: MVT::f16, Action: Legal);
911	}
912	// clang-format on
913
914	// Basic strict FP operations are legal
915	for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
916	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
917	for (MVT Ty : {MVT::f32, MVT::f64})
918	setOperationAction(Op, VT: Ty, Action: Legal);
919	if (Subtarget->hasFullFP16())
920	setOperationAction(Op, VT: MVT::f16, Action: Legal);
921	}
922
923	setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
924
925	setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
926	setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
927	setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Custom);
928	setOperationAction(Op: ISD::SET_FPMODE, VT: MVT::i32, Action: Custom);
929	setOperationAction(Op: ISD::RESET_FPMODE, VT: MVT::Other, Action: Custom);
930
931	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: Custom);
932	if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
933	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: LibCall);
934	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: LibCall);
935	} else {
936	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: Expand);
937	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: Expand);
938	}
939	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i32, Action: Custom);
940	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i64, Action: Custom);
941
942	// Generate outline atomics library calls only if LSE was not specified for
943	// subtarget
944	if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
945	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i8, Action: LibCall);
946	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i16, Action: LibCall);
947	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: LibCall);
948	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i64, Action: LibCall);
949	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: LibCall);
950	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i8, Action: LibCall);
951	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i16, Action: LibCall);
952	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i32, Action: LibCall);
953	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i64, Action: LibCall);
954	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i8, Action: LibCall);
955	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i16, Action: LibCall);
956	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i32, Action: LibCall);
957	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i64, Action: LibCall);
958	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i8, Action: LibCall);
959	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i16, Action: LibCall);
960	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i32, Action: LibCall);
961	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i64, Action: LibCall);
962	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i8, Action: LibCall);
963	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i16, Action: LibCall);
964	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i32, Action: LibCall);
965	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i64, Action: LibCall);
966	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i8, Action: LibCall);
967	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i16, Action: LibCall);
968	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i32, Action: LibCall);
969	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i64, Action: LibCall);
970	}
971
972	if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
973	setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f16, Action: LibCall);
974	setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f32, Action: LibCall);
975	setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::f64, Action: LibCall);
976	setOperationAction(Op: ISD::ATOMIC_LOAD_FADD, VT: MVT::bf16, Action: LibCall);
977
978	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f16, Action: LibCall);
979	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f32, Action: LibCall);
980	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::f64, Action: LibCall);
981	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAX, VT: MVT::bf16, Action: LibCall);
982
983	setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f16, Action: LibCall);
984	setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f32, Action: LibCall);
985	setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::f64, Action: LibCall);
986	setOperationAction(Op: ISD::ATOMIC_LOAD_FMIN, VT: MVT::bf16, Action: LibCall);
987
988	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f16, Action: LibCall);
989	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f32, Action: LibCall);
990	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::f64, Action: LibCall);
991	setOperationAction(Op: ISD::ATOMIC_LOAD_FMAXIMUM, VT: MVT::bf16, Action: LibCall);
992
993	setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f16, Action: LibCall);
994	setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f32, Action: LibCall);
995	setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::f64, Action: LibCall);
996	setOperationAction(Op: ISD::ATOMIC_LOAD_FMINIMUM, VT: MVT::bf16, Action: LibCall);
997	}
998
999	if (Subtarget->hasLSE128()) {
1000	// Custom lowering because i128 is not legal. Must be replaced by 2x64
1001	// values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1002	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i128, Action: Custom);
1003	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i128, Action: Custom);
1004	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i128, Action: Custom);
1005	}
1006
1007	// 128-bit loads and stores can be done without expanding
1008	setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Custom);
1009	setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Custom);
1010
1011	// Aligned 128-bit loads and stores are single-copy atomic according to the
1012	// v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1013	if (Subtarget->hasLSE2()) {
1014	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
1015	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
1016	}
1017
1018	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1019	// custom lowering, as there are no un-paired non-temporal stores and
1020	// legalization will break up 256 bit inputs.
1021	setOperationAction(Op: ISD::STORE, VT: MVT::v32i8, Action: Custom);
1022	setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Custom);
1023	setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Custom);
1024	setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Custom);
1025	setOperationAction(Op: ISD::STORE, VT: MVT::v8i32, Action: Custom);
1026	setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Custom);
1027	setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Custom);
1028	setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Custom);
1029
1030	// 256 bit non-temporal loads can be lowered to LDNP. This is done using
1031	// custom lowering, as there are no un-paired non-temporal loads legalization
1032	// will break up 256 bit inputs.
1033	setOperationAction(Op: ISD::LOAD, VT: MVT::v32i8, Action: Custom);
1034	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Custom);
1035	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Custom);
1036	setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Custom);
1037	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i32, Action: Custom);
1038	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Custom);
1039	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Custom);
1040	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Custom);
1041
1042	// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1043	setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
1044
1045	if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
1046	getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
1047	// Issue __sincos_stret if available.
1048	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Custom);
1049	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Custom);
1050	} else {
1051	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
1052	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
1053	}
1054
1055	// Make floating-point constants legal for the large code model, so they don't
1056	// become loads from the constant pool.
1057	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1058	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f32, Action: Legal);
1059	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f64, Action: Legal);
1060	}
1061
1062	// AArch64 does not have floating-point extending loads, i1 sign-extending
1063	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1064	for (MVT VT : MVT::fp_valuetypes()) {
1065	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::bf16, Action: Expand);
1066	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f16, Action: Expand);
1067	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f32, Action: Expand);
1068	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f64, Action: Expand);
1069	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f80, Action: Expand);
1070	}
1071	for (MVT VT : MVT::integer_valuetypes())
1072	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Expand);
1073
1074	for (MVT WideVT : MVT::fp_valuetypes()) {
1075	for (MVT NarrowVT : MVT::fp_valuetypes()) {
1076	if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1077	setTruncStoreAction(ValVT: WideVT, MemVT: NarrowVT, Action: Expand);
1078	}
1079	}
1080	}
1081
1082	if (Subtarget->hasFPARMv8()) {
1083	setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1084	setOperationAction(Op: ISD::BITCAST, VT: MVT::f16, Action: Custom);
1085	setOperationAction(Op: ISD::BITCAST, VT: MVT::bf16, Action: Custom);
1086	}
1087
1088	// Indexed loads and stores are supported.
1089	for (unsigned im = (unsigned)ISD::PRE_INC;
1090	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1091	setIndexedLoadAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1092	setIndexedLoadAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1093	setIndexedLoadAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1094	setIndexedLoadAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1095	setIndexedLoadAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1096	setIndexedLoadAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1097	setIndexedLoadAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1098	setIndexedLoadAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1099	setIndexedStoreAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1100	setIndexedStoreAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1101	setIndexedStoreAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1102	setIndexedStoreAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1103	setIndexedStoreAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1104	setIndexedStoreAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1105	setIndexedStoreAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1106	setIndexedStoreAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1107	}
1108
1109	// Trap.
1110	setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
1111	setOperationAction(Op: ISD::DEBUGTRAP, VT: MVT::Other, Action: Legal);
1112	setOperationAction(Op: ISD::UBSANTRAP, VT: MVT::Other, Action: Legal);
1113
1114	// We combine OR nodes for bitfield operations.
1115	setTargetDAGCombine(ISD::OR);
1116	// Try to create BICs for vector ANDs.
1117	setTargetDAGCombine(ISD::AND);
1118
1119	// llvm.init.trampoline and llvm.adjust.trampoline
1120	setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1121	setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1122
1123	// Vector add and sub nodes may conceal a high-half opportunity.
1124	// Also, try to fold ADD into CSINC/CSINV..
1125	setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1126	ISD::UINT_TO_FP});
1127
1128	setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1129	ISD::FP_TO_UINT_SAT, ISD::FADD});
1130
1131	// Try and combine setcc with csel
1132	setTargetDAGCombine(ISD::SETCC);
1133
1134	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1135
1136	setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1137	ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
1138	ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
1139	ISD::STORE, ISD::BUILD_VECTOR});
1140	setTargetDAGCombine(ISD::TRUNCATE);
1141	setTargetDAGCombine(ISD::LOAD);
1142
1143	setTargetDAGCombine(ISD::MSTORE);
1144
1145	setTargetDAGCombine(ISD::MUL);
1146
1147	setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1148
1149	setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1150	ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1151	ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1152
1153	setTargetDAGCombine(
1154	{ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1155
1156	setTargetDAGCombine(ISD::FP_EXTEND);
1157
1158	setTargetDAGCombine(ISD::GlobalAddress);
1159
1160	setTargetDAGCombine(ISD::CTLZ);
1161
1162	setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1163
1164	setTargetDAGCombine(ISD::VECREDUCE_AND);
1165	setTargetDAGCombine(ISD::VECREDUCE_OR);
1166	setTargetDAGCombine(ISD::VECREDUCE_XOR);
1167
1168	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1169
1170	setTargetDAGCombine(ISD::SHL);
1171
1172	// In case of strict alignment, avoid an excessive number of byte wide stores.
1173	MaxStoresPerMemsetOptSize = `8`;
1174	MaxStoresPerMemset =
1175	Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : `32`;
1176
1177	MaxGluedStoresPerMemcpy = `4`;
1178	MaxStoresPerMemcpyOptSize = `4`;
1179	MaxStoresPerMemcpy =
1180	Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : `16`;
1181
1182	MaxStoresPerMemmoveOptSize = `4`;
1183	MaxStoresPerMemmove =
1184	Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : `16`;
1185
1186	MaxLoadsPerMemcmpOptSize = `4`;
1187	MaxLoadsPerMemcmp =
1188	Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : `8`;
1189
1190	setStackPointerRegisterToSaveRestore(AArch64::SP);
1191
1192	setSchedulingPreference(Sched::Hybrid);
1193
1194	EnableExtLdPromotion = true;
1195
1196	// Set required alignment.
1197	setMinFunctionAlignment(Align (`4`));
1198	// Set preferred alignments.
1199
1200	// Don't align loops on Windows. The SEH unwind info generation needs to
1201	// know the exact length of functions before the alignments have been
1202	// expanded.
1203	if (!Subtarget->isTargetWindows())
1204	setPrefLoopAlignment(STI.getPrefLoopAlignment());
1205	setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1206	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1207
1208	// Only change the limit for entries in a jump table if specified by
1209	// the sub target, but not at the command line.
1210	unsigned MaxJT = STI.getMaximumJumpTableSize();
1211	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1212	setMaximumJumpTableSize(MaxJT);
1213
1214	setHasExtractBitsInsn(true);
1215
1216	setMaxDivRemBitWidthSupported(`128`);
1217
1218	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
1219	if (Subtarget->hasSME())
1220	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i1, Action: Custom);
1221
1222	if (Subtarget->isNeonAvailable()) {
1223	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1224	// silliness like this:
1225	// clang-format off
1226	for (auto Op :
1227	{ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1228	ISD::BR_CC, ISD::FADD, ISD::FSUB,
1229	ISD::FMUL, ISD::FDIV, ISD::FMA,
1230	ISD::FNEG, ISD::FABS, ISD::FCEIL,
1231	ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1232	ISD::FSIN, ISD::FCOS, ISD::FTAN,
1233	ISD::FASIN, ISD::FACOS, ISD::FATAN,
1234	ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1235	ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1236	ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1237	ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1238	ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1239	ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1240	ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1241	ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
1242	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL,
1243	ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT,
1244	ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC,
1245	ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1246	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM})
1247	setOperationAction(Op, VT: MVT::v1f64, Action: Expand);
1248	// clang-format on
1249
1250	for (auto Op :
1251	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1252	ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1253	ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1254	ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1255	setOperationAction(Op, VT: MVT::v1i64, Action: Expand);
1256
1257	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
1258	// elements smaller than i32, so promote the input to i32 first.
1259	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1260	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1261
1262	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
1263	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1264	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1265	for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1266	ISD::STRICT_UINT_TO_FP})
1267	for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1268	setOperationAction(Op, VT, Action: Custom);
1269
1270	if (Subtarget->hasFullFP16()) {
1271	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
1272	setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
1273
1274	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1275	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1276	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1277	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1278	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1279	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1280	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1281	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1282	} else {
1283	// when AArch64 doesn't have fullfp16 support, promote the input
1284	// to i32 first.
1285	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1286	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1287	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1288	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1289	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1290	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1291	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1292	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1293	}
1294
1295	setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Expand);
1296	setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Expand);
1297	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v8i8, Action: Legal);
1298	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v16i8, Action: Legal);
1299	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i32, Action: Custom);
1300	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v4i32, Action: Custom);
1301	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1302	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i64, Action: Custom);
1303	for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1304	setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1305	setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1306	setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1307	setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1308	}
1309
1310	// Custom handling for some quad-vector types to detect MULL.
1311	setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Custom);
1312	setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
1313	setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1314	setOperationAction(Op: ISD::MUL, VT: MVT::v4i16, Action: Custom);
1315	setOperationAction(Op: ISD::MUL, VT: MVT::v2i32, Action: Custom);
1316	setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1317
1318	// Saturates
1319	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1320	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1321	setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1322	setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1323	setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1324	setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1325	}
1326
1327	for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1328	MVT::v4i32}) {
1329	setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Legal);
1330	setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Legal);
1331	setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
1332	setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
1333	setOperationAction(Op: ISD::ABDS, VT, Action: Legal);
1334	setOperationAction(Op: ISD::ABDU, VT, Action: Legal);
1335	}
1336
1337	// Vector reductions
1338	for (MVT VT : { MVT::v4f16, MVT::v2f32,
1339	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1340	if (VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()) {
1341	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Legal);
1342	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Legal);
1343	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Legal);
1344	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Legal);
1345
1346	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Legal);
1347	}
1348	}
1349	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1350	MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1351	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1352	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1353	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1354	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1355	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1356	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1357	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1358	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1359	}
1360	setOperationAction(Op: ISD::VECREDUCE_ADD, VT: MVT::v2i64, Action: Custom);
1361	setOperationAction(Op: ISD::VECREDUCE_AND, VT: MVT::v2i64, Action: Custom);
1362	setOperationAction(Op: ISD::VECREDUCE_OR, VT: MVT::v2i64, Action: Custom);
1363	setOperationAction(Op: ISD::VECREDUCE_XOR, VT: MVT::v2i64, Action: Custom);
1364
1365	setOperationAction(Op: ISD::ANY_EXTEND, VT: MVT::v4i32, Action: Legal);
1366	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
1367	// Likewise, narrowing and extending vector loads/stores aren't handled
1368	// directly.
1369	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1370	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
1371
1372	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
1373	setOperationAction(Op: ISD::MULHS, VT, Action: Legal);
1374	setOperationAction(Op: ISD::MULHU, VT, Action: Legal);
1375	} else {
1376	setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
1377	setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
1378	}
1379	setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1380	setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1381
1382	setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
1383	setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
1384
1385	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1386	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1387	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1388	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1389	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1390	}
1391	}
1392
1393	for (auto Op :
1394	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1395	ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1396	ISD::STRICT_FFLOOR, ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL,
1397	ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUND,
1398	ISD::STRICT_FROUNDEVEN}) {
1399	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1400	setOperationAction(Op, VT: Ty, Action: Legal);
1401	if (Subtarget->hasFullFP16())
1402	for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1403	setOperationAction(Op, VT: Ty, Action: Legal);
1404	}
1405
1406	// LRINT and LLRINT.
1407	for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1408	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1409	setOperationAction(Op, VT: Ty, Action: Custom);
1410	if (Subtarget->hasFullFP16())
1411	for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1412	setOperationAction(Op, VT: Ty, Action: Custom);
1413	}
1414
1415	setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1416
1417	setOperationAction(Op: ISD::BITCAST, VT: MVT::i2, Action: Custom);
1418	setOperationAction(Op: ISD::BITCAST, VT: MVT::i4, Action: Custom);
1419	setOperationAction(Op: ISD::BITCAST, VT: MVT::i8, Action: Custom);
1420	setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1421
1422	setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i8, Action: Custom);
1423	setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i16, Action: Custom);
1424	setOperationAction(Op: ISD::BITCAST, VT: MVT::v4i8, Action: Custom);
1425
1426	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1427	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1428	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1429	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1430	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1431	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1432
1433	// ADDP custom lowering
1434	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1435	setOperationAction(Op: ISD::ADD, VT, Action: Custom);
1436	// FADDP custom lowering
1437	for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1438	setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1439
1440	if (Subtarget->hasDotProd()) {
1441	static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1442	ISD::PARTIAL_REDUCE_UMLA};
1443
1444	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v16i8, Action: Legal);
1445	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i32, InputVT: MVT::v8i8, Action: Legal);
1446	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v16i8, Action: Custom);
1447
1448	if (Subtarget->hasMatMulInt8()) {
1449	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v4i32,
1450	InputVT: MVT::v16i8, Action: Legal);
1451	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v2i64,
1452	InputVT: MVT::v16i8, Action: Custom);
1453
1454	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::v2i32,
1455	InputVT: MVT::v8i8, Action: Legal);
1456	}
1457	}
1458
1459	} else / !isNeonAvailable / {
1460	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1461	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op)
1462	setOperationAction(Op, VT, Action: Expand);
1463
1464	if (VT.is128BitVector() \|\| VT.is64BitVector()) {
1465	setOperationAction(Op: ISD::LOAD, VT, Action: Legal);
1466	setOperationAction(Op: ISD::STORE, VT, Action: Legal);
1467	setOperationAction(Op: ISD::BITCAST, VT,
1468	Action: Subtarget->isLittleEndian() ? Legal : Expand);
1469	}
1470	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1471	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1472	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1473	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1474	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1475	}
1476	}
1477	}
1478
1479	for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1480	setOperationAction(Op: ISD::TRUNCATE_SSAT_S, VT, Action: Legal);
1481	setOperationAction(Op: ISD::TRUNCATE_SSAT_U, VT, Action: Legal);
1482	setOperationAction(Op: ISD::TRUNCATE_USAT_U, VT, Action: Legal);
1483	}
1484
1485	if (Subtarget->hasSME()) {
1486	setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
1487	}
1488
1489	// FIXME: Move lowering for more nodes here if those are common between
1490	// SVE and SME.
1491	if (Subtarget->isSVEorStreamingSVEAvailable()) {
1492	for (auto VT :
1493	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1494	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Custom);
1495	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1496	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1497	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1498	}
1499	for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1500	setOperationAction(Op: ISD::VECTOR_FIND_LAST_ACTIVE, VT, Action: Legal);
1501	setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Legal);
1502	}
1503
1504	if (Subtarget->hasSVE2p1() \|\|
1505	(Subtarget->hasSME2() && Subtarget->isStreaming()))
1506	setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT: MVT::nxv32i1, Action: Custom);
1507
1508	for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1509	setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Custom);
1510	}
1511
1512	if (Subtarget->isSVEorStreamingSVEAvailable()) {
1513	for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1514	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1515	setOperationAction(Op: ISD::BSWAP, VT, Action: Custom);
1516	setOperationAction(Op: ISD::CTLZ, VT, Action: Custom);
1517	setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1518	setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1519	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1520	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1521	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1522	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1523	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1524	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1525	setOperationAction(Op: ISD::MUL, VT, Action: Custom);
1526	setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1527	setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1528	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1529	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1530	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1531	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1532	setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1533	setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1534	setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1535	setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1536	setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1537	setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1538	setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1539	setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1540	setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1541	setOperationAction(Op: ISD::ABS, VT, Action: Custom);
1542	setOperationAction(Op: ISD::ABDS, VT, Action: Custom);
1543	setOperationAction(Op: ISD::ABDU, VT, Action: Custom);
1544	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1545	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1546	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1547	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1548	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1549	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1550	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1551	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1552	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1553	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1554
1555	setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1556	setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1557	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1558	setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
1559	setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
1560
1561	setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1562	setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1563	setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1564	setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1565	setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1566	setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1567	setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
1568	setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
1569
1570	setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Custom);
1571	setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Custom);
1572	setOperationAction(Op: ISD::AVGCEILS, VT, Action: Custom);
1573	setOperationAction(Op: ISD::AVGCEILU, VT, Action: Custom);
1574
1575	if (!Subtarget->isLittleEndian())
1576	setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1577
1578	if (Subtarget->hasSVE2() \|\|
1579	(Subtarget->hasSME() && Subtarget->isStreaming()))
1580	// For SLI/SRI.
1581	setOperationAction(Op: ISD::OR, VT, Action: Custom);
1582	}
1583
1584	// Illegal unpacked integer vector types.
1585	for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1586	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1587	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1588	}
1589
1590	// Type legalize unpacked bitcasts.
1591	for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1592	setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1593
1594	for (auto VT :
1595	{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1596	MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1597	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Legal);
1598
1599	for (auto VT :
1600	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1601	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1602	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1603	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1604	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1605	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1606	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1607	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1608
1609	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1610	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1611	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1612
1613	// There are no legal MVT::nxv16f## based types.
1614	if (VT != MVT::nxv16i1) {
1615	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1616	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1617	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1618	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1619	}
1620	}
1621
1622	// NEON doesn't support masked loads/stores, but SME and SVE do.
1623	for (auto VT :
1624	{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1625	MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1626	MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1627	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1628	setOperationAction(Op: ISD::MSTORE, VT, Action: Custom);
1629	}
1630
1631	// Firstly, exclude all scalable vector extending loads/truncating stores,
1632	// include both integer and floating scalable vector.
1633	for (MVT VT : MVT::scalable_vector_valuetypes()) {
1634	for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1635	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1636	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1637	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1638	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1639	}
1640	}
1641
1642	// Then, selectively enable those which we directly support.
1643	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1644	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1645	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1646	setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1647	setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1648	setTruncStoreAction(ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1649	for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1650	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1651	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1652	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1653	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1654	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1655	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1656	}
1657
1658	// SVE supports truncating stores of 64 and 128-bit vectors
1659	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Custom);
1660	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Custom);
1661	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Custom);
1662	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Custom);
1663	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Custom);
1664
1665	for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1666	MVT::nxv4f32, MVT::nxv2f64}) {
1667	setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1668	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1669	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1670	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1671	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1672	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1673	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1674	setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1675	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1676	setOperationAction(Op: ISD::FDIV, VT, Action: Custom);
1677	setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1678	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1679	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1680	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1681	setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1682	setOperationAction(Op: ISD::FMUL, VT, Action: Custom);
1683	setOperationAction(Op: ISD::FNEG, VT, Action: Custom);
1684	setOperationAction(Op: ISD::FSUB, VT, Action: Custom);
1685	setOperationAction(Op: ISD::FCEIL, VT, Action: Custom);
1686	setOperationAction(Op: ISD::FFLOOR, VT, Action: Custom);
1687	setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Custom);
1688	setOperationAction(Op: ISD::FRINT, VT, Action: Custom);
1689	setOperationAction(Op: ISD::LRINT, VT, Action: Custom);
1690	setOperationAction(Op: ISD::LLRINT, VT, Action: Custom);
1691	setOperationAction(Op: ISD::FROUND, VT, Action: Custom);
1692	setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Custom);
1693	setOperationAction(Op: ISD::FTRUNC, VT, Action: Custom);
1694	setOperationAction(Op: ISD::FSQRT, VT, Action: Custom);
1695	setOperationAction(Op: ISD::FABS, VT, Action: Custom);
1696	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1697	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1698	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1699	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Custom);
1700	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Custom);
1701	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Custom);
1702	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Custom);
1703	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1704	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1705	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1706
1707	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1708	setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1709	setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1710	setOperationAction(Op: ISD::FPOWI, VT, Action: Expand);
1711	setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1712	setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1713	setOperationAction(Op: ISD::FSINCOS, VT, Action: Expand);
1714	setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1715	setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1716	setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1717	setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1718	setOperationAction(Op: ISD::FATAN2, VT, Action: Expand);
1719	setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1720	setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1721	setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1722	setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1723	setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1724	setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1725	setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1726	setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1727	setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1728
1729	setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1730	setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1731	setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
1732	setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1733	setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
1734	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1735	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1736	setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1737	setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1738	setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1739	setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1740	}
1741
1742	for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1743	setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1744	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1745	setOperationAction(Op: ISD::FABS, VT, Action: Legal);
1746	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1747	setOperationAction(Op: ISD::FNEG, VT, Action: Legal);
1748	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1749	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1750	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1751	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1752	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1753	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1754	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1755	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1756	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1757	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1758
1759	if (Subtarget->hasSVEB16B16()) {
1760	setOperationAction(Op: ISD::FADD, VT, Action: Legal);
1761	setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1762	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1763	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1764	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1765	setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1766	setOperationAction(Op: ISD::FMUL, VT, Action: Legal);
1767	setOperationAction(Op: ISD::FSUB, VT, Action: Legal);
1768	}
1769	}
1770
1771	for (auto Opcode :
1772	{ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1773	ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1774	ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1775	ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1776	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv2bf16, DestVT: MVT::nxv2f32);
1777	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv4bf16, DestVT: MVT::nxv4f32);
1778	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv8bf16, DestVT: MVT::nxv8f32);
1779	}
1780
1781	if (!Subtarget->hasSVEB16B16()) {
1782	for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1783	ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1784	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv2bf16, DestVT: MVT::nxv2f32);
1785	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv4bf16, DestVT: MVT::nxv4f32);
1786	setOperationPromotedToType(Opc: Opcode, OrigVT: MVT::nxv8bf16, DestVT: MVT::nxv8f32);
1787	}
1788	}
1789
1790	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i8, Action: Custom);
1791	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i16, Action: Custom);
1792
1793	// NEON doesn't support integer divides, but SVE does
1794	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1795	MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1796	setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1797	setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1798	}
1799
1800	// NEON doesn't support 64-bit vector integer muls, but SVE does.
1801	setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1802	setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1803
1804	// NOTE: Currently this has to happen after computeRegisterProperties rather
1805	// than the preferred option of combining it with the addRegisterClass call.
1806	if (Subtarget->useSVEForFixedLengthVectors()) {
1807	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1808	if (useSVEForFixedLengthVectorVT(
1809	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1810	addTypeForFixedLengthSVE(VT);
1811	}
1812	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1813	if (useSVEForFixedLengthVectorVT(
1814	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1815	addTypeForFixedLengthSVE(VT);
1816	}
1817
1818	// 64bit results can mean a bigger than NEON input.
1819	for (auto VT : {MVT::v8i8, MVT::v4i16})
1820	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1821	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f16, Action: Custom);
1822
1823	// 128bit results imply a bigger than NEON input.
1824	for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1825	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1826	for (auto VT : {MVT::v8f16, MVT::v4f32})
1827	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1828
1829	// These operations are not supported on NEON but SVE can do them.
1830	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1831	setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Custom);
1832	setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Custom);
1833	setOperationAction(Op: ISD::CTTZ, VT: MVT::v1i64, Action: Custom);
1834	setOperationAction(Op: ISD::MULHS, VT: MVT::v1i64, Action: Custom);
1835	setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Custom);
1836	setOperationAction(Op: ISD::MULHU, VT: MVT::v1i64, Action: Custom);
1837	setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Custom);
1838	setOperationAction(Op: ISD::SMAX, VT: MVT::v1i64, Action: Custom);
1839	setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Custom);
1840	setOperationAction(Op: ISD::SMIN, VT: MVT::v1i64, Action: Custom);
1841	setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Custom);
1842	setOperationAction(Op: ISD::UMAX, VT: MVT::v1i64, Action: Custom);
1843	setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Custom);
1844	setOperationAction(Op: ISD::UMIN, VT: MVT::v1i64, Action: Custom);
1845	setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Custom);
1846	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT: MVT::v2i64, Action: Custom);
1847	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT: MVT::v2i64, Action: Custom);
1848	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT: MVT::v2i64, Action: Custom);
1849	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT: MVT::v2i64, Action: Custom);
1850
1851	// Int operations with no NEON support.
1852	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1853	MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1854	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1855	setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1856	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1857	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1858	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1859	setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1860	setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1861	}
1862
1863	// Use SVE for vectors with more than 2 elements.
1864	for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1865	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1866	}
1867
1868	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv2i1, DestVT: MVT::nxv2i64);
1869	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv4i1, DestVT: MVT::nxv4i32);
1870	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv8i1, DestVT: MVT::nxv8i16);
1871	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv16i1, DestVT: MVT::nxv16i8);
1872
1873	setOperationAction(Op: ISD::VSCALE, VT: MVT::i32, Action: Custom);
1874
1875	for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1876	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT, Action: Custom);
1877	}
1878
1879	// Handle partial reduction operations
1880	if (Subtarget->isSVEorStreamingSVEAvailable()) {
1881	// Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1882	// Other pairs will default to 'Expand'.
1883	static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1884	ISD::PARTIAL_REDUCE_UMLA};
1885	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv8i16, Action: Legal);
1886	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv4i32, InputVT: MVT::nxv16i8, Action: Legal);
1887
1888	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv16i8, Action: Custom);
1889
1890	if (Subtarget->hasMatMulInt8()) {
1891	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::nxv4i32,
1892	InputVT: MVT::nxv16i8, Action: Legal);
1893	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: MVT::nxv2i64,
1894	InputVT: MVT::nxv16i8, Action: Custom);
1895	}
1896
1897	// Wide add types
1898	if (Subtarget->hasSVE2() \|\| Subtarget->hasSME()) {
1899	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv2i64, InputVT: MVT::nxv4i32, Action: Legal);
1900	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv4i32, InputVT: MVT::nxv8i16, Action: Legal);
1901	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::nxv8i16, InputVT: MVT::nxv16i8, Action: Legal);
1902	}
1903	}
1904
1905	// Handle operations that are only available in non-streaming SVE mode.
1906	if (Subtarget->isSVEAvailable()) {
1907	for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1908	MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1909	MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1910	MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1911	MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1912	MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1913	MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1914	setOperationAction(Op: ISD::MGATHER, VT, Action: Custom);
1915	setOperationAction(Op: ISD::MSCATTER, VT, Action: Custom);
1916	}
1917
1918	for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1919	MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1920	MVT::v2f32, MVT::v4f32, MVT::v2f64})
1921	setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: Custom);
1922
1923	// We can lower types that have <vscale x {2\|4}> elements to compact.
1924	for (auto VT :
1925	{MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1926	MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1927	setOperationAction(Op: ISD::VECTOR_COMPRESS, VT, Action: Custom);
1928
1929	// If we have SVE, we can use SVE logic for legal (or smaller than legal)
1930	// NEON vectors in the lowest bits of the SVE register.
1931	for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1932	MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1933	setOperationAction(Op: ISD::VECTOR_COMPRESS, VT, Action: Custom);
1934
1935	// Histcnt is SVE2 only
1936	if (Subtarget->hasSVE2()) {
1937	setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::nxv4i32,
1938	Action: Custom);
1939	setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::nxv2i64,
1940	Action: Custom);
1941
1942	static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1943	ISD::PARTIAL_REDUCE_UMLA};
1944	// Must be lowered to SVE instructions.
1945	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v4i32, Action: Custom);
1946	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v8i16, Action: Custom);
1947	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v2i64, InputVT: MVT::v16i8, Action: Custom);
1948	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v8i16, Action: Custom);
1949	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v4i32, InputVT: MVT::v16i8, Action: Custom);
1950	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: MVT::v8i16, InputVT: MVT::v16i8, Action: Custom);
1951	}
1952	}
1953
1954	if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1955	// Only required for llvm.aarch64.mops.memset.tag
1956	setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i8, Action: Custom);
1957	}
1958
1959	setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
1960
1961	if (Subtarget->hasSVE()) {
1962	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f64, Action: Custom);
1963	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f32, Action: Custom);
1964	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f16, Action: Custom);
1965	setOperationAction(Op: ISD::FLDEXP, VT: MVT::bf16, Action: Custom);
1966	}
1967
1968	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1969
1970	IsStrictFPEnabled = true;
1971	setMaxAtomicSizeInBitsSupported(`128`);
1972
1973	// On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1974	// it, but it's just a wrapper around ldexp.
1975	if (Subtarget->isTargetWindows()) {
1976	for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1977	if (isOperationExpand(Op, VT: MVT::f32))
1978	setOperationAction(Op, VT: MVT::f32, Action: Promote);
1979	}
1980
1981	// LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1982	// isn't legal.
1983	for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1984	if (isOperationExpand(Op, VT: MVT::f16))
1985	setOperationAction(Op, VT: MVT::f16, Action: Promote);
1986	}
1987
1988	void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1989	assert(VT.isVector() && "VT should be a vector type");
1990
1991	if (VT.isFloatingPoint()) {
1992	MVT PromoteTo = EVT (VT).changeVectorElementTypeToInteger().getSimpleVT();
1993	setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1994	setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1995	}
1996
1997	// Mark vector float intrinsics as expand.
1998	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
1999	setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
2000	setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
2001	setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
2002	setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
2003	setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
2004	setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
2005	setOperationAction(Op: ISD::FATAN2, VT, Action: Expand);
2006	setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
2007	setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
2008	setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
2009	setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
2010	setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
2011	setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
2012	setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
2013	setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
2014	setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
2015	setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
2016	}
2017
2018	// But we do support custom-lowering for FCOPYSIGN.
2019	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
2020	((VT == MVT::v4bf16 \|\| VT == MVT::v8bf16 \|\| VT == MVT::v4f16 \|\|
2021	VT == MVT::v8f16) &&
2022	Subtarget->hasFullFP16()))
2023	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
2024
2025	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
2026	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
2027	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
2028	setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
2029	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
2030	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
2031	setOperationAction(Op: ISD::SRA, VT, Action: Custom);
2032	setOperationAction(Op: ISD::SRL, VT, Action: Custom);
2033	setOperationAction(Op: ISD::SHL, VT, Action: Custom);
2034	setOperationAction(Op: ISD::OR, VT, Action: Custom);
2035	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
2036	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
2037
2038	setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
2039	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
2040	setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
2041	for (MVT InnerVT : MVT::all_valuetypes())
2042	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: InnerVT, MemVT: VT, Action: Expand);
2043
2044	// CNT supports only B element sizes, then use UADDLP to widen.
2045	if (VT != MVT::v8i8 && VT != MVT::v16i8)
2046	setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
2047
2048	setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
2049	setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
2050	setOperationAction(Op: ISD::UREM, VT, Action: Expand);
2051	setOperationAction(Op: ISD::SREM, VT, Action: Expand);
2052	setOperationAction(Op: ISD::FREM, VT, Action: Expand);
2053
2054	for (unsigned Opcode :
2055	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
2056	ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
2057	setOperationAction(Op: Opcode, VT, Action: Custom);
2058
2059	if (!VT.isFloatingPoint())
2060	setOperationAction(Op: ISD::ABS, VT, Action: Legal);
2061
2062	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
2063	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2064	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2065	setOperationAction(Op: Opcode, VT, Action: Legal);
2066
2067	// F[MIN\|MAX][NUM\|NAN] and simple strict operations are available for all FP
2068	// NEON types.
2069	if (VT.isFloatingPoint() &&
2070	VT.getVectorElementType() != MVT::bf16 &&
2071	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
2072	for (unsigned Opcode :
2073	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2074	ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2075	ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
2076	ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
2077	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT})
2078	setOperationAction(Op: Opcode, VT, Action: Legal);
2079
2080	// Strict fp extend and trunc are legal
2081	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `16`)
2082	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
2083	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `64`)
2084	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
2085
2086	// FIXME: We could potentially make use of the vector comparison instructions
2087	// for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2088	// complications:
2089	// FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,*
2090	// so we would need to expand when the condition code doesn't match the
2091	// kind of comparison.
2092	// Some kinds of comparison require more than one FCMXY instruction so*
2093	// would need to be expanded instead.
2094	// The lowering of the non-strict versions involves target-specific ISD*
2095	// nodes so we would likely need to add strict versions of all of them and
2096	// handle them appropriately.
2097	setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
2098	setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
2099
2100	// When little-endian we can use ordinary d and q register loads/stores for
2101	// vector types, but when big-endian we need to use structure load/store which
2102	// only allow post-index addressing.
2103	if (Subtarget->isLittleEndian()) {
2104	for (unsigned im = (unsigned)ISD::PRE_INC;
2105	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2106	setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
2107	setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
2108	}
2109	} else {
2110	setIndexedLoadAction(IdxModes: ISD::POST_INC, VT, Action: Legal);
2111	setIndexedStoreAction(IdxModes: ISD::POST_INC, VT, Action: Legal);
2112	}
2113
2114	if (Subtarget->hasD128()) {
2115	setOperationAction(Op: ISD::READ_REGISTER, VT: MVT::i128, Action: Custom);
2116	setOperationAction(Op: ISD::WRITE_REGISTER, VT: MVT::i128, Action: Custom);
2117	}
2118
2119	if (VT.isInteger()) {
2120	// Let common code emit inverted variants of compares we do support.
2121	setCondCodeAction(CCs: ISD::SETNE, VT, Action: Expand);
2122	setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
2123	setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
2124	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
2125	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
2126	}
2127	}
2128
2129	bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
2130	EVT OpVT) const {
2131	// Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2132	if (!Subtarget->isSVEorStreamingSVEAvailable() \|\|
2133	ResVT.getVectorElementType() != MVT::i1)
2134	return true;
2135
2136	// Only support illegal types if the result is scalable and min elements > 1.
2137	if (ResVT.getVectorMinNumElements() == `1` \|\|
2138	(ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > `16` \|\|
2139	(OpVT != MVT::i32 && OpVT != MVT::i64))))
2140	return true;
2141
2142	// 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2143	// but anything larger should be expanded.
2144	if (OpVT.getFixedSizeInBits() > `64`)
2145	return true;
2146
2147	return false;
2148	}
2149
2150	bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
2151	const IntrinsicInst I) const* {
2152	assert(I->getIntrinsicID() ==
2153	Intrinsic::experimental_vector_partial_reduce_add &&
2154	"Unexpected intrinsic!");
2155	return true;
2156	}
2157
2158	bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
2159	if (!Subtarget->isSVEorStreamingSVEAvailable())
2160	return true;
2161
2162	// We can only use the BRKB + CNTP sequence with legal predicate types. We can
2163	// also support fixed-width predicates.
2164	return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2165	VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2166	VT != MVT::v4i1 && VT != MVT::v2i1;
2167	}
2168
2169	bool AArch64TargetLowering::shouldExpandVectorMatch(EVT VT,
2170	unsigned SearchSize) const {
2171	// MATCH is SVE2 and only available in non-streaming mode.
2172	if (!Subtarget->hasSVE2() \|\| !Subtarget->isSVEAvailable())
2173	return true;
2174	// Furthermore, we can only use it for 8-bit or 16-bit elements.
2175	if (VT == MVT::nxv8i16 \|\| VT == MVT::v8i16)
2176	return SearchSize != `8`;
2177	if (VT == MVT::nxv16i8 \|\| VT == MVT::v16i8 \|\| VT == MVT::v8i8)
2178	return SearchSize != `8` && SearchSize != `16`;
2179	return true;
2180	}
2181
2182	void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2183	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2184
2185	// By default everything must be expanded.
2186	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op)
2187	setOperationAction(Op, VT, Action: Expand);
2188
2189	if (VT.isFloatingPoint()) {
2190	setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
2191	setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
2192	setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
2193	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
2194	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
2195	setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
2196	setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
2197	setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
2198	setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
2199	}
2200
2201	TargetLoweringBase::LegalizeAction Default =
2202	VT == MVT::v1f64 ? Expand : Custom;
2203
2204	// Mark integer truncating stores/extending loads as having custom lowering
2205	if (VT.isInteger()) {
2206	MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::i8);
2207	while (InnerVT != VT) {
2208	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
2209	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2210	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2211	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2212	InnerVT = InnerVT.changeVectorElementType(
2213	EltVT: MVT::getIntegerVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
2214	}
2215	}
2216
2217	// Mark floating-point truncating stores/extending loads as having custom
2218	// lowering
2219	if (VT.isFloatingPoint()) {
2220	MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::f16);
2221	while (InnerVT != VT) {
2222	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
2223	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2224	InnerVT = InnerVT.changeVectorElementType(
2225	EltVT: MVT::getFloatingPointVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
2226	}
2227	}
2228
2229	bool PreferNEON = VT.is64BitVector() \|\| VT.is128BitVector();
2230	bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2231
2232	static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2233	ISD::PARTIAL_REDUCE_UMLA};
2234	unsigned NumElts = VT.getVectorNumElements();
2235	if (VT.getVectorElementType() == MVT::i64) {
2236	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2237	InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * `8`), Action: Custom);
2238	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2239	InputVT: MVT::getVectorVT(VT: MVT::i16, NumElements: NumElts * `4`), Action: Custom);
2240	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2241	InputVT: MVT::getVectorVT(VT: MVT::i32, NumElements: NumElts * `2`), Action: Custom);
2242	} else if (VT.getVectorElementType() == MVT::i32) {
2243	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2244	InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * `4`), Action: Custom);
2245	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2246	InputVT: MVT::getVectorVT(VT: MVT::i16, NumElements: NumElts * `2`), Action: Custom);
2247	} else if (VT.getVectorElementType() == MVT::i16) {
2248	setPartialReduceMLAAction(Opcodes: MLAOps, AccVT: VT,
2249	InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * `2`), Action: Custom);
2250	}
2251	if (Subtarget->hasMatMulInt8()) {
2252	if (VT.getVectorElementType() == MVT::i32)
2253	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: VT,
2254	InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * `4`), Action: Custom);
2255	else if (VT.getVectorElementType() == MVT::i64)
2256	setPartialReduceMLAAction(Opc: ISD::PARTIAL_REDUCE_SUMLA, AccVT: VT,
2257	InputVT: MVT::getVectorVT(VT: MVT::i8, NumElements: NumElts * `8`), Action: Custom);
2258	}
2259
2260	// Lower fixed length vector operations to scalable equivalents.
2261	setOperationAction(Op: ISD::ABDS, VT, Action: Default);
2262	setOperationAction(Op: ISD::ABDU, VT, Action: Default);
2263	setOperationAction(Op: ISD::ABS, VT, Action: Default);
2264	setOperationAction(Op: ISD::ADD, VT, Action: Default);
2265	setOperationAction(Op: ISD::AND, VT, Action: Default);
2266	setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
2267	setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
2268	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
2269	setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
2270	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
2271	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
2272	setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
2273	setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
2274	setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
2275	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
2276	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
2277	setOperationAction(Op: ISD::FABS, VT, Action: Default);
2278	setOperationAction(Op: ISD::FADD, VT, Action: Default);
2279	setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
2280	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
2281	setOperationAction(Op: ISD::FDIV, VT, Action: Default);
2282	setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
2283	setOperationAction(Op: ISD::FMA, VT, Action: Default);
2284	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
2285	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
2286	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
2287	setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
2288	setOperationAction(Op: ISD::FMUL, VT, Action: Default);
2289	setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
2290	setOperationAction(Op: ISD::FNEG, VT, Action: Default);
2291	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
2292	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
2293	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
2294	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
2295	setOperationAction(Op: ISD::FRINT, VT, Action: Default);
2296	setOperationAction(Op: ISD::LRINT, VT, Action: Default);
2297	setOperationAction(Op: ISD::LLRINT, VT, Action: Default);
2298	setOperationAction(Op: ISD::FROUND, VT, Action: Default);
2299	setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
2300	setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
2301	setOperationAction(Op: ISD::FSUB, VT, Action: Default);
2302	setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
2303	setOperationAction(Op: ISD::GET_ACTIVE_LANE_MASK, VT, Action: Default);
2304	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
2305	setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
2306	setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
2307	setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
2308	setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
2309	setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
2310	setOperationAction(Op: ISD::MUL, VT, Action: Default);
2311	setOperationAction(Op: ISD::MULHS, VT, Action: Default);
2312	setOperationAction(Op: ISD::MULHU, VT, Action: Default);
2313	setOperationAction(Op: ISD::OR, VT, Action: Default);
2314	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
2315	setOperationAction(Op: ISD::SDIV, VT, Action: Default);
2316	setOperationAction(Op: ISD::SELECT, VT, Action: Default);
2317	setOperationAction(Op: ISD::SETCC, VT, Action: Default);
2318	setOperationAction(Op: ISD::SHL, VT, Action: Default);
2319	setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
2320	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
2321	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
2322	setOperationAction(Op: ISD::SMAX, VT, Action: Default);
2323	setOperationAction(Op: ISD::SMIN, VT, Action: Default);
2324	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
2325	setOperationAction(Op: ISD::SRA, VT, Action: Default);
2326	setOperationAction(Op: ISD::SRL, VT, Action: Default);
2327	setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
2328	setOperationAction(Op: ISD::SUB, VT, Action: Default);
2329	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
2330	setOperationAction(Op: ISD::UDIV, VT, Action: Default);
2331	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
2332	setOperationAction(Op: ISD::UMAX, VT, Action: Default);
2333	setOperationAction(Op: ISD::UMIN, VT, Action: Default);
2334	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
2335	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
2336	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
2337	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
2338	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
2339	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
2340	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
2341	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
2342	setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
2343	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
2344	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
2345	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
2346	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
2347	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
2348	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
2349	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
2350	setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
2351	setOperationAction(Op: ISD::XOR, VT, Action: Default);
2352	setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
2353	}
2354
2355	void AArch64TargetLowering::addDRType(MVT VT) {
2356	addRegisterClass(VT, RC: &AArch64::FPR64RegClass);
2357	if (Subtarget->isNeonAvailable())
2358	addTypeForNEON(VT);
2359	}
2360
2361	void AArch64TargetLowering::addQRType(MVT VT) {
2362	addRegisterClass(VT, RC: &AArch64::FPR128RegClass);
2363	if (Subtarget->isNeonAvailable())
2364	addTypeForNEON(VT);
2365	}
2366
2367	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2368	LLVMContext &C, EVT VT) const {
2369	if (!VT.isVector())
2370	return MVT::i32;
2371	if (VT.isScalableVector())
2372	return EVT::getVectorVT(Context&: C, VT: MVT::i1, EC: VT.getVectorElementCount());
2373	return VT.changeVectorElementTypeToInteger();
2374	}
2375
2376	// isIntImmediate - This method tests to see if the node is a constant
2377	// operand. If so Imm will receive the value.
2378	static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2379	if (const ConstantSDNode C = dyn_cast<const* ConstantSDNode>(Val: N)) {
2380	Imm = C->getZExtValue();
2381	return true;
2382	}
2383	return false;
2384	}
2385
2386	// isOpcWithIntImmediate - This method tests to see if the node is a specific
2387	// opcode and that it has a immediate integer right operand.
2388	// If so Imm will receive the value.
2389	static bool isOpcWithIntImmediate(const SDNode N, unsigned* Opc,
2390	uint64_t &Imm) {
2391	return N->getOpcode() == Opc &&
2392	isIntImmediate(N: N->getOperand(Num: `1`).getNode(), Imm);
2393	}
2394
2395	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2396	const APInt &Demanded,
2397	TargetLowering::TargetLoweringOpt &TLO,
2398	unsigned NewOpc) {
2399	uint64_t OldImm = Imm, NewImm, Enc;
2400	uint64_t Mask = ((uint64_t)(-`1LL`) >> (`64` - Size)), OrigMask = Mask;
2401
2402	// Return if the immediate is already all zeros, all ones, a bimm32 or a
2403	// bimm64.
2404	if (Imm == `0` \|\| Imm == Mask \|\|
2405	AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2406	return false;
2407
2408	unsigned EltSize = Size;
2409	uint64_t DemandedBits = Demanded.getZExtValue();
2410
2411	// Clear bits that are not demanded.
2412	Imm &= DemandedBits;
2413
2414	while (true) {
2415	// The goal here is to set the non-demanded bits in a way that minimizes
2416	// the number of switching between 0 and 1. In order to achieve this goal,
2417	// we set the non-demanded bits to the value of the preceding demanded bits.
2418	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2419	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
2420	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2421	// The final result is 0b11000011.
2422	uint64_t NonDemandedBits = ~DemandedBits;
2423	uint64_t InvertedImm = ~Imm & DemandedBits;
2424	uint64_t RotatedImm =
2425	((InvertedImm << `1`) \| (InvertedImm >> (EltSize - `1`) & `1`)) &
2426	NonDemandedBits;
2427	uint64_t Sum = RotatedImm + NonDemandedBits;
2428	bool Carry = NonDemandedBits & ~Sum & (`1ULL` << (EltSize - `1`));
2429	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2430	NewImm = (Imm \| Ones) & Mask;
2431
2432	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2433	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2434	// we halve the element size and continue the search.
2435	if (isShiftedMask_64(Value: NewImm) \|\| isShiftedMask_64(Value: ~(NewImm \| ~Mask)))
2436	break;
2437
2438	// We cannot shrink the element size any further if it is 2-bits.
2439	if (EltSize == `2`)
2440	return false;
2441
2442	EltSize /= `2`;
2443	Mask >>= EltSize;
2444	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2445
2446	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
2447	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != `0`)
2448	return false;
2449
2450	// Merge the upper and lower halves of Imm and DemandedBits.
2451	Imm \|= Hi;
2452	DemandedBits \|= DemandedBitsHi;
2453	}
2454
2455	++NumOptimizedImms;
2456
2457	// Replicate the element across the register width.
2458	while (EltSize < Size) {
2459	NewImm \|= NewImm << EltSize;
2460	EltSize *= `2`;
2461	}
2462
2463	(void)OldImm;
2464	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == `0` &&
2465	"demanded bits should never be altered");
2466	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2467
2468	// Create the new constant immediate node.
2469	EVT VT = Op.getValueType();
2470	SDLoc DL(Op);
2471	SDValue New;
2472
2473	// If the new constant immediate is all-zeros or all-ones, let the target
2474	// independent DAG combine optimize this node.
2475	if (NewImm == `0` \|\| NewImm == OrigMask) {
2476	New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: `0`),
2477	N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2478	// Otherwise, create a machine node so that target independent DAG combine
2479	// doesn't undo this optimization.
2480	} else {
2481	Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2482	SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2483	New = SDValue (
2484	TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: `0`), Op2: EncConst), `0`);
2485	}
2486
2487	return TLO.CombineTo(O: Op, N: New);
2488	}
2489
2490	bool AArch64TargetLowering::targetShrinkDemandedConstant(
2491	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2492	TargetLoweringOpt &TLO) const {
2493	// Delay this optimization to as late as possible.
2494	if (!TLO.LegalOps)
2495	return false;
2496
2497	if (!EnableOptimizeLogicalImm)
2498	return false;
2499
2500	EVT VT = Op.getValueType();
2501	if (VT.isVector())
2502	return false;
2503
2504	unsigned Size = VT.getSizeInBits();
2505
2506	if (Size != `32` && Size != `64`)
2507	return false;
2508
2509	// Exit early if we demand all bits.
2510	if (DemandedBits.popcount() == Size)
2511	return false;
2512
2513	unsigned NewOpc;
2514	switch (Op.getOpcode()) {
2515	default:
2516	return false;
2517	case ISD::AND:
2518	NewOpc = Size == `32` ? AArch64::ANDWri : AArch64::ANDXri;
2519	break;
2520	case ISD::OR:
2521	NewOpc = Size == `32` ? AArch64::ORRWri : AArch64::ORRXri;
2522	break;
2523	case ISD::XOR:
2524	NewOpc = Size == `32` ? AArch64::EORWri : AArch64::EORXri;
2525	break;
2526	}
2527	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
2528	if (!C)
2529	return false;
2530	uint64_t Imm = C->getZExtValue();
2531	return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2532	}
2533
2534	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2535	/// Mask are known to be either zero or one and return them Known.
2536	void AArch64TargetLowering::computeKnownBitsForTargetNode(
2537	const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2538	const SelectionDAG &DAG, unsigned Depth) const {
2539	switch (Op.getOpcode()) {
2540	default:
2541	break;
2542	case AArch64ISD::DUP: {
2543	SDValue SrcOp = Op.getOperand(i: `0`);
2544	Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + `1`);
2545	if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2546	assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2547	"Expected DUP implicit truncation");
2548	Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2549	}
2550	break;
2551	}
2552	case AArch64ISD::CSEL: {
2553	KnownBits Known2;
2554	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2555	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2556	Known = Known.intersectWith(RHS: Known2);
2557	break;
2558	}
2559	case AArch64ISD::BICi: {
2560	// Compute the bit cleared value.
2561	APInt Mask =
2562	~(Op ->getConstantOperandAPInt(Num: `1`) << Op ->getConstantOperandAPInt(Num: `2`))
2563	.trunc(width: Known.getBitWidth());
2564	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2565	Known &= KnownBits::makeConstant(C: Mask);
2566	break;
2567	}
2568	case AArch64ISD::VLSHR: {
2569	KnownBits Known2;
2570	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2571	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2572	Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2573	break;
2574	}
2575	case AArch64ISD::VASHR: {
2576	KnownBits Known2;
2577	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2578	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2579	Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2580	break;
2581	}
2582	case AArch64ISD::VSHL: {
2583	KnownBits Known2;
2584	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2585	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2586	Known = KnownBits::shl(LHS: Known, RHS: Known2);
2587	break;
2588	}
2589	case AArch64ISD::MOVI: {
2590	Known = KnownBits::makeConstant(
2591	C: APInt (Known.getBitWidth(), Op ->getConstantOperandVal(Num: `0`)));
2592	break;
2593	}
2594	case AArch64ISD::LOADgot:
2595	case AArch64ISD::ADDlow: {
2596	if (!Subtarget->isTargetILP32())
2597	break;
2598	// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2599	Known.Zero = APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`);
2600	break;
2601	}
2602	case AArch64ISD::ASSERT_ZEXT_BOOL: {
2603	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2604	Known.Zero \|= APInt (Known.getBitWidth(), `0xFE`);
2605	break;
2606	}
2607	case ISD::INTRINSIC_W_CHAIN: {
2608	Intrinsic::ID IntID =
2609	static_cast<Intrinsic::ID>(Op ->getConstantOperandVal(Num: `1`));
2610	switch (IntID) {
2611	default: return;
2612	case Intrinsic::aarch64_ldaxr:
2613	case Intrinsic::aarch64_ldxr: {
2614	unsigned BitWidth = Known.getBitWidth();
2615	EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2616	unsigned MemBits = VT.getScalarSizeInBits();
2617	Known.Zero \|= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2618	return;
2619	}
2620	}
2621	break;
2622	}
2623	case ISD::INTRINSIC_WO_CHAIN:
2624	case ISD::INTRINSIC_VOID: {
2625	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
2626	switch (IntNo) {
2627	default:
2628	break;
2629	case Intrinsic::aarch64_neon_uaddlv: {
2630	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2631	unsigned BitWidth = Known.getBitWidth();
2632	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2633	unsigned Bound = (VT == MVT::v8i8) ? `11` : `12`;
2634	assert(BitWidth >= Bound && "Unexpected width!");
2635	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2636	Known.Zero \|= Mask;
2637	}
2638	break;
2639	}
2640	case Intrinsic::aarch64_neon_umaxv:
2641	case Intrinsic::aarch64_neon_uminv: {
2642	// Figure out the datatype of the vector operand. The UMINV instruction
2643	// will zero extend the result, so we can mark as known zero all the
2644	// bits larger than the element datatype. 32-bit or larget doesn't need
2645	// this as those are legal types and will be handled by isel directly.
2646	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2647	unsigned BitWidth = Known.getBitWidth();
2648	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2649	assert(BitWidth >= `8` && "Unexpected width!");
2650	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `8`);
2651	Known.Zero \|= Mask;
2652	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
2653	assert(BitWidth >= `16` && "Unexpected width!");
2654	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
2655	Known.Zero \|= Mask;
2656	}
2657	break;
2658	} break;
2659	}
2660	}
2661	}
2662	}
2663
2664	unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2665	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2666	unsigned Depth) const {
2667	EVT VT = Op.getValueType();
2668	unsigned VTBits = VT.getScalarSizeInBits();
2669	unsigned Opcode = Op.getOpcode();
2670	switch (Opcode) {
2671	case AArch64ISD::FCMEQ:
2672	case AArch64ISD::FCMGE:
2673	case AArch64ISD::FCMGT:
2674	// Compares return either 0 or all-ones
2675	return VTBits;
2676	case AArch64ISD::VASHR: {
2677	unsigned Tmp =
2678	DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`), DemandedElts, Depth: Depth + `1`);
2679	return std::min<uint64_t>(a: Tmp + Op.getConstantOperandVal(i: `1`), b: VTBits);
2680	}
2681	}
2682
2683	return `1`;
2684	}
2685
2686	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2687	EVT) const {
2688	return MVT::i64;
2689	}
2690
2691	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2692	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2693	unsigned Fast) const* {
2694
2695	// Allow SVE loads/stores where the alignment >= the size of the element type,
2696	// even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2697	// for stores that come from IR, only require element-size alignment (even if
2698	// unaligned accesses are disabled). Without this, these will be forced to
2699	// have 16-byte alignment with +strict-align (and fail to lower as we don't
2700	// yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2701	if (VT.isScalableVector()) {
2702	unsigned ElementSizeBits = VT.getScalarSizeInBits();
2703	if (ElementSizeBits % `8` == `0` && Alignment >= Align (ElementSizeBits / `8`))
2704	return true;
2705	}
2706
2707	if (Subtarget->requiresStrictAlign())
2708	return false;
2709
2710	if (Fast) {
2711	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2712	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != `16` \|\|
2713	// See comments in performSTORECombine() for more details about
2714	// these conditions.
2715
2716	// Code that uses clang vector extensions can mark that it
2717	// wants unaligned accesses to be treated as fast by
2718	// underspecifying alignment to be 1 or 2.
2719	Alignment <= `2` \|\|
2720
2721	// Disregard v2i64. Memcpy lowering produces those and splitting
2722	// them regresses performance on micro-benchmarks and olden/bh.
2723	VT == MVT::v2i64;
2724	}
2725	return true;
2726	}
2727
2728	// Same as above but handling LLTs instead.
2729	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2730	LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2731	unsigned Fast) const* {
2732	if (Subtarget->requiresStrictAlign())
2733	return false;
2734
2735	if (Fast) {
2736	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2737	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\|
2738	Ty.getSizeInBytes() != `16` \|\|
2739	// See comments in performSTORECombine() for more details about
2740	// these conditions.
2741
2742	// Code that uses clang vector extensions can mark that it
2743	// wants unaligned accesses to be treated as fast by
2744	// underspecifying alignment to be 1 or 2.
2745	Alignment <= `2` \|\|
2746
2747	// Disregard v2i64. Memcpy lowering produces those and splitting
2748	// them regresses performance on micro-benchmarks and olden/bh.
2749	Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
2750	}
2751	return true;
2752	}
2753
2754	FastISel *
2755	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2756	const TargetLibraryInfo libInfo) const* {
2757	return AArch64::createFastISel(funcInfo, libInfo);
2758	}
2759
2760	MachineBasicBlock *
2761	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2762	MachineBasicBlock MBB) const* {
2763	// We materialise the F128CSEL pseudo-instruction as some control flow and a
2764	// phi node:
2765
2766	// OrigBB:
2767	// [... previous instrs leading to comparison ...]
2768	// b.ne TrueBB
2769	// b EndBB
2770	// TrueBB:
2771	// ; Fallthrough
2772	// EndBB:
2773	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2774
2775	MachineFunction *MF = MBB->getParent();
2776	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2777	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2778	DebugLoc DL = MI.getDebugLoc();
2779	MachineFunction::iterator It = ++MBB->getIterator();
2780
2781	Register DestReg = MI.getOperand(i: `0`).getReg();
2782	Register IfTrueReg = MI.getOperand(i: `1`).getReg();
2783	Register IfFalseReg = MI.getOperand(i: `2`).getReg();
2784	unsigned CondCode = MI.getOperand(i: `3`).getImm();
2785	bool NZCVKilled = MI.getOperand(i: `4`).isKill();
2786
2787	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2788	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2789	MF->insert(MBBI: It, MBB: TrueBB);
2790	MF->insert(MBBI: It, MBB: EndBB);
2791
2792	// Transfer rest of current basic-block to EndBB
2793	EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator (MI)),
2794	To: MBB->end());
2795	EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2796
2797	BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc)).addImm(Val: CondCode).addMBB(MBB: TrueBB);
2798	BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::B)).addMBB(MBB: EndBB);
2799	MBB->addSuccessor(Succ: TrueBB);
2800	MBB->addSuccessor(Succ: EndBB);
2801
2802	// TrueBB falls through to the end.
2803	TrueBB->addSuccessor(Succ: EndBB);
2804
2805	if (!NZCVKilled) {
2806	TrueBB->addLiveIn(PhysReg: AArch64::NZCV);
2807	EndBB->addLiveIn(PhysReg: AArch64::NZCV);
2808	}
2809
2810	BuildMI(BB&: *EndBB, I: EndBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AArch64::PHI), DestReg)
2811	.addReg(RegNo: IfTrueReg)
2812	.addMBB(MBB: TrueBB)
2813	.addReg(RegNo: IfFalseReg)
2814	.addMBB(MBB);
2815
2816	MI.eraseFromParent();
2817	return EndBB;
2818	}
2819
2820	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2821	MachineInstr &MI, MachineBasicBlock BB) const* {
2822	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2823	BB->getParent()->getFunction().getPersonalityFn())) &&
2824	"SEH does not use catchret!");
2825	return BB;
2826	}
2827
2828	MachineBasicBlock *
2829	AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2830	MachineBasicBlock MBB) const* {
2831	MachineFunction &MF = *MBB->getParent();
2832	MachineBasicBlock::iterator MBBI = MI.getIterator();
2833	const AArch64InstrInfo &TII =
2834	*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2835	Register TargetReg = MI.getOperand(i: `0`).getReg();
2836	MachineBasicBlock::iterator NextInst =
2837	TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2838
2839	MI.eraseFromParent();
2840	return NextInst ->getParent();
2841	}
2842
2843	MachineBasicBlock *
2844	AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2845	MachineInstr &MI,
2846	MachineBasicBlock BB) const* {
2847	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2848	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2849
2850	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: `0`).getImm(), flags: RegState::Define);
2851	MIB.add(MO: MI.getOperand(i: `1`)); // slice index register
2852	MIB.add(MO: MI.getOperand(i: `2`)); // slice index offset
2853	MIB.add(MO: MI.getOperand(i: `3`)); // pg
2854	MIB.add(MO: MI.getOperand(i: `4`)); // base
2855	MIB.add(MO: MI.getOperand(i: `5`)); // offset
2856
2857	MI.eraseFromParent(); // The pseudo is gone now.
2858	return BB;
2859	}
2860
2861	MachineBasicBlock *
2862	AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock BB) const* {
2863	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2864	MachineInstrBuilder MIB =
2865	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::LDR_ZA));
2866
2867	MIB.addReg(RegNo: AArch64::ZA, flags: RegState::Define);
2868	MIB.add(MO: MI.getOperand(i: `0`)); // Vector select register
2869	MIB.add(MO: MI.getOperand(i: `1`)); // Vector select offset
2870	MIB.add(MO: MI.getOperand(i: `2`)); // Base
2871	MIB.add(MO: MI.getOperand(i: `1`)); // Offset, same as vector select offset
2872
2873	MI.eraseFromParent(); // The pseudo is gone now.
2874	return BB;
2875	}
2876
2877	MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2878	MachineBasicBlock *BB,
2879	unsigned Opcode,
2880	bool Op0IsDef) const {
2881	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2882	MachineInstrBuilder MIB;
2883
2884	MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2885	.addReg(RegNo: MI.getOperand(i: `0`).getReg(), flags: Op0IsDef ? RegState::Define : `0`);
2886	for (unsigned I = `1`; I < MI.getNumOperands(); ++I)
2887	MIB.add(MO: MI.getOperand(i: I));
2888
2889	MI.eraseFromParent(); // The pseudo is gone now.
2890	return BB;
2891	}
2892
2893	MachineBasicBlock *
2894	AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2895	MachineInstr &MI,
2896	MachineBasicBlock BB) const* {
2897	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2898	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2899	unsigned StartIdx = `0`;
2900
2901	bool HasTile = BaseReg != AArch64::ZA;
2902	bool HasZPROut = HasTile && MI.getOperand(i: `0`).isReg();
2903	if (HasZPROut) {
2904	MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
2905	++StartIdx;
2906	}
2907	if (HasTile) {
2908	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm(),
2909	flags: RegState::Define); // Output ZA Tile
2910	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm()); // Input Za Tile
2911	StartIdx++;
2912	} else {
2913	// Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
2914	if (MI.getOperand(i: `0`).isReg() && !MI.getOperand(i: `1`).isImm()) {
2915	MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
2916	++StartIdx;
2917	}
2918	MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
2919	}
2920	for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
2921	MIB.add(MO: MI.getOperand(i: I));
2922
2923	MI.eraseFromParent(); // The pseudo is gone now.
2924	return BB;
2925	}
2926
2927	MachineBasicBlock *
2928	AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock BB) const* {
2929	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2930	MachineInstrBuilder MIB =
2931	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::ZERO_M));
2932	MIB.add(MO: MI.getOperand(i: `0`)); // Mask
2933
2934	unsigned Mask = MI.getOperand(i: `0`).getImm();
2935	for (unsigned I = `0`; I < `8`; I++) {
2936	if (Mask & (`1` << I))
2937	MIB.addDef(RegNo: AArch64::ZAD0 + I, Flags: RegState::ImplicitDefine);
2938	}
2939
2940	MI.eraseFromParent(); // The pseudo is gone now.
2941	return BB;
2942	}
2943
2944	MachineBasicBlock *
2945	AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
2946	MachineBasicBlock BB) const* {
2947	MachineFunction *MF = BB->getParent();
2948	MachineFrameInfo &MFI = MF->getFrameInfo();
2949	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
2950	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
2951	if (TPIDR2.Uses > `0`) {
2952	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2953	// Store the buffer pointer to the TPIDR2 stack object.
2954	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRXui))
2955	.addReg(RegNo: MI.getOperand(i: `0`).getReg())
2956	.addFrameIndex(Idx: TPIDR2.FrameIndex)
2957	.addImm(Val: `0`);
2958	// Set the reserved bytes (10-15) to zero
2959	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRHHui))
2960	.addReg(RegNo: AArch64::WZR)
2961	.addFrameIndex(Idx: TPIDR2.FrameIndex)
2962	.addImm(Val: `5`);
2963	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRWui))
2964	.addReg(RegNo: AArch64::WZR)
2965	.addFrameIndex(Idx: TPIDR2.FrameIndex)
2966	.addImm(Val: `3`);
2967	} else
2968	MFI.RemoveStackObject(ObjectIdx: TPIDR2.FrameIndex);
2969
2970	BB->remove_instr(I: &MI);
2971	return BB;
2972	}
2973
2974	MachineBasicBlock *
2975	AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
2976	MachineBasicBlock BB) const* {
2977	MachineFunction *MF = BB->getParent();
2978	MachineFrameInfo &MFI = MF->getFrameInfo();
2979	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
2980	// TODO This function grows the stack with a subtraction, which doesn't work
2981	// on Windows. Some refactoring to share the functionality in
2982	// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
2983	// supports SME
2984	assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
2985	"Lazy ZA save is not yet supported on Windows");
2986
2987	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
2988
2989	if (TPIDR2.Uses > `0`) {
2990	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2991	MachineRegisterInfo &MRI = MF->getRegInfo();
2992
2993	// The SUBXrs below won't always be emitted in a form that accepts SP
2994	// directly
2995	Register SP = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
2996	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SP)
2997	.addReg(RegNo: AArch64::SP);
2998
2999	// Allocate a lazy-save buffer object of the size given, normally SVL SVL*
3000	auto Size = MI.getOperand(i: `1`).getReg();
3001	auto Dest = MI.getOperand(i: `0`).getReg();
3002	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::MSUBXrrr), DestReg: Dest)
3003	.addReg(RegNo: Size)
3004	.addReg(RegNo: Size)
3005	.addReg(RegNo: SP);
3006	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3007	DestReg: AArch64::SP)
3008	.addReg(RegNo: Dest);
3009
3010	// We have just allocated a variable sized object, tell this to PEI.
3011	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
3012	}
3013
3014	BB->remove_instr(I: &MI);
3015	return BB;
3016	}
3017
3018	// TODO: Find a way to merge this with EmitAllocateZABuffer.
3019	MachineBasicBlock *
3020	AArch64TargetLowering::EmitAllocateSMESaveBuffer(MachineInstr &MI,
3021	MachineBasicBlock BB) const* {
3022	MachineFunction *MF = BB->getParent();
3023	MachineFrameInfo &MFI = MF->getFrameInfo();
3024	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3025	assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3026	"Lazy ZA save is not yet supported on Windows");
3027
3028	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3029	if (FuncInfo->isSMESaveBufferUsed()) {
3030	// Allocate a buffer object of the size given by MI.getOperand(1).
3031	auto Size = MI.getOperand(i: `1`).getReg();
3032	auto Dest = MI.getOperand(i: `0`).getReg();
3033	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::SUBXrx64), DestReg: AArch64::SP)
3034	.addReg(RegNo: AArch64::SP)
3035	.addReg(RegNo: Size)
3036	.addImm(Val: AArch64_AM::getArithExtendImm(ET: AArch64_AM::UXTX, Imm: `0`));
3037	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: Dest)
3038	.addReg(RegNo: AArch64::SP);
3039
3040	// We have just allocated a variable sized object, tell this to PEI.
3041	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
3042	} else
3043	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::IMPLICIT_DEF),
3044	DestReg: MI.getOperand(i: `0`).getReg());
3045
3046	BB->remove_instr(I: &MI);
3047	return BB;
3048	}
3049
3050	MachineBasicBlock *
3051	AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
3052	MachineBasicBlock BB) const* {
3053	// If the buffer is used, emit a call to __arm_sme_state_size()
3054	MachineFunction *MF = BB->getParent();
3055	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3056	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3057	if (FuncInfo->isSMESaveBufferUsed()) {
3058	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3059	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::BL))
3060	.addExternalSymbol(FnName: "__arm_sme_state_size")
3061	.addReg(RegNo: AArch64::X0, flags: RegState::ImplicitDefine)
3062	.addRegMask(Mask: TRI->getCallPreservedMask(
3063	MF: *MF, CallingConv::
3064	AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
3065	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3066	DestReg: MI.getOperand(i: `0`).getReg())
3067	.addReg(RegNo: AArch64::X0);
3068	} else
3069	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3070	DestReg: MI.getOperand(i: `0`).getReg())
3071	.addReg(RegNo: AArch64::XZR);
3072	BB->remove_instr(I: &MI);
3073	return BB;
3074	}
3075
3076	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3077	MachineInstr &MI, MachineBasicBlock BB) const* {
3078
3079	int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
3080	if (SMEOrigInstr != -`1`) {
3081	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3082	uint64_t SMEMatrixType =
3083	TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3084	switch (SMEMatrixType) {
3085	case (AArch64::SMEMatrixArray):
3086	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZA, MI, BB);
3087	case (AArch64::SMEMatrixTileB):
3088	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAB0, MI, BB);
3089	case (AArch64::SMEMatrixTileH):
3090	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAH0, MI, BB);
3091	case (AArch64::SMEMatrixTileS):
3092	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAS0, MI, BB);
3093	case (AArch64::SMEMatrixTileD):
3094	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAD0, MI, BB);
3095	case (AArch64::SMEMatrixTileQ):
3096	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAQ0, MI, BB);
3097	}
3098	}
3099
3100	switch (MI.getOpcode()) {
3101	default:
3102	#ifndef NDEBUG
3103	MI.dump();
3104	#endif
3105	llvm_unreachable("Unexpected instruction for custom inserter!");
3106	case AArch64::InitTPIDR2Obj:
3107	return EmitInitTPIDR2Object(MI, BB);
3108	case AArch64::AllocateZABuffer:
3109	return EmitAllocateZABuffer(MI, BB);
3110	case AArch64::AllocateSMESaveBuffer:
3111	return EmitAllocateSMESaveBuffer(MI, BB);
3112	case AArch64::GetSMESaveSize:
3113	return EmitGetSMESaveSize(MI, BB);
3114	case AArch64::F128CSEL:
3115	return EmitF128CSEL(MI, MBB: BB);
3116	case TargetOpcode::STATEPOINT:
3117	// STATEPOINT is a pseudo instruction which has no implicit defs/uses
3118	// while bl call instruction (where statepoint will be lowered at the end)
3119	// has implicit def. This def is early-clobber as it will be set at
3120	// the moment of the call and earlier than any use is read.
3121	// Add this implicit dead def here as a workaround.
3122	MI.addOperand(MF&: *MI.getMF(),
3123	Op: MachineOperand::CreateReg(
3124	Reg: AArch64::LR, /isDef/ true,
3125	/isImp/ true, /isKill/ false, /isDead/ true,
3126	/isUndef/ false, /isEarlyClobber/ true));
3127	[[fallthrough]];
3128	case TargetOpcode::STACKMAP:
3129	case TargetOpcode::PATCHPOINT:
3130	return emitPatchPoint(MI, MBB: BB);
3131
3132	case TargetOpcode::PATCHABLE_EVENT_CALL:
3133	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3134	return BB;
3135
3136	case AArch64::CATCHRET:
3137	return EmitLoweredCatchRet(MI, BB);
3138
3139	case AArch64::PROBED_STACKALLOC_DYN:
3140	return EmitDynamicProbedAlloc(MI, MBB: BB);
3141
3142	case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3143	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_B, BaseReg: AArch64::ZAB0, MI, BB);
3144	case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3145	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_H, BaseReg: AArch64::ZAH0, MI, BB);
3146	case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3147	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_S, BaseReg: AArch64::ZAS0, MI, BB);
3148	case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3149	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_D, BaseReg: AArch64::ZAD0, MI, BB);
3150	case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3151	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3152	case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3153	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_B, BaseReg: AArch64::ZAB0, MI, BB);
3154	case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3155	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_H, BaseReg: AArch64::ZAH0, MI, BB);
3156	case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3157	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_S, BaseReg: AArch64::ZAS0, MI, BB);
3158	case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3159	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_D, BaseReg: AArch64::ZAD0, MI, BB);
3160	case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3161	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3162	case AArch64::LDR_ZA_PSEUDO:
3163	return EmitFill(MI, BB);
3164	case AArch64::LDR_TX_PSEUDO:
3165	return EmitZTInstr(MI, BB, Opcode: AArch64::LDR_TX, /Op0IsDef=/true);
3166	case AArch64::STR_TX_PSEUDO:
3167	return EmitZTInstr(MI, BB, Opcode: AArch64::STR_TX, /Op0IsDef=/false);
3168	case AArch64::ZERO_M_PSEUDO:
3169	return EmitZero(MI, BB);
3170	case AArch64::ZERO_T_PSEUDO:
3171	return EmitZTInstr(MI, BB, Opcode: AArch64::ZERO_T, /Op0IsDef=/true);
3172	case AArch64::MOVT_TIZ_PSEUDO:
3173	return EmitZTInstr(MI, BB, Opcode: AArch64::MOVT_TIZ, /Op0IsDef=/true);
3174	}
3175	}
3176
3177	//===----------------------------------------------------------------------===//
3178	// AArch64 Lowering private implementation.
3179	//===----------------------------------------------------------------------===//
3180
3181	//===----------------------------------------------------------------------===//
3182	// Lowering Code
3183	//===----------------------------------------------------------------------===//
3184
3185	// Forward declarations of SVE fixed length lowering helpers
3186	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
3187	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3188	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3189	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
3190	SelectionDAG &DAG);
3191	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
3192	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
3193	EVT VT);
3194
3195	/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3196	static bool isZerosVector(const SDNode *N) {
3197	// Look through a bit convert.
3198	while (N->getOpcode() == ISD::BITCAST)
3199	N = N->getOperand(Num: `0`).getNode();
3200
3201	if (ISD::isConstantSplatVectorAllZeros(N))
3202	return true;
3203
3204	if (N->getOpcode() != AArch64ISD::DUP)
3205	return false;
3206
3207	auto Opnd0 = N->getOperand(Num: `0`);
3208	return isNullConstant(V: Opnd0) \|\| isNullFPConstant(V: Opnd0);
3209	}
3210
3211	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3212	/// CC
3213	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3214	switch (CC) {
3215	default:
3216	llvm_unreachable("Unknown condition code!");
3217	case ISD::SETNE:
3218	return AArch64CC::NE;
3219	case ISD::SETEQ:
3220	return AArch64CC::EQ;
3221	case ISD::SETGT:
3222	return AArch64CC::GT;
3223	case ISD::SETGE:
3224	return AArch64CC::GE;
3225	case ISD::SETLT:
3226	return AArch64CC::LT;
3227	case ISD::SETLE:
3228	return AArch64CC::LE;
3229	case ISD::SETUGT:
3230	return AArch64CC::HI;
3231	case ISD::SETUGE:
3232	return AArch64CC::HS;
3233	case ISD::SETULT:
3234	return AArch64CC::LO;
3235	case ISD::SETULE:
3236	return AArch64CC::LS;
3237	}
3238	}
3239
3240	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3241	static void changeFPCCToAArch64CC(ISD::CondCode CC,
3242	AArch64CC::CondCode &CondCode,
3243	AArch64CC::CondCode &CondCode2) {
3244	CondCode2 = AArch64CC::AL;
3245	switch (CC) {
3246	default:
3247	llvm_unreachable("Unknown FP condition!");
3248	case ISD::SETEQ:
3249	case ISD::SETOEQ:
3250	CondCode = AArch64CC::EQ;
3251	break;
3252	case ISD::SETGT:
3253	case ISD::SETOGT:
3254	CondCode = AArch64CC::GT;
3255	break;
3256	case ISD::SETGE:
3257	case ISD::SETOGE:
3258	CondCode = AArch64CC::GE;
3259	break;
3260	case ISD::SETOLT:
3261	CondCode = AArch64CC::MI;
3262	break;
3263	case ISD::SETOLE:
3264	CondCode = AArch64CC::LS;
3265	break;
3266	case ISD::SETONE:
3267	CondCode = AArch64CC::MI;
3268	CondCode2 = AArch64CC::GT;
3269	break;
3270	case ISD::SETO:
3271	CondCode = AArch64CC::VC;
3272	break;
3273	case ISD::SETUO:
3274	CondCode = AArch64CC::VS;
3275	break;
3276	case ISD::SETUEQ:
3277	CondCode = AArch64CC::EQ;
3278	CondCode2 = AArch64CC::VS;
3279	break;
3280	case ISD::SETUGT:
3281	CondCode = AArch64CC::HI;
3282	break;
3283	case ISD::SETUGE:
3284	CondCode = AArch64CC::PL;
3285	break;
3286	case ISD::SETLT:
3287	case ISD::SETULT:
3288	CondCode = AArch64CC::LT;
3289	break;
3290	case ISD::SETLE:
3291	case ISD::SETULE:
3292	CondCode = AArch64CC::LE;
3293	break;
3294	case ISD::SETNE:
3295	case ISD::SETUNE:
3296	CondCode = AArch64CC::NE;
3297	break;
3298	}
3299	}
3300
3301	/// Convert a DAG fp condition code to an AArch64 CC.
3302	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3303	/// should be AND'ed instead of OR'ed.
3304	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3305	AArch64CC::CondCode &CondCode,
3306	AArch64CC::CondCode &CondCode2) {
3307	CondCode2 = AArch64CC::AL;
3308	switch (CC) {
3309	default:
3310	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3311	assert(CondCode2 == AArch64CC::AL);
3312	break;
3313	case ISD::SETONE:
3314	// (a one b)
3315	// == ((a olt b) \|\| (a ogt b))
3316	// == ((a ord b) && (a une b))
3317	CondCode = AArch64CC::VC;
3318	CondCode2 = AArch64CC::NE;
3319	break;
3320	case ISD::SETUEQ:
3321	// (a ueq b)
3322	// == ((a uno b) \|\| (a oeq b))
3323	// == ((a ule b) && (a uge b))
3324	CondCode = AArch64CC::PL;
3325	CondCode2 = AArch64CC::LE;
3326	break;
3327	}
3328	}
3329
3330	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3331	/// CC usable with the vector instructions. Fewer operations are available
3332	/// without a real NZCV register, so we have to use less efficient combinations
3333	/// to get the same effect.
3334	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3335	AArch64CC::CondCode &CondCode,
3336	AArch64CC::CondCode &CondCode2,
3337	bool &Invert) {
3338	Invert = false;
3339	switch (CC) {
3340	default:
3341	// Mostly the scalar mappings work fine.
3342	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3343	break;
3344	case ISD::SETUO:
3345	Invert = true;
3346	[[fallthrough]];
3347	case ISD::SETO:
3348	CondCode = AArch64CC::MI;
3349	CondCode2 = AArch64CC::GE;
3350	break;
3351	case ISD::SETUEQ:
3352	case ISD::SETULT:
3353	case ISD::SETULE:
3354	case ISD::SETUGT:
3355	case ISD::SETUGE:
3356	// All of the compare-mask comparisons are ordered, but we can switch
3357	// between the two by a double inversion. E.g. ULE == !OGT.
3358	Invert = true;
3359	changeFPCCToAArch64CC(CC: getSetCCInverse(Operation: CC, / FP inverse / Type: MVT::f32),
3360	CondCode, CondCode2);
3361	break;
3362	}
3363	}
3364
3365	static bool isLegalArithImmed(uint64_t C) {
3366	// Matches AArch64DAGToDAGISel::SelectArithImmed().
3367	bool IsLegal = (C >> `12` == `0`) \|\| ((C & `0xFFFULL`) == `0` && C >> `24` == `0`);
3368	LLVM_DEBUG(dbgs() << "Is imm " << C
3369	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
3370	return IsLegal;
3371	}
3372
3373	bool isLegalCmpImmed(APInt C) {
3374	// Works for negative immediates too, as it can be written as an ADDS
3375	// instruction with a negated immediate.
3376	return isLegalArithImmed(C: C.abs().getZExtValue());
3377	}
3378
3379	static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) {
3380	// 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3381	if (Op ->getFlags().hasNoSignedWrap())
3382	return true;
3383
3384	// We can still figure out if the second operand is safe to use
3385	// in a CMN instruction by checking if it is known to be not the minimum
3386	// signed value. If it is not, then we can safely use CMN.
3387	// Note: We can eventually remove this check and simply rely on
3388	// Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3389	// consistently sets them appropriately when making said nodes.
3390
3391	KnownBits KnownSrc = DAG.computeKnownBits(Op: Op.getOperand(i: `1`));
3392	return !KnownSrc.getSignedMinValue().isMinSignedValue();
3393	}
3394
3395	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3396	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3397	// can be set differently by this operation. It comes down to whether
3398	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3399	// everything is fine. If not then the optimization is wrong. Thus general
3400	// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3401	//
3402	// So, finally, the only LLVM-native comparisons that don't mention C or V
3403	// are the ones that aren't unsigned comparisons. They're the only ones we can
3404	// safely use CMN for in the absence of information about op2.
3405	static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
3406	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`)) &&
3407	(isIntEqualitySetCC(Code: CC) \|\|
3408	(isUnsignedIntSetCC(Code: CC) && DAG.isKnownNeverZero(Op: Op.getOperand(i: `1`))) \|\|
3409	(isSignedIntSetCC(Code: CC) && isSafeSignedCMN(Op, DAG)));
3410	}
3411
3412	static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL,
3413	SelectionDAG &DAG, SDValue Chain,
3414	bool IsSignaling) {
3415	EVT VT = LHS.getValueType();
3416	assert(VT != MVT::f128);
3417
3418	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3419
3420	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3421	LHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
3422	Ops: {Chain, LHS});
3423	RHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
3424	Ops: {LHS.getValue(R: `1`), RHS});
3425	Chain = RHS.getValue(R: `1`);
3426	}
3427	unsigned Opcode =
3428	IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3429	return DAG.getNode(Opcode, DL, ResultTys: {MVT::i32, MVT::Other}, Ops: {Chain, LHS, RHS});
3430	}
3431
3432	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3433	const SDLoc &DL, SelectionDAG &DAG) {
3434	EVT VT = LHS.getValueType();
3435	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3436
3437	if (VT.isFloatingPoint()) {
3438	assert(VT != MVT::f128);
3439	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3440	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3441	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3442	}
3443	return DAG.getNode(Opcode: AArch64ISD::FCMP, DL, VT: MVT::i32, N1: LHS, N2: RHS);
3444	}
3445
3446	// The CMP instruction is just an alias for SUBS, and representing it as
3447	// SUBS means that it's possible to get CSE with subtract operations.
3448	// A later phase can perform the optimization of setting the destination
3449	// register to WZR/XZR if it ends up being unused.
3450	unsigned Opcode = AArch64ISD::SUBS;
3451
3452	if (isCMN(Op: RHS, CC, DAG)) {
3453	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3454	Opcode = AArch64ISD::ADDS;
3455	RHS = RHS.getOperand(i: `1`);
3456	} else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: `0`)) &&
3457	isIntEqualitySetCC(Code: CC)) {
3458	// As we are looking for EQ/NE compares, the operands can be commuted ; can
3459	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3460	Opcode = AArch64ISD::ADDS;
3461	LHS = LHS.getOperand(i: `1`);
3462	} else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3463	if (LHS.getOpcode() == ISD::AND) {
3464	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3465	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3466	// of the signed comparisons.
3467	const SDValue ANDSNode =
3468	DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3469	N1: LHS.getOperand(i: `0`), N2: LHS.getOperand(i: `1`));
3470	// Replace all users of (and X, Y) with newly generated (ands X, Y)
3471	DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3472	return ANDSNode.getValue(R: `1`);
3473	} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3474	// Use result of ANDS
3475	return LHS.getValue(R: `1`);
3476	}
3477	}
3478
3479	return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3480	.getValue(R: `1`);
3481	}
3482
3483	/// \defgroup AArch64CCMP CMP;CCMP matching
3484	///
3485	/// These functions deal with the formation of CMP;CCMP;... sequences.
3486	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3487	/// a comparison. They set the NZCV flags to a predefined value if their
3488	/// predicate is false. This allows to express arbitrary conjunctions, for
3489	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3490	/// expressed as:
3491	/// cmp A
3492	/// ccmp B, inv(CB), CA
3493	/// check for CB flags
3494	///
3495	/// This naturally lets us implement chains of AND operations with SETCC
3496	/// operands. And we can even implement some other situations by transforming
3497	/// them:
3498	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3499	/// negating the flags used in a CCMP/FCCMP operations.
3500	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3501	/// by negating the flags we test for afterwards. i.e.
3502	/// NEG (CMP CCMP CCCMP ...) can be implemented.
3503	/// - Note that we can only ever negate all previously processed results.
3504	/// What we can not implement by flipping the flags to test is a negation
3505	/// of two sub-trees (because the negation affects all sub-trees emitted so
3506	/// far, so the 2nd sub-tree we emit would also affect the first).
3507	/// With those tools we can implement some OR operations:
3508	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3509	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3510	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3511	/// elimination rules from earlier to implement the whole thing as a
3512	/// CCMP/FCCMP chain.
3513	///
3514	/// As complete example:
3515	/// or (or (setCA (cmp A)) (setCB (cmp B)))
3516	/// (and (setCC (cmp C)) (setCD (cmp D)))"
3517	/// can be reassociated to:
3518	/// or (and (setCC (cmp C)) setCD (cmp D))
3519	// (or (setCA (cmp A)) (setCB (cmp B)))
3520	/// can be transformed to:
3521	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3522	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3523	/// which can be implemented as:
3524	/// cmp C
3525	/// ccmp D, inv(CD), CC
3526	/// ccmp A, CA, inv(CD)
3527	/// ccmp B, CB, inv(CA)
3528	/// check for CB flags
3529	///
3530	/// A counterexample is "or (and A B) (and C D)" which translates to
3531	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3532	/// can only implement 1 of the inner (not) operations, but not both!
3533	/// @{
3534
3535	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3536	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3537	ISD::CondCode CC, SDValue CCOp,
3538	AArch64CC::CondCode Predicate,
3539	AArch64CC::CondCode OutCC,
3540	const SDLoc &DL, SelectionDAG &DAG) {
3541	unsigned Opcode = `0`;
3542	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3543
3544	if (LHS.getValueType().isFloatingPoint()) {
3545	assert(LHS.getValueType() != MVT::f128);
3546	if ((LHS.getValueType() == MVT::f16 && !FullFP16) \|\|
3547	LHS.getValueType() == MVT::bf16) {
3548	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3549	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3550	}
3551	Opcode = AArch64ISD::FCCMP;
3552	} else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3553	APInt Imm = Const->getAPIntValue();
3554	if (Imm.isNegative() && Imm.sgt(RHS: -`32`)) {
3555	Opcode = AArch64ISD::CCMN;
3556	RHS = DAG.getConstant(Val: Imm.abs(), DL, VT: Const->getValueType(ResNo: `0`));
3557	}
3558	} else if (isCMN(Op: RHS, CC, DAG)) {
3559	Opcode = AArch64ISD::CCMN;
3560	RHS = RHS.getOperand(i: `1`);
3561	} else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: `0`)) &&
3562	isIntEqualitySetCC(Code: CC)) {
3563	// As we are looking for EQ/NE compares, the operands can be commuted ; can
3564	// we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3565	Opcode = AArch64ISD::CCMN;
3566	LHS = LHS.getOperand(i: `1`);
3567	}
3568	if (Opcode == `0`)
3569	Opcode = AArch64ISD::CCMP;
3570
3571	SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3572	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3573	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3574	SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
3575	return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3576	}
3577
3578	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3579	/// expressed as a conjunction. See \ref AArch64CCMP.
3580	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3581	/// changing the conditions on the SETCC tests.
3582	/// (this means we can call emitConjunctionRec() with
3583	/// Negate==true on this sub-tree)
3584	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3585	/// cannot do the negation naturally. We are required to
3586	/// emit the subtree first in this case.
3587	/// \param WillNegate Is true if are called when the result of this
3588	/// subexpression must be negated. This happens when the
3589	/// outer expression is an OR. We can use this fact to know
3590	/// that we have a double negation (or (or ...) ...) that
3591	/// can be implemented for free.
3592	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3593	bool &MustBeFirst, bool WillNegate,
3594	unsigned Depth = `0`) {
3595	if (!Val.hasOneUse())
3596	return false;
3597	unsigned Opcode = Val ->getOpcode();
3598	if (Opcode == ISD::SETCC) {
3599	if (Val ->getOperand(Num: `0`).getValueType() == MVT::f128)
3600	return false;
3601	CanNegate = true;
3602	MustBeFirst = false;
3603	return true;
3604	}
3605	// Protect against exponential runtime and stack overflow.
3606	if (Depth > `6`)
3607	return false;
3608	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
3609	bool IsOR = Opcode == ISD::OR;
3610	SDValue O0 = Val ->getOperand(Num: `0`);
3611	SDValue O1 = Val ->getOperand(Num: `1`);
3612	bool CanNegateL;
3613	bool MustBeFirstL;
3614	if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+`1`))
3615	return false;
3616	bool CanNegateR;
3617	bool MustBeFirstR;
3618	if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+`1`))
3619	return false;
3620
3621	if (MustBeFirstL && MustBeFirstR)
3622	return false;
3623
3624	if (IsOR) {
3625	// For an OR expression we need to be able to naturally negate at least
3626	// one side or we cannot do the transformation at all.
3627	if (!CanNegateL && !CanNegateR)
3628	return false;
3629	// If we the result of the OR will be negated and we can naturally negate
3630	// the leafs, then this sub-tree as a whole negates naturally.
3631	CanNegate = WillNegate && CanNegateL && CanNegateR;
3632	// If we cannot naturally negate the whole sub-tree, then this must be
3633	// emitted first.
3634	MustBeFirst = !CanNegate;
3635	} else {
3636	assert(Opcode == ISD::AND && "Must be OR or AND");
3637	// We cannot naturally negate an AND operation.
3638	CanNegate = false;
3639	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
3640	}
3641	return true;
3642	}
3643	return false;
3644	}
3645
3646	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3647	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3648	/// Tries to transform the given i1 producing node @p Val to a series compare
3649	/// and conditional compare operations. @returns an NZCV flags producing node
3650	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3651	/// transformation was not possible.
3652	/// \p Negate is true if we want this sub-tree being negated just by changing
3653	/// SETCC conditions.
3654	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3655	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3656	AArch64CC::CondCode Predicate) {
3657	// We're at a tree leaf, produce a conditional comparison operation.
3658	unsigned Opcode = Val ->getOpcode();
3659	if (Opcode == ISD::SETCC) {
3660	SDValue LHS = Val ->getOperand(Num: `0`);
3661	SDValue RHS = Val ->getOperand(Num: `1`);
3662	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val ->getOperand(Num: `2`))->get();
3663	bool isInteger = LHS.getValueType().isInteger();
3664	if (Negate)
3665	CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3666	SDLoc DL(Val);
3667	// Determine OutCC and handle FP special case.
3668	if (isInteger) {
3669	OutCC = changeIntCCToAArch64CC(CC);
3670	} else {
3671	assert(LHS.getValueType().isFloatingPoint());
3672	AArch64CC::CondCode ExtraCC;
3673	changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3674	// Some floating point conditions can't be tested with a single condition
3675	// code. Construct an additional comparison in this case.
3676	if (ExtraCC != AArch64CC::AL) {
3677	SDValue ExtraCmp;
3678	if (!CCOp.getNode())
3679	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3680	else
3681	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3682	OutCC: ExtraCC, DL, DAG);
3683	CCOp = ExtraCmp;
3684	Predicate = ExtraCC;
3685	}
3686	}
3687
3688	// Produce a normal comparison if we are first in the chain
3689	if (!CCOp)
3690	return emitComparison(LHS, RHS, CC, DL, DAG);
3691	// Otherwise produce a ccmp.
3692	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3693	DAG);
3694	}
3695	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3696
3697	bool IsOR = Opcode == ISD::OR;
3698
3699	SDValue LHS = Val ->getOperand(Num: `0`);
3700	bool CanNegateL;
3701	bool MustBeFirstL;
3702	bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3703	assert(ValidL && "Valid conjunction/disjunction tree");
3704	(void)ValidL;
3705
3706	SDValue RHS = Val ->getOperand(Num: `1`);
3707	bool CanNegateR;
3708	bool MustBeFirstR;
3709	bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3710	assert(ValidR && "Valid conjunction/disjunction tree");
3711	(void)ValidR;
3712
3713	// Swap sub-tree that must come first to the right side.
3714	if (MustBeFirstL) {
3715	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3716	std::swap(a&: LHS, b&: RHS);
3717	std::swap(a&: CanNegateL, b&: CanNegateR);
3718	std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3719	}
3720
3721	bool NegateR;
3722	bool NegateAfterR;
3723	bool NegateL;
3724	bool NegateAfterAll;
3725	if (Opcode == ISD::OR) {
3726	// Swap the sub-tree that we can negate naturally to the left.
3727	if (!CanNegateL) {
3728	assert(CanNegateR && "at least one side must be negatable");
3729	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3730	assert(!Negate);
3731	std::swap(a&: LHS, b&: RHS);
3732	NegateR = false;
3733	NegateAfterR = true;
3734	} else {
3735	// Negate the left sub-tree if possible, otherwise negate the result.
3736	NegateR = CanNegateR;
3737	NegateAfterR = !CanNegateR;
3738	}
3739	NegateL = true;
3740	NegateAfterAll = !Negate;
3741	} else {
3742	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3743	assert(!Negate && "Valid conjunction/disjunction tree");
3744
3745	NegateL = false;
3746	NegateR = false;
3747	NegateAfterR = false;
3748	NegateAfterAll = false;
3749	}
3750
3751	// Emit sub-trees.
3752	AArch64CC::CondCode RHSCC;
3753	SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3754	if (NegateAfterR)
3755	RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3756	SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3757	if (NegateAfterAll)
3758	OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3759	return CmpL;
3760	}
3761
3762	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3763	/// In some cases this is even possible with OR operations in the expression.
3764	/// See \ref AArch64CCMP.
3765	/// \see emitConjunctionRec().
3766	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3767	AArch64CC::CondCode &OutCC) {
3768	bool DummyCanNegate;
3769	bool DummyMustBeFirst;
3770	if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3771	return SDValue ();
3772
3773	return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue (), Predicate: AArch64CC::AL);
3774	}
3775
3776	/// @}
3777
3778	/// Returns how profitable it is to fold a comparison's operand's shift and/or
3779	/// extension operations.
3780	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3781	auto isSupportedExtend = [&](SDValue V) {
3782	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3783	return true;
3784
3785	if (V.getOpcode() == ISD::AND)
3786	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: `1`))) {
3787	uint64_t Mask = MaskCst->getZExtValue();
3788	return (Mask == `0xFF` \|\| Mask == `0xFFFF` \|\| Mask == `0xFFFFFFFF`);
3789	}
3790
3791	return false;
3792	};
3793
3794	if (!Op.hasOneUse())
3795	return `0`;
3796
3797	if (isSupportedExtend (Op))
3798	return `1`;
3799
3800	unsigned Opc = Op.getOpcode();
3801	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
3802	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
3803	uint64_t Shift = ShiftCst->getZExtValue();
3804	if (isSupportedExtend (Op.getOperand(i: `0`)))
3805	return (Shift <= `4`) ? `2` : `1`;
3806	EVT VT = Op.getValueType();
3807	if ((VT == MVT::i32 && Shift <= `31`) \|\| (VT == MVT::i64 && Shift <= `63`))
3808	return `1`;
3809	}
3810
3811	return `0`;
3812	}
3813
3814	// emitComparison() converts comparison with one or negative one to comparison
3815	// with 0. Note that this only works for signed comparisons because of how ANDS
3816	// works.
3817	static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC) {
3818	// Only works for ANDS and AND.
3819	if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
3820	return false;
3821
3822	if (C.isOne() && (CC == ISD::SETLT \|\| CC == ISD::SETGE)) {
3823	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3824	return true;
3825	}
3826
3827	if (C.isAllOnes() && (CC == ISD::SETLE \|\| CC == ISD::SETGT)) {
3828	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3829	return true;
3830	}
3831
3832	return false;
3833	}
3834
3835	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3836	SDValue &AArch64cc, SelectionDAG &DAG,
3837	const SDLoc &DL) {
3838	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3839	EVT VT = RHS.getValueType();
3840	APInt C = RHSC->getAPIntValue();
3841	// shouldBeAdjustedToZero is a special case to better fold with
3842	// emitComparison().
3843	if (shouldBeAdjustedToZero(LHS, C, CC)) {
3844	// Adjust the constant to zero.
3845	// CC has already been adjusted.
3846	RHS = DAG.getConstant(Val: `0`, DL, VT);
3847	} else if (!isLegalCmpImmed(C)) {
3848	// Constant does not fit, try adjusting it by one?
3849	switch (CC) {
3850	default:
3851	break;
3852	case ISD::SETLT:
3853	case ISD::SETGE:
3854	if (!C.isMinSignedValue()) {
3855	APInt CMinusOne = C - `1`;
3856	if (isLegalCmpImmed(C: CMinusOne)) {
3857	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3858	RHS = DAG.getConstant(Val: CMinusOne, DL, VT);
3859	}
3860	}
3861	break;
3862	case ISD::SETULT:
3863	case ISD::SETUGE:
3864	if (!C.isZero()) {
3865	APInt CMinusOne = C - `1`;
3866	if (isLegalCmpImmed(C: CMinusOne)) {
3867	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3868	RHS = DAG.getConstant(Val: CMinusOne, DL, VT);
3869	}
3870	}
3871	break;
3872	case ISD::SETLE:
3873	case ISD::SETGT:
3874	if (!C.isMaxSignedValue()) {
3875	APInt CPlusOne = C + `1`;
3876	if (isLegalCmpImmed(C: CPlusOne)) {
3877	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3878	RHS = DAG.getConstant(Val: CPlusOne, DL, VT);
3879	}
3880	}
3881	break;
3882	case ISD::SETULE:
3883	case ISD::SETUGT:
3884	if (!C.isAllOnes()) {
3885	APInt CPlusOne = C + `1`;
3886	if (isLegalCmpImmed(C: CPlusOne)) {
3887	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3888	RHS = DAG.getConstant(Val: CPlusOne, DL, VT);
3889	}
3890	}
3891	break;
3892	}
3893	}
3894	}
3895
3896	// Comparisons are canonicalized so that the RHS operand is simpler than the
3897	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
3898	// can fold some shift+extend operations on the RHS operand, so swap the
3899	// operands if that can be done.
3900	//
3901	// For example:
3902	// lsl w13, w11, #1
3903	// cmp w13, w12
3904	// can be turned into:
3905	// cmp w12, w11, lsl #1
3906	if (!isa<ConstantSDNode>(Val: RHS) \|\| !isLegalCmpImmed(C: RHS ->getAsAPIntVal())) {
3907	bool LHSIsCMN = isCMN(Op: LHS, CC, DAG);
3908	bool RHSIsCMN = isCMN(Op: RHS, CC, DAG);
3909	SDValue TheLHS = LHSIsCMN ? LHS.getOperand(i: `1`) : LHS;
3910	SDValue TheRHS = RHSIsCMN ? RHS.getOperand(i: `1`) : RHS;
3911
3912	if (getCmpOperandFoldingProfit(Op: TheLHS) + (LHSIsCMN ? `1` : `0`) >
3913	getCmpOperandFoldingProfit(Op: TheRHS) + (RHSIsCMN ? `1` : `0`)) {
3914	std::swap(a&: LHS, b&: RHS);
3915	CC = ISD::getSetCCSwappedOperands(Operation: CC);
3916	}
3917	}
3918
3919	SDValue Cmp;
3920	AArch64CC::CondCode AArch64CC;
3921	if (isIntEqualitySetCC(Code: CC) && isa<ConstantSDNode>(Val: RHS)) {
3922	const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3923
3924	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3925	// For the i8 operand, the largest immediate is 255, so this can be easily
3926	// encoded in the compare instruction. For the i16 operand, however, the
3927	// largest immediate cannot be encoded in the compare.
3928	// Therefore, use a sign extending load and cmn to avoid materializing the
3929	// -1 constant. For example,
3930	// movz w1, #65535
3931	// ldrh w0, [x0, #0]
3932	// cmp w0, w1
3933	// >
3934	// ldrsh w0, [x0, #0]
3935	// cmn w0, #1
3936	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3937	// if and only if (sext LHS) == (sext RHS). The checks are in place to
3938	// ensure both the LHS and RHS are truly zero extended and to make sure the
3939	// transformation is profitable.
3940	if ((RHSC->getZExtValue() >> `16` == `0`) && isa<LoadSDNode>(Val: LHS) &&
3941	cast<LoadSDNode>(Val&: LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3942	cast<LoadSDNode>(Val&: LHS)->getMemoryVT() == MVT::i16 &&
3943	LHS.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) {
3944	int16_t ValueofRHS = RHS ->getAsZExtVal();
3945	if (ValueofRHS < `0` && isLegalArithImmed(C: -ValueofRHS)) {
3946	SDValue SExt =
3947	DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: LHS.getValueType(), N1: LHS,
3948	N2: DAG.getValueType(MVT::i16));
3949	Cmp = emitComparison(
3950	LHS: SExt, RHS: DAG.getSignedConstant(Val: ValueofRHS, DL, VT: RHS.getValueType()), CC,
3951	DL, DAG);
3952	AArch64CC = changeIntCCToAArch64CC(CC);
3953	}
3954	}
3955
3956	if (!Cmp && (RHSC->isZero() \|\| RHSC->isOne())) {
3957	if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3958	if ((CC == ISD::SETNE) ^ RHSC->isZero())
3959	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3960	}
3961	}
3962	}
3963
3964	if (!Cmp) {
3965	Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
3966	AArch64CC = changeIntCCToAArch64CC(CC);
3967	}
3968	AArch64cc = DAG.getConstant(Val: AArch64CC, DL, VT: MVT_CC);
3969	return Cmp;
3970	}
3971
3972	static std::pair<SDValue, SDValue>
3973	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3974	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
3975	"Unsupported value type");
3976	SDValue Value, Overflow;
3977	SDLoc DL(Op);
3978	SDValue LHS = Op.getOperand(i: `0`);
3979	SDValue RHS = Op.getOperand(i: `1`);
3980	unsigned Opc = `0`;
3981	switch (Op.getOpcode()) {
3982	default:
3983	llvm_unreachable("Unknown overflow instruction!");
3984	case ISD::SADDO:
3985	Opc = AArch64ISD::ADDS;
3986	CC = AArch64CC::VS;
3987	break;
3988	case ISD::UADDO:
3989	Opc = AArch64ISD::ADDS;
3990	CC = AArch64CC::HS;
3991	break;
3992	case ISD::SSUBO:
3993	Opc = AArch64ISD::SUBS;
3994	CC = AArch64CC::VS;
3995	break;
3996	case ISD::USUBO:
3997	Opc = AArch64ISD::SUBS;
3998	CC = AArch64CC::LO;
3999	break;
4000	// Multiply needs a little bit extra work.
4001	case ISD::SMULO:
4002	case ISD::UMULO: {
4003	CC = AArch64CC::NE;
4004	bool IsSigned = Op.getOpcode() == ISD::SMULO;
4005	if (Op.getValueType() == MVT::i32) {
4006	// Extend to 64-bits, then perform a 64-bit multiply.
4007	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4008	LHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: LHS);
4009	RHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: RHS);
4010	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4011	Value = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Mul);
4012
4013	// Check that the result fits into a 32-bit integer.
4014	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT_CC);
4015	if (IsSigned) {
4016	// cmp xreg, wreg, sxtw
4017	SDValue SExtMul = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Value);
4018	Overflow =
4019	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: `1`);
4020	} else {
4021	// tst xreg, #0xffffffff00000000
4022	SDValue UpperBits = DAG.getConstant(Val: `0xFFFFFFFF00000000`, DL, VT: MVT::i64);
4023	Overflow =
4024	DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: `1`);
4025	}
4026	break;
4027	}
4028	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4029	// For the 64 bit multiply
4030	Value = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4031	if (IsSigned) {
4032	SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHS, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4033	SDValue LowerBits = DAG.getNode(Opcode: ISD::SRA, DL, VT: MVT::i64, N1: Value,
4034	N2: DAG.getConstant(Val: `63`, DL, VT: MVT::i64));
4035	// It is important that LowerBits is last, otherwise the arithmetic
4036	// shift will not be folded into the compare (SUBS).
4037	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4038	Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
4039	.getValue(R: `1`);
4040	} else {
4041	SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHU, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4042	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4043	Overflow =
4044	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs,
4045	N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i64),
4046	N2: UpperBits).getValue(R: `1`);
4047	}
4048	break;
4049	}
4050	} // switch (...)
4051
4052	if (Opc) {
4053	SDVTList VTs = DAG.getVTList(VT1: Op ->getValueType(ResNo: `0`), VT2: MVT::i32);
4054
4055	// Emit the AArch64 operation with overflow check.
4056	Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
4057	Overflow = Value.getValue(R: `1`);
4058	}
4059	return std::make_pair(x&: Value, y&: Overflow);
4060	}
4061
4062	SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4063	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
4064	OverrideNEON: !Subtarget->isNeonAvailable()))
4065	return LowerToScalableOp(Op, DAG);
4066
4067	SDValue Sel = Op.getOperand(i: `0`);
4068	SDValue Other = Op.getOperand(i: `1`);
4069	SDLoc DL(Sel);
4070
4071	// If the operand is an overflow checking operation, invert the condition
4072	// code and kill the Not operation. I.e., transform:
4073	// (xor (overflow_op_bool, 1))
4074	// -->
4075	// (csel 1, 0, invert(cc), overflow_op_bool)
4076	// ... which later gets transformed to just a cset instruction with an
4077	// inverted condition code, rather than a cset + eor sequence.
4078	if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
4079	// Only lower legal XALUO ops.
4080	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel ->getValueType(ResNo: `0`)))
4081	return SDValue ();
4082
4083	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
4084	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
4085	AArch64CC::CondCode CC;
4086	SDValue Value, Overflow;
4087	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: `0`), DAG);
4088	SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32);
4089	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
4090	N3: CCVal, N4: Overflow);
4091	}
4092	// If neither operand is a SELECT_CC, give up.
4093	if (Sel.getOpcode() != ISD::SELECT_CC)
4094	std::swap(a&: Sel, b&: Other);
4095	if (Sel.getOpcode() != ISD::SELECT_CC)
4096	return Op;
4097
4098	// The folding we want to perform is:
4099	// (xor x, (select_cc a, b, cc, 0, -1) )
4100	// -->
4101	// (csel x, (xor x, -1), cc ...)
4102	//
4103	// The latter will get matched to a CSINV instruction.
4104
4105	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: `4`))->get();
4106	SDValue LHS = Sel.getOperand(i: `0`);
4107	SDValue RHS = Sel.getOperand(i: `1`);
4108	SDValue TVal = Sel.getOperand(i: `2`);
4109	SDValue FVal = Sel.getOperand(i: `3`);
4110
4111	// FIXME: This could be generalized to non-integer comparisons.
4112	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4113	return Op;
4114
4115	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
4116	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
4117
4118	// The values aren't constants, this isn't the pattern we're looking for.
4119	if (!CFVal \|\| !CTVal)
4120	return Op;
4121
4122	// We can commute the SELECT_CC by inverting the condition. This
4123	// might be needed to make this fit into a CSINV pattern.
4124	if (CTVal->isAllOnes() && CFVal->isZero()) {
4125	std::swap(a&: TVal, b&: FVal);
4126	std::swap(a&: CTVal, b&: CFVal);
4127	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
4128	}
4129
4130	// If the constants line up, perform the transform!
4131	if (CTVal->isZero() && CFVal->isAllOnes()) {
4132	SDValue CCVal;
4133	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
4134
4135	FVal = Other;
4136	TVal = DAG.getNode(Opcode: ISD::XOR, DL, VT: Other.getValueType(), N1: Other,
4137	N2: DAG.getAllOnesConstant(DL, VT: Other.getValueType()));
4138
4139	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Sel.getValueType(), N1: FVal, N2: TVal,
4140	N3: CCVal, N4: Cmp);
4141	}
4142
4143	return Op;
4144	}
4145
4146	// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4147	// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4148	// sets 'C' bit to 0.
4149	static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
4150	SDLoc DL(Value);
4151	EVT VT = Value.getValueType();
4152	SDValue Op0 = Invert ? DAG.getConstant(Val: `0`, DL, VT) : Value;
4153	SDValue Op1 = Invert ? Value : DAG.getConstant(Val: `1`, DL, VT);
4154	SDValue Cmp =
4155	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue), N1: Op0, N2: Op1);
4156	return Cmp.getValue(R: `1`);
4157	}
4158
4159	// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4160	// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4161	static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
4162	bool Invert) {
4163	assert(Glue.getResNo() == `1`);
4164	SDLoc DL(Glue);
4165	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
4166	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
4167	unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4168	SDValue CC = DAG.getConstant(Val: Cond, DL, VT: MVT::i32);
4169	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4170	}
4171
4172	// Value is 1 if 'V' bit of NZCV is 1, else 0
4173	static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
4174	assert(Glue.getResNo() == `1`);
4175	SDLoc DL(Glue);
4176	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
4177	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
4178	SDValue CC = DAG.getConstant(Val: AArch64CC::VS, DL, VT: MVT::i32);
4179	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4180	}
4181
4182	// This lowering is inefficient, but it will get cleaned up by
4183	// `foldOverflowCheck`
4184	static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
4185	unsigned Opcode, bool IsSigned) {
4186	EVT VT0 = Op.getValue(R: `0`).getValueType();
4187	EVT VT1 = Op.getValue(R: `1`).getValueType();
4188
4189	if (VT0 != MVT::i32 && VT0 != MVT::i64)
4190	return SDValue ();
4191
4192	bool InvertCarry = Opcode == AArch64ISD::SBCS;
4193	SDValue OpLHS = Op.getOperand(i: `0`);
4194	SDValue OpRHS = Op.getOperand(i: `1`);
4195	SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: `2`), DAG, Invert: InvertCarry);
4196
4197	SDLoc DL(Op);
4198	SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
4199
4200	SDValue Sum = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT0, VT2: MVT::Glue), N1: OpLHS,
4201	N2: OpRHS, N3: OpCarryIn);
4202
4203	SDValue OutFlag =
4204	IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG)
4205	: carryFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG, Invert: InvertCarry);
4206
4207	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
4208	}
4209
4210	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
4211	// Let legalize expand this if it isn't a legal type yet.
4212	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
4213	return SDValue ();
4214
4215	SDLoc DL(Op);
4216	AArch64CC::CondCode CC;
4217	// The actual operation that sets the overflow or carry flag.
4218	SDValue Value, Overflow;
4219	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4220
4221	// We use 0 and 1 as false and true values.
4222	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
4223	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: MVT::i32);
4224
4225	// We use an inverted condition, because the conditional select is inverted
4226	// too. This will allow it to be selected to a single instruction:
4227	// CSINC Wd, WZR, WZR, invert(cond).
4228	SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32);
4229	Overflow =
4230	DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: MVT::i32, N1: FVal, N2: TVal, N3: CCVal, N4: Overflow);
4231
4232	SDVTList VTs = DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::i32);
4233	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Value, N2: Overflow);
4234	}
4235
4236	// Prefetch operands are:
4237	// 1: Address to prefetch
4238	// 2: bool isWrite
4239	// 3: int locality (0 = no locality ... 3 = extreme locality)
4240	// 4: bool isDataCache
4241	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
4242	SDLoc DL(Op);
4243	unsigned IsWrite = Op.getConstantOperandVal(i: `2`);
4244	unsigned Locality = Op.getConstantOperandVal(i: `3`);
4245	unsigned IsData = Op.getConstantOperandVal(i: `4`);
4246
4247	bool IsStream = !Locality;
4248	// When the locality number is set
4249	if (Locality) {
4250	// The front-end should have filtered out the out-of-range values
4251	assert(Locality <= `3` && "Prefetch locality out-of-range");
4252	// The locality degree is the opposite of the cache speed.
4253	// Put the number the other way around.
4254	// The encoding starts at 0 for level 1
4255	Locality = `3` - Locality;
4256	}
4257
4258	// built the mask value encoding the expected behavior.
4259	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
4260	(!IsData << `3`) \| // IsDataCache bit
4261	(Locality << `1`) \| // Cache level bits
4262	(unsigned)IsStream; // Stream bit
4263	return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4264	N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32),
4265	N3: Op.getOperand(i: `1`));
4266	}
4267
4268	// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4269	// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4270	// (AND X Y) Z which produces a better opt with EmitComparison
4271	static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS,
4272	SelectionDAG &DAG, const SDLoc DL) {
4273	if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS ->hasOneUse()) {
4274	ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
4275	ConstantSDNode *RHSConst = dyn_cast<ConstantSDNode>(Val&: RHS);
4276	if (LHSConstOp && RHSConst) {
4277	uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4278	uint64_t RHSConstant = RHSConst->getZExtValue();
4279	if (isPowerOf2_64(Value: RHSConstant)) {
4280	uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - `1`);
4281	LHS =
4282	DAG.getNode(Opcode: ISD::AND, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: `0`),
4283	N2: DAG.getConstant(Val: NewMaskValue, DL, VT: LHS.getValueType()));
4284	RHS = DAG.getConstant(Val: `0`, DL, VT: RHS.getValueType());
4285	CC = ISD::SETEQ;
4286	}
4287	}
4288	}
4289	}
4290
4291	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4292	SelectionDAG &DAG) const {
4293	EVT VT = Op.getValueType();
4294	if (VT.isScalableVector()) {
4295	SDValue SrcVal = Op.getOperand(i: `0`);
4296
4297	if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4298	// Break conversion in two with the first part converting to f32 and the
4299	// second using native f32->VT instructions.
4300	SDLoc DL(Op);
4301	return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT,
4302	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::nxv2f32, Operand: SrcVal));
4303	}
4304
4305	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4306	}
4307
4308	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4309	return LowerFixedLengthFPExtendToSVE(Op, DAG);
4310
4311	bool IsStrict = Op ->isStrictFPOpcode();
4312	SDValue Op0 = Op.getOperand(i: IsStrict ? `1` : `0`);
4313	EVT Op0VT = Op0.getValueType();
4314	if (VT == MVT::f64) {
4315	// FP16->FP32 extends are legal for v32 and v4f32.
4316	if (Op0VT == MVT::f32 \|\| Op0VT == MVT::f16)
4317	return Op;
4318	// Split bf16->f64 extends into two fpextends.
4319	if (Op0VT == MVT::bf16 && IsStrict) {
4320	SDValue Ext1 =
4321	DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: SDLoc (Op), ResultTys: {MVT::f32, MVT::Other},
4322	Ops: {Op0, Op.getOperand(i: `0`)});
4323	return DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: SDLoc (Op), ResultTys: {VT, MVT::Other},
4324	Ops: {Ext1, Ext1.getValue(R: `1`)});
4325	}
4326	if (Op0VT == MVT::bf16)
4327	return DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT,
4328	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT: MVT::f32, Operand: Op0));
4329	return SDValue ();
4330	}
4331
4332	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4333	return SDValue ();
4334	}
4335
4336	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4337	SelectionDAG &DAG) const {
4338	EVT VT = Op.getValueType();
4339	bool IsStrict = Op ->isStrictFPOpcode();
4340	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4341	EVT SrcVT = SrcVal.getValueType();
4342	bool Trunc = Op.getConstantOperandVal(i: IsStrict ? `2` : `1`) == `1`;
4343
4344	if (VT.isScalableVector()) {
4345	// Let common code split the operation.
4346	if (SrcVT == MVT::nxv8f32)
4347	return Op;
4348
4349	if (VT.getScalarType() != MVT::bf16)
4350	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4351
4352	SDLoc DL(Op);
4353	constexpr EVT I32 = MVT::nxv4i32;
4354	auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(Val: I, DL, VT: I32); };
4355
4356	SDValue NaN;
4357	SDValue Narrow;
4358
4359	if (SrcVT == MVT::nxv2f32 \|\| SrcVT == MVT::nxv4f32) {
4360	if (Subtarget->hasBF16())
4361	return LowerToPredicatedOp(Op, DAG,
4362	NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4363
4364	Narrow = getSVESafeBitCast(VT: I32, Op: SrcVal, DAG);
4365
4366	// Set the quiet bit.
4367	if (!DAG.isKnownNeverSNaN(Op: SrcVal))
4368	NaN = DAG.getNode(Opcode: ISD::OR, DL, VT: I32, N1: Narrow, N2: ImmV (`0x400000`));
4369	} else if (SrcVT == MVT::nxv2f64 &&
4370	(Subtarget->hasSVE2() \|\| Subtarget->isStreamingSVEAvailable())) {
4371	// Round to float without introducing rounding errors and try again.
4372	SDValue Pg = getPredicateForVector(DAG, DL, VT: MVT::nxv2f32);
4373	Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, VT: MVT::nxv2f32,
4374	N1: Pg, N2: SrcVal, N3: DAG.getUNDEF(VT: MVT::nxv2f32));
4375
4376	SmallVector<SDValue, `3`> NewOps;
4377	if (IsStrict)
4378	NewOps.push_back(Elt: Op.getOperand(i: `0`));
4379	NewOps.push_back(Elt: Narrow);
4380	NewOps.push_back(Elt: Op.getOperand(i: IsStrict ? `2` : `1`));
4381	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT, Ops: NewOps, Flags: Op ->getFlags());
4382	} else
4383	return SDValue ();
4384
4385	if (!Trunc) {
4386	SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow, N2: ImmV (`16`));
4387	Lsb = DAG.getNode(Opcode: ISD::AND, DL, VT: I32, N1: Lsb, N2: ImmV (`1`));
4388	SDValue RoundingBias = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Lsb, N2: ImmV (`0x7fff`));
4389	Narrow = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Narrow, N2: RoundingBias);
4390	}
4391
4392	// Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4393	// 0x80000000.
4394	if (NaN) {
4395	EVT I1 = I32.changeElementType(EltVT: MVT::i1);
4396	EVT CondVT = VT.changeElementType(EltVT: MVT::i1);
4397	SDValue IsNaN = DAG.getSetCC(DL, VT: CondVT, LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4398	IsNaN = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: I1, Operand: IsNaN);
4399	Narrow = DAG.getSelect(DL, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4400	}
4401
4402	// Now that we have rounded, shift the bits into position.
4403	Narrow = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow, N2: ImmV (`16`));
4404	return getSVESafeBitCast(VT, Op: Narrow, DAG);
4405	}
4406
4407	if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4408	return LowerFixedLengthFPRoundToSVE(Op, DAG);
4409
4410	// Expand cases where the result type is BF16 but we don't have hardware
4411	// instructions to lower it.
4412	if (VT.getScalarType() == MVT::bf16 &&
4413	!((Subtarget->hasNEON() \|\| Subtarget->hasSME()) &&
4414	Subtarget->hasBF16())) {
4415	SDLoc DL(Op);
4416	SDValue Narrow = SrcVal;
4417	SDValue NaN;
4418	EVT I32 = SrcVT.changeElementType(EltVT: MVT::i32);
4419	EVT F32 = SrcVT.changeElementType(EltVT: MVT::f32);
4420	if (SrcVT.getScalarType() == MVT::f32) {
4421	bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4422	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: I32, Operand: Narrow);
4423	if (!NeverSNaN) {
4424	// Set the quiet bit.
4425	NaN = DAG.getNode(Opcode: ISD::OR, DL, VT: I32, N1: Narrow,
4426	N2: DAG.getConstant(Val: `0x400000`, DL, VT: I32));
4427	}
4428	} else if (SrcVT.getScalarType() == MVT::f64) {
4429	Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: F32, Operand: Narrow);
4430	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: I32, Operand: Narrow);
4431	} else {
4432	return SDValue ();
4433	}
4434	if (!Trunc) {
4435	SDValue One = DAG.getConstant(Val: `1`, DL, VT: I32);
4436	SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow,
4437	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL));
4438	Lsb = DAG.getNode(Opcode: ISD::AND, DL, VT: I32, N1: Lsb, N2: One);
4439	SDValue RoundingBias =
4440	DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: DAG.getConstant(Val: `0x7fff`, DL, VT: I32), N2: Lsb);
4441	Narrow = DAG.getNode(Opcode: ISD::ADD, DL, VT: I32, N1: Narrow, N2: RoundingBias);
4442	}
4443
4444	// Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4445	// 0x80000000.
4446	if (NaN) {
4447	SDValue IsNaN = DAG.getSetCC(
4448	DL, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4449	LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4450	Narrow = DAG.getSelect(DL, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4451	}
4452
4453	// Now that we have rounded, shift the bits into position.
4454	Narrow = DAG.getNode(Opcode: ISD::SRL, DL, VT: I32, N1: Narrow,
4455	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL));
4456	if (VT.isVector()) {
4457	EVT I16 = I32.changeVectorElementType(EltVT: MVT::i16);
4458	Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: I16, Operand: Narrow);
4459	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Narrow);
4460	}
4461	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: F32, Operand: Narrow);
4462	SDValue Result = DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: Narrow);
4463	return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: `0`)}, dl: DL)
4464	: Result;
4465	}
4466
4467	if (SrcVT != MVT::f128) {
4468	// Expand cases where the input is a vector bigger than NEON.
4469	if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4470	return SDValue ();
4471
4472	// It's legal except when f128 is involved
4473	return Op;
4474	}
4475
4476	return SDValue ();
4477	}
4478
4479	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4480	SelectionDAG &DAG) const {
4481	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4482	// Any additional optimization in this function should be recorded
4483	// in the cost tables.
4484	bool IsStrict = Op ->isStrictFPOpcode();
4485	EVT InVT = Op.getOperand(i: IsStrict ? `1` : `0`).getValueType();
4486	EVT VT = Op.getValueType();
4487
4488	assert(!(IsStrict && VT.isScalableVector()) &&
4489	"Unimplemented SVE support for STRICT_FP_to_INT!");
4490
4491	// f16 conversions are promoted to f32 when full fp16 is not supported.
4492	if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4493	InVT.getVectorElementType() == MVT::bf16) {
4494	EVT NewVT = VT.changeElementType(EltVT: MVT::f32);
4495	SDLoc DL(Op);
4496	if (IsStrict) {
4497	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {NewVT, MVT::Other},
4498	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4499	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {VT, MVT::Other},
4500	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4501	}
4502	return DAG.getNode(
4503	Opcode: Op.getOpcode(), DL, VT: Op.getValueType(),
4504	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: NewVT, Operand: Op.getOperand(i: `0`)));
4505	}
4506
4507	if (VT.isScalableVector()) {
4508	if (VT.getVectorElementType() == MVT::i1) {
4509	SDLoc DL(Op);
4510	EVT CvtVT = getPromotedVTForPredicate(VT);
4511	SDValue Cvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: CvtVT, Operand: Op.getOperand(i: `0`));
4512	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: CvtVT);
4513	return DAG.getSetCC(DL, VT, LHS: Cvt, RHS: Zero, Cond: ISD::SETNE);
4514	}
4515
4516	// Let common code split the operation.
4517	if (InVT == MVT::nxv8f32)
4518	return Op;
4519
4520	unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4521	? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4522	: AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4523	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4524	}
4525
4526	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4527	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4528	return LowerFixedLengthFPToIntToSVE(Op, DAG);
4529
4530	uint64_t VTSize = VT.getFixedSizeInBits();
4531	uint64_t InVTSize = InVT.getFixedSizeInBits();
4532	if (VTSize < InVTSize) {
4533	SDLoc DL(Op);
4534	if (IsStrict) {
4535	InVT = InVT.changeVectorElementTypeToInteger();
4536	SDValue Cv = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {InVT, MVT::Other},
4537	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4538	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Cv);
4539	return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: `1`)}, dl: DL);
4540	}
4541	SDValue Cv =
4542	DAG.getNode(Opcode: Op.getOpcode(), DL, VT: InVT.changeVectorElementTypeToInteger(),
4543	Operand: Op.getOperand(i: `0`));
4544	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Cv);
4545	}
4546
4547	if (VTSize > InVTSize) {
4548	SDLoc DL(Op);
4549	MVT ExtVT =
4550	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4551	NumElements: VT.getVectorNumElements());
4552	if (IsStrict) {
4553	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {ExtVT, MVT::Other},
4554	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4555	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {VT, MVT::Other},
4556	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4557	}
4558	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: ExtVT, Operand: Op.getOperand(i: `0`));
4559	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT, Operand: Ext);
4560	}
4561
4562	// Use a scalar operation for conversions between single-element vectors of
4563	// the same size.
4564	if (InVT.getVectorNumElements() == `1`) {
4565	SDLoc DL(Op);
4566	SDValue Extract = DAG.getNode(
4567	Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: InVT.getScalarType(),
4568	N1: Op.getOperand(i: IsStrict ? `1` : `0`), N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
4569	EVT ScalarVT = VT.getScalarType();
4570	if (IsStrict)
4571	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {ScalarVT, MVT::Other},
4572	Ops: {Op.getOperand(i: `0`), Extract});
4573	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ScalarVT, Operand: Extract);
4574	}
4575
4576	// Type changing conversions are illegal.
4577	return Op;
4578	}
4579
4580	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4581	SelectionDAG &DAG) const {
4582	bool IsStrict = Op ->isStrictFPOpcode();
4583	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4584
4585	if (SrcVal.getValueType().isVector())
4586	return LowerVectorFP_TO_INT(Op, DAG);
4587
4588	// f16 conversions are promoted to f32 when full fp16 is not supported.
4589	if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4590	SrcVal.getValueType() == MVT::bf16) {
4591	SDLoc DL(Op);
4592	if (IsStrict) {
4593	SDValue Ext =
4594	DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
4595	Ops: {Op.getOperand(i: `0`), SrcVal});
4596	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
4597	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4598	}
4599	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(),
4600	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: SrcVal));
4601	}
4602
4603	if (SrcVal.getValueType() != MVT::f128) {
4604	// It's legal except when f128 is involved
4605	return Op;
4606	}
4607
4608	return SDValue ();
4609	}
4610
4611	SDValue
4612	AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4613	SelectionDAG &DAG) const {
4614	// AArch64 FP-to-int conversions saturate to the destination element size, so
4615	// we can lower common saturating conversions to simple instructions.
4616	SDValue SrcVal = Op.getOperand(i: `0`);
4617	EVT SrcVT = SrcVal.getValueType();
4618	EVT DstVT = Op.getValueType();
4619	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4620
4621	uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4622	uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4623	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4624	assert(SatWidth <= DstElementWidth &&
4625	"Saturation width cannot exceed result width");
4626
4627	// TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4628	// Currently, the `llvm.fpto[su]i.sat.` intrinsics don't accept scalable*
4629	// types, so this is hard to reach.
4630	if (DstVT.isScalableVector())
4631	return SDValue ();
4632
4633	EVT SrcElementVT = SrcVT.getVectorElementType();
4634
4635	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4636	SDLoc DL(Op);
4637	SDValue SrcVal2;
4638	if ((SrcElementVT == MVT::f16 &&
4639	(!Subtarget->hasFullFP16() \|\| DstElementWidth > `16`)) \|\|
4640	SrcElementVT == MVT::bf16) {
4641	MVT F32VT = MVT::getVectorVT(VT: MVT::f32, NumElements: SrcVT.getVectorNumElements());
4642	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F32VT, Operand: SrcVal);
4643	// If we are extending to a v8f32, split into two v4f32 to produce legal
4644	// types.
4645	if (F32VT.getSizeInBits() > `128`) {
4646	std::tie(args&: SrcVal, args&: SrcVal2) = DAG.SplitVector(N: SrcVal, DL);
4647	F32VT = F32VT.getHalfNumVectorElementsVT();
4648	}
4649	SrcVT = F32VT;
4650	SrcElementVT = MVT::f32;
4651	SrcElementWidth = `32`;
4652	} else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4653	SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4654	return SDValue ();
4655
4656	// Expand to f64 if we are saturating to i64, to help keep the lanes the same
4657	// width and produce a fcvtzu.
4658	if (SatWidth == `64` && SrcElementWidth < `64`) {
4659	MVT F64VT = MVT::getVectorVT(VT: MVT::f64, NumElements: SrcVT.getVectorNumElements());
4660	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F64VT, Operand: SrcVal);
4661	SrcVT = F64VT;
4662	SrcElementVT = MVT::f64;
4663	SrcElementWidth = `64`;
4664	}
4665	// Cases that we can emit directly.
4666	if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4667	SDValue Res = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4668	N2: DAG.getValueType(DstVT.getScalarType()));
4669	if (SrcVal2) {
4670	SDValue Res2 = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal2,
4671	N2: DAG.getValueType(DstVT.getScalarType()));
4672	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DstVT, N1: Res, N2: Res2);
4673	}
4674	return Res;
4675	}
4676
4677	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4678	// result. This is only valid if the legal cvt is larger than the saturate
4679	// width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4680	// (at least until sqxtn is selected).
4681	if (SrcElementWidth < SatWidth \|\| SrcElementVT == MVT::f64)
4682	return SDValue ();
4683
4684	EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4685	SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4686	N2: DAG.getValueType(IntVT.getScalarType()));
4687	SDValue NativeCvt2 =
4688	SrcVal2 ? DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal2,
4689	N2: DAG.getValueType(IntVT.getScalarType()))
4690	: SDValue ();
4691	SDValue Sat, Sat2;
4692	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4693	SDValue MinC = DAG.getConstant(
4694	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4695	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4696	SDValue Min2 = SrcVal2 ? DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt2, N2: MinC) : SDValue ();
4697	SDValue MaxC = DAG.getConstant(
4698	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4699	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4700	Sat2 = SrcVal2 ? DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min2, N2: MaxC) : SDValue ();
4701	} else {
4702	SDValue MinC = DAG.getConstant(
4703	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4704	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4705	Sat2 = SrcVal2 ? DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt2, N2: MinC) : SDValue ();
4706	}
4707
4708	if (SrcVal2)
4709	Sat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
4710	VT: IntVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
4711	N1: Sat, N2: Sat2);
4712
4713	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4714	}
4715
4716	SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4717	SelectionDAG &DAG) const {
4718	// AArch64 FP-to-int conversions saturate to the destination register size, so
4719	// we can lower common saturating conversions to simple instructions.
4720	SDValue SrcVal = Op.getOperand(i: `0`);
4721	EVT SrcVT = SrcVal.getValueType();
4722
4723	if (SrcVT.isVector())
4724	return LowerVectorFP_TO_INT_SAT(Op, DAG);
4725
4726	EVT DstVT = Op.getValueType();
4727	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4728	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4729	uint64_t DstWidth = DstVT.getScalarSizeInBits();
4730	assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4731
4732	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4733	if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) \|\| SrcVT == MVT::bf16) {
4734	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT: MVT::f32, Operand: SrcVal);
4735	SrcVT = MVT::f32;
4736	} else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4737	SrcVT != MVT::bf16)
4738	return SDValue ();
4739
4740	SDLoc DL(Op);
4741	// Cases that we can emit directly.
4742	if ((SrcVT == MVT::f64 \|\| SrcVT == MVT::f32 \|\|
4743	(SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4744	DstVT == SatVT && (DstVT == MVT::i64 \|\| DstVT == MVT::i32))
4745	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4746	N2: DAG.getValueType(DstVT));
4747
4748	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4749	// result. This is only valid if the legal cvt is larger than the saturate
4750	// width.
4751	if (DstWidth < SatWidth)
4752	return SDValue ();
4753
4754	SDValue NativeCvt =
4755	DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4756	SDValue Sat;
4757	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4758	SDValue MinC = DAG.getConstant(
4759	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4760	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4761	SDValue MaxC = DAG.getConstant(
4762	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4763	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4764	} else {
4765	SDValue MinC = DAG.getConstant(
4766	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4767	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4768	}
4769
4770	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4771	}
4772
4773	SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4774	SelectionDAG &DAG) const {
4775	EVT VT = Op.getValueType();
4776	SDValue Src = Op.getOperand(i: `0`);
4777	SDLoc DL(Op);
4778
4779	assert(VT.isVector() && "Expected vector type");
4780
4781	EVT CastVT =
4782	VT.changeVectorElementType(EltVT: Src.getValueType().getVectorElementType());
4783
4784	// Round the floating-point value into a floating-point register with the
4785	// current rounding mode.
4786	SDValue FOp = DAG.getNode(Opcode: ISD::FRINT, DL, VT: CastVT, Operand: Src);
4787
4788	// Truncate the rounded floating point to an integer.
4789	return DAG.getNode(Opcode: ISD::FP_TO_SINT_SAT, DL, VT, N1: FOp,
4790	N2: DAG.getValueType(VT.getVectorElementType()));
4791	}
4792
4793	SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4794	SelectionDAG &DAG) const {
4795	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4796	// Any additional optimization in this function should be recorded
4797	// in the cost tables.
4798	bool IsStrict = Op ->isStrictFPOpcode();
4799	EVT VT = Op.getValueType();
4800	SDLoc DL(Op);
4801	SDValue In = Op.getOperand(i: IsStrict ? `1` : `0`);
4802	EVT InVT = In.getValueType();
4803	unsigned Opc = Op.getOpcode();
4804	bool IsSigned = Opc == ISD::SINT_TO_FP \|\| Opc == ISD::STRICT_SINT_TO_FP;
4805
4806	assert(!(IsStrict && VT.isScalableVector()) &&
4807	"Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
4808
4809	// NOTE: i1->bf16 does not require promotion to f32.
4810	if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
4811	SDValue FalseVal = DAG.getConstantFP(Val: `0.0`, DL, VT);
4812	SDValue TrueVal = IsSigned ? DAG.getConstantFP(Val: -`1.0`, DL, VT)
4813	: DAG.getConstantFP(Val: `1.0`, DL, VT);
4814	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: In, N2: TrueVal, N3: FalseVal);
4815	}
4816
4817	// Promote bf16 conversions to f32.
4818	if (VT.getVectorElementType() == MVT::bf16) {
4819	EVT F32 = VT.changeElementType(EltVT: MVT::f32);
4820	if (IsStrict) {
4821	SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {F32, MVT::Other},
4822	Ops: {Op.getOperand(i: `0`), In});
4823	return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4824	ResultTys: {Op.getValueType(), MVT::Other},
4825	Ops: {Val.getValue(R: `1`), Val.getValue(R: `0`),
4826	DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true)});
4827	}
4828	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(),
4829	N1: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: F32, Operand: In),
4830	N2: DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true));
4831	}
4832
4833	if (VT.isScalableVector()) {
4834	// Let common code split the operation.
4835	if (VT == MVT::nxv8f32)
4836	return Op;
4837
4838	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4839	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4840	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4841	}
4842
4843	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4844	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4845	return LowerFixedLengthIntToFPToSVE(Op, DAG);
4846
4847	uint64_t VTSize = VT.getFixedSizeInBits();
4848	uint64_t InVTSize = InVT.getFixedSizeInBits();
4849	if (VTSize < InVTSize) {
4850	// AArch64 doesn't have a direct vector instruction to convert
4851	// fixed point to floating point AND narrow it at the same time.
4852	// Additional rounding when the target is f32/f64 causes double
4853	// rounding issues. Conversion to f16 is fine due to narrow width.
4854	bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
4855	bool IsTargetf16 = false;
4856	if (Op.hasOneUse() &&
4857	Op ->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
4858	// Some vector types are split during legalization into half, followed by
4859	// concatenation, followed by rounding to the original vector type. If we
4860	// end up resolving to f16 type, we shouldn't worry about rounding errors.
4861	SDNode U = Op ->user_begin();
4862	if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
4863	EVT TmpVT = U->user_begin()->getValueType(ResNo: `0`);
4864	if (TmpVT.getScalarType() == MVT::f16)
4865	IsTargetf16 = true;
4866	}
4867	}
4868
4869	if (IsTargetf32 && !IsTargetf16) {
4870	return !IsStrict ? DAG.UnrollVectorOp(N: Op.getNode()) : SDValue ();
4871	}
4872
4873	MVT CastVT =
4874	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4875	NumElements: InVT.getVectorNumElements());
4876	if (IsStrict) {
4877	In = DAG.getNode(Opcode: Opc, DL, ResultTys: {CastVT, MVT::Other}, Ops: {Op.getOperand(i: `0`), In});
4878	return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL, ResultTys: {VT, MVT::Other},
4879	Ops: {In.getValue(R: `1`), In.getValue(R: `0`),
4880	DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true)});
4881	}
4882	In = DAG.getNode(Opcode: Opc, DL, VT: CastVT, Operand: In);
4883	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: In,
4884	N2: DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true));
4885	}
4886
4887	if (VTSize > InVTSize) {
4888	unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4889	EVT CastVT = VT.changeVectorElementTypeToInteger();
4890	In = DAG.getNode(Opcode: CastOpc, DL, VT: CastVT, Operand: In);
4891	if (IsStrict)
4892	return DAG.getNode(Opcode: Opc, DL, ResultTys: {VT, MVT::Other}, Ops: {Op.getOperand(i: `0`), In});
4893	return DAG.getNode(Opcode: Opc, DL, VT, Operand: In);
4894	}
4895
4896	// Use a scalar operation for conversions between single-element vectors of
4897	// the same size.
4898	if (VT.getVectorNumElements() == `1`) {
4899	SDValue Extract =
4900	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: InVT.getScalarType(), N1: In,
4901	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
4902	EVT ScalarVT = VT.getScalarType();
4903	if (IsStrict)
4904	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {ScalarVT, MVT::Other},
4905	Ops: {Op.getOperand(i: `0`), Extract});
4906	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ScalarVT, Operand: Extract);
4907	}
4908
4909	return Op;
4910	}
4911
4912	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4913	SelectionDAG &DAG) const {
4914	if (Op.getValueType().isVector())
4915	return LowerVectorINT_TO_FP(Op, DAG);
4916
4917	bool IsStrict = Op ->isStrictFPOpcode();
4918	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4919
4920	bool IsSigned = Op ->getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
4921	Op ->getOpcode() == ISD::SINT_TO_FP;
4922
4923	auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4924	SDLoc DL(Op);
4925	if (IsStrict) {
4926	SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {PromoteVT, MVT::Other},
4927	Ops: {Op.getOperand(i: `0`), SrcVal});
4928	return DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4929	ResultTys: {Op.getValueType(), MVT::Other},
4930	Ops: {Val.getValue(R: `1`), Val.getValue(R: `0`),
4931	DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true)});
4932	}
4933	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(),
4934	N1: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromoteVT, Operand: SrcVal),
4935	N2: DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true));
4936	};
4937
4938	if (Op.getValueType() == MVT::bf16) {
4939	unsigned MaxWidth = IsSigned
4940	? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4941	: DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4942	// bf16 conversions are promoted to f32 when converting from i16.
4943	if (MaxWidth <= `24`) {
4944	return IntToFpViaPromotion (MVT::f32);
4945	}
4946
4947	// bf16 conversions are promoted to f64 when converting from i32.
4948	if (MaxWidth <= `53`) {
4949	return IntToFpViaPromotion (MVT::f64);
4950	}
4951
4952	// We need to be careful about i64 -> bf16.
4953	// Consider an i32 22216703.
4954	// This number cannot be represented exactly as an f32 and so a itofp will
4955	// turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4956	// However, the correct bf16 was supposed to be 22151168.0
4957	// We need to use sticky rounding to get this correct.
4958	if (SrcVal.getValueType() == MVT::i64) {
4959	SDLoc DL(Op);
4960	// This algorithm is equivalent to the following:
4961	// uint64_t SrcHi = SrcVal & ~0xfffull;
4962	// uint64_t SrcLo = SrcVal & 0xfffull;
4963	// uint64_t Highest = SrcVal >> 53;
4964	// bool HasHighest = Highest != 0;
4965	// uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4966	// double Rounded = static_cast<double>(ToRound);
4967	// uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4968	// uint64_t HasLo = SrcLo != 0;
4969	// bool NeedsAdjustment = HasHighest & HasLo;
4970	// uint64_t AdjustedBits = RoundedBits \| uint64_t{NeedsAdjustment};
4971	// double Adjusted = std::bit_cast<double>(AdjustedBits);
4972	// return static_cast<__bf16>(Adjusted);
4973	//
4974	// Essentially, what happens is that SrcVal either fits perfectly in a
4975	// double-precision value or it is too big. If it is sufficiently small,
4976	// we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4977	// ensure that u64 -> double has no rounding error by only using the 52
4978	// MSB of the input. The low order bits will get merged into a sticky bit
4979	// which will avoid issues incurred by double rounding.
4980
4981	// Signed conversion is more or less like so:
4982	// copysign((__bf16)abs(SrcVal), SrcVal)
4983	SDValue SignBit;
4984	if (IsSigned) {
4985	SignBit = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4986	N2: DAG.getConstant(Val: `1ull` << `63`, DL, VT: MVT::i64));
4987	SrcVal = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::i64, Operand: SrcVal);
4988	}
4989	SDValue SrcHi = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4990	N2: DAG.getConstant(Val: ~`0xfffull`, DL, VT: MVT::i64));
4991	SDValue SrcLo = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4992	N2: DAG.getConstant(Val: `0xfffull`, DL, VT: MVT::i64));
4993	SDValue Highest =
4994	DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SrcVal,
4995	N2: DAG.getShiftAmountConstant(Val: `53`, VT: MVT::i64, DL));
4996	SDValue Zero64 = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
4997	SDValue ToRound =
4998	DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4999	SDValue Rounded =
5000	IsStrict ? DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {MVT::f64, MVT::Other},
5001	Ops: {Op.getOperand(i: `0`), ToRound})
5002	: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f64, Operand: ToRound);
5003
5004	SDValue RoundedBits = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Rounded);
5005	if (SignBit) {
5006	RoundedBits = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: SignBit);
5007	}
5008
5009	SDValue HasHighest = DAG.getSetCC(
5010	DL,
5011	VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
5012	LHS: Highest, RHS: Zero64, Cond: ISD::SETNE);
5013
5014	SDValue HasLo = DAG.getSetCC(
5015	DL,
5016	VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
5017	LHS: SrcLo, RHS: Zero64, Cond: ISD::SETNE);
5018
5019	SDValue NeedsAdjustment =
5020	DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
5021	NeedsAdjustment = DAG.getZExtOrTrunc(Op: NeedsAdjustment, DL, VT: MVT::i64);
5022
5023	SDValue AdjustedBits =
5024	DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: NeedsAdjustment);
5025	SDValue Adjusted = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: AdjustedBits);
5026	return IsStrict
5027	? DAG.getNode(
5028	Opcode: ISD::STRICT_FP_ROUND, DL,
5029	ResultTys: {Op.getValueType(), MVT::Other},
5030	Ops: {Rounded.getValue(R: `1`), Adjusted,
5031	DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true)})
5032	: DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(), N1: Adjusted,
5033	N2: DAG.getIntPtrConstant(Val: `0`, DL, /isTarget=/true));
5034	}
5035	}
5036
5037	// f16 conversions are promoted to f32 when full fp16 is not supported.
5038	if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5039	return IntToFpViaPromotion (MVT::f32);
5040	}
5041
5042	// i128 conversions are libcalls.
5043	if (SrcVal.getValueType() == MVT::i128)
5044	return SDValue ();
5045
5046	// Other conversions are legal, unless it's to the completely software-based
5047	// fp128.
5048	if (Op.getValueType() != MVT::f128)
5049	return Op;
5050	return SDValue ();
5051	}
5052
5053	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
5054	SelectionDAG &DAG) const {
5055	// For iOS, we want to call an alternative entry point: __sincos_stret,
5056	// which returns the values in two S / D registers.
5057	SDLoc DL(Op);
5058	SDValue Arg = Op.getOperand(i: `0`);
5059	EVT ArgVT = Arg.getValueType();
5060	Type ArgTy = ArgVT.getTypeForEVT(Context&: DAG.getContext());
5061
5062	ArgListTy Args;
5063	ArgListEntry Entry;
5064
5065	Entry.Node = Arg;
5066	Entry.Ty = ArgTy;
5067	Entry.IsSExt = false;
5068	Entry.IsZExt = false;
5069	Args.push_back(x: Entry);
5070
5071	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
5072	: RTLIB::SINCOS_STRET_F32;
5073	const char *LibcallName = getLibcallName(Call: LC);
5074	SDValue Callee =
5075	DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
5076
5077	StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
5078	TargetLowering::CallLoweringInfo CLI(DAG);
5079	CallingConv::ID CC = getLibcallCallingConv(Call: LC);
5080	CLI.setDebugLoc(DL)
5081	.setChain(DAG.getEntryNode())
5082	.setLibCallee(CC, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5083
5084	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5085	return CallResult.first;
5086	}
5087
5088	static MVT getSVEContainerType(EVT ContentTy);
5089
5090	SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5091	SelectionDAG &DAG) const {
5092	EVT OpVT = Op.getValueType();
5093	EVT ArgVT = Op.getOperand(i: `0`).getValueType();
5094
5095	if (useSVEForFixedLengthVectorVT(VT: OpVT))
5096	return LowerFixedLengthBitcastToSVE(Op, DAG);
5097
5098	if (OpVT.isScalableVector()) {
5099	assert(isTypeLegal(OpVT) && "Unexpected result type!");
5100
5101	// Handle type legalisation first.
5102	if (!isTypeLegal(VT: ArgVT)) {
5103	assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5104	"Expected int->fp bitcast!");
5105
5106	// Bitcasting between unpacked vector types of different element counts is
5107	// not a NOP because the live elements are laid out differently.
5108	// 01234567
5109	// e.g. nxv2i32 = XX??XX??
5110	// nxv4f16 = X?X?X?X?
5111	if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5112	return SDValue ();
5113
5114	SDValue ExtResult =
5115	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc (Op), VT: getSVEContainerType(ContentTy: ArgVT),
5116	Operand: Op.getOperand(i: `0`));
5117	return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
5118	}
5119
5120	// Bitcasts between legal types with the same element count are legal.
5121	if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5122	return Op;
5123
5124	// getSVESafeBitCast does not support casting between unpacked types.
5125	if (!isPackedVectorType(VT: OpVT, DAG))
5126	return SDValue ();
5127
5128	return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: `0`), DAG);
5129	}
5130
5131	if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5132	return SDValue ();
5133
5134	// Bitcasts between f16 and bf16 are legal.
5135	if (ArgVT == MVT::f16 \|\| ArgVT == MVT::bf16)
5136	return Op;
5137
5138	assert(ArgVT == MVT::i16);
5139	SDLoc DL(Op);
5140
5141	Op = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Op.getOperand(i: `0`));
5142	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Op);
5143	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: OpVT, Operand: Op);
5144	}
5145
5146	// Returns lane if Op extracts from a two-element vector and lane is constant
5147	// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5148	static std::optional<uint64_t>
5149	getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
5150	SDNode *OpNode = Op.getNode();
5151	if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5152	return std::nullopt;
5153
5154	EVT VT = OpNode->getOperand(Num: `0`).getValueType();
5155	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: `1`));
5156	if (!VT.isFixedLengthVector() \|\| VT.getVectorNumElements() != `2` \|\| !C)
5157	return std::nullopt;
5158
5159	return C->getZExtValue();
5160	}
5161
5162	static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
5163	bool isSigned) {
5164	EVT VT = N.getValueType();
5165
5166	if (N.getOpcode() != ISD::BUILD_VECTOR)
5167	return false;
5168
5169	for (const SDValue &Elt : N ->op_values()) {
5170	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
5171	unsigned EltSize = VT.getScalarSizeInBits();
5172	unsigned HalfSize = EltSize / `2`;
5173	if (isSigned) {
5174	if (!isIntN(N: HalfSize, x: C->getSExtValue()))
5175	return false;
5176	} else {
5177	if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
5178	return false;
5179	}
5180	continue;
5181	}
5182	return false;
5183	}
5184
5185	return true;
5186	}
5187
5188	static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
5189	EVT VT = N.getValueType();
5190	assert(VT.is128BitVector() && "Unexpected vector MULL size");
5191	EVT HalfVT = EVT::getVectorVT(
5192	Context&: *DAG.getContext(),
5193	VT: VT.getScalarType().getHalfSizedIntegerVT(Context&: *DAG.getContext()),
5194	EC: VT.getVectorElementCount());
5195	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: HalfVT, Operand: N);
5196	}
5197
5198	static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
5199	return N.getOpcode() == ISD::SIGN_EXTEND \|\|
5200	N.getOpcode() == ISD::ANY_EXTEND \|\|
5201	isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
5202	}
5203
5204	static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
5205	return N.getOpcode() == ISD::ZERO_EXTEND \|\|
5206	N.getOpcode() == ISD::ANY_EXTEND \|\|
5207	isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
5208	}
5209
5210	static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
5211	unsigned Opcode = N.getOpcode();
5212	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
5213	SDValue N0 = N.getOperand(i: `0`);
5214	SDValue N1 = N.getOperand(i: `1`);
5215	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
5216	isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
5217	}
5218	return false;
5219	}
5220
5221	static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
5222	unsigned Opcode = N.getOpcode();
5223	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
5224	SDValue N0 = N.getOperand(i: `0`);
5225	SDValue N1 = N.getOperand(i: `1`);
5226	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
5227	isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
5228	}
5229	return false;
5230	}
5231
5232	SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5233	SelectionDAG &DAG) const {
5234	// The rounding mode is in bits 23:22 of the FPSCR.
5235	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5236	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5237	// so that the shift + and get folded into a bitfield extract.
5238	SDLoc DL(Op);
5239
5240	SDValue Chain = Op.getOperand(i: `0`);
5241	SDValue FPCR_64 = DAG.getNode(
5242	Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other},
5243	Ops: {Chain, DAG.getConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)});
5244	Chain = FPCR_64.getValue(R: `1`);
5245	SDValue FPCR_32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR_64);
5246	SDValue FltRounds = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, N1: FPCR_32,
5247	N2: DAG.getConstant(Val: `1U` << `22`, DL, VT: MVT::i32));
5248	SDValue RMODE = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i32, N1: FltRounds,
5249	N2: DAG.getConstant(Val: `22`, DL, VT: MVT::i32));
5250	SDValue AND = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMODE,
5251	N2: DAG.getConstant(Val: `3`, DL, VT: MVT::i32));
5252	return DAG.getMergeValues(Ops: {AND, Chain}, dl: DL);
5253	}
5254
5255	SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5256	SelectionDAG &DAG) const {
5257	SDLoc DL(Op);
5258	SDValue Chain = Op ->getOperand(Num: `0`);
5259	SDValue RMValue = Op ->getOperand(Num: `1`);
5260
5261	// The rounding mode is in bits 23:22 of the FPCR.
5262	// The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5263	// is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5264	// ((arg - 1) & 3) << 22).
5265	//
5266	// The argument of llvm.set.rounding must be within the segment [0, 3], so
5267	// NearestTiesToAway (4) is not handled here. It is responsibility of the code
5268	// generated llvm.set.rounding to ensure this condition.
5269
5270	// Calculate new value of FPCR[23:22].
5271	RMValue = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: RMValue,
5272	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
5273	RMValue = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMValue,
5274	N2: DAG.getConstant(Val: `0x3`, DL, VT: MVT::i32));
5275	RMValue =
5276	DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: RMValue,
5277	N2: DAG.getConstant(Val: AArch64::RoundingBitsPos, DL, VT: MVT::i32));
5278	RMValue = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: RMValue);
5279
5280	// Get current value of FPCR.
5281	SDValue Ops[] = {
5282	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5283	SDValue FPCR =
5284	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5285	Chain = FPCR.getValue(R: `1`);
5286	FPCR = FPCR.getValue(R: `0`);
5287
5288	// Put new rounding mode into FPSCR[23:22].
5289	const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5290	FPCR = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5291	N2: DAG.getConstant(Val: RMMask, DL, VT: MVT::i64));
5292	FPCR = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: FPCR, N2: RMValue);
5293	SDValue Ops2[] = {
5294	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5295	FPCR};
5296	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5297	}
5298
5299	SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5300	SelectionDAG &DAG) const {
5301	SDLoc DL(Op);
5302	SDValue Chain = Op ->getOperand(Num: `0`);
5303
5304	// Get current value of FPCR.
5305	SDValue Ops[] = {
5306	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5307	SDValue FPCR =
5308	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5309	Chain = FPCR.getValue(R: `1`);
5310	FPCR = FPCR.getValue(R: `0`);
5311
5312	// Truncate FPCR to 32 bits.
5313	SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR);
5314
5315	return DAG.getMergeValues(Ops: {Result, Chain}, dl: DL);
5316	}
5317
5318	SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5319	SelectionDAG &DAG) const {
5320	SDLoc DL(Op);
5321	SDValue Chain = Op ->getOperand(Num: `0`);
5322	SDValue Mode = Op ->getOperand(Num: `1`);
5323
5324	// Extend the specified value to 64 bits.
5325	SDValue FPCR = DAG.getZExtOrTrunc(Op: Mode, DL, VT: MVT::i64);
5326
5327	// Set new value of FPCR.
5328	SDValue Ops2[] = {
5329	Chain, DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64), FPCR};
5330	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5331	}
5332
5333	SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5334	SelectionDAG &DAG) const {
5335	SDLoc DL(Op);
5336	SDValue Chain = Op ->getOperand(Num: `0`);
5337
5338	// Get current value of FPCR.
5339	SDValue Ops[] = {
5340	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5341	SDValue FPCR =
5342	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5343	Chain = FPCR.getValue(R: `1`);
5344	FPCR = FPCR.getValue(R: `0`);
5345
5346	// Clear bits that are not reserved.
5347	SDValue FPSCRMasked = DAG.getNode(
5348	Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5349	N2: DAG.getConstant(Val: AArch64::ReservedFPControlBits, DL, VT: MVT::i64));
5350
5351	// Set new value of FPCR.
5352	SDValue Ops2[] = {Chain,
5353	DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5354	FPSCRMasked};
5355	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5356	}
5357
5358	static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5359	SDLoc DL, bool &IsMLA) {
5360	bool IsN0SExt = isSignExtended(N: N0, DAG);
5361	bool IsN1SExt = isSignExtended(N: N1, DAG);
5362	if (IsN0SExt && IsN1SExt)
5363	return AArch64ISD::SMULL;
5364
5365	bool IsN0ZExt = isZeroExtended(N: N0, DAG);
5366	bool IsN1ZExt = isZeroExtended(N: N1, DAG);
5367
5368	if (IsN0ZExt && IsN1ZExt)
5369	return AArch64ISD::UMULL;
5370
5371	// Select UMULL if we can replace the other operand with an extend.
5372	EVT VT = N0.getValueType();
5373	unsigned EltSize = VT.getScalarSizeInBits();
5374	APInt Mask = APInt::getHighBitsSet(numBits: EltSize, hiBitsSet: EltSize / `2`);
5375	if (IsN0ZExt \|\| IsN1ZExt) {
5376	if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
5377	return AArch64ISD::UMULL;
5378	} else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(Op: N0, Mask) &&
5379	DAG.MaskedValueIsZero(Op: N1, Mask)) {
5380	// For v2i64 we look more aggressively at both operands being zero, to avoid
5381	// scalarization.
5382	return AArch64ISD::UMULL;
5383	}
5384
5385	if (IsN0SExt \|\| IsN1SExt) {
5386	if (DAG.ComputeNumSignBits(Op: IsN0SExt ? N1 : N0) > EltSize / `2`)
5387	return AArch64ISD::SMULL;
5388	} else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(Op: N0) > EltSize / `2` &&
5389	DAG.ComputeNumSignBits(Op: N1) > EltSize / `2`) {
5390	return AArch64ISD::SMULL;
5391	}
5392
5393	if (!IsN1SExt && !IsN1ZExt)
5394	return `0`;
5395
5396	// Look for (s/zext A + s/zext B) (s/zext C). We want to turn these*
5397	// into (s/zext A s/zext C) + (s/zext B * s/zext C)*
5398	if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
5399	IsMLA = true;
5400	return AArch64ISD::SMULL;
5401	}
5402	if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
5403	IsMLA = true;
5404	return AArch64ISD::UMULL;
5405	}
5406	if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
5407	std::swap(a&: N0, b&: N1);
5408	IsMLA = true;
5409	return AArch64ISD::UMULL;
5410	}
5411	return `0`;
5412	}
5413
5414	SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5415	EVT VT = Op.getValueType();
5416
5417	bool OverrideNEON = !Subtarget->isNeonAvailable();
5418	if (VT.isScalableVector() \|\| useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5419	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5420
5421	// Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5422	// that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5423	assert((VT.is128BitVector() \|\| VT.is64BitVector()) && VT.isInteger() &&
5424	"unexpected type for custom-lowering ISD::MUL");
5425	SDValue N0 = Op.getOperand(i: `0`);
5426	SDValue N1 = Op.getOperand(i: `1`);
5427	bool isMLA = false;
5428	EVT OVT = VT;
5429	if (VT.is64BitVector()) {
5430	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5431	isNullConstant(V: N0.getOperand(i: `1`)) &&
5432	N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5433	isNullConstant(V: N1.getOperand(i: `1`))) {
5434	N0 = N0.getOperand(i: `0`);
5435	N1 = N1.getOperand(i: `0`);
5436	VT = N0.getValueType();
5437	} else {
5438	if (VT == MVT::v1i64) {
5439	if (Subtarget->hasSVE())
5440	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5441	// Fall through to expand this. It is not legal.
5442	return SDValue ();
5443	} else
5444	// Other vector multiplications are legal.
5445	return Op;
5446	}
5447	}
5448
5449	SDLoc DL(Op);
5450	unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
5451
5452	if (!NewOpc) {
5453	if (VT.getVectorElementType() == MVT::i64) {
5454	// If SVE is available then i64 vector multiplications can also be made
5455	// legal.
5456	if (Subtarget->hasSVE())
5457	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5458	// Fall through to expand this. It is not legal.
5459	return SDValue ();
5460	} else
5461	// Other vector multiplications are legal.
5462	return Op;
5463	}
5464
5465	// Legalize to a S/UMULL instruction
5466	SDValue Op0;
5467	SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
5468	if (!isMLA) {
5469	Op0 = skipExtensionForVectorMULL(N: N0, DAG);
5470	assert(Op0.getValueType().is64BitVector() &&
5471	Op1.getValueType().is64BitVector() &&
5472	"unexpected types for extended operands to VMULL");
5473	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5474	N1: DAG.getNode(Opcode: NewOpc, DL, VT, N1: Op0, N2: Op1),
5475	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
5476	}
5477	// Optimizing (zext A + zext B) C, to (S/UMULL A, C) + (S/UMULL B, C) during*
5478	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5479	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5480	SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: `0`), DAG);
5481	SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: `1`), DAG);
5482	EVT Op1VT = Op1.getValueType();
5483	return DAG.getNode(
5484	Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5485	N1: DAG.getNode(Opcode: N0.getOpcode(), DL, VT,
5486	N1: DAG.getNode(Opcode: NewOpc, DL, VT,
5487	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N00), N2: Op1),
5488	N2: DAG.getNode(Opcode: NewOpc, DL, VT,
5489	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N01), N2: Op1)),
5490	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
5491	}
5492
5493	static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5494	int Pattern) {
5495	if (Pattern == AArch64SVEPredPattern::all)
5496	return DAG.getConstant(Val: `1`, DL, VT);
5497	return DAG.getNode(Opcode: AArch64ISD::PTRUE, DL, VT,
5498	Operand: DAG.getTargetConstant(Val: Pattern, DL, VT: MVT::i32));
5499	}
5500
5501	static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG,
5502	bool IsSigned, bool IsEqual) {
5503	unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? `1` : `0`;
5504	unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? `2` : `1`;
5505
5506	if (!isa<ConstantSDNode>(Val: N->getOperand(Num: Op1)))
5507	return SDValue ();
5508
5509	SDLoc DL(N);
5510	APInt Y = N->getConstantOperandAPInt(Num: Op1);
5511
5512	// When the second operand is the maximum value, comparisons that include
5513	// equality can never fail and thus we can return an all active predicate.
5514	if (IsEqual)
5515	if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5516	return DAG.getConstant(Val: `1`, DL, VT: N->getValueType(ResNo: `0`));
5517
5518	if (!isa<ConstantSDNode>(Val: N->getOperand(Num: Op0)))
5519	return SDValue ();
5520
5521	APInt X = N->getConstantOperandAPInt(Num: Op0);
5522
5523	bool Overflow;
5524	APInt NumActiveElems =
5525	IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5526
5527	if (Overflow)
5528	return SDValue ();
5529
5530	if (IsEqual) {
5531	APInt One(NumActiveElems.getBitWidth(), `1`, IsSigned);
5532	NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5533	: NumActiveElems.uadd_ov(RHS: One, Overflow);
5534	if (Overflow)
5535	return SDValue ();
5536	}
5537
5538	std::optional<unsigned> PredPattern =
5539	getSVEPredPatternFromNumElements(MinNumElts: NumActiveElems.getZExtValue());
5540	unsigned MinSVEVectorSize = std::max(
5541	a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: `128u`);
5542	unsigned ElementSize = `128` / N->getValueType(ResNo: `0`).getVectorMinNumElements();
5543	if (PredPattern != std::nullopt &&
5544	NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5545	return getPTrue(DAG, DL, VT: N->getValueType(ResNo: `0`), Pattern: *PredPattern);
5546
5547	return SDValue ();
5548	}
5549
5550	// Returns a safe bitcast between two scalable vector predicates, where
5551	// any newly created lanes from a widening bitcast are defined as zero.
5552	static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5553	SDLoc DL(Op);
5554	EVT InVT = Op.getValueType();
5555
5556	assert(InVT.getVectorElementType() == MVT::i1 &&
5557	VT.getVectorElementType() == MVT::i1 &&
5558	"Expected a predicate-to-predicate bitcast");
5559	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5560	InVT.isScalableVector() &&
5561	DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5562	"Only expect to cast between legal scalable predicate types!");
5563
5564	// Return the operand if the cast isn't changing type,
5565	if (InVT == VT)
5566	return Op;
5567
5568	// Look through casts to <vscale x 16 x i1> when their input has more lanes
5569	// than VT. This will increase the chances of removing casts that introduce
5570	// new lanes, which have to be explicitly zero'd.
5571	if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5572	Op.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_convert_to_svbool &&
5573	Op.getOperand(i: `1`).getValueType().bitsGT(VT))
5574	Op = Op.getOperand(i: `1`);
5575
5576	SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5577
5578	// We only have to zero the lanes if new lanes are being defined, e.g. when
5579	// casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5580	// case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5581	// we can return here.
5582	if (InVT.bitsGT(VT))
5583	return Reinterpret;
5584
5585	// Check if the other lanes are already known to be zeroed by
5586	// construction.
5587	if (isZeroingInactiveLanes(Op))
5588	return Reinterpret;
5589
5590	// Zero the newly introduced lanes.
5591	SDValue Mask = DAG.getConstant(Val: `1`, DL, VT: InVT);
5592	Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5593	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5594	}
5595
5596	SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5597	SDValue Chain, SDLoc DL,
5598	EVT VT) const {
5599	SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5600	VT: getPointerTy(DL: DAG.getDataLayout()));
5601	Type Int64Ty = Type::getInt64Ty(C&: DAG.getContext());
5602	Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5603	TargetLowering::CallLoweringInfo CLI(DAG);
5604	ArgListTy Args;
5605	CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5606	CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5607	ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5608	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5609	SDValue Mask = DAG.getConstant(/PSTATE.SM/ Val: `1`, DL, VT: MVT::i64);
5610	return DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: CallResult.first.getOperand(i: `0`),
5611	N2: Mask);
5612	}
5613
5614	// Lower an SME LDR/STR ZA intrinsic
5615	// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5616	// folded into the instruction
5617	// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5618	// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5619	// and tile slice registers
5620	// ldr(%tileslice, %ptr, %vecnum)
5621	// ->
5622	// %svl = rdsvl
5623	// %ptr2 = %ptr + %svl %vecnum*
5624	// %tileslice2 = %tileslice + %vecnum
5625	// ldr [%tileslice2, 0], [%ptr2, 0]
5626	// Case 3: If the vecnum is an immediate out of range, then the same is done as
5627	// case 2, but the base and slice registers are modified by the greatest
5628	// multiple of 15 lower than the vecnum and the remainder is folded into the
5629	// instruction. This means that successive loads and stores that are offset from
5630	// each other can share the same base and slice register updates.
5631	// ldr(%tileslice, %ptr, 22)
5632	// ldr(%tileslice, %ptr, 23)
5633	// ->
5634	// %svl = rdsvl
5635	// %ptr2 = %ptr + %svl 15*
5636	// %tileslice2 = %tileslice + 15
5637	// ldr [%tileslice2, 7], [%ptr2, 7]
5638	// ldr [%tileslice2, 8], [%ptr2, 8]
5639	// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5640	// operand and the immediate can be folded into the instruction, like case 2.
5641	// ldr(%tileslice, %ptr, %vecnum + 7)
5642	// ldr(%tileslice, %ptr, %vecnum + 8)
5643	// ->
5644	// %svl = rdsvl
5645	// %ptr2 = %ptr + %svl %vecnum*
5646	// %tileslice2 = %tileslice + %vecnum
5647	// ldr [%tileslice2, 7], [%ptr2, 7]
5648	// ldr [%tileslice2, 8], [%ptr2, 8]
5649	// Case 5: The vecnum being an add of an immediate out of range is also handled,
5650	// in which case the same remainder logic as case 3 is used.
5651	SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5652	SDLoc DL(N);
5653
5654	SDValue TileSlice = N ->getOperand(Num: `2`);
5655	SDValue Base = N ->getOperand(Num: `3`);
5656	SDValue VecNum = N ->getOperand(Num: `4`);
5657	int32_t ConstAddend = `0`;
5658	SDValue VarAddend = VecNum;
5659
5660	// If the vnum is an add of an immediate, we can fold it into the instruction
5661	if (VecNum.getOpcode() == ISD::ADD &&
5662	isa<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))) {
5663	ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))->getSExtValue();
5664	VarAddend = VecNum.getOperand(i: `0`);
5665	} else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5666	ConstAddend = ImmNode->getSExtValue();
5667	VarAddend = SDValue ();
5668	}
5669
5670	int32_t ImmAddend = ConstAddend % `16`;
5671	if (int32_t C = (ConstAddend - ImmAddend)) {
5672	SDValue CVal = DAG.getTargetConstant(Val: C, DL, VT: MVT::i32);
5673	VarAddend = VarAddend
5674	? DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {VarAddend, CVal})
5675	: CVal;
5676	}
5677
5678	if (VarAddend) {
5679	// Get the vector length that will be multiplied by vnum
5680	auto SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
5681	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
5682
5683	// Multiply SVL and vnum then add it to the base
5684	SDValue Mul = DAG.getNode(
5685	Opcode: ISD::MUL, DL, VT: MVT::i64,
5686	Ops: {SVL, DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: VarAddend)});
5687	Base = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, Ops: {Base, Mul});
5688	// Just add vnum to the tileslice
5689	TileSlice = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {TileSlice, VarAddend});
5690	}
5691
5692	return DAG.getNode(Opcode: IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5693	DL, VT: MVT::Other,
5694	Ops: {/Chain=/N.getOperand(i: `0`), TileSlice, Base,
5695	DAG.getTargetConstant(Val: ImmAddend, DL, VT: MVT::i32)});
5696	}
5697
5698	SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG) {
5699	SDLoc DL(Op);
5700	SDValue ID =
5701	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_match, DL, VT: MVT::i64);
5702
5703	auto Op1 = Op.getOperand(i: `1`);
5704	auto Op2 = Op.getOperand(i: `2`);
5705	auto Mask = Op.getOperand(i: `3`);
5706
5707	EVT Op1VT = Op1.getValueType();
5708	EVT Op2VT = Op2.getValueType();
5709	EVT ResVT = Op.getValueType();
5710
5711	assert((Op1VT.getVectorElementType() == MVT::i8 \|\|
5712	Op1VT.getVectorElementType() == MVT::i16) &&
5713	"Expected 8-bit or 16-bit characters.");
5714
5715	// Scalable vector type used to wrap operands.
5716	// A single container is enough for both operands because ultimately the
5717	// operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
5718	EVT OpContainerVT = Op1VT.isScalableVector()
5719	? Op1VT
5720	: getContainerForFixedLengthVector(DAG, VT: Op1VT);
5721
5722	if (Op2VT.is128BitVector()) {
5723	// If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
5724	Op2 = convertToScalableVector(DAG, VT: OpContainerVT, V: Op2);
5725	// Further, if the result is scalable, broadcast Op2 to a full SVE register.
5726	if (ResVT.isScalableVector())
5727	Op2 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: OpContainerVT, N1: Op2,
5728	N2: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64));
5729	} else {
5730	// If Op2 is not a full 128-bit vector, we always need to broadcast it.
5731	unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
5732	MVT Op2IntVT = MVT::getIntegerVT(BitWidth: Op2BitWidth);
5733	EVT Op2PromotedVT = getPackedSVEVectorVT(VT: Op2IntVT);
5734	Op2 = DAG.getBitcast(VT: MVT::getVectorVT(VT: Op2IntVT, NumElements: `1`), V: Op2);
5735	Op2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: Op2IntVT, N1: Op2,
5736	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
5737	Op2 = DAG.getSplatVector(VT: Op2PromotedVT, DL, Op: Op2);
5738	Op2 = DAG.getBitcast(VT: OpContainerVT, V: Op2);
5739	}
5740
5741	// If the result is scalable, we just need to carry out the MATCH.
5742	if (ResVT.isScalableVector())
5743	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResVT, N1: ID, N2: Mask, N3: Op1, N4: Op2);
5744
5745	// If the result is fixed, we can still use MATCH but we need to wrap the
5746	// first operand and the mask in scalable vectors before doing so.
5747
5748	// Wrap the operands.
5749	Op1 = convertToScalableVector(DAG, VT: OpContainerVT, V: Op1);
5750	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: Op1VT, Operand: Mask);
5751	Mask = convertFixedMaskToScalableVector(Mask, DAG);
5752
5753	// Carry out the match.
5754	SDValue Match = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Mask.getValueType(),
5755	N1: ID, N2: Mask, N3: Op1, N4: Op2);
5756
5757	// Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
5758	// (v16i8/v8i8).
5759	Match = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: OpContainerVT, Operand: Match);
5760	Match = convertFromScalableVector(DAG, VT: Op1VT, V: Match);
5761	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Match);
5762	}
5763
5764	SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5765	SelectionDAG &DAG) const {
5766	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5767	SDLoc DL(Op);
5768	switch (IntNo) {
5769	default:
5770	return SDValue (); // Don't custom lower most intrinsics.
5771	case Intrinsic::aarch64_prefetch: {
5772	SDValue Chain = Op.getOperand(i: `0`);
5773	SDValue Addr = Op.getOperand(i: `2`);
5774
5775	unsigned IsWrite = Op.getConstantOperandVal(i: `3`);
5776	unsigned Locality = Op.getConstantOperandVal(i: `4`);
5777	unsigned IsStream = Op.getConstantOperandVal(i: `5`);
5778	unsigned IsData = Op.getConstantOperandVal(i: `6`);
5779	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
5780	(!IsData << `3`) \| // IsDataCache bit
5781	(Locality << `1`) \| // Cache level bits
5782	(unsigned)IsStream; // Stream bit
5783
5784	return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Chain,
5785	N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32), N3: Addr);
5786	}
5787	case Intrinsic::aarch64_sme_str:
5788	case Intrinsic::aarch64_sme_ldr: {
5789	return LowerSMELdrStr(N: Op, DAG, IsLoad: IntNo == Intrinsic::aarch64_sme_ldr);
5790	}
5791	case Intrinsic::aarch64_sme_za_enable:
5792	return DAG.getNode(
5793	Opcode: AArch64ISD::SMSTART, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
5794	N1: Op ->getOperand(Num: `0`), // Chain
5795	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
5796	case Intrinsic::aarch64_sme_za_disable:
5797	return DAG.getNode(
5798	Opcode: AArch64ISD::SMSTOP, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
5799	N1: Op ->getOperand(Num: `0`), // Chain
5800	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
5801	}
5802	}
5803
5804	SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5805	SelectionDAG &DAG) const {
5806	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5807	SDLoc DL(Op);
5808	switch (IntNo) {
5809	default:
5810	return SDValue (); // Don't custom lower most intrinsics.
5811	case Intrinsic::aarch64_mops_memset_tag: {
5812	auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5813	SDValue Chain = Node->getChain();
5814	SDValue Dst = Op.getOperand(i: `2`);
5815	SDValue Val = Op.getOperand(i: `3`);
5816	Val = DAG.getAnyExtOrTrunc(Op: Val, DL, VT: MVT::i64);
5817	SDValue Size = Op.getOperand(i: `4`);
5818	auto Alignment = Node->getMemOperand()->getAlign();
5819	bool IsVol = Node->isVolatile();
5820	auto DstPtrInfo = Node->getPointerInfo();
5821
5822	const auto &SDI =
5823	static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5824	SDValue MS = SDI.EmitMOPS(Opcode: AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
5825	Chain, Dst, SrcOrValue: Val, Size, Alignment, isVolatile: IsVol,
5826	DstPtrInfo, SrcPtrInfo: MachinePointerInfo {});
5827
5828	// MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5829	// intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5830	// LowerOperationWrapper will complain that the number of results has
5831	// changed.
5832	return DAG.getMergeValues(Ops: {MS.getValue(R: `0`), MS.getValue(R: `2`)}, dl: DL);
5833	}
5834	}
5835	}
5836
5837	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5838	SelectionDAG &DAG) const {
5839	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
5840	SDLoc DL(Op);
5841	switch (IntNo) {
5842	default: return SDValue (); // Don't custom lower most intrinsics.
5843	case Intrinsic::thread_pointer: {
5844	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5845	return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
5846	}
5847	case Intrinsic::aarch64_neon_abs: {
5848	EVT Ty = Op.getValueType();
5849	if (Ty == MVT::i64) {
5850	SDValue Result =
5851	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v1i64, Operand: Op.getOperand(i: `1`));
5852	Result = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::v1i64, Operand: Result);
5853	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Result);
5854	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5855	return DAG.getNode(Opcode: ISD::ABS, DL, VT: Ty, Operand: Op.getOperand(i: `1`));
5856	} else {
5857	report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinsic");
5858	}
5859	}
5860	case Intrinsic::aarch64_neon_pmull64: {
5861	SDValue LHS = Op.getOperand(i: `1`);
5862	SDValue RHS = Op.getOperand(i: `2`);
5863
5864	std::optional<uint64_t> LHSLane =
5865	getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5866	std::optional<uint64_t> RHSLane =
5867	getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5868
5869	assert((!LHSLane \|\| *LHSLane < `2`) && "Expect lane to be None or 0 or 1");
5870	assert((!RHSLane \|\| *RHSLane < `2`) && "Expect lane to be None or 0 or 1");
5871
5872	// 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5873	// instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5874	// which ISel recognizes better. For example, generate a ldr into d*
5875	// registers as opposed to a GPR load followed by a fmov.
5876	auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5877	std::optional<uint64_t> OtherLane,
5878	const SDLoc &DL,
5879	SelectionDAG &DAG) -> SDValue {
5880	// If the operand is an higher half itself, rewrite it to
5881	// extract_high_v2i64; this way aarch64_neon_pmull64 could
5882	// re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5883	if (NLane == `1`)
5884	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v1i64,
5885	N1: N.getOperand(i: `0`), N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
5886
5887	// Operand N is not a higher half but the other operand is.
5888	if (OtherLane == `1`) {
5889	// If this operand is a lower half, rewrite it to
5890	// extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5891	// align lanes of two operands. A roundtrip sequence (to move from lane
5892	// 1 to lane 0) is like this:
5893	// mov x8, v0.d[1]
5894	// fmov d0, x8
5895	if (NLane == `0`)
5896	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v1i64,
5897	N1: DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL, VT: MVT::v2i64,
5898	N1: N.getOperand(i: `0`),
5899	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64)),
5900	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
5901
5902	// Otherwise just dup from main to all lanes.
5903	return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT: MVT::v1i64, Operand: N);
5904	}
5905
5906	// Neither operand is an extract of higher half, so codegen may just use
5907	// the non-high version of PMULL instruction. Use v1i64 to represent i64.
5908	assert(N.getValueType() == MVT::i64 &&
5909	"Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5910	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: N);
5911	};
5912
5913	LHS = TryVectorizeOperand (LHS, LHSLane, RHSLane, DL, DAG);
5914	RHS = TryVectorizeOperand (RHS, RHSLane, LHSLane, DL, DAG);
5915
5916	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL, VT: Op.getValueType(), N1: LHS, N2: RHS);
5917	}
5918	case Intrinsic::aarch64_neon_smax:
5919	return DAG.getNode(Opcode: ISD::SMAX, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5920	N2: Op.getOperand(i: `2`));
5921	case Intrinsic::aarch64_neon_umax:
5922	return DAG.getNode(Opcode: ISD::UMAX, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5923	N2: Op.getOperand(i: `2`));
5924	case Intrinsic::aarch64_neon_smin:
5925	return DAG.getNode(Opcode: ISD::SMIN, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5926	N2: Op.getOperand(i: `2`));
5927	case Intrinsic::aarch64_neon_umin:
5928	return DAG.getNode(Opcode: ISD::UMIN, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5929	N2: Op.getOperand(i: `2`));
5930	case Intrinsic::aarch64_neon_scalar_sqxtn:
5931	case Intrinsic::aarch64_neon_scalar_sqxtun:
5932	case Intrinsic::aarch64_neon_scalar_uqxtn: {
5933	assert(Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::f32);
5934	if (Op.getValueType() == MVT::i32)
5935	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32,
5936	Operand: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::f32,
5937	N1: Op.getOperand(i: `0`),
5938	N2: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64,
5939	Operand: Op.getOperand(i: `1`))));
5940	return SDValue ();
5941	}
5942	case Intrinsic::aarch64_neon_sqxtn:
5943	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5944	Operand: Op.getOperand(i: `1`));
5945	case Intrinsic::aarch64_neon_sqxtun:
5946	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5947	Operand: Op.getOperand(i: `1`));
5948	case Intrinsic::aarch64_neon_uqxtn:
5949	return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5950	Operand: Op.getOperand(i: `1`));
5951	case Intrinsic::aarch64_neon_sqshrn:
5952	if (Op.getValueType().isVector())
5953	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5954	Operand: DAG.getNode(Opcode: AArch64ISD::VASHR, DL,
5955	VT: Op.getOperand(i: `1`).getValueType(),
5956	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5957	return SDValue ();
5958	case Intrinsic::aarch64_neon_sqshrun:
5959	if (Op.getValueType().isVector())
5960	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5961	Operand: DAG.getNode(Opcode: AArch64ISD::VASHR, DL,
5962	VT: Op.getOperand(i: `1`).getValueType(),
5963	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5964	return SDValue ();
5965	case Intrinsic::aarch64_neon_uqshrn:
5966	if (Op.getValueType().isVector())
5967	return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5968	Operand: DAG.getNode(Opcode: AArch64ISD::VLSHR, DL,
5969	VT: Op.getOperand(i: `1`).getValueType(),
5970	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5971	return SDValue ();
5972	case Intrinsic::aarch64_neon_sqrshrn:
5973	if (Op.getValueType().isVector())
5974	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_S, DL, VT: Op.getValueType(),
5975	Operand: DAG.getNode(Opcode: AArch64ISD::SRSHR_I, DL,
5976	VT: Op.getOperand(i: `1`).getValueType(),
5977	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5978	return SDValue ();
5979	case Intrinsic::aarch64_neon_sqrshrun:
5980	if (Op.getValueType().isVector())
5981	return DAG.getNode(Opcode: ISD::TRUNCATE_SSAT_U, DL, VT: Op.getValueType(),
5982	Operand: DAG.getNode(Opcode: AArch64ISD::SRSHR_I, DL,
5983	VT: Op.getOperand(i: `1`).getValueType(),
5984	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5985	return SDValue ();
5986	case Intrinsic::aarch64_neon_uqrshrn:
5987	if (Op.getValueType().isVector())
5988	return DAG.getNode(Opcode: ISD::TRUNCATE_USAT_U, DL, VT: Op.getValueType(),
5989	Operand: DAG.getNode(Opcode: AArch64ISD::URSHR_I, DL,
5990	VT: Op.getOperand(i: `1`).getValueType(),
5991	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`)));
5992	return SDValue ();
5993	case Intrinsic::aarch64_neon_sqadd:
5994	if (Op.getValueType().isVector())
5995	return DAG.getNode(Opcode: ISD::SADDSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
5996	N2: Op.getOperand(i: `2`));
5997	return SDValue ();
5998	case Intrinsic::aarch64_neon_sqsub:
5999	if (Op.getValueType().isVector())
6000	return DAG.getNode(Opcode: ISD::SSUBSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6001	N2: Op.getOperand(i: `2`));
6002	return SDValue ();
6003	case Intrinsic::aarch64_neon_uqadd:
6004	if (Op.getValueType().isVector())
6005	return DAG.getNode(Opcode: ISD::UADDSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6006	N2: Op.getOperand(i: `2`));
6007	return SDValue ();
6008	case Intrinsic::aarch64_neon_uqsub:
6009	if (Op.getValueType().isVector())
6010	return DAG.getNode(Opcode: ISD::USUBSAT, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6011	N2: Op.getOperand(i: `2`));
6012	return SDValue ();
6013	case Intrinsic::aarch64_sve_whilelt:
6014	return optimizeIncrementingWhile(N: Op.getNode(), DAG, /IsSigned=/true,
6015	/IsEqual=/false);
6016	case Intrinsic::aarch64_sve_whilels:
6017	return optimizeIncrementingWhile(N: Op.getNode(), DAG, /IsSigned=/false,
6018	/IsEqual=/true);
6019	case Intrinsic::aarch64_sve_whilele:
6020	return optimizeIncrementingWhile(N: Op.getNode(), DAG, /IsSigned=/true,
6021	/IsEqual=/true);
6022	case Intrinsic::aarch64_sve_sunpkhi:
6023	return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL, VT: Op.getValueType(),
6024	Operand: Op.getOperand(i: `1`));
6025	case Intrinsic::aarch64_sve_sunpklo:
6026	return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL, VT: Op.getValueType(),
6027	Operand: Op.getOperand(i: `1`));
6028	case Intrinsic::aarch64_sve_uunpkhi:
6029	return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: Op.getValueType(),
6030	Operand: Op.getOperand(i: `1`));
6031	case Intrinsic::aarch64_sve_uunpklo:
6032	return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: Op.getValueType(),
6033	Operand: Op.getOperand(i: `1`));
6034	case Intrinsic::aarch64_sve_clasta_n:
6035	return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: Op.getValueType(),
6036	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6037	case Intrinsic::aarch64_sve_clastb_n:
6038	return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: Op.getValueType(),
6039	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6040	case Intrinsic::aarch64_sve_lasta:
6041	return DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: Op.getValueType(),
6042	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6043	case Intrinsic::aarch64_sve_lastb:
6044	return DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: Op.getValueType(),
6045	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6046	case Intrinsic::aarch64_sve_rev:
6047	return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: Op.getValueType(),
6048	Operand: Op.getOperand(i: `1`));
6049	case Intrinsic::aarch64_sve_tbl:
6050	return DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6051	N2: Op.getOperand(i: `2`));
6052	case Intrinsic::aarch64_sve_trn1:
6053	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL, VT: Op.getValueType(),
6054	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6055	case Intrinsic::aarch64_sve_trn2:
6056	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL, VT: Op.getValueType(),
6057	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6058	case Intrinsic::aarch64_sve_uzp1:
6059	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: Op.getValueType(),
6060	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6061	case Intrinsic::aarch64_sve_uzp2:
6062	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: Op.getValueType(),
6063	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6064	case Intrinsic::aarch64_sve_zip1:
6065	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: Op.getValueType(),
6066	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6067	case Intrinsic::aarch64_sve_zip2:
6068	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: Op.getValueType(),
6069	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6070	case Intrinsic::aarch64_sve_splice:
6071	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Op.getValueType(),
6072	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6073	case Intrinsic::aarch64_sve_ptrue:
6074	return getPTrue(DAG, DL, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: `1`));
6075	case Intrinsic::aarch64_sve_clz:
6076	return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6077	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6078	case Intrinsic::aarch64_sme_cntsb:
6079	return DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6080	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
6081	case Intrinsic::aarch64_sme_cntsh: {
6082	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i32);
6083	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(), Operand: One);
6084	return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes, N2: One);
6085	}
6086	case Intrinsic::aarch64_sme_cntsw: {
6087	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6088	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
6089	return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes,
6090	N2: DAG.getConstant(Val: `2`, DL, VT: MVT::i32));
6091	}
6092	case Intrinsic::aarch64_sme_cntsd: {
6093	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: Op.getValueType(),
6094	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
6095	return DAG.getNode(Opcode: ISD::SRL, DL, VT: Op.getValueType(), N1: Bytes,
6096	N2: DAG.getConstant(Val: `3`, DL, VT: MVT::i32));
6097	}
6098	case Intrinsic::aarch64_sve_cnt: {
6099	SDValue Data = Op.getOperand(i: `3`);
6100	// CTPOP only supports integer operands.
6101	if (Data.getValueType().isFloatingPoint())
6102	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Data);
6103	return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6104	N1: Op.getOperand(i: `2`), N2: Data, N3: Op.getOperand(i: `1`));
6105	}
6106	case Intrinsic::aarch64_sve_dupq_lane:
6107	return LowerDUPQLane(Op, DAG);
6108	case Intrinsic::aarch64_sve_convert_from_svbool:
6109	if (Op.getValueType() == MVT::aarch64svcount)
6110	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
6111	return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: `1`), DAG);
6112	case Intrinsic::aarch64_sve_convert_to_svbool:
6113	if (Op.getOperand(i: `1`).getValueType() == MVT::aarch64svcount)
6114	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: Op.getOperand(i: `1`));
6115	return getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Op.getOperand(i: `1`), DAG);
6116	case Intrinsic::aarch64_sve_fneg:
6117	return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6118	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6119	case Intrinsic::aarch64_sve_frintp:
6120	return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6121	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6122	case Intrinsic::aarch64_sve_frintm:
6123	return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6124	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6125	case Intrinsic::aarch64_sve_frinti:
6126	return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6127	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6128	N3: Op.getOperand(i: `1`));
6129	case Intrinsic::aarch64_sve_frintx:
6130	return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6131	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6132	case Intrinsic::aarch64_sve_frinta:
6133	return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6134	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6135	case Intrinsic::aarch64_sve_frintn:
6136	return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6137	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6138	N3: Op.getOperand(i: `1`));
6139	case Intrinsic::aarch64_sve_frintz:
6140	return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6141	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6142	case Intrinsic::aarch64_sve_ucvtf:
6143	return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6144	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6145	N3: Op.getOperand(i: `1`));
6146	case Intrinsic::aarch64_sve_scvtf:
6147	return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6148	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6149	N3: Op.getOperand(i: `1`));
6150	case Intrinsic::aarch64_sve_fcvtzu:
6151	return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6152	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6153	case Intrinsic::aarch64_sve_fcvtzs:
6154	return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6155	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6156	case Intrinsic::aarch64_sve_fsqrt:
6157	return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6158	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6159	case Intrinsic::aarch64_sve_frecpx:
6160	return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6161	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6162	case Intrinsic::aarch64_sve_frecpe_x:
6163	return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL, VT: Op.getValueType(),
6164	Operand: Op.getOperand(i: `1`));
6165	case Intrinsic::aarch64_sve_frecps_x:
6166	return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT: Op.getValueType(),
6167	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6168	case Intrinsic::aarch64_sve_frsqrte_x:
6169	return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL, VT: Op.getValueType(),
6170	Operand: Op.getOperand(i: `1`));
6171	case Intrinsic::aarch64_sve_frsqrts_x:
6172	return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT: Op.getValueType(),
6173	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
6174	case Intrinsic::aarch64_sve_fabs:
6175	return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6176	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6177	case Intrinsic::aarch64_sve_abs:
6178	return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6179	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6180	case Intrinsic::aarch64_sve_neg:
6181	return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6182	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6183	case Intrinsic::aarch64_sve_insr: {
6184	SDValue Scalar = Op.getOperand(i: `2`);
6185	EVT ScalarTy = Scalar.getValueType();
6186	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
6187	Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Scalar);
6188
6189	return DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: Op.getValueType(),
6190	N1: Op.getOperand(i: `1`), N2: Scalar);
6191	}
6192	case Intrinsic::aarch64_sve_rbit:
6193	return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6194	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6195	N3: Op.getOperand(i: `1`));
6196	case Intrinsic::aarch64_sve_revb:
6197	return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6198	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6199	case Intrinsic::aarch64_sve_revh:
6200	return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6201	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6202	case Intrinsic::aarch64_sve_revw:
6203	return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6204	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6205	case Intrinsic::aarch64_sve_revd:
6206	return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6207	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
6208	case Intrinsic::aarch64_sve_sxtb:
6209	return DAG.getNode(
6210	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6211	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6212	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
6213	N4: Op.getOperand(i: `1`));
6214	case Intrinsic::aarch64_sve_sxth:
6215	return DAG.getNode(
6216	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6217	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6218	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
6219	N4: Op.getOperand(i: `1`));
6220	case Intrinsic::aarch64_sve_sxtw:
6221	return DAG.getNode(
6222	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6223	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6224	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
6225	N4: Op.getOperand(i: `1`));
6226	case Intrinsic::aarch64_sve_uxtb:
6227	return DAG.getNode(
6228	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6229	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6230	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
6231	N4: Op.getOperand(i: `1`));
6232	case Intrinsic::aarch64_sve_uxth:
6233	return DAG.getNode(
6234	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6235	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6236	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
6237	N4: Op.getOperand(i: `1`));
6238	case Intrinsic::aarch64_sve_uxtw:
6239	return DAG.getNode(
6240	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, VT: Op.getValueType(),
6241	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
6242	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
6243	N4: Op.getOperand(i: `1`));
6244	case Intrinsic::localaddress: {
6245	const auto &MF = DAG.getMachineFunction();
6246	const auto *RegInfo = Subtarget->getRegisterInfo();
6247	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6248	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg,
6249	VT: Op.getSimpleValueType());
6250	}
6251
6252	case Intrinsic::eh_recoverfp: {
6253	// FIXME: This needs to be implemented to correctly handle highly aligned
6254	// stack objects. For now we simply return the incoming FP. Refer D53541
6255	// for more details.
6256	SDValue FnOp = Op.getOperand(i: `1`);
6257	SDValue IncomingFPOp = Op.getOperand(i: `2`);
6258	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
6259	auto Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr*);
6260	if (!Fn)
6261	report_fatal_error(
6262	reason: "llvm.eh.recoverfp must take a function as the first argument");
6263	return IncomingFPOp;
6264	}
6265
6266	case Intrinsic::aarch64_neon_vsri:
6267	case Intrinsic::aarch64_neon_vsli:
6268	case Intrinsic::aarch64_sve_sri:
6269	case Intrinsic::aarch64_sve_sli: {
6270	EVT Ty = Op.getValueType();
6271
6272	if (!Ty.isVector())
6273	report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
6274
6275	assert(Op.getConstantOperandVal(`3`) <= Ty.getScalarSizeInBits());
6276
6277	bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri \|\|
6278	IntNo == Intrinsic::aarch64_sve_sri;
6279	unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6280	return DAG.getNode(Opcode, DL, VT: Ty, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`),
6281	N3: Op.getOperand(i: `3`));
6282	}
6283
6284	case Intrinsic::aarch64_neon_srhadd:
6285	case Intrinsic::aarch64_neon_urhadd:
6286	case Intrinsic::aarch64_neon_shadd:
6287	case Intrinsic::aarch64_neon_uhadd: {
6288	bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
6289	IntNo == Intrinsic::aarch64_neon_shadd);
6290	bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
6291	IntNo == Intrinsic::aarch64_neon_urhadd);
6292	unsigned Opcode = IsSignedAdd
6293	? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6294	: (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6295	return DAG.getNode(Opcode, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6296	N2: Op.getOperand(i: `2`));
6297	}
6298	case Intrinsic::aarch64_neon_saddlp:
6299	case Intrinsic::aarch64_neon_uaddlp: {
6300	unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6301	? AArch64ISD::UADDLP
6302	: AArch64ISD::SADDLP;
6303	return DAG.getNode(Opcode, DL, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
6304	}
6305	case Intrinsic::aarch64_neon_sdot:
6306	case Intrinsic::aarch64_neon_udot:
6307	case Intrinsic::aarch64_sve_sdot:
6308	case Intrinsic::aarch64_sve_udot: {
6309	unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot \|\|
6310	IntNo == Intrinsic::aarch64_sve_udot)
6311	? AArch64ISD::UDOT
6312	: AArch64ISD::SDOT;
6313	return DAG.getNode(Opcode, DL, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6314	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6315	}
6316	case Intrinsic::aarch64_neon_usdot:
6317	case Intrinsic::aarch64_sve_usdot: {
6318	return DAG.getNode(Opcode: AArch64ISD::USDOT, DL, VT: Op.getValueType(),
6319	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6320	}
6321	case Intrinsic::aarch64_neon_saddlv:
6322	case Intrinsic::aarch64_neon_uaddlv: {
6323	EVT OpVT = Op.getOperand(i: `1`).getValueType();
6324	EVT ResVT = Op.getValueType();
6325	assert(
6326	((ResVT == MVT::i32 && (OpVT == MVT::v8i8 \|\| OpVT == MVT::v16i8 \|\|
6327	OpVT == MVT::v8i16 \|\| OpVT == MVT::v4i16)) \|\|
6328	(ResVT == MVT::i64 && (OpVT == MVT::v4i32 \|\| OpVT == MVT::v2i32))) &&
6329	"Unexpected aarch64_neon_u/saddlv type");
6330	(void)OpVT;
6331	// In order to avoid insert_subvector, use v4i32 rather than v2i32.
6332	SDValue ADDLV = DAG.getNode(
6333	Opcode: IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6334	: AArch64ISD::SADDLV,
6335	DL, VT: ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Operand: Op.getOperand(i: `1`));
6336	SDValue EXTRACT_VEC_ELT = DAG.getNode(
6337	Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6338	N1: ADDLV, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6339	return EXTRACT_VEC_ELT;
6340	}
6341	case Intrinsic::experimental_cttz_elts: {
6342	SDValue CttzOp = Op.getOperand(i: `1`);
6343	EVT VT = CttzOp.getValueType();
6344	assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6345
6346	if (VT.isFixedLengthVector()) {
6347	// We can use SVE instructions to lower this intrinsic by first creating
6348	// an SVE predicate register mask from the fixed-width vector.
6349	EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
6350	SDValue Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: NewVT, Operand: CttzOp);
6351	CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6352	}
6353
6354	SDValue NewCttzElts =
6355	DAG.getNode(Opcode: AArch64ISD::CTTZ_ELTS, DL, VT: MVT::i64, Operand: CttzOp);
6356	return DAG.getZExtOrTrunc(Op: NewCttzElts, DL, VT: Op.getValueType());
6357	}
6358	case Intrinsic::experimental_vector_match: {
6359	return LowerVectorMatch(Op, DAG);
6360	}
6361	}
6362	}
6363
6364	bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6365	if (VT.getVectorElementType() == MVT::i8 \|\|
6366	VT.getVectorElementType() == MVT::i16) {
6367	EltTy = MVT::i32;
6368	return true;
6369	}
6370	return false;
6371	}
6372
6373	bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6374	EVT DataVT) const {
6375	const EVT IndexVT = Extend.getOperand(i: `0`).getValueType();
6376	// SVE only supports implicit extension of 32-bit indices.
6377	if (!Subtarget->hasSVE() \|\| IndexVT.getVectorElementType() != MVT::i32)
6378	return false;
6379
6380	// Indices cannot be smaller than the main data type.
6381	if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6382	return false;
6383
6384	// Scalable vectors with "vscale 2" or fewer elements sit within a 64-bit*
6385	// element container type, which would violate the previous clause.
6386	return DataVT.isFixedLengthVector() \|\| DataVT.getVectorMinNumElements() > `2`;
6387	}
6388
6389	bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6390	EVT ExtVT = ExtVal.getValueType();
6391	if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6392	return false;
6393
6394	// It may be worth creating extending masked loads if there are multiple
6395	// masked loads using the same predicate. That way we'll end up creating
6396	// extending masked loads that may then get split by the legaliser. This
6397	// results in just one set of predicate unpacks at the start, instead of
6398	// multiple sets of vector unpacks after each load.
6399	if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal ->getOperand(Num: `0`))) {
6400	if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: `0`))) {
6401	// Disable extending masked loads for fixed-width for now, since the code
6402	// quality doesn't look great.
6403	if (!ExtVT.isScalableVector())
6404	return false;
6405
6406	unsigned NumExtMaskedLoads = `0`;
6407	for (auto *U : Ld->getMask()->users())
6408	if (isa<MaskedLoadSDNode>(Val: U))
6409	NumExtMaskedLoads++;
6410
6411	if (NumExtMaskedLoads <= `1`)
6412	return false;
6413	}
6414	}
6415
6416	return true;
6417	}
6418
6419	unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6420	std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6421	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: false),
6422	AArch64ISD::GLD1_MERGE_ZERO},
6423	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: true),
6424	AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6425	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: false),
6426	AArch64ISD::GLD1_MERGE_ZERO},
6427	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: true),
6428	AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6429	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: false),
6430	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6431	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: true),
6432	AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6433	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: false),
6434	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6435	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: true),
6436	AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6437	};
6438	auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
6439	return AddrModes.find(x: Key)->second;
6440	}
6441
6442	unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6443	switch (Opcode) {
6444	default:
6445	llvm_unreachable("unimplemented opcode");
6446	return Opcode;
6447	case AArch64ISD::GLD1_MERGE_ZERO:
6448	return AArch64ISD::GLD1S_MERGE_ZERO;
6449	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6450	return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6451	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6452	return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6453	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6454	return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6455	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6456	return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6457	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6458	return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6459	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6460	return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6461	}
6462	}
6463
6464	SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6465	SelectionDAG &DAG) const {
6466	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
6467
6468	SDLoc DL(Op);
6469	SDValue Chain = MGT->getChain();
6470	SDValue PassThru = MGT->getPassThru();
6471	SDValue Mask = MGT->getMask();
6472	SDValue BasePtr = MGT->getBasePtr();
6473	SDValue Index = MGT->getIndex();
6474	SDValue Scale = MGT->getScale();
6475	EVT VT = Op.getValueType();
6476	EVT MemVT = MGT->getMemoryVT();
6477	ISD::LoadExtType ExtType = MGT->getExtensionType();
6478	ISD::MemIndexType IndexType = MGT->getIndexType();
6479
6480	// SVE supports zero (and so undef) passthrough values only, everything else
6481	// must be handled manually by an explicit select on the load's output.
6482	if (!PassThru ->isUndef() && !isZerosVector(N: PassThru.getNode())) {
6483	SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6484	SDValue Load =
6485	DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6486	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6487	SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6488	return DAG.getMergeValues(Ops: {Select, Load.getValue(R: `1`)}, dl: DL);
6489	}
6490
6491	bool IsScaled = MGT->isIndexScaled();
6492	bool IsSigned = MGT->isIndexSigned();
6493
6494	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6495	// must be calculated before hand.
6496	uint64_t ScaleVal = Scale ->getAsZExtVal();
6497	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6498	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6499	EVT IndexVT = Index.getValueType();
6500	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6501	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6502	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
6503
6504	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6505	return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6506	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6507	}
6508
6509	// Lower fixed length gather to a scalable equivalent.
6510	if (VT.isFixedLengthVector()) {
6511	assert(Subtarget->useSVEForFixedLengthVectors() &&
6512	"Cannot lower when not using SVE for fixed vectors!");
6513
6514	// NOTE: Handle floating-point as if integer then bitcast the result.
6515	EVT DataVT = VT.changeVectorElementTypeToInteger();
6516	MemVT = MemVT.changeVectorElementTypeToInteger();
6517
6518	// Find the smallest integer fixed length vector we can use for the gather.
6519	EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6520	if (DataVT.getVectorElementType() == MVT::i64 \|\|
6521	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
6522	Mask.getValueType().getVectorElementType() == MVT::i64)
6523	PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6524
6525	// Promote vector operands except for passthrough, which we know is either
6526	// undef or zero, and thus best constructed directly.
6527	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6528	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6529	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6530
6531	// A promoted result type forces the need for an extending load.
6532	if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6533	ExtType = ISD::EXTLOAD;
6534
6535	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6536
6537	// Convert fixed length vector operands to scalable.
6538	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6539	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6540	Mask = convertFixedMaskToScalableVector(Mask, DAG);
6541	PassThru = PassThru ->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
6542	: DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
6543
6544	// Emit equivalent scalable vector gather.
6545	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6546	SDValue Load =
6547	DAG.getMaskedGather(VTs: DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other), MemVT, dl: DL,
6548	Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6549
6550	// Extract fixed length data then convert to the required result type.
6551	SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
6552	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
6553	if (VT.isFloatingPoint())
6554	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
6555
6556	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
6557	}
6558
6559	// Everything else is legal.
6560	return Op;
6561	}
6562
6563	SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6564	SelectionDAG &DAG) const {
6565	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
6566
6567	SDLoc DL(Op);
6568	SDValue Chain = MSC->getChain();
6569	SDValue StoreVal = MSC->getValue();
6570	SDValue Mask = MSC->getMask();
6571	SDValue BasePtr = MSC->getBasePtr();
6572	SDValue Index = MSC->getIndex();
6573	SDValue Scale = MSC->getScale();
6574	EVT VT = StoreVal.getValueType();
6575	EVT MemVT = MSC->getMemoryVT();
6576	ISD::MemIndexType IndexType = MSC->getIndexType();
6577	bool Truncating = MSC->isTruncatingStore();
6578
6579	bool IsScaled = MSC->isIndexScaled();
6580	bool IsSigned = MSC->isIndexSigned();
6581
6582	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6583	// must be calculated before hand.
6584	uint64_t ScaleVal = Scale ->getAsZExtVal();
6585	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6586	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6587	EVT IndexVT = Index.getValueType();
6588	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6589	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6590	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
6591
6592	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6593	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6594	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6595	}
6596
6597	// Lower fixed length scatter to a scalable equivalent.
6598	if (VT.isFixedLengthVector()) {
6599	assert(Subtarget->useSVEForFixedLengthVectors() &&
6600	"Cannot lower when not using SVE for fixed vectors!");
6601
6602	// Once bitcast we treat floating-point scatters as if integer.
6603	if (VT.isFloatingPoint()) {
6604	VT = VT.changeVectorElementTypeToInteger();
6605	MemVT = MemVT.changeVectorElementTypeToInteger();
6606	StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
6607	}
6608
6609	// Find the smallest integer fixed length vector we can use for the scatter.
6610	EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6611	if (VT.getVectorElementType() == MVT::i64 \|\|
6612	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
6613	Mask.getValueType().getVectorElementType() == MVT::i64)
6614	PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6615
6616	// Promote vector operands.
6617	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6618	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6619	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6620	StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
6621
6622	// A promoted value type forces the need for a truncating store.
6623	if (PromotedVT != VT)
6624	Truncating = true;
6625
6626	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6627
6628	// Convert fixed length vector operands to scalable.
6629	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6630	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6631	Mask = convertFixedMaskToScalableVector(Mask, DAG);
6632	StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
6633
6634	// Emit equivalent scalable vector scatter.
6635	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6636	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6637	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6638	}
6639
6640	// Everything else is legal.
6641	return Op;
6642	}
6643
6644	SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6645	SDLoc DL(Op);
6646	MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
6647	assert(LoadNode && "Expected custom lowering of a masked load node");
6648	EVT VT = Op ->getValueType(ResNo: `0`);
6649
6650	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
6651	return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6652
6653	SDValue PassThru = LoadNode->getPassThru();
6654	SDValue Mask = LoadNode->getMask();
6655
6656	if (PassThru ->isUndef() \|\| isZerosVector(N: PassThru.getNode()))
6657	return Op;
6658
6659	SDValue Load = DAG.getMaskedLoad(
6660	VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6661	Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6662	MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6663	LoadNode->getExtensionType());
6664
6665	SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6666
6667	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
6668	}
6669
6670	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6671	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6672	EVT VT, EVT MemVT,
6673	SelectionDAG &DAG) {
6674	assert(VT.isVector() && "VT should be a vector type");
6675	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6676
6677	SDValue Value = ST->getValue();
6678
6679	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6680	// the word lane which represent the v4i8 subvector. It optimizes the store
6681	// to:
6682	//
6683	// xtn v0.8b, v0.8h
6684	// str s0, [x0]
6685
6686	SDValue Undef = DAG.getUNDEF(VT: MVT::i16);
6687	SDValue UndefVec = DAG.getBuildVector(VT: MVT::v4i16, DL,
6688	Ops: {Undef, Undef, Undef, Undef});
6689
6690	SDValue TruncExt = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16,
6691	N1: Value, N2: UndefVec);
6692	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: TruncExt);
6693
6694	Trunc = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Trunc);
6695	SDValue ExtractTrunc = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32,
6696	N1: Trunc, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6697
6698	return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6699	Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6700	}
6701
6702	static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
6703	SDLoc DL(Op);
6704	SDValue Src = Op.getOperand(i: `0`);
6705	MVT DestVT = Op.getSimpleValueType();
6706	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6707	AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Val: Op.getNode());
6708
6709	unsigned SrcAS = N->getSrcAddressSpace();
6710	unsigned DestAS = N->getDestAddressSpace();
6711	assert(SrcAS != DestAS &&
6712	"addrspacecast must be between different address spaces");
6713	assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
6714	TLI.getTargetMachine().getPointerSize(DestAS) &&
6715	"addrspacecast must be between different ptr sizes");
6716	(void)TLI;
6717
6718	if (SrcAS == ARM64AS::PTR32_SPTR) {
6719	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: DestVT, N1: Src,
6720	N2: DAG.getTargetConstant(Val: `0`, DL, VT: DestVT));
6721	} else if (SrcAS == ARM64AS::PTR32_UPTR) {
6722	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: DestVT, N1: Src,
6723	N2: DAG.getTargetConstant(Val: `0`, DL, VT: DestVT));
6724	} else if ((DestAS == ARM64AS::PTR32_SPTR) \|\|
6725	(DestAS == ARM64AS::PTR32_UPTR)) {
6726	SDValue Ext = DAG.getAnyExtOrTrunc(Op: Src, DL, VT: DestVT);
6727	SDValue Trunc = DAG.getZeroExtendInReg(Op: Ext, DL, VT: DestVT);
6728	return Trunc;
6729	} else {
6730	return Src;
6731	}
6732	}
6733
6734	// Custom lowering for any store, vector or scalar and/or default or with
6735	// a truncate operations. Currently only custom lower truncate operation
6736	// from vector v4i16 to v4i8 or volatile stores of i128.
6737	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6738	SelectionDAG &DAG) const {
6739	SDLoc Dl(Op);
6740	StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6741	assert (StoreNode && "Can only custom lower store nodes");
6742
6743	SDValue Value = StoreNode->getValue();
6744
6745	EVT VT = Value.getValueType();
6746	EVT MemVT = StoreNode->getMemoryVT();
6747
6748	if (VT.isVector()) {
6749	if (useSVEForFixedLengthVectorVT(
6750	VT,
6751	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
6752	return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6753
6754	unsigned AS = StoreNode->getAddressSpace();
6755	Align Alignment = StoreNode->getAlign();
6756	if (Alignment < MemVT.getStoreSize() &&
6757	!allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6758	Flags: StoreNode->getMemOperand()->getFlags(),
6759	Fast: nullptr)) {
6760	return scalarizeVectorStore(ST: StoreNode, DAG);
6761	}
6762
6763	if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6764	MemVT == MVT::v4i8) {
6765	return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6766	}
6767	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6768	// the custom lowering, as there are no un-paired non-temporal stores and
6769	// legalization will break up 256 bit inputs.
6770	ElementCount EC = MemVT.getVectorElementCount();
6771	if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == `256u` &&
6772	EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6773	(MemVT.getScalarSizeInBits() == `8u` \|\|
6774	MemVT.getScalarSizeInBits() == `16u` \|\|
6775	MemVT.getScalarSizeInBits() == `32u` \|\|
6776	MemVT.getScalarSizeInBits() == `64u`)) {
6777	SDValue Lo =
6778	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6779	VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6780	N1: StoreNode->getValue(), N2: DAG.getConstant(Val: `0`, DL: Dl, VT: MVT::i64));
6781	SDValue Hi =
6782	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6783	VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6784	N1: StoreNode->getValue(),
6785	N2: DAG.getConstant(Val: EC.getKnownMinValue() / `2`, DL: Dl, VT: MVT::i64));
6786	SDValue Result = DAG.getMemIntrinsicNode(
6787	Opcode: AArch64ISD::STNP, dl: Dl, VTList: DAG.getVTList(VT: MVT::Other),
6788	Ops: {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6789	MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6790	return Result;
6791	}
6792	} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6793	return LowerStore128(Op, DAG);
6794	} else if (MemVT == MVT::i64x8) {
6795	SDValue Value = StoreNode->getValue();
6796	assert(Value->getValueType(`0`) == MVT::i64x8);
6797	SDValue Chain = StoreNode->getChain();
6798	SDValue Base = StoreNode->getBasePtr();
6799	EVT PtrVT = Base.getValueType();
6800	for (unsigned i = `0`; i < `8`; i++) {
6801	SDValue Part = DAG.getNode(Opcode: AArch64ISD::LS64_EXTRACT, DL: Dl, VT: MVT::i64,
6802	N1: Value, N2: DAG.getConstant(Val: i, DL: Dl, VT: MVT::i32));
6803	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6804	N2: DAG.getConstant(Val: i * `8`, DL: Dl, VT: PtrVT));
6805	Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6806	Alignment: StoreNode->getBaseAlign());
6807	}
6808	return Chain;
6809	}
6810
6811	return SDValue ();
6812	}
6813
6814	/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6815	SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6816	SelectionDAG &DAG) const {
6817	MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6818	assert(StoreNode->getMemoryVT() == MVT::i128);
6819	assert(StoreNode->isVolatile() \|\| StoreNode->isAtomic());
6820
6821	bool IsStoreRelease =
6822	StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6823	if (StoreNode->isAtomic())
6824	assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6825	Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) \|\|
6826	StoreNode->getMergedOrdering() == AtomicOrdering::Unordered \|\|
6827	StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6828
6829	SDValue Value = (StoreNode->getOpcode() == ISD::STORE \|\|
6830	StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6831	? StoreNode->getOperand(Num: `1`)
6832	: StoreNode->getOperand(Num: `2`);
6833	SDLoc DL(Op);
6834	auto StoreValue = DAG.SplitScalar(N: Value, DL, LoVT: MVT::i64, HiVT: MVT::i64);
6835	unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6836	if (DAG.getDataLayout().isBigEndian())
6837	std::swap(a&: StoreValue.first, b&: StoreValue.second);
6838	SDValue Result = DAG.getMemIntrinsicNode(
6839	Opcode, dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
6840	Ops: {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6841	StoreNode->getBasePtr()},
6842	MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6843	return Result;
6844	}
6845
6846	SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6847	SelectionDAG &DAG) const {
6848	SDLoc DL(Op);
6849	LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6850	assert(LoadNode && "Expected custom lowering of a load node");
6851
6852	if (LoadNode->getMemoryVT() == MVT::i64x8) {
6853	SmallVector<SDValue, `8`> Ops;
6854	SDValue Base = LoadNode->getBasePtr();
6855	SDValue Chain = LoadNode->getChain();
6856	EVT PtrVT = Base.getValueType();
6857	for (unsigned i = `0`; i < `8`; i++) {
6858	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6859	N2: DAG.getConstant(Val: i * `8`, DL, VT: PtrVT));
6860	SDValue Part =
6861	DAG.getLoad(VT: MVT::i64, dl: DL, Chain, Ptr, PtrInfo: LoadNode->getPointerInfo(),
6862	Alignment: LoadNode->getBaseAlign());
6863	Ops.push_back(Elt: Part);
6864	Chain = SDValue (Part.getNode(), `1`);
6865	}
6866	SDValue Loaded = DAG.getNode(Opcode: AArch64ISD::LS64_BUILD, DL, VT: MVT::i64x8, Ops);
6867	return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6868	}
6869
6870	// Custom lowering for extending v4i8 vector loads.
6871	EVT VT = Op ->getValueType(ResNo: `0`);
6872	assert((VT == MVT::v4i16 \|\| VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6873
6874	if (LoadNode->getMemoryVT() != MVT::v4i8)
6875	return SDValue ();
6876
6877	// Avoid generating unaligned loads.
6878	if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align (`4`))
6879	return SDValue ();
6880
6881	unsigned ExtType;
6882	if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6883	ExtType = ISD::SIGN_EXTEND;
6884	else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD \|\|
6885	LoadNode->getExtensionType() == ISD::EXTLOAD)
6886	ExtType = ISD::ZERO_EXTEND;
6887	else
6888	return SDValue ();
6889
6890	SDValue Load = DAG.getLoad(VT: MVT::f32, dl: DL, Chain: LoadNode->getChain(),
6891	Ptr: LoadNode->getBasePtr(), PtrInfo: MachinePointerInfo ());
6892	SDValue Chain = Load.getValue(R: `1`);
6893	SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v2f32, Operand: Load);
6894	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Vec);
6895	SDValue Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v8i16, Operand: BC);
6896	Ext = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v4i16, N1: Ext,
6897	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6898	if (VT == MVT::v4i32)
6899	Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v4i32, Operand: Ext);
6900	return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6901	}
6902
6903	SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
6904	SelectionDAG &DAG) const {
6905	SDLoc DL(Op);
6906	SDValue Vec = Op.getOperand(i: `0`);
6907	SDValue Mask = Op.getOperand(i: `1`);
6908	SDValue Passthru = Op.getOperand(i: `2`);
6909	EVT VecVT = Vec.getValueType();
6910	EVT MaskVT = Mask.getValueType();
6911	EVT ElmtVT = VecVT.getVectorElementType();
6912	const bool IsFixedLength = VecVT.isFixedLengthVector();
6913	const bool HasPassthru = !Passthru.isUndef();
6914	unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
6915	EVT FixedVecVT = MVT::getVectorVT(VT: ElmtVT.getSimpleVT(), NumElements: MinElmts);
6916
6917	assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
6918
6919	if (!Subtarget->isSVEAvailable())
6920	return SDValue ();
6921
6922	if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > `128`)
6923	return SDValue ();
6924
6925	// Only <vscale x {4\|2} x {i32\|i64}> supported for compact.
6926	if (MinElmts != `2` && MinElmts != `4`)
6927	return SDValue ();
6928
6929	// We can use the SVE register containing the NEON vector in its lowest bits.
6930	if (IsFixedLength) {
6931	EVT ScalableVecVT =
6932	MVT::getScalableVectorVT(VT: ElmtVT.getSimpleVT(), NumElements: MinElmts);
6933	EVT ScalableMaskVT = MVT::getScalableVectorVT(
6934	VT: MaskVT.getVectorElementType().getSimpleVT(), NumElements: MinElmts);
6935
6936	Vec = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableVecVT,
6937	N1: DAG.getUNDEF(VT: ScalableVecVT), N2: Vec,
6938	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6939	Mask = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableMaskVT,
6940	N1: DAG.getUNDEF(VT: ScalableMaskVT), N2: Mask,
6941	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6942	Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
6943	VT: ScalableMaskVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
6944	Passthru = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: ScalableVecVT,
6945	N1: DAG.getUNDEF(VT: ScalableVecVT), N2: Passthru,
6946	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6947
6948	VecVT = Vec.getValueType();
6949	MaskVT = Mask.getValueType();
6950	}
6951
6952	// Get legal type for compact instruction
6953	EVT ContainerVT = getSVEContainerType(ContentTy: VecVT);
6954	EVT CastVT = VecVT.changeVectorElementTypeToInteger();
6955
6956	// Convert to i32 or i64 for smaller types, as these are the only supported
6957	// sizes for compact.
6958	if (ContainerVT != VecVT) {
6959	Vec = DAG.getBitcast(VT: CastVT, V: Vec);
6960	Vec = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ContainerVT, Operand: Vec);
6961	}
6962
6963	SDValue Compressed = DAG.getNode(
6964	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: Vec.getValueType(),
6965	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_compact, DL, VT: MVT::i64), N2: Mask, N3: Vec);
6966
6967	// compact fills with 0s, so if our passthru is all 0s, do nothing here.
6968	if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(N: Passthru.getNode())) {
6969	SDValue Offset = DAG.getNode(
6970	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64,
6971	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64), N2: Mask, N3: Mask);
6972
6973	SDValue IndexMask = DAG.getNode(
6974	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MaskVT,
6975	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64),
6976	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64), N3: Offset);
6977
6978	Compressed =
6979	DAG.getNode(Opcode: ISD::VSELECT, DL, VT: VecVT, N1: IndexMask, N2: Compressed, N3: Passthru);
6980	}
6981
6982	// Extracting from a legal SVE type before truncating produces better code.
6983	if (IsFixedLength) {
6984	Compressed = DAG.getNode(
6985	Opcode: ISD::EXTRACT_SUBVECTOR, DL,
6986	VT: FixedVecVT.changeVectorElementType(EltVT: ContainerVT.getVectorElementType()),
6987	N1: Compressed, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6988	CastVT = FixedVecVT.changeVectorElementTypeToInteger();
6989	VecVT = FixedVecVT;
6990	}
6991
6992	// If we changed the element type before, we need to convert it back.
6993	if (ContainerVT != VecVT) {
6994	Compressed = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: CastVT, Operand: Compressed);
6995	Compressed = DAG.getBitcast(VT: VecVT, V: Compressed);
6996	}
6997
6998	return Compressed;
6999	}
7000
7001	// Generate SUBS and CSEL for integer abs.
7002	SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7003	MVT VT = Op.getSimpleValueType();
7004
7005	if (VT.isVector())
7006	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
7007
7008	SDLoc DL(Op);
7009	SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
7010	N2: Op.getOperand(i: `0`));
7011	// Generate SUBS & CSEL.
7012	SDValue Cmp =
7013	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
7014	N1: Op.getOperand(i: `0`), N2: DAG.getConstant(Val: `0`, DL, VT));
7015	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: Op.getOperand(i: `0`), N2: Neg,
7016	N3: DAG.getConstant(Val: AArch64CC::PL, DL, VT: MVT::i32),
7017	N4: Cmp.getValue(R: `1`));
7018	}
7019
7020	static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
7021	SDValue Chain = Op.getOperand(i: `0`);
7022	SDValue Cond = Op.getOperand(i: `1`);
7023	SDValue Dest = Op.getOperand(i: `2`);
7024
7025	AArch64CC::CondCode CC;
7026	if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
7027	SDLoc DL(Op);
7028	SDValue CCVal = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
7029	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
7030	N4: Cmp);
7031	}
7032
7033	return SDValue ();
7034	}
7035
7036	// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7037	// FSHL is converted to FSHR before deciding what to do with it
7038	static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
7039	SDValue Shifts = Op.getOperand(i: `2`);
7040	// Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7041	// If opcode is FSHL, convert it to FSHR
7042	if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
7043	SDLoc DL(Op);
7044	MVT VT = Op.getSimpleValueType();
7045	unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7046
7047	if (Op.getOpcode() == ISD::FSHL) {
7048	if (NewShiftNo == `0`)
7049	return Op.getOperand(i: `0`);
7050
7051	NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7052	return DAG.getNode(
7053	Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`),
7054	N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
7055	}
7056
7057	if (Op.getOpcode() == ISD::FSHR) {
7058	if (NewShiftNo == `0`)
7059	return Op.getOperand(i: `1`);
7060
7061	if (ShiftNo->getZExtValue() == NewShiftNo)
7062	return Op;
7063
7064	// Rewrite using the normalised shift amount.
7065	return DAG.getNode(
7066	Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`),
7067	N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
7068	}
7069	}
7070
7071	return SDValue ();
7072	}
7073
7074	static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
7075	SDValue X = Op.getOperand(i: `0`);
7076	EVT XScalarTy = X.getValueType();
7077	SDValue Exp = Op.getOperand(i: `1`);
7078
7079	SDLoc DL(Op);
7080	EVT XVT, ExpVT;
7081	switch (Op.getSimpleValueType().SimpleTy) {
7082	default:
7083	return SDValue ();
7084	case MVT::bf16:
7085	case MVT::f16:
7086	X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X);
7087	[[fallthrough]];
7088	case MVT::f32:
7089	XVT = MVT::nxv4f32;
7090	ExpVT = MVT::nxv4i32;
7091	break;
7092	case MVT::f64:
7093	XVT = MVT::nxv2f64;
7094	ExpVT = MVT::nxv2i64;
7095	Exp = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Exp);
7096	break;
7097	}
7098
7099	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
7100	SDValue VX =
7101	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
7102	SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
7103	N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
7104	SDValue VPg = getPTrue(DAG, DL, VT: XVT.changeVectorElementType(EltVT: MVT::i1),
7105	Pattern: AArch64SVEPredPattern::all);
7106	SDValue FScale =
7107	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: XVT,
7108	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_fscale, DL, VT: MVT::i64),
7109	N2: VPg, N3: VX, N4: VExp);
7110	SDValue Final =
7111	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
7112	if (X.getValueType() != XScalarTy)
7113	Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
7114	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (Op), /isTarget=/true));
7115	return Final;
7116	}
7117
7118	SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7119	SelectionDAG &DAG) const {
7120	return Op.getOperand(i: `0`);
7121	}
7122
7123	SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7124	SelectionDAG &DAG) const {
7125	SDValue Chain = Op.getOperand(i: `0`);
7126	SDValue Trmp = Op.getOperand(i: `1`); // trampoline, >=32 bytes
7127	SDValue FPtr = Op.getOperand(i: `2`); // nested function
7128	SDValue Nest = Op.getOperand(i: `3`); // 'nest' parameter value
7129
7130	const Value *TrmpAddr = cast<SrcValueSDNode>(Val: Op.getOperand(i: `4`))->getValue();
7131
7132	// ldr NestReg, .+16
7133	// ldr x17, .+20
7134	// br x17
7135	// .word 0
7136	// .nest: .qword nest
7137	// .fptr: .qword fptr
7138	SDValue OutChains[`5`];
7139
7140	const Function *Func =
7141	cast<Function>(Val: cast<SrcValueSDNode>(Val: Op.getOperand(i: `5`))->getValue());
7142	CallingConv::ID CC = Func->getCallingConv();
7143	unsigned NestReg;
7144
7145	switch (CC) {
7146	default:
7147	NestReg = `0x0f`; // X15
7148	break;
7149	case CallingConv::ARM64EC_Thunk_X64:
7150	// Must be kept in sync with AArch64CallingConv.td
7151	NestReg = `0x04`; // X4
7152	break;
7153	}
7154
7155	const char FptrReg = `0x11`; // X17
7156
7157	SDValue Addr = Trmp;
7158
7159	SDLoc DL(Op);
7160	OutChains[`0`] = DAG.getStore(
7161	Chain, dl: DL, Val: DAG.getConstant(Val: `0x58000080u` \| NestReg, DL, VT: MVT::i32), Ptr: Addr,
7162	PtrInfo: MachinePointerInfo (TrmpAddr));
7163
7164	Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7165	N2: DAG.getConstant(Val: `4`, DL, VT: MVT::i64));
7166	OutChains[`1`] = DAG.getStore(
7167	Chain, dl: DL, Val: DAG.getConstant(Val: `0x580000b0u` \| FptrReg, DL, VT: MVT::i32), Ptr: Addr,
7168	PtrInfo: MachinePointerInfo (TrmpAddr, `4`));
7169
7170	Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7171	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
7172	OutChains[`2`] =
7173	DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: `0xd61f0220u`, DL, VT: MVT::i32), Ptr: Addr,
7174	PtrInfo: MachinePointerInfo (TrmpAddr, `8`));
7175
7176	Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7177	N2: DAG.getConstant(Val: `16`, DL, VT: MVT::i64));
7178	OutChains[`3`] =
7179	DAG.getStore(Chain, dl: DL, Val: Nest, Ptr: Addr, PtrInfo: MachinePointerInfo (TrmpAddr, `16`));
7180
7181	Addr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7182	N2: DAG.getConstant(Val: `24`, DL, VT: MVT::i64));
7183	OutChains[`4`] =
7184	DAG.getStore(Chain, dl: DL, Val: FPtr, Ptr: Addr, PtrInfo: MachinePointerInfo (TrmpAddr, `24`));
7185
7186	SDValue StoreToken = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: OutChains);
7187
7188	SDValue EndOfTrmp = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Trmp,
7189	N2: DAG.getConstant(Val: `12`, DL, VT: MVT::i64));
7190
7191	// Call clear cache on the trampoline instructions.
7192	return DAG.getNode(Opcode: ISD::CLEAR_CACHE, DL, VT: MVT::Other, N1: StoreToken, N2: Trmp,
7193	N3: EndOfTrmp);
7194	}
7195
7196	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
7197	SelectionDAG &DAG) const {
7198	LLVM_DEBUG(dbgs() << "Custom lowering: ");
7199	LLVM_DEBUG(Op.dump());
7200
7201	switch (Op.getOpcode()) {
7202	default:
7203	llvm_unreachable("unimplemented operand");
7204	return SDValue ();
7205	case ISD::BITCAST:
7206	return LowerBITCAST(Op, DAG);
7207	case ISD::GlobalAddress:
7208	return LowerGlobalAddress(Op, DAG);
7209	case ISD::GlobalTLSAddress:
7210	return LowerGlobalTLSAddress(Op, DAG);
7211	case ISD::PtrAuthGlobalAddress:
7212	return LowerPtrAuthGlobalAddress(Op, DAG);
7213	case ISD::ADJUST_TRAMPOLINE:
7214	return LowerADJUST_TRAMPOLINE(Op, DAG);
7215	case ISD::INIT_TRAMPOLINE:
7216	return LowerINIT_TRAMPOLINE(Op, DAG);
7217	case ISD::SETCC:
7218	case ISD::STRICT_FSETCC:
7219	case ISD::STRICT_FSETCCS:
7220	return LowerSETCC(Op, DAG);
7221	case ISD::SETCCCARRY:
7222	return LowerSETCCCARRY(Op, DAG);
7223	case ISD::BRCOND:
7224	return LowerBRCOND(Op, DAG);
7225	case ISD::BR_CC:
7226	return LowerBR_CC(Op, DAG);
7227	case ISD::SELECT:
7228	return LowerSELECT(Op, DAG);
7229	case ISD::SELECT_CC:
7230	return LowerSELECT_CC(Op, DAG);
7231	case ISD::JumpTable:
7232	return LowerJumpTable(Op, DAG);
7233	case ISD::BR_JT:
7234	return LowerBR_JT(Op, DAG);
7235	case ISD::BRIND:
7236	return LowerBRIND(Op, DAG);
7237	case ISD::ConstantPool:
7238	return LowerConstantPool(Op, DAG);
7239	case ISD::BlockAddress:
7240	return LowerBlockAddress(Op, DAG);
7241	case ISD::VASTART:
7242	return LowerVASTART(Op, DAG);
7243	case ISD::VACOPY:
7244	return LowerVACOPY(Op, DAG);
7245	case ISD::VAARG:
7246	return LowerVAARG(Op, DAG);
7247	case ISD::UADDO_CARRY:
7248	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /unsigned/);
7249	case ISD::USUBO_CARRY:
7250	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /unsigned/);
7251	case ISD::SADDO_CARRY:
7252	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /signed/);
7253	case ISD::SSUBO_CARRY:
7254	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /signed/);
7255	case ISD::SADDO:
7256	case ISD::UADDO:
7257	case ISD::SSUBO:
7258	case ISD::USUBO:
7259	case ISD::SMULO:
7260	case ISD::UMULO:
7261	return LowerXALUO(Op, DAG);
7262	case ISD::FADD:
7263	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
7264	case ISD::FSUB:
7265	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
7266	case ISD::FMUL:
7267	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
7268	case ISD::FMA:
7269	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
7270	case ISD::FDIV:
7271	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
7272	case ISD::FNEG:
7273	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
7274	case ISD::FCEIL:
7275	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
7276	case ISD::FFLOOR:
7277	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7278	case ISD::FNEARBYINT:
7279	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7280	case ISD::FRINT:
7281	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
7282	case ISD::FROUND:
7283	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
7284	case ISD::FROUNDEVEN:
7285	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7286	case ISD::FTRUNC:
7287	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7288	case ISD::FSQRT:
7289	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
7290	case ISD::FABS:
7291	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
7292	case ISD::FP_ROUND:
7293	case ISD::STRICT_FP_ROUND:
7294	return LowerFP_ROUND(Op, DAG);
7295	case ISD::FP_EXTEND:
7296	case ISD::STRICT_FP_EXTEND:
7297	return LowerFP_EXTEND(Op, DAG);
7298	case ISD::FRAMEADDR:
7299	return LowerFRAMEADDR(Op, DAG);
7300	case ISD::SPONENTRY:
7301	return LowerSPONENTRY(Op, DAG);
7302	case ISD::RETURNADDR:
7303	return LowerRETURNADDR(Op, DAG);
7304	case ISD::ADDROFRETURNADDR:
7305	return LowerADDROFRETURNADDR(Op, DAG);
7306	case ISD::CONCAT_VECTORS:
7307	return LowerCONCAT_VECTORS(Op, DAG);
7308	case ISD::INSERT_VECTOR_ELT:
7309	return LowerINSERT_VECTOR_ELT(Op, DAG);
7310	case ISD::EXTRACT_VECTOR_ELT:
7311	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7312	case ISD::BUILD_VECTOR:
7313	return LowerBUILD_VECTOR(Op, DAG);
7314	case ISD::ZERO_EXTEND_VECTOR_INREG:
7315	return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7316	case ISD::VECTOR_SHUFFLE:
7317	return LowerVECTOR_SHUFFLE(Op, DAG);
7318	case ISD::SPLAT_VECTOR:
7319	return LowerSPLAT_VECTOR(Op, DAG);
7320	case ISD::EXTRACT_SUBVECTOR:
7321	return LowerEXTRACT_SUBVECTOR(Op, DAG);
7322	case ISD::INSERT_SUBVECTOR:
7323	return LowerINSERT_SUBVECTOR(Op, DAG);
7324	case ISD::SDIV:
7325	case ISD::UDIV:
7326	return LowerDIV(Op, DAG);
7327	case ISD::SMIN:
7328	case ISD::UMIN:
7329	case ISD::SMAX:
7330	case ISD::UMAX:
7331	return LowerMinMax(Op, DAG);
7332	case ISD::SRA:
7333	case ISD::SRL:
7334	case ISD::SHL:
7335	return LowerVectorSRA_SRL_SHL(Op, DAG);
7336	case ISD::SHL_PARTS:
7337	case ISD::SRL_PARTS:
7338	case ISD::SRA_PARTS:
7339	return LowerShiftParts(Op, DAG);
7340	case ISD::CTPOP:
7341	case ISD::PARITY:
7342	return LowerCTPOP_PARITY(Op, DAG);
7343	case ISD::FCOPYSIGN:
7344	return LowerFCOPYSIGN(Op, DAG);
7345	case ISD::OR:
7346	return LowerVectorOR(Op, DAG);
7347	case ISD::XOR:
7348	return LowerXOR(Op, DAG);
7349	case ISD::PREFETCH:
7350	return LowerPREFETCH(Op, DAG);
7351	case ISD::SINT_TO_FP:
7352	case ISD::UINT_TO_FP:
7353	case ISD::STRICT_SINT_TO_FP:
7354	case ISD::STRICT_UINT_TO_FP:
7355	return LowerINT_TO_FP(Op, DAG);
7356	case ISD::FP_TO_SINT:
7357	case ISD::FP_TO_UINT:
7358	case ISD::STRICT_FP_TO_SINT:
7359	case ISD::STRICT_FP_TO_UINT:
7360	return LowerFP_TO_INT(Op, DAG);
7361	case ISD::FP_TO_SINT_SAT:
7362	case ISD::FP_TO_UINT_SAT:
7363	return LowerFP_TO_INT_SAT(Op, DAG);
7364	case ISD::FSINCOS:
7365	return LowerFSINCOS(Op, DAG);
7366	case ISD::GET_ROUNDING:
7367	return LowerGET_ROUNDING(Op, DAG);
7368	case ISD::SET_ROUNDING:
7369	return LowerSET_ROUNDING(Op, DAG);
7370	case ISD::GET_FPMODE:
7371	return LowerGET_FPMODE(Op, DAG);
7372	case ISD::SET_FPMODE:
7373	return LowerSET_FPMODE(Op, DAG);
7374	case ISD::RESET_FPMODE:
7375	return LowerRESET_FPMODE(Op, DAG);
7376	case ISD::MUL:
7377	return LowerMUL(Op, DAG);
7378	case ISD::MULHS:
7379	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
7380	case ISD::MULHU:
7381	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
7382	case ISD::INTRINSIC_W_CHAIN:
7383	return LowerINTRINSIC_W_CHAIN(Op, DAG);
7384	case ISD::INTRINSIC_WO_CHAIN:
7385	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7386	case ISD::INTRINSIC_VOID:
7387	return LowerINTRINSIC_VOID(Op, DAG);
7388	case ISD::ATOMIC_STORE:
7389	if (cast<MemSDNode>(Val&: Op)->getMemoryVT() == MVT::i128) {
7390	assert(Subtarget->hasLSE2() \|\| Subtarget->hasRCPC3());
7391	return LowerStore128(Op, DAG);
7392	}
7393	return SDValue ();
7394	case ISD::STORE:
7395	return LowerSTORE(Op, DAG);
7396	case ISD::MSTORE:
7397	return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7398	case ISD::MGATHER:
7399	return LowerMGATHER(Op, DAG);
7400	case ISD::MSCATTER:
7401	return LowerMSCATTER(Op, DAG);
7402	case ISD::VECREDUCE_SEQ_FADD:
7403	return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
7404	case ISD::VECREDUCE_ADD:
7405	case ISD::VECREDUCE_AND:
7406	case ISD::VECREDUCE_OR:
7407	case ISD::VECREDUCE_XOR:
7408	case ISD::VECREDUCE_SMAX:
7409	case ISD::VECREDUCE_SMIN:
7410	case ISD::VECREDUCE_UMAX:
7411	case ISD::VECREDUCE_UMIN:
7412	case ISD::VECREDUCE_FADD:
7413	case ISD::VECREDUCE_FMAX:
7414	case ISD::VECREDUCE_FMIN:
7415	case ISD::VECREDUCE_FMAXIMUM:
7416	case ISD::VECREDUCE_FMINIMUM:
7417	return LowerVECREDUCE(Op, DAG);
7418	case ISD::ATOMIC_LOAD_AND:
7419	return LowerATOMIC_LOAD_AND(Op, DAG);
7420	case ISD::DYNAMIC_STACKALLOC:
7421	return LowerDYNAMIC_STACKALLOC(Op, DAG);
7422	case ISD::VSCALE:
7423	return LowerVSCALE(Op, DAG);
7424	case ISD::VECTOR_COMPRESS:
7425	return LowerVECTOR_COMPRESS(Op, DAG);
7426	case ISD::ANY_EXTEND:
7427	case ISD::SIGN_EXTEND:
7428	case ISD::ZERO_EXTEND:
7429	return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7430	case ISD::ADDRSPACECAST:
7431	return LowerADDRSPACECAST(Op, DAG);
7432	case ISD::SIGN_EXTEND_INREG: {
7433	// Only custom lower when ExtraVT has a legal byte based element type.
7434	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
7435	EVT ExtraEltVT = ExtraVT.getVectorElementType();
7436	if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7437	(ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7438	return SDValue ();
7439
7440	return LowerToPredicatedOp(Op, DAG,
7441	NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7442	}
7443	case ISD::TRUNCATE:
7444	return LowerTRUNCATE(Op, DAG);
7445	case ISD::MLOAD:
7446	return LowerMLOAD(Op, DAG);
7447	case ISD::LOAD:
7448	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
7449	OverrideNEON: !Subtarget->isNeonAvailable()))
7450	return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7451	return LowerLOAD(Op, DAG);
7452	case ISD::ADD:
7453	case ISD::AND:
7454	case ISD::SUB:
7455	return LowerToScalableOp(Op, DAG);
7456	case ISD::FMAXIMUM:
7457	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
7458	case ISD::FMAXNUM:
7459	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
7460	case ISD::FMINIMUM:
7461	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
7462	case ISD::FMINNUM:
7463	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
7464	case ISD::VSELECT:
7465	return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7466	case ISD::ABS:
7467	return LowerABS(Op, DAG);
7468	case ISD::ABDS:
7469	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
7470	case ISD::ABDU:
7471	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
7472	case ISD::AVGFLOORS:
7473	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
7474	case ISD::AVGFLOORU:
7475	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
7476	case ISD::AVGCEILS:
7477	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
7478	case ISD::AVGCEILU:
7479	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
7480	case ISD::BITREVERSE:
7481	return LowerBitreverse(Op, DAG);
7482	case ISD::BSWAP:
7483	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
7484	case ISD::CTLZ:
7485	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
7486	case ISD::CTTZ:
7487	return LowerCTTZ(Op, DAG);
7488	case ISD::VECTOR_SPLICE:
7489	return LowerVECTOR_SPLICE(Op, DAG);
7490	case ISD::VECTOR_DEINTERLEAVE:
7491	return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7492	case ISD::VECTOR_INTERLEAVE:
7493	return LowerVECTOR_INTERLEAVE(Op, DAG);
7494	case ISD::GET_ACTIVE_LANE_MASK:
7495	return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7496	case ISD::LRINT:
7497	case ISD::LLRINT:
7498	if (Op.getValueType().isVector())
7499	return LowerVectorXRINT(Op, DAG);
7500	[[fallthrough]];
7501	case ISD::LROUND:
7502	case ISD::LLROUND: {
7503	assert((Op.getOperand(`0`).getValueType() == MVT::f16 \|\|
7504	Op.getOperand(`0`).getValueType() == MVT::bf16) &&
7505	"Expected custom lowering of rounding operations only for f16");
7506	SDLoc DL(Op);
7507	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Op.getOperand(i: `0`));
7508	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
7509	}
7510	case ISD::STRICT_LROUND:
7511	case ISD::STRICT_LLROUND:
7512	case ISD::STRICT_LRINT:
7513	case ISD::STRICT_LLRINT: {
7514	assert((Op.getOperand(`1`).getValueType() == MVT::f16 \|\|
7515	Op.getOperand(`1`).getValueType() == MVT::bf16) &&
7516	"Expected custom lowering of rounding operations only for f16");
7517	SDLoc DL(Op);
7518	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
7519	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
7520	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
7521	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
7522	}
7523	case ISD::WRITE_REGISTER: {
7524	assert(Op.getOperand(`2`).getValueType() == MVT::i128 &&
7525	"WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7526	SDLoc DL(Op);
7527
7528	SDValue Chain = Op.getOperand(i: `0`);
7529	SDValue SysRegName = Op.getOperand(i: `1`);
7530	std::pair<SDValue, SDValue> Pair =
7531	DAG.SplitScalar(N: Op.getOperand(i: `2`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
7532
7533	// chain = MSRR(chain, sysregname, lo, hi)
7534	SDValue Result = DAG.getNode(Opcode: AArch64ISD::MSRR, DL, VT: MVT::Other, N1: Chain,
7535	N2: SysRegName, N3: Pair.first, N4: Pair.second);
7536
7537	return Result;
7538	}
7539	case ISD::FSHL:
7540	case ISD::FSHR:
7541	return LowerFunnelShift(Op, DAG);
7542	case ISD::FLDEXP:
7543	return LowerFLDEXP(Op, DAG);
7544	case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7545	return LowerVECTOR_HISTOGRAM(Op, DAG);
7546	case ISD::PARTIAL_REDUCE_SMLA:
7547	case ISD::PARTIAL_REDUCE_UMLA:
7548	case ISD::PARTIAL_REDUCE_SUMLA:
7549	return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7550	}
7551	}
7552
7553	bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
7554	return !Subtarget->useSVEForFixedLengthVectors();
7555	}
7556
7557	bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
7558	EVT VT, bool OverrideNEON) const {
7559	if (!VT.isFixedLengthVector() \|\| !VT.isSimple())
7560	return false;
7561
7562	// Don't use SVE for vectors we cannot scalarize if required.
7563	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7564	// Fixed length predicates should be promoted to i8.
7565	// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7566	case MVT::i1:
7567	default:
7568	return false;
7569	case MVT::i8:
7570	case MVT::i16:
7571	case MVT::i32:
7572	case MVT::i64:
7573	case MVT::f16:
7574	case MVT::f32:
7575	case MVT::f64:
7576	break;
7577	}
7578
7579	// NEON-sized vectors can be emulated using SVE instructions.
7580	if (OverrideNEON && (VT.is128BitVector() \|\| VT.is64BitVector()))
7581	return Subtarget->isSVEorStreamingSVEAvailable();
7582
7583	// Ensure NEON MVTs only belong to a single register class.
7584	if (VT.getFixedSizeInBits() <= `128`)
7585	return false;
7586
7587	// Ensure wider than NEON code generation is enabled.
7588	if (!Subtarget->useSVEForFixedLengthVectors())
7589	return false;
7590
7591	// Don't use SVE for types that don't fit.
7592	if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7593	return false;
7594
7595	// TODO: Perhaps an artificial restriction, but worth having whilst getting
7596	// the base fixed length SVE support in place.
7597	if (!VT.isPow2VectorType())
7598	return false;
7599
7600	return true;
7601	}
7602
7603	//===----------------------------------------------------------------------===//
7604	// Calling Convention Implementation
7605	//===----------------------------------------------------------------------===//
7606
7607	static unsigned getIntrinsicID(const SDNode *N) {
7608	unsigned Opcode = N->getOpcode();
7609	switch (Opcode) {
7610	default:
7611	return Intrinsic::not_intrinsic;
7612	case ISD::INTRINSIC_WO_CHAIN: {
7613	unsigned IID = N->getConstantOperandVal(Num: `0`);
7614	if (IID < Intrinsic::num_intrinsics)
7615	return IID;
7616	return Intrinsic::not_intrinsic;
7617	}
7618	}
7619	}
7620
7621	bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
7622	SDValue N1) const {
7623	if (!N0.hasOneUse())
7624	return false;
7625
7626	unsigned IID = getIntrinsicID(N: N1.getNode());
7627	// Avoid reassociating expressions that can be lowered to smlal/umlal.
7628	if (IID == Intrinsic::aarch64_neon_umull \|\|
7629	N1.getOpcode() == AArch64ISD::UMULL \|\|
7630	IID == Intrinsic::aarch64_neon_smull \|\|
7631	N1.getOpcode() == AArch64ISD::SMULL)
7632	return N0.getOpcode() != ISD::ADD;
7633
7634	return true;
7635	}
7636
7637	/// Selects the correct CCAssignFn for a given CallingConvention value.
7638	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
7639	bool IsVarArg) const {
7640	switch (CC) {
7641	default:
7642	reportFatalUsageError(reason: "unsupported calling convention");
7643	case CallingConv::GHC:
7644	return CC_AArch64_GHC;
7645	case CallingConv::PreserveNone:
7646	// The VarArg implementation makes assumptions about register
7647	// argument passing that do not hold for preserve_none, so we
7648	// instead fall back to C argument passing.
7649	// The non-vararg case is handled in the CC function itself.
7650	if (!IsVarArg)
7651	return CC_AArch64_Preserve_None;
7652	[[fallthrough]];
7653	case CallingConv::C:
7654	case CallingConv::Fast:
7655	case CallingConv::PreserveMost:
7656	case CallingConv::PreserveAll:
7657	case CallingConv::CXX_FAST_TLS:
7658	case CallingConv::Swift:
7659	case CallingConv::SwiftTail:
7660	case CallingConv::Tail:
7661	case CallingConv::GRAAL:
7662	if (Subtarget->isTargetWindows()) {
7663	if (IsVarArg) {
7664	if (Subtarget->isWindowsArm64EC())
7665	return CC_AArch64_Arm64EC_VarArg;
7666	return CC_AArch64_Win64_VarArg;
7667	}
7668	return CC_AArch64_Win64PCS;
7669	}
7670	if (!Subtarget->isTargetDarwin())
7671	return CC_AArch64_AAPCS;
7672	if (!IsVarArg)
7673	return CC_AArch64_DarwinPCS;
7674	return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
7675	: CC_AArch64_DarwinPCS_VarArg;
7676	case CallingConv::Win64:
7677	if (IsVarArg) {
7678	if (Subtarget->isWindowsArm64EC())
7679	return CC_AArch64_Arm64EC_VarArg;
7680	return CC_AArch64_Win64_VarArg;
7681	}
7682	return CC_AArch64_Win64PCS;
7683	case CallingConv::CFGuard_Check:
7684	if (Subtarget->isWindowsArm64EC())
7685	return CC_AArch64_Arm64EC_CFGuard_Check;
7686	return CC_AArch64_Win64_CFGuard_Check;
7687	case CallingConv::AArch64_VectorCall:
7688	case CallingConv::AArch64_SVE_VectorCall:
7689	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
7690	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1:
7691	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
7692	return CC_AArch64_AAPCS;
7693	case CallingConv::ARM64EC_Thunk_X64:
7694	return CC_AArch64_Arm64EC_Thunk;
7695	case CallingConv::ARM64EC_Thunk_Native:
7696	return CC_AArch64_Arm64EC_Thunk_Native;
7697	}
7698	}
7699
7700	CCAssignFn *
7701	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7702	switch (CC) {
7703	default:
7704	return RetCC_AArch64_AAPCS;
7705	case CallingConv::ARM64EC_Thunk_X64:
7706	return RetCC_AArch64_Arm64EC_Thunk;
7707	case CallingConv::CFGuard_Check:
7708	if (Subtarget->isWindowsArm64EC())
7709	return RetCC_AArch64_Arm64EC_CFGuard_Check;
7710	return RetCC_AArch64_AAPCS;
7711	}
7712	}
7713
7714	static bool isPassedInFPR(EVT VT) {
7715	return VT.isFixedLengthVector() \|\|
7716	(VT.isFloatingPoint() && !VT.isScalableVector());
7717	}
7718
7719	SDValue AArch64TargetLowering::LowerFormalArguments(
7720	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7721	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7722	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7723	MachineFunction &MF = DAG.getMachineFunction();
7724	const Function &F = MF.getFunction();
7725	MachineFrameInfo &MFI = MF.getFrameInfo();
7726	bool IsWin64 =
7727	Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7728	bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 \|\|
7729	(isVarArg && Subtarget->isWindowsArm64EC());
7730	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7731
7732	SmallVector<ISD::OutputArg, `4`> Outs;
7733	GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
7734	TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
7735	if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7736	FuncInfo->setIsSVECC(true);
7737
7738	// Assign locations to all of the incoming arguments.
7739	SmallVector<CCValAssign, `16`> ArgLocs;
7740	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7741
7742	// At this point, Ins[].VT may already be promoted to i32. To correctly
7743	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7744	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7745	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7746	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
7747	// LocVT.
7748	unsigned NumArgs = Ins.size();
7749	Function::const_arg_iterator CurOrigArg = F.arg_begin();
7750	unsigned CurArgIdx = `0`;
7751	bool UseVarArgCC = false;
7752	if (IsWin64)
7753	UseVarArgCC = isVarArg;
7754
7755	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
7756
7757	for (unsigned i = `0`; i != NumArgs; ++i) {
7758	MVT ValVT = Ins [i].VT;
7759	if (Ins [i].isOrigArg()) {
7760	std::advance(i&: CurOrigArg, n: Ins [i].getOrigArgIndex() - CurArgIdx);
7761	CurArgIdx = Ins [i].getOrigArgIndex();
7762
7763	// Get type of the original argument.
7764	EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
7765	/AllowUnknown/ true);
7766	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7767	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7768	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
7769	ValVT = MVT::i8;
7770	else if (ActualMVT == MVT::i16)
7771	ValVT = MVT::i16;
7772	}
7773	bool Res =
7774	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins [i].Flags, CCInfo);
7775	assert(!Res && "Call operand has unhandled type");
7776	(void)Res;
7777	}
7778
7779	SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
7780	bool IsLocallyStreaming =
7781	!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7782	assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7783	SDValue Glue = Chain.getValue(R: `1`);
7784
7785	unsigned ExtraArgLocs = `0`;
7786	for (unsigned i = `0`, e = Ins.size(); i != e; ++i) {
7787	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
7788
7789	if (Ins [i].Flags.isByVal()) {
7790	// Byval is used for HFAs in the PCS, but the system should work in a
7791	// non-compliant manner for larger structs.
7792	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7793	int Size = Ins [i].Flags.getByValSize();
7794	unsigned NumRegs = (Size + `7`) / `8`;
7795
7796	// FIXME: This works on big-endian for composite byvals, which are the common
7797	// case. It should also work for fundamental types too.
7798	unsigned FrameIdx =
7799	MFI.CreateFixedObject(Size: `8` * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
7800	SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
7801	InVals.push_back(Elt: FrameIdxN);
7802
7803	continue;
7804	}
7805
7806	if (Ins [i].Flags.isSwiftAsync())
7807	MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
7808
7809	SDValue ArgValue;
7810	if (VA.isRegLoc()) {
7811	// Arguments stored in registers.
7812	EVT RegVT = VA.getLocVT();
7813	const TargetRegisterClass *RC;
7814
7815	if (RegVT == MVT::i32)
7816	RC = &AArch64::GPR32RegClass;
7817	else if (RegVT == MVT::i64)
7818	RC = &AArch64::GPR64RegClass;
7819	else if (RegVT == MVT::f16 \|\| RegVT == MVT::bf16)
7820	RC = &AArch64::FPR16RegClass;
7821	else if (RegVT == MVT::f32)
7822	RC = &AArch64::FPR32RegClass;
7823	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
7824	RC = &AArch64::FPR64RegClass;
7825	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
7826	RC = &AArch64::FPR128RegClass;
7827	else if (RegVT.isScalableVector() &&
7828	RegVT.getVectorElementType() == MVT::i1) {
7829	FuncInfo->setIsSVECC(true);
7830	RC = &AArch64::PPRRegClass;
7831	} else if (RegVT == MVT::aarch64svcount) {
7832	FuncInfo->setIsSVECC(true);
7833	RC = &AArch64::PPRRegClass;
7834	} else if (RegVT.isScalableVector()) {
7835	FuncInfo->setIsSVECC(true);
7836	RC = &AArch64::ZPRRegClass;
7837	} else
7838	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7839
7840	// Transform the arguments in physical registers into virtual ones.
7841	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
7842
7843	if (IsLocallyStreaming) {
7844	// LocallyStreamingFunctions must insert the SMSTART in the correct
7845	// position, so we use Glue to ensure no instructions can be scheduled
7846	// between the chain of:
7847	// t0: ch,glue = EntryNode
7848	// t1: res,ch,glue = CopyFromReg
7849	// ...
7850	// tn: res,ch,glue = CopyFromReg t(n-1), ..
7851	// t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7852	// ^^^^^^
7853	// This will be the new Chain/Root node.
7854	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
7855	Glue = ArgValue.getValue(R: `2`);
7856	if (isPassedInFPR(VT: ArgValue.getValueType())) {
7857	ArgValue =
7858	DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
7859	VTList: DAG.getVTList(VT1: ArgValue.getValueType(), VT2: MVT::Glue),
7860	Ops: {ArgValue, Glue});
7861	Glue = ArgValue.getValue(R: `1`);
7862	}
7863	} else
7864	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
7865
7866	// If this is an 8, 16 or 32-bit value, it is really passed promoted
7867	// to 64 bits. Insert an assert[sz]ext to capture this, then
7868	// truncate to the right size.
7869	switch (VA.getLocInfo()) {
7870	default:
7871	llvm_unreachable("Unknown loc info!");
7872	case CCValAssign::Full:
7873	break;
7874	case CCValAssign::Indirect:
7875	assert(
7876	(VA.getValVT().isScalableVT() \|\| Subtarget->isWindowsArm64EC()) &&
7877	"Indirect arguments should be scalable on most subtargets");
7878	break;
7879	case CCValAssign::BCvt:
7880	ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7881	break;
7882	case CCValAssign::AExt:
7883	case CCValAssign::SExt:
7884	case CCValAssign::ZExt:
7885	break;
7886	case CCValAssign::AExtUpper:
7887	ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7888	N2: DAG.getConstant(Val: `32`, DL, VT: RegVT));
7889	ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7890	break;
7891	}
7892	} else { // VA.isRegLoc()
7893	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7894	unsigned ArgOffset = VA.getLocMemOffset();
7895	unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7896	? VA.getLocVT().getSizeInBits()
7897	: VA.getValVT().getSizeInBits()) / `8`;
7898
7899	uint32_t BEAlign = `0`;
7900	if (!Subtarget->isLittleEndian() && ArgSize < `8` &&
7901	!Ins [i].Flags.isInConsecutiveRegs())
7902	BEAlign = `8` - ArgSize;
7903
7904	SDValue FIN;
7905	MachinePointerInfo PtrInfo;
7906	if (StackViaX4) {
7907	// In both the ARM64EC varargs convention and the thunk convention,
7908	// arguments on the stack are accessed relative to x4, not sp. In
7909	// the thunk convention, there's an additional offset of 32 bytes
7910	// to account for the shadow store.
7911	unsigned ObjOffset = ArgOffset + BEAlign;
7912	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7913	ObjOffset += `32`;
7914	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7915	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7916	FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
7917	N2: DAG.getConstant(Val: ObjOffset, DL, VT: MVT::i64));
7918	PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7919	} else {
7920	int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7921
7922	// Create load nodes to retrieve arguments from the stack.
7923	FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7924	PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7925	}
7926
7927	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7928	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7929	MVT MemVT = VA.getValVT();
7930
7931	switch (VA.getLocInfo()) {
7932	default:
7933	break;
7934	case CCValAssign::Trunc:
7935	case CCValAssign::BCvt:
7936	MemVT = VA.getLocVT();
7937	break;
7938	case CCValAssign::Indirect:
7939	assert((VA.getValVT().isScalableVector() \|\|
7940	Subtarget->isWindowsArm64EC()) &&
7941	"Indirect arguments should be scalable on most subtargets");
7942	MemVT = VA.getLocVT();
7943	break;
7944	case CCValAssign::SExt:
7945	ExtType = ISD::SEXTLOAD;
7946	break;
7947	case CCValAssign::ZExt:
7948	ExtType = ISD::ZEXTLOAD;
7949	break;
7950	case CCValAssign::AExt:
7951	ExtType = ISD::EXTLOAD;
7952	break;
7953	}
7954
7955	ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7956	MemVT);
7957	}
7958
7959	if (VA.getLocInfo() == CCValAssign::Indirect) {
7960	assert((VA.getValVT().isScalableVT() \|\|
7961	Subtarget->isWindowsArm64EC()) &&
7962	"Indirect arguments should be scalable on most subtargets");
7963
7964	uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7965	unsigned NumParts = `1`;
7966	if (Ins [i].Flags.isInConsecutiveRegs()) {
7967	while (!Ins [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
7968	++NumParts;
7969	}
7970
7971	MVT PartLoad = VA.getValVT();
7972	SDValue Ptr = ArgValue;
7973
7974	// Ensure we generate all loads for each tuple part, whilst updating the
7975	// pointer after each load correctly using vscale.
7976	while (NumParts > `0`) {
7977	ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo ());
7978	InVals.push_back(Elt: ArgValue);
7979	NumParts--;
7980	if (NumParts > `0`) {
7981	SDValue BytesIncrement;
7982	if (PartLoad.isScalableVector()) {
7983	BytesIncrement = DAG.getVScale(
7984	DL, VT: Ptr.getValueType(),
7985	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7986	} else {
7987	BytesIncrement = DAG.getConstant(
7988	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7989	VT: Ptr.getValueType());
7990	}
7991	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7992	N2: BytesIncrement, Flags: SDNodeFlags::NoUnsignedWrap);
7993	ExtraArgLocs++;
7994	i++;
7995	}
7996	}
7997	} else {
7998	if (Subtarget->isTargetILP32() && Ins [i].Flags.isPointer())
7999	ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: ArgValue.getValueType(),
8000	N1: ArgValue, N2: DAG.getValueType(MVT::i32));
8001
8002	// i1 arguments are zero-extended to i8 by the caller. Emit a
8003	// hint to reflect this.
8004	if (Ins [i].isOrigArg()) {
8005	Argument *OrigArg = F.getArg(i: Ins [i].getOrigArgIndex());
8006	if (OrigArg->getType()->isIntegerTy(Bitwidth: `1`)) {
8007	if (!Ins [i].Flags.isZExt()) {
8008	ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8009	VT: ArgValue.getValueType(), Operand: ArgValue);
8010	}
8011	}
8012	}
8013
8014	InVals.push_back(Elt: ArgValue);
8015	}
8016	}
8017	assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8018
8019	// Insert the SMSTART if this is a locally streaming function and
8020	// make sure it is Glued to the last CopyFromReg value.
8021	if (IsLocallyStreaming) {
8022	SDValue PStateSM;
8023	if (Attrs.hasStreamingCompatibleInterface()) {
8024	PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
8025	Register Reg = MF.getRegInfo().createVirtualRegister(
8026	RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
8027	FuncInfo->setPStateSMReg(Reg);
8028	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
8029	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
8030	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
8031	} else
8032	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
8033	Condition: AArch64SME::Always);
8034
8035	// Ensure that the SMSTART happens after the CopyWithChain such that its
8036	// chain result is used.
8037	for (unsigned I=`0`; I<InVals.size(); ++I) {
8038	Register Reg = MF.getRegInfo().createVirtualRegister(
8039	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
8040	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals [I]);
8041	InVals [I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
8042	VT: InVals [I].getValueType());
8043	}
8044	}
8045
8046	// varargs
8047	if (isVarArg) {
8048	if (DAG.getMachineFunction().getFrameInfo().hasVAStart()) {
8049	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
8050	// The AAPCS variadic function ABI is identical to the non-variadic
8051	// one. As a result there may be more arguments in registers and we
8052	// should save them for future reference.
8053	// Win64 variadic functions also pass arguments in registers, but all
8054	// float arguments are passed in integer registers.
8055	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8056	}
8057
8058	// This will point to the next argument passed via stack.
8059	unsigned VarArgsOffset = CCInfo.getStackSize();
8060	// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8061	VarArgsOffset =
8062	alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? `4` : `8`);
8063	FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8064	FuncInfo->setVarArgsStackIndex(
8065	MFI.CreateFixedObject(Size: `4`, SPOffset: VarArgsOffset, IsImmutable: true));
8066	}
8067
8068	if (MFI.hasMustTailInVarArgFunc()) {
8069	SmallVector<MVT, `2`> RegParmTypes;
8070	RegParmTypes.push_back(Elt: MVT::i64);
8071	RegParmTypes.push_back(Elt: MVT::f128);
8072	// Compute the set of forwarded registers. The rest are scratch.
8073	SmallVectorImpl<ForwardedRegister> &Forwards =
8074	FuncInfo->getForwardedMustTailRegParms();
8075	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8076	Fn: CC_AArch64_AAPCS);
8077
8078	// Conservatively forward X8, since it might be used for aggregate return.
8079	if (!CCInfo.isAllocated(Reg: AArch64::X8)) {
8080	Register X8VReg = MF.addLiveIn(PReg: AArch64::X8, RC: &AArch64::GPR64RegClass);
8081	Forwards.push_back(Elt: ForwardedRegister (X8VReg, AArch64::X8, MVT::i64));
8082	}
8083	}
8084	}
8085
8086	// On Windows, InReg pointers must be returned, so record the pointer in a
8087	// virtual register at the start of the function so it can be returned in the
8088	// epilogue.
8089	if (IsWin64 \|\| F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8090	for (unsigned I = `0`, E = Ins.size(); I != E; ++I) {
8091	if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 \|\|
8092	Ins [I].Flags.isInReg()) &&
8093	Ins [I].Flags.isSRet()) {
8094	assert(!FuncInfo->getSRetReturnReg());
8095
8096	MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
8097	Register Reg =
8098	MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
8099	FuncInfo->setSRetReturnReg(Reg);
8100
8101	SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals [I]);
8102	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, N1: Copy, N2: Chain);
8103	break;
8104	}
8105	}
8106	}
8107
8108	unsigned StackArgSize = CCInfo.getStackSize();
8109	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8110	if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
8111	// This is a non-standard ABI so by fiat I say we're allowed to make full
8112	// use of the stack area to be popped, which must be aligned to 16 bytes in
8113	// any case:
8114	StackArgSize = alignTo(Value: StackArgSize, Align: `16`);
8115
8116	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8117	// a multiple of 16.
8118	FuncInfo->setArgumentStackToRestore(StackArgSize);
8119
8120	// This realignment carries over to the available bytes below. Our own
8121	// callers will guarantee the space is free by giving an aligned value to
8122	// CALLSEQ_START.
8123	}
8124	// Even if we're not expected to free up the space, it's useful to know how
8125	// much is there while considering tail calls (because we can reuse it).
8126	FuncInfo->setBytesInStackArgArea(StackArgSize);
8127
8128	if (Subtarget->hasCustomCallingConv())
8129	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8130
8131	// Create a 16 Byte TPIDR2 object. The dynamic buffer
8132	// will be expanded and stored in the static object later using a pseudonode.
8133	if (Attrs.hasZAState()) {
8134	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8135	TPIDR2.FrameIndex = MFI.CreateStackObject(Size: `16`, Alignment: Align (`16`), isSpillSlot: false);
8136	SDValue SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8137	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
8138
8139	SDValue Buffer;
8140	if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8141	Buffer = DAG.getNode(Opcode: AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8142	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, SVL});
8143	} else {
8144	SDValue Size = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: SVL, N2: SVL);
8145	Buffer = DAG.getNode(Opcode: ISD::DYNAMIC_STACKALLOC, DL,
8146	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
8147	Ops: {Chain, Size, DAG.getConstant(Val: `1`, DL, VT: MVT::i64)});
8148	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
8149	}
8150	Chain = DAG.getNode(
8151	Opcode: AArch64ISD::INIT_TPIDR2OBJ, DL, VTList: DAG.getVTList(VT: MVT::Other),
8152	Ops: {/Chain/ Buffer.getValue(R: `1`), /Buffer ptr/ Buffer.getValue(R: `0`)});
8153	} else if (Attrs.hasAgnosticZAInterface()) {
8154	// Call __arm_sme_state_size().
8155	SDValue BufferSize =
8156	DAG.getNode(Opcode: AArch64ISD::GET_SME_SAVE_SIZE, DL,
8157	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), N: Chain);
8158	Chain = BufferSize.getValue(R: `1`);
8159
8160	SDValue Buffer;
8161	if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8162	Buffer =
8163	DAG.getNode(Opcode: AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8164	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, BufferSize});
8165	} else {
8166	// Allocate space dynamically.
8167	Buffer = DAG.getNode(
8168	Opcode: ISD::DYNAMIC_STACKALLOC, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
8169	Ops: {Chain, BufferSize, DAG.getConstant(Val: `1`, DL, VT: MVT::i64)});
8170	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
8171	}
8172
8173	// Copy the value to a virtual register, and save that in FuncInfo.
8174	Register BufferPtr =
8175	MF.getRegInfo().createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
8176	FuncInfo->setSMESaveBufferAddr(BufferPtr);
8177	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: BufferPtr, N: Buffer);
8178	}
8179
8180	if (CallConv == CallingConv::PreserveNone) {
8181	for (const ISD::InputArg &I : Ins) {
8182	if (I.Flags.isSwiftSelf() \|\| I.Flags.isSwiftError() \|\|
8183	I.Flags.isSwiftAsync()) {
8184	MachineFunction &MF = DAG.getMachineFunction();
8185	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8186	MF.getFunction(),
8187	"Swift attributes can't be used with preserve_none",
8188	DL.getDebugLoc()));
8189	break;
8190	}
8191	}
8192	}
8193
8194	return Chain;
8195	}
8196
8197	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8198	SelectionDAG &DAG,
8199	const SDLoc &DL,
8200	SDValue &Chain) const {
8201	MachineFunction &MF = DAG.getMachineFunction();
8202	MachineFrameInfo &MFI = MF.getFrameInfo();
8203	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8204	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8205	Function &F = MF.getFunction();
8206	bool IsWin64 =
8207	Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
8208
8209	SmallVector<SDValue, `8`> MemOps;
8210
8211	auto GPRArgRegs = AArch64::getGPRArgRegs();
8212	unsigned NumGPRArgRegs = GPRArgRegs.size();
8213	if (Subtarget->isWindowsArm64EC()) {
8214	// In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8215	// functions.
8216	NumGPRArgRegs = `4`;
8217	}
8218	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
8219
8220	unsigned GPRSaveSize = `8` * (NumGPRArgRegs - FirstVariadicGPR);
8221	int GPRIdx = `0`;
8222	if (GPRSaveSize != `0`) {
8223	if (IsWin64) {
8224	GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
8225	if (GPRSaveSize & `15`)
8226	// The extra size here, if triggered, will always be 8.
8227	MFI.CreateFixedObject(Size: `16` - (GPRSaveSize & `15`), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: `16`), IsImmutable: false);
8228	} else
8229	GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align (`8`), isSpillSlot: false);
8230
8231	SDValue FIN;
8232	if (Subtarget->isWindowsArm64EC()) {
8233	// With the Arm64EC ABI, we reserve the save area as usual, but we
8234	// compute its address relative to x4. For a normal AArch64->AArch64
8235	// call, x4 == sp on entry, but calls from an entry thunk can pass in a
8236	// different address.
8237	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
8238	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
8239	FIN = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Val,
8240	N2: DAG.getConstant(Val: GPRSaveSize, DL, VT: MVT::i64));
8241	} else {
8242	FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
8243	}
8244
8245	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8246	Register VReg = MF.addLiveIn(PReg: GPRArgRegs [i], RC: &AArch64::GPR64RegClass);
8247	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
8248	SDValue Store =
8249	DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
8250	PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
8251	MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * `8`)
8252	: MachinePointerInfo::getStack(MF, Offset: i * `8`));
8253	MemOps.push_back(Elt: Store);
8254	FIN =
8255	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: `8`, DL, VT: PtrVT));
8256	}
8257	}
8258	FuncInfo->setVarArgsGPRIndex(GPRIdx);
8259	FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8260
8261	if (Subtarget->hasFPARMv8() && !IsWin64) {
8262	auto FPRArgRegs = AArch64::getFPRArgRegs();
8263	const unsigned NumFPRArgRegs = FPRArgRegs.size();
8264	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
8265
8266	unsigned FPRSaveSize = `16` * (NumFPRArgRegs - FirstVariadicFPR);
8267	int FPRIdx = `0`;
8268	if (FPRSaveSize != `0`) {
8269	FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align (`16`), isSpillSlot: false);
8270
8271	SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
8272
8273	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8274	Register VReg = MF.addLiveIn(PReg: FPRArgRegs [i], RC: &AArch64::FPR128RegClass);
8275	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::f128);
8276
8277	SDValue Store = DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
8278	PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * `16`));
8279	MemOps.push_back(Elt: Store);
8280	FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
8281	N2: DAG.getConstant(Val: `16`, DL, VT: PtrVT));
8282	}
8283	}
8284	FuncInfo->setVarArgsFPRIndex(FPRIdx);
8285	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8286	}
8287
8288	if (!MemOps.empty()) {
8289	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
8290	}
8291	}
8292
8293	/// LowerCallResult - Lower the result values of a call into the
8294	/// appropriate copies out of appropriate physical registers.
8295	SDValue AArch64TargetLowering::LowerCallResult(
8296	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8297	const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8298	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8299	SDValue ThisVal, bool RequiresSMChange) const {
8300	DenseMap<unsigned, SDValue> CopiedRegs;
8301	// Copy all of the result registers out of their specified physreg.
8302	for (unsigned i = `0`; i != RVLocs.size(); ++i) {
8303	CCValAssign VA = RVLocs [i];
8304
8305	// Pass 'this' value directly from the argument to return value, to avoid
8306	// reg unit interference
8307	if (i == `0` && isThisReturn) {
8308	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8309	"unexpected return calling convention register assignment");
8310	InVals.push_back(Elt: ThisVal);
8311	continue;
8312	}
8313
8314	// Avoid copying a physreg twice since RegAllocFast is incompetent and only
8315	// allows one use of a physreg per block.
8316	SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
8317	if (!Val) {
8318	Val =
8319	DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
8320	Chain = Val.getValue(R: `1`);
8321	InGlue = Val.getValue(R: `2`);
8322	CopiedRegs [VA.getLocReg()] = Val;
8323	}
8324
8325	switch (VA.getLocInfo()) {
8326	default:
8327	llvm_unreachable("Unknown loc info!");
8328	case CCValAssign::Full:
8329	break;
8330	case CCValAssign::BCvt:
8331	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
8332	break;
8333	case CCValAssign::AExtUpper:
8334	Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
8335	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
8336	[[fallthrough]];
8337	case CCValAssign::AExt:
8338	[[fallthrough]];
8339	case CCValAssign::ZExt:
8340	Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
8341	break;
8342	}
8343
8344	if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
8345	Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8346	VTList: DAG.getVTList(VT1: Val.getValueType(), VT2: MVT::Glue), N: Val);
8347
8348	InVals.push_back(Elt: Val);
8349	}
8350
8351	return Chain;
8352	}
8353
8354	/// Return true if the calling convention is one that we can guarantee TCO for.
8355	static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8356	return (CC == CallingConv::Fast && GuaranteeTailCalls) \|\|
8357	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail;
8358	}
8359
8360	/// Return true if we might ever do TCO for calls with this calling convention.
8361	static bool mayTailCallThisCC(CallingConv::ID CC) {
8362	switch (CC) {
8363	case CallingConv::C:
8364	case CallingConv::AArch64_SVE_VectorCall:
8365	case CallingConv::PreserveMost:
8366	case CallingConv::PreserveAll:
8367	case CallingConv::PreserveNone:
8368	case CallingConv::Swift:
8369	case CallingConv::SwiftTail:
8370	case CallingConv::Tail:
8371	case CallingConv::Fast:
8372	return true;
8373	default:
8374	return false;
8375	}
8376	}
8377
8378	/// Return true if the call convention supports varargs
8379	/// Currently only those that pass varargs like the C
8380	/// calling convention does are eligible
8381	/// Calling conventions listed in this function must also
8382	/// be properly handled in AArch64Subtarget::isCallingConvWin64
8383	static bool callConvSupportsVarArgs(CallingConv::ID CC) {
8384	switch (CC) {
8385	case CallingConv::C:
8386	case CallingConv::PreserveNone:
8387	// SVE vector call is only partially supported, but it should
8388	// support named arguments being passed. Any arguments being passed
8389	// as varargs, are still unsupported.
8390	case CallingConv::AArch64_SVE_VectorCall:
8391	return true;
8392	default:
8393	return false;
8394	}
8395	}
8396
8397	static void analyzeCallOperands(const AArch64TargetLowering &TLI,
8398	const AArch64Subtarget *Subtarget,
8399	const TargetLowering::CallLoweringInfo &CLI,
8400	CCState &CCInfo) {
8401	const SelectionDAG &DAG = CLI.DAG;
8402	CallingConv::ID CalleeCC = CLI.CallConv;
8403	bool IsVarArg = CLI.IsVarArg;
8404	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
8405	bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC, IsVarArg);
8406
8407	// For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8408	// for the shadow store.
8409	if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8410	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`16`));
8411
8412	unsigned NumArgs = Outs.size();
8413	for (unsigned i = `0`; i != NumArgs; ++i) {
8414	MVT ArgVT = Outs [i].VT;
8415	ISD::ArgFlagsTy ArgFlags = Outs [i].Flags;
8416
8417	bool UseVarArgCC = false;
8418	if (IsVarArg) {
8419	// On Windows, the fixed arguments in a vararg call are passed in GPRs
8420	// too, so use the vararg CC to force them to integer registers.
8421	if (IsCalleeWin64) {
8422	UseVarArgCC = true;
8423	} else {
8424	UseVarArgCC = !Outs [i].IsFixed;
8425	}
8426	}
8427
8428	if (!UseVarArgCC) {
8429	// Get type of the original argument.
8430	EVT ActualVT =
8431	TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args [Outs [i].OrigArgIndex].Ty,
8432	/AllowUnknown/ true);
8433	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8434	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8435	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
8436	ArgVT = MVT::i8;
8437	else if (ActualMVT == MVT::i16)
8438	ArgVT = MVT::i16;
8439	}
8440
8441	// FIXME: CCAssignFnForCall should be called once, for the call and not per
8442	// argument. This logic should exactly mirror LowerFormalArguments.
8443	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
8444	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
8445	assert(!Res && "Call operand has unhandled type");
8446	(void)Res;
8447	}
8448	}
8449
8450	static SMECallAttrs
8451	getSMECallAttrs(const Function &Caller,
8452	const TargetLowering::CallLoweringInfo &CLI) {
8453	if (CLI.CB)
8454	return SMECallAttrs (*CLI.CB);
8455	if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val: CLI.Callee))
8456	return SMECallAttrs (SMEAttrs (Caller), SMEAttrs (ES->getSymbol()));
8457	return SMECallAttrs (SMEAttrs (Caller), SMEAttrs (SMEAttrs::Normal));
8458	}
8459
8460	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
8461	const CallLoweringInfo &CLI) const {
8462	CallingConv::ID CalleeCC = CLI.CallConv;
8463	if (!mayTailCallThisCC(CC: CalleeCC))
8464	return false;
8465
8466	SDValue Callee = CLI.Callee;
8467	bool IsVarArg = CLI.IsVarArg;
8468	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
8469	const SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
8470	const SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
8471	const SelectionDAG &DAG = CLI.DAG;
8472	MachineFunction &MF = DAG.getMachineFunction();
8473	const Function &CallerF = MF.getFunction();
8474	CallingConv::ID CallerCC = CallerF.getCallingConv();
8475
8476	// SME Streaming functions are not eligible for TCO as they may require
8477	// the streaming mode or ZA to be restored after returning from the call.
8478	SMECallAttrs CallAttrs = getSMECallAttrs(Caller: CallerF, CLI);
8479	if (CallAttrs.requiresSMChange() \|\| CallAttrs.requiresLazySave() \|\|
8480	CallAttrs.requiresPreservingAllZAState() \|\|
8481	CallAttrs.caller().hasStreamingBody())
8482	return false;
8483
8484	// Functions using the C or Fast calling convention that have an SVE signature
8485	// preserve more registers and should assume the SVE_VectorCall CC.
8486	// The check for matching callee-saved regs will determine whether it is
8487	// eligible for TCO.
8488	if ((CallerCC == CallingConv::C \|\| CallerCC == CallingConv::Fast) &&
8489	MF.getInfo<AArch64FunctionInfo>()->isSVECC())
8490	CallerCC = CallingConv::AArch64_SVE_VectorCall;
8491
8492	bool CCMatch = CallerCC == CalleeCC;
8493
8494	// When using the Windows calling convention on a non-windows OS, we want
8495	// to back up and restore X18 in such functions; we can't do a tail call
8496	// from those functions.
8497	if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
8498	CalleeCC != CallingConv::Win64)
8499	return false;
8500
8501	// Byval parameters hand the function a pointer directly into the stack area
8502	// we want to reuse during a tail call. Working around this is* possible (see*
8503	// X86) but less efficient and uglier in LowerCall.
8504	for (Function::const_arg_iterator i = CallerF.arg_begin(),
8505	e = CallerF.arg_end();
8506	i != e; ++i) {
8507	if (i->hasByValAttr())
8508	return false;
8509
8510	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
8511	// In this case, it is necessary to save X0/X1 in the callee and return it
8512	// in X0. Tail call opt may interfere with this, so we disable tail call
8513	// opt when the caller has an "inreg" attribute -- except if the callee
8514	// also has that attribute on the same argument, and the same value is
8515	// passed.
8516	if (i->hasInRegAttr()) {
8517	unsigned ArgIdx = i - CallerF.arg_begin();
8518	if (!CLI.CB \|\| CLI.CB->arg_size() <= ArgIdx)
8519	return false;
8520	AttributeSet Attrs = CLI.CB->getParamAttributes(ArgNo: ArgIdx);
8521	if (!Attrs.hasAttribute(Kind: Attribute::InReg) \|\|
8522	!Attrs.hasAttribute(Kind: Attribute::StructRet) \|\| !i->hasStructRetAttr() \|\|
8523	CLI.CB->getArgOperand(i: ArgIdx) != i) {
8524	return false;
8525	}
8526	}
8527	}
8528
8529	if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
8530	return CCMatch;
8531
8532	// Externally-defined functions with weak linkage should not be
8533	// tail-called on AArch64 when the OS does not support dynamic
8534	// pre-emption of symbols, as the AAELF spec requires normal calls
8535	// to undefined weak functions to be replaced with a NOP or jump to the
8536	// next instruction. The behaviour of branch instructions in this
8537	// situation (as used for tail calls) is implementation-defined, so we
8538	// cannot rely on the linker replacing the tail call with a return.
8539	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8540	const GlobalValue *GV = G->getGlobal();
8541	const Triple &TT = getTargetMachine().getTargetTriple();
8542	if (GV->hasExternalWeakLinkage() &&
8543	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
8544	return false;
8545	}
8546
8547	// Now we search for cases where we can use a tail call without changing the
8548	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
8549	// concept.
8550
8551	// I want anyone implementing a new calling convention to think long and hard
8552	// about this assert.
8553	if (IsVarArg && !callConvSupportsVarArgs(CC: CalleeCC))
8554	report_fatal_error(reason: "Unsupported variadic calling convention");
8555
8556	LLVMContext &C = *DAG.getContext();
8557	// Check that the call results are passed in the same way.
8558	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8559	CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
8560	CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
8561	return false;
8562	// The callee has to preserve all registers the caller needs to preserve.
8563	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8564	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8565	if (!CCMatch) {
8566	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8567	if (Subtarget->hasCustomCallingConv()) {
8568	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
8569	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
8570	}
8571	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
8572	return false;
8573	}
8574
8575	// Nothing more to check if the callee is taking no arguments
8576	if (Outs.empty())
8577	return true;
8578
8579	SmallVector<CCValAssign, `16`> ArgLocs;
8580	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8581
8582	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8583
8584	if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8585	// When we are musttail, additional checks have been done and we can safely ignore this check
8586	// At least two cases here: if caller is fastcc then we can't have any
8587	// memory arguments (we'd be expected to clean up the stack afterwards). If
8588	// caller is C then we could potentially use its argument area.
8589
8590	// FIXME: for now we take the most conservative of these in both cases:
8591	// disallow all variadic memory operands.
8592	for (const CCValAssign &ArgLoc : ArgLocs)
8593	if (!ArgLoc.isRegLoc())
8594	return false;
8595	}
8596
8597	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8598
8599	// If any of the arguments is passed indirectly, it must be SVE, so the
8600	// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8601	// allocate space on the stack. That is why we determine this explicitly here
8602	// the call cannot be a tailcall.
8603	if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
8604	assert((A.getLocInfo() != CCValAssign::Indirect \|\|
8605	A.getValVT().isScalableVector() \|\|
8606	Subtarget->isWindowsArm64EC()) &&
8607	"Expected value to be scalable");
8608	return A.getLocInfo() == CCValAssign::Indirect;
8609	}))
8610	return false;
8611
8612	// If the stack arguments for this call do not fit into our own save area then
8613	// the call cannot be made tail.
8614	if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8615	return false;
8616
8617	const MachineRegisterInfo &MRI = MF.getRegInfo();
8618	if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
8619	return false;
8620
8621	return true;
8622	}
8623
8624	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8625	SelectionDAG &DAG,
8626	MachineFrameInfo &MFI,
8627	int ClobberedFI) const {
8628	SmallVector<SDValue, `8`> ArgChains;
8629	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
8630	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
8631
8632	// Include the original chain at the beginning of the list. When this is
8633	// used by target LowerCall hooks, this helps legalize find the
8634	// CALLSEQ_BEGIN node.
8635	ArgChains.push_back(Elt: Chain);
8636
8637	// Add a chain value for each stack argument corresponding
8638	for (SDNode *U : DAG.getEntryNode().getNode()->users())
8639	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
8640	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
8641	if (FI->getIndex() < `0`) {
8642	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
8643	int64_t InLastByte = InFirstByte;
8644	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
8645
8646	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
8647	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
8648	ArgChains.push_back(Elt: SDValue (L, `1`));
8649	}
8650
8651	// Build a tokenfactor for all the chains.
8652	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ArgChains);
8653	}
8654
8655	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8656	bool TailCallOpt) const {
8657	return (CallCC == CallingConv::Fast && TailCallOpt) \|\|
8658	CallCC == CallingConv::Tail \|\| CallCC == CallingConv::SwiftTail;
8659	}
8660
8661	// Check if the value is zero-extended from i1 to i8
8662	static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8663	unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8664	if (SizeInBits < `8`)
8665	return false;
8666
8667	APInt RequiredZero(SizeInBits, `0xFE`);
8668	KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: `4`);
8669	bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
8670	return ZExtBool;
8671	}
8672
8673	void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8674	SDNode Node) const* {
8675	// Live-in physreg copies that are glued to SMSTART are applied as
8676	// implicit-def's in the InstrEmitter. Here we remove them, allowing the
8677	// register allocator to pass call args in callee saved regs, without extra
8678	// copies to avoid these fake clobbers of actually-preserved GPRs.
8679	if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 \|\|
8680	MI.getOpcode() == AArch64::MSRpstatePseudo) {
8681	for (unsigned I = MI.getNumOperands() - `1`; I > `0`; --I)
8682	if (MachineOperand &MO = MI.getOperand(i: I);
8683	MO.isReg() && MO.isImplicit() && MO.isDef() &&
8684	(AArch64::GPR32RegClass.contains(Reg: MO.getReg()) \|\|
8685	AArch64::GPR64RegClass.contains(Reg: MO.getReg())))
8686	MI.removeOperand(OpNo: I);
8687
8688	// The SVE vector length can change when entering/leaving streaming mode.
8689	// FPMR is set to 0 when entering/leaving streaming mode.
8690	if (MI.getOperand(i: `0`).getImm() == AArch64SVCR::SVCRSM \|\|
8691	MI.getOperand(i: `0`).getImm() == AArch64SVCR::SVCRSMZA) {
8692	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: false,
8693	/IsImplicit=/isImp: true));
8694	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: true,
8695	/IsImplicit=/isImp: true));
8696	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::FPMR, /IsDef=/isDef: true,
8697	/IsImplicit=/isImp: true));
8698	}
8699	}
8700
8701	// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8702	// have nothing to do with VG, were it not that they are used to materialise a
8703	// frame-address. If they contain a frame-index to a scalable vector, this
8704	// will likely require an ADDVL instruction to materialise the address, thus
8705	// reading VG.
8706	const MachineFunction &MF = *MI.getMF();
8707	if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
8708	(MI.getOpcode() == AArch64::ADDXri \|\|
8709	MI.getOpcode() == AArch64::SUBXri)) {
8710	const MachineOperand &MO = MI.getOperand(i: `1`);
8711	if (MO.isFI() && MF.getFrameInfo().getStackID(ObjectIdx: MO.getIndex()) ==
8712	TargetStackID::ScalableVector)
8713	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: false,
8714	/IsImplicit=/isImp: true));
8715	}
8716	}
8717
8718	SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
8719	bool Enable, SDValue Chain,
8720	SDValue InGlue,
8721	unsigned Condition,
8722	SDValue PStateSM) const {
8723	MachineFunction &MF = DAG.getMachineFunction();
8724	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8725	FuncInfo->setHasStreamingModeChanges(true);
8726
8727	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8728	SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
8729	SDValue MSROp =
8730	DAG.getTargetConstant(Val: (int32_t)AArch64SVCR::SVCRSM, DL, VT: MVT::i32);
8731	SmallVector<SDValue> Ops = {Chain, MSROp};
8732	unsigned Opcode;
8733	if (Condition != AArch64SME::Always) {
8734	SDValue ConditionOp = DAG.getTargetConstant(Val: Condition, DL, VT: MVT::i64);
8735	Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
8736	assert(PStateSM && "PStateSM should be defined");
8737	Ops.push_back(Elt: ConditionOp);
8738	Ops.push_back(Elt: PStateSM);
8739	} else {
8740	Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8741	}
8742	Ops.push_back(Elt: RegMask);
8743
8744	if (InGlue)
8745	Ops.push_back(Elt: InGlue);
8746
8747	return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
8748	}
8749
8750	// Emit a call to __arm_sme_save or __arm_sme_restore.
8751	static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI,
8752	SelectionDAG &DAG,
8753	AArch64FunctionInfo *Info, SDLoc DL,
8754	SDValue Chain, bool IsSave) {
8755	MachineFunction &MF = DAG.getMachineFunction();
8756	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8757	FuncInfo->setSMESaveBufferUsed();
8758
8759	TargetLowering::ArgListTy Args;
8760	TargetLowering::ArgListEntry Entry;
8761	Entry.Ty = PointerType::getUnqual(C&: *DAG.getContext());
8762	Entry.Node =
8763	DAG.getCopyFromReg(Chain, dl: DL, Reg: Info->getSMESaveBufferAddr(), VT: MVT::i64);
8764	Args.push_back(x: Entry);
8765
8766	SDValue Callee =
8767	DAG.getExternalSymbol(Sym: IsSave ? "__arm_sme_save" : "__arm_sme_restore",
8768	VT: TLI.getPointerTy(DL: DAG.getDataLayout()));
8769	auto RetTy = Type::getVoidTy(C&: DAG.getContext());
8770	TargetLowering::CallLoweringInfo CLI(DAG);
8771	CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8772	CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, ResultType: RetTy,
8773	Target: Callee, ArgsList: std::move(Args));
8774	return TLI.LowerCallTo(CLI).second;
8775	}
8776
8777	static AArch64SME::ToggleCondition
8778	getSMToggleCondition(const SMECallAttrs &CallAttrs) {
8779	if (!CallAttrs.caller().hasStreamingCompatibleInterface() \|\|
8780	CallAttrs.caller().hasStreamingBody())
8781	return AArch64SME::Always;
8782	if (CallAttrs.callee().hasNonStreamingInterface())
8783	return AArch64SME::IfCallerIsStreaming;
8784	if (CallAttrs.callee().hasStreamingInterface())
8785	return AArch64SME::IfCallerIsNonStreaming;
8786
8787	llvm_unreachable("Unsupported attributes");
8788	}
8789
8790	/// Check whether a stack argument requires lowering in a tail call.
8791	static bool shouldLowerTailCallStackArg(const MachineFunction &MF,
8792	const CCValAssign &VA, SDValue Arg,
8793	ISD::ArgFlagsTy Flags, int CallOffset) {
8794	// FIXME: We should be able to handle this case, but it's not clear how to.
8795	if (Flags.isZExt() \|\| Flags.isSExt())
8796	return true;
8797
8798	for (;;) {
8799	// Look through nodes that don't alter the bits of the incoming value.
8800	unsigned Op = Arg.getOpcode();
8801	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST \|\|
8802	Arg ->isAssert() \|\| Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
8803	Arg = Arg.getOperand(i: `0`);
8804	continue;
8805	}
8806	break;
8807	}
8808
8809	// If the argument is a load from the same immutable stack slot, we can reuse
8810	// it.
8811	if (auto *LoadNode = dyn_cast<LoadSDNode>(Val&: Arg)) {
8812	if (auto *FINode = dyn_cast<FrameIndexSDNode>(Val: LoadNode->getBasePtr())) {
8813	const MachineFrameInfo &MFI = MF.getFrameInfo();
8814	int FI = FINode->getIndex();
8815	if (!MFI.isImmutableObjectIndex(ObjectIdx: FI))
8816	return true;
8817	if (CallOffset != MFI.getObjectOffset(ObjectIdx: FI))
8818	return true;
8819	uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
8820	if (SizeInBits / `8` != MFI.getObjectSize(ObjectIdx: FI))
8821	return true;
8822	return false;
8823	}
8824	}
8825
8826	return true;
8827	}
8828
8829	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8830	/// and add input and output parameter nodes.
8831	SDValue
8832	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8833	SmallVectorImpl<SDValue> &InVals) const {
8834	SelectionDAG &DAG = CLI.DAG;
8835	SDLoc &DL = CLI.DL;
8836	SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
8837	SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
8838	SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
8839	SDValue Chain = CLI.Chain;
8840	SDValue Callee = CLI.Callee;
8841	bool &IsTailCall = CLI.IsTailCall;
8842	CallingConv::ID &CallConv = CLI.CallConv;
8843	bool IsVarArg = CLI.IsVarArg;
8844
8845	MachineFunction &MF = DAG.getMachineFunction();
8846	MachineFunction::CallSiteInfo CSInfo;
8847	bool IsThisReturn = false;
8848
8849	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8850	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8851	bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8852	bool IsSibCall = false;
8853	bool GuardWithBTI = false;
8854
8855	if (CLI.CB && CLI.CB->hasFnAttr(Kind: Attribute::ReturnsTwice) &&
8856	!Subtarget->noBTIAtReturnTwice()) {
8857	GuardWithBTI = FuncInfo->branchTargetEnforcement();
8858	}
8859
8860	// Analyze operands of the call, assigning locations to each operand.
8861	SmallVector<CCValAssign, `16`> ArgLocs;
8862	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8863
8864	if (IsVarArg) {
8865	unsigned NumArgs = Outs.size();
8866
8867	for (unsigned i = `0`; i != NumArgs; ++i) {
8868	if (!Outs [i].IsFixed && Outs [i].VT.isScalableVector())
8869	report_fatal_error(reason: "Passing SVE types to variadic functions is "
8870	"currently not supported");
8871	}
8872	}
8873
8874	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8875
8876	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8877	// Assign locations to each value returned by this call.
8878	SmallVector<CCValAssign, `16`> RVLocs;
8879	CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8880	*DAG.getContext());
8881	RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
8882
8883	// Check callee args/returns for SVE registers and set calling convention
8884	// accordingly.
8885	if (CallConv == CallingConv::C \|\| CallConv == CallingConv::Fast) {
8886	auto HasSVERegLoc = [](CCValAssign &Loc) {
8887	if (!Loc.isRegLoc())
8888	return false;
8889	return AArch64::ZPRRegClass.contains(Reg: Loc.getLocReg()) \|\|
8890	AArch64::PPRRegClass.contains(Reg: Loc.getLocReg());
8891	};
8892	if (any_of(Range&: RVLocs, P: HasSVERegLoc) \|\| any_of(Range&: ArgLocs, P: HasSVERegLoc))
8893	CallConv = CallingConv::AArch64_SVE_VectorCall;
8894	}
8895
8896	if (IsTailCall) {
8897	// Check if it's really possible to do a tail call.
8898	IsTailCall = isEligibleForTailCallOptimization(CLI);
8899
8900	// A sibling call is one where we're under the usual C ABI and not planning
8901	// to change that but can still do a tail call:
8902	if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8903	CallConv != CallingConv::SwiftTail)
8904	IsSibCall = true;
8905
8906	if (IsTailCall)
8907	++NumTailCalls;
8908	}
8909
8910	if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8911	report_fatal_error(reason: "failed to perform tail call elimination on a call "
8912	"site marked musttail");
8913
8914	// Get a count of how many bytes are to be pushed on the stack.
8915	unsigned NumBytes = CCInfo.getStackSize();
8916
8917	if (IsSibCall) {
8918	// Since we're not changing the ABI to make this a tail call, the memory
8919	// operands are already available in the caller's incoming argument space.
8920	NumBytes = `0`;
8921	}
8922
8923	// FPDiff is the byte offset of the call's argument area from the callee's.
8924	// Stores to callee stack arguments will be placed in FixedStackSlots offset
8925	// by this amount for a tail call. In a sibling call it must be 0 because the
8926	// caller will deallocate the entire stack and the callee still expects its
8927	// arguments to begin at SP+0. Completely unused for non-tail calls.
8928	int FPDiff = `0`;
8929
8930	if (IsTailCall && !IsSibCall) {
8931	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8932
8933	// Since callee will pop argument stack as a tail call, we must keep the
8934	// popped size 16-byte aligned.
8935	NumBytes = alignTo(Value: NumBytes, Align: `16`);
8936
8937	// FPDiff will be negative if this tail call requires more space than we
8938	// would automatically have in our incoming argument space. Positive if we
8939	// can actually shrink the stack.
8940	FPDiff = NumReusableBytes - NumBytes;
8941
8942	// Update the required reserved area if this is the tail call requiring the
8943	// most argument stack space.
8944	if (FPDiff < `0` && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8945	FuncInfo->setTailCallReservedStack(-FPDiff);
8946
8947	// The stack pointer must be 16-byte aligned at all times it's used for a
8948	// memory operation, which in practice means at all* times and in*
8949	// particular across call boundaries. Therefore our own arguments started at
8950	// a 16-byte aligned SP and the delta applied for the tail call should
8951	// satisfy the same constraint.
8952	assert(FPDiff % `16` == `0` && "unaligned stack on tail call");
8953	}
8954
8955	// Determine whether we need any streaming mode changes.
8956	SMECallAttrs CallAttrs = getSMECallAttrs(Caller: MF.getFunction(), CLI);
8957
8958	auto DescribeCallsite =
8959	[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
8960	R << "call from '" << ore::NV ("Caller", MF.getName()) << "' to '";
8961	if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8962	R << ore::NV ("Callee", ES->getSymbol());
8963	else if (CLI.CB && CLI.CB->getCalledFunction())
8964	R << ore::NV ("Callee", CLI.CB->getCalledFunction()->getName());
8965	else
8966	R << "unknown callee";
8967	R << "'";
8968	return R;
8969	};
8970
8971	bool RequiresLazySave = CallAttrs.requiresLazySave();
8972	bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
8973	if (RequiresLazySave) {
8974	const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8975	MachinePointerInfo MPI =
8976	MachinePointerInfo::getStack(MF, Offset: TPIDR2.FrameIndex);
8977	SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8978	FI: TPIDR2.FrameIndex,
8979	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8980	SDValue NumZaSaveSlicesAddr =
8981	DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
8982	N2: DAG.getConstant(Val: `8`, DL, VT: TPIDR2ObjAddr.getValueType()));
8983	SDValue NumZaSaveSlices = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8984	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
8985	Chain = DAG.getTruncStore(Chain, dl: DL, Val: NumZaSaveSlices, Ptr: NumZaSaveSlicesAddr,
8986	PtrInfo: MPI, SVT: MVT::i16);
8987	Chain = DAG.getNode(
8988	Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Chain,
8989	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8990	N3: TPIDR2ObjAddr);
8991	OptimizationRemarkEmitter ORE(&MF.getFunction());
8992	ORE.emit(RemarkBuilder: [&]() {
8993	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
8994	CLI.CB)
8995	: OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
8996	&MF.getFunction());
8997	return DescribeCallsite (R) << " sets up a lazy save for ZA";
8998	});
8999	} else if (RequiresSaveAllZA) {
9000	assert(!CallAttrs.callee().hasSharedZAInterface() &&
9001	"Cannot share state that may not exist");
9002	Chain = emitSMEStateSaveRestore(TLI: *this, DAG, Info: FuncInfo, DL, Chain,
9003	/IsSave=/true);
9004	}
9005
9006	SDValue PStateSM;
9007	bool RequiresSMChange = CallAttrs.requiresSMChange();
9008	if (RequiresSMChange) {
9009	if (CallAttrs.caller().hasStreamingInterfaceOrBody())
9010	PStateSM = DAG.getConstant(Val: `1`, DL, VT: MVT::i64);
9011	else if (CallAttrs.caller().hasNonStreamingInterface())
9012	PStateSM = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
9013	else
9014	PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
9015	OptimizationRemarkEmitter ORE(&MF.getFunction());
9016	ORE.emit(RemarkBuilder: [&]() {
9017	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMETransition",
9018	CLI.CB)
9019	: OptimizationRemarkAnalysis ("sme", "SMETransition",
9020	&MF.getFunction());
9021	DescribeCallsite (R) << " requires a streaming mode transition";
9022	return R;
9023	});
9024	}
9025
9026	SDValue ZTFrameIdx;
9027	MachineFrameInfo &MFI = MF.getFrameInfo();
9028	bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9029
9030	// If the caller has ZT0 state which will not be preserved by the callee,
9031	// spill ZT0 before the call.
9032	if (ShouldPreserveZT0) {
9033	unsigned ZTObj = MFI.CreateSpillStackObject(Size: `64`, Alignment: Align (`16`));
9034	ZTFrameIdx = DAG.getFrameIndex(
9035	FI: ZTObj,
9036	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9037
9038	Chain = DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
9039	Ops: {Chain, DAG.getConstant(Val: `0`, DL, VT: MVT::i32), ZTFrameIdx});
9040	}
9041
9042	// If caller shares ZT0 but the callee is not shared ZA, we need to stop
9043	// PSTATE.ZA before the call if there is no lazy-save active.
9044	bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9045	assert((!DisableZA \|\| !RequiresLazySave) &&
9046	"Lazy-save should have PSTATE.SM=1 on entry to the function");
9047
9048	if (DisableZA)
9049	Chain = DAG.getNode(
9050	Opcode: AArch64ISD::SMSTOP, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N1: Chain,
9051	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
9052
9053	// Adjust the stack pointer for the new arguments...
9054	// These operations are automatically eliminated by the prolog/epilog pass
9055	if (!IsSibCall)
9056	Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? `0` : NumBytes, OutSize: `0`, DL);
9057
9058	SDValue StackPtr = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP,
9059	VT: getPointerTy(DL: DAG.getDataLayout()));
9060
9061	SmallVector<std::pair<unsigned, SDValue>, `8`> RegsToPass;
9062	SmallSet<unsigned, `8`> RegsUsed;
9063	SmallVector<SDValue, `8`> MemOpChains;
9064	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
9065
9066	if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9067	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9068	for (const auto &F : Forwards) {
9069	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
9070	RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
9071	}
9072	}
9073
9074	// Walk the register/memloc assignments, inserting copies/loads.
9075	unsigned ExtraArgLocs = `0`;
9076	for (unsigned i = `0`, e = Outs.size(); i != e; ++i) {
9077	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
9078	SDValue Arg = OutVals [i];
9079	ISD::ArgFlagsTy Flags = Outs [i].Flags;
9080
9081	// Promote the value if needed.
9082	switch (VA.getLocInfo()) {
9083	default:
9084	llvm_unreachable("Unknown loc info!");
9085	case CCValAssign::Full:
9086	break;
9087	case CCValAssign::SExt:
9088	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9089	break;
9090	case CCValAssign::ZExt:
9091	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9092	break;
9093	case CCValAssign::AExt:
9094	if (Outs [i].ArgVT == MVT::i1) {
9095	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9096	//
9097	// Check if we actually have to do this, because the value may
9098	// already be zero-extended.
9099	//
9100	// We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9101	// and rely on DAGCombiner to fold this, because the following
9102	// (anyext i32) is combined with (zext i8) in DAG.getNode:
9103	//
9104	// (ext (zext x)) -> (zext x)
9105	//
9106	// This will give us (zext i32), which we cannot remove, so
9107	// try to check this beforehand.
9108	if (!checkZExtBool(Arg, DAG)) {
9109	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9110	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i8, Operand: Arg);
9111	}
9112	}
9113	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9114	break;
9115	case CCValAssign::AExtUpper:
9116	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9117	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9118	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9119	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
9120	break;
9121	case CCValAssign::BCvt:
9122	Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
9123	break;
9124	case CCValAssign::Trunc:
9125	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9126	break;
9127	case CCValAssign::FPExt:
9128	Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9129	break;
9130	case CCValAssign::Indirect:
9131	bool isScalable = VA.getValVT().isScalableVT();
9132	assert((isScalable \|\| Subtarget->isWindowsArm64EC()) &&
9133	"Indirect arguments should be scalable on most subtargets");
9134
9135	uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9136	uint64_t PartSize = StoreSize;
9137	unsigned NumParts = `1`;
9138	if (Outs [i].Flags.isInConsecutiveRegs()) {
9139	while (!Outs [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
9140	++NumParts;
9141	StoreSize *= NumParts;
9142	}
9143
9144	Type Ty = EVT (VA.getValVT()).getTypeForEVT(Context&: DAG.getContext());
9145	Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9146	MachineFrameInfo &MFI = MF.getFrameInfo();
9147	int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
9148	if (isScalable)
9149	MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
9150
9151	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9152	SDValue Ptr = DAG.getFrameIndex(
9153	FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9154	SDValue SpillSlot = Ptr;
9155
9156	// Ensure we generate all stores for each tuple part, whilst updating the
9157	// pointer after each store correctly using vscale.
9158	while (NumParts) {
9159	SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals [i], Ptr, PtrInfo: MPI);
9160	MemOpChains.push_back(Elt: Store);
9161
9162	NumParts--;
9163	if (NumParts > `0`) {
9164	SDValue BytesIncrement;
9165	if (isScalable) {
9166	BytesIncrement = DAG.getVScale(
9167	DL, VT: Ptr.getValueType(),
9168	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9169	} else {
9170	BytesIncrement = DAG.getConstant(
9171	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9172	VT: Ptr.getValueType());
9173	}
9174	MPI = MachinePointerInfo (MPI.getAddrSpace());
9175	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
9176	N2: BytesIncrement, Flags: SDNodeFlags::NoUnsignedWrap);
9177	ExtraArgLocs++;
9178	i++;
9179	}
9180	}
9181
9182	Arg = SpillSlot;
9183	break;
9184	}
9185
9186	if (VA.isRegLoc()) {
9187	if (i == `0` && Flags.isReturned() && !Flags.isSwiftSelf() &&
9188	Outs [`0`].VT == MVT::i64) {
9189	assert(VA.getLocVT() == MVT::i64 &&
9190	"unexpected calling convention register assignment");
9191	assert(!Ins.empty() && Ins[`0`].VT == MVT::i64 &&
9192	"unexpected use of 'returned'");
9193	IsThisReturn = true;
9194	}
9195	if (RegsUsed.count(V: VA.getLocReg())) {
9196	// If this register has already been used then we're trying to pack
9197	// parts of an [N x i32] into an X-register. The extension type will
9198	// take care of putting the two halves in the right place but we have to
9199	// combine them.
9200	SDValue &Bits =
9201	llvm::find_if(Range&: RegsToPass,
9202	P: [=](const std::pair<unsigned, SDValue> &Elt) {
9203	return Elt.first == VA.getLocReg();
9204	})
9205	->second;
9206	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9207	// Call site info is used for function's parameter entry value
9208	// tracking. For now we track only simple cases when parameter
9209	// is transferred through whole register.
9210	llvm::erase_if(C&: CSInfo.ArgRegPairs,
9211	P: [&VA](MachineFunction::ArgRegPair ArgReg) {
9212	return ArgReg.Reg == VA.getLocReg();
9213	});
9214	} else {
9215	// Add an extra level of indirection for streaming mode changes by
9216	// using a pseudo copy node that cannot be rematerialised between a
9217	// smstart/smstop and the call by the simple register coalescer.
9218	if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
9219	Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9220	VTList: DAG.getVTList(VT1: Arg.getValueType(), VT2: MVT::Glue), N: Arg);
9221	RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9222	RegsUsed.insert(V: VA.getLocReg());
9223	const TargetOptions &Options = DAG.getTarget().Options;
9224	if (Options.EmitCallSiteInfo)
9225	CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
9226	}
9227	} else {
9228	assert(VA.isMemLoc());
9229
9230	SDValue DstAddr;
9231	MachinePointerInfo DstInfo;
9232
9233	// FIXME: This works on big-endian for composite byvals, which are the
9234	// common case. It should also work for fundamental types too.
9235	uint32_t BEAlign = `0`;
9236	unsigned OpSize;
9237	if (VA.getLocInfo() == CCValAssign::Indirect \|\|
9238	VA.getLocInfo() == CCValAssign::Trunc)
9239	OpSize = VA.getLocVT().getFixedSizeInBits();
9240	else
9241	OpSize = Flags.isByVal() ? Flags.getByValSize() * `8`
9242	: VA.getValVT().getSizeInBits();
9243	OpSize = (OpSize + `7`) / `8`;
9244	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9245	!Flags.isInConsecutiveRegs()) {
9246	if (OpSize < `8`)
9247	BEAlign = `8` - OpSize;
9248	}
9249	unsigned LocMemOffset = VA.getLocMemOffset();
9250	int32_t Offset = LocMemOffset + BEAlign;
9251
9252	if (IsTailCall) {
9253	// When the frame pointer is perfectly aligned for the tail call and the
9254	// same stack argument is passed down intact, we can reuse it.
9255	if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, CallOffset: Offset))
9256	continue;
9257
9258	Offset = Offset + FPDiff;
9259	int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
9260
9261	DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
9262	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9263
9264	// Make sure any stack arguments overlapping with where we're storing
9265	// are loaded before this eventual operation. Otherwise they'll be
9266	// clobbered.
9267	Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
9268	} else {
9269	SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
9270
9271	DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
9272	DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
9273	}
9274
9275	if (Outs [i].Flags.isByVal()) {
9276	SDValue SizeNode =
9277	DAG.getConstant(Val: Outs [i].Flags.getByValSize(), DL, VT: MVT::i64);
9278	SDValue Cpy = DAG.getMemcpy(
9279	Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
9280	Alignment: Outs [i].Flags.getNonZeroByValAlign(),
9281	/isVol = / false, /AlwaysInline = / false,
9282	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo ());
9283
9284	MemOpChains.push_back(Elt: Cpy);
9285	} else {
9286	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9287	// promoted to a legal register type i32, we should truncate Arg back to
9288	// i1/i8/i16.
9289	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
9290	VA.getValVT() == MVT::i16)
9291	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
9292
9293	SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
9294	MemOpChains.push_back(Elt: Store);
9295	}
9296	}
9297	}
9298
9299	if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9300	!(CLI.CB && CLI.CB->isMustTailCall())) {
9301	SDValue ParamPtr = StackPtr;
9302	if (IsTailCall) {
9303	// Create a dummy object at the top of the stack that can be used to get
9304	// the SP after the epilogue
9305	int FI = MF.getFrameInfo().CreateFixedObject(Size: `1`, SPOffset: FPDiff, IsImmutable: true);
9306	ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
9307	}
9308
9309	// For vararg calls, the Arm64EC ABI requires values in x4 and x5
9310	// describing the argument list. x4 contains the address of the
9311	// first stack parameter. x5 contains the size in bytes of all parameters
9312	// passed on the stack.
9313	RegsToPass.emplace_back(Args: AArch64::X4, Args&: ParamPtr);
9314	RegsToPass.emplace_back(Args: AArch64::X5,
9315	Args: DAG.getConstant(Val: NumBytes, DL, VT: MVT::i64));
9316	}
9317
9318	if (!MemOpChains.empty())
9319	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
9320
9321	SDValue InGlue;
9322	if (RequiresSMChange) {
9323	if (!Subtarget->isTargetDarwin() \|\| Subtarget->hasSVE()) {
9324	Chain = DAG.getNode(Opcode: AArch64ISD::VG_SAVE, DL,
9325	VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N: Chain);
9326	InGlue = Chain.getValue(R: `1`);
9327	}
9328
9329	SDValue NewChain = changeStreamingMode(
9330	DAG, DL, Enable: CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9331	Condition: getSMToggleCondition(CallAttrs), PStateSM);
9332	Chain = NewChain.getValue(R: `0`);
9333	InGlue = NewChain.getValue(R: `1`);
9334	}
9335
9336	// Build a sequence of copy-to-reg nodes chained together with token chain
9337	// and flag operands which copy the outgoing args into the appropriate regs.
9338	for (auto &RegToPass : RegsToPass) {
9339	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
9340	N: RegToPass.second, Glue: InGlue);
9341	InGlue = Chain.getValue(R: `1`);
9342	}
9343
9344	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9345	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9346	// node so that legalize doesn't hack it.
9347	const GlobalValue CalledGlobal = nullptr*;
9348	unsigned OpFlags = `0`;
9349	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
9350	CalledGlobal = G->getGlobal();
9351	OpFlags = Subtarget->classifyGlobalFunctionReference(GV: CalledGlobal,
9352	TM: getTargetMachine());
9353	if (OpFlags & AArch64II::MO_GOT) {
9354	Callee = DAG.getTargetGlobalAddress(GV: CalledGlobal, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
9355	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
9356	} else {
9357	const GlobalValue *GV = G->getGlobal();
9358	Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
9359	}
9360	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
9361	bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9362	Subtarget->isTargetMachO()) \|\|
9363	MF.getFunction().getParent()->getRtLibUseGOT();
9364	const char *Sym = S->getSymbol();
9365	if (UseGot) {
9366	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
9367	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
9368	} else {
9369	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: `0`);
9370	}
9371	}
9372
9373	// We don't usually want to end the call-sequence here because we would tidy
9374	// the frame up after* the call, however in the ABI-changing tail-call case*
9375	// we've carefully laid out the parameters so that when sp is reset they'll be
9376	// in the correct location.
9377	if (IsTailCall && !IsSibCall) {
9378	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: InGlue, DL);
9379	InGlue = Chain.getValue(R: `1`);
9380	}
9381
9382	unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9383
9384	std::vector<SDValue> Ops;
9385	Ops.push_back(x: Chain);
9386	Ops.push_back(x: Callee);
9387
9388	// Calls with operand bundle "clang.arc.attachedcall" are special. They should
9389	// be expanded to the call, directly followed by a special marker sequence and
9390	// a call to an ObjC library function. Use CALL_RVMARKER to do that.
9391	if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
9392	assert(!IsTailCall &&
9393	"tail calls cannot be marked with clang.arc.attachedcall");
9394	Opc = AArch64ISD::CALL_RVMARKER;
9395
9396	// Add a target global address for the retainRV/claimRV runtime function
9397	// just before the call target.
9398	Function ARCFn = objcarc::getAttachedARCFunction(CB: CLI.CB);
9399	auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
9400	Ops.insert(position: Ops.begin() + `1`, x: GA);
9401
9402	// We may or may not need to emit both the marker and the retain/claim call.
9403	// Tell the pseudo expansion using an additional boolean op.
9404	bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CB: CLI.CB);
9405	SDValue DoEmitMarker =
9406	DAG.getTargetConstant(Val: ShouldEmitMarker, DL, VT: MVT::i32);
9407	Ops.insert(position: Ops.begin() + `2`, x: DoEmitMarker);
9408	} else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9409	Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9410	} else if (GuardWithBTI) {
9411	Opc = AArch64ISD::CALL_BTI;
9412	}
9413
9414	if (IsTailCall) {
9415	// Each tail call may have to adjust the stack by a different amount, so
9416	// this information must travel along with the operation for eventual
9417	// consumption by emitEpilogue.
9418	Ops.push_back(x: DAG.getSignedTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
9419	}
9420
9421	if (CLI.PAI) {
9422	const uint64_t Key = CLI.PAI ->Key;
9423	assert((Key == AArch64PACKey::IA \|\| Key == AArch64PACKey::IB) &&
9424	"Invalid auth call key");
9425
9426	// Split the discriminator into address/integer components.
9427	SDValue AddrDisc, IntDisc;
9428	std::tie(args&: IntDisc, args&: AddrDisc) =
9429	extractPtrauthBlendDiscriminators(Disc: CLI.PAI ->Discriminator, DAG: &DAG);
9430
9431	if (Opc == AArch64ISD::CALL_RVMARKER)
9432	Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9433	else
9434	Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9435	Ops.push_back(x: DAG.getTargetConstant(Val: Key, DL, VT: MVT::i32));
9436	Ops.push_back(x: IntDisc);
9437	Ops.push_back(x: AddrDisc);
9438	}
9439
9440	// Add argument registers to the end of the list so that they are known live
9441	// into the call.
9442	for (auto &RegToPass : RegsToPass)
9443	Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
9444	VT: RegToPass.second.getValueType()));
9445
9446	// Add a register mask operand representing the call-preserved registers.
9447	const uint32_t *Mask;
9448	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9449	if (IsThisReturn) {
9450	// For 'this' returns, use the X0-preserving mask if applicable
9451	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
9452	if (!Mask) {
9453	IsThisReturn = false;
9454	Mask = TRI->getCallPreservedMask(MF, CallConv);
9455	}
9456	} else
9457	Mask = TRI->getCallPreservedMask(MF, CallConv);
9458
9459	if (Subtarget->hasCustomCallingConv())
9460	TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
9461
9462	if (TRI->isAnyArgRegReserved(MF))
9463	TRI->emitReservedArgRegCallError(MF);
9464
9465	assert(Mask && "Missing call preserved mask for calling convention");
9466	Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
9467
9468	if (InGlue.getNode())
9469	Ops.push_back(x: InGlue);
9470
9471	// If we're doing a tall call, use a TC_RETURN here rather than an
9472	// actual call instruction.
9473	if (IsTailCall) {
9474	MF.getFrameInfo().setHasTailCall();
9475	SDValue Ret = DAG.getNode(Opcode: Opc, DL, VT: MVT::Other, Ops);
9476	if (IsCFICall)
9477	Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9478
9479	DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
9480	DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
9481	if (CalledGlobal &&
9482	MF.getFunction().getParent()->getModuleFlag(Key: "import-call-optimization"))
9483	DAG.addCalledGlobal(Node: Ret.getNode(), GV: CalledGlobal, OpFlags);
9484	return Ret;
9485	}
9486
9487	// Returns a chain and a flag for retval copy to use.
9488	Chain = DAG.getNode(Opcode: Opc, DL, ResultTys: {MVT::Other, MVT::Glue}, Ops);
9489	if (IsCFICall)
9490	Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
9491
9492	DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
9493	InGlue = Chain.getValue(R: `1`);
9494	DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
9495	if (CalledGlobal &&
9496	MF.getFunction().getParent()->getModuleFlag(Key: "import-call-optimization"))
9497	DAG.addCalledGlobal(Node: Chain.getNode(), GV: CalledGlobal, OpFlags);
9498
9499	uint64_t CalleePopBytes =
9500	DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: `16`) : `0`;
9501
9502	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
9503	InGlue = Chain.getValue(R: `1`);
9504
9505	// Handle result values, copying them out of physregs into vregs that we
9506	// return.
9507	SDValue Result = LowerCallResult(
9508	Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
9509	ThisVal: IsThisReturn ? OutVals [`0`] : SDValue (), RequiresSMChange);
9510
9511	if (!Ins.empty())
9512	InGlue = Result.getValue(R: Result ->getNumValues() - `1`);
9513
9514	if (RequiresSMChange) {
9515	assert(PStateSM && "Expected a PStateSM to be set");
9516	Result = changeStreamingMode(
9517	DAG, DL, Enable: !CallAttrs.callee().hasStreamingInterface(), Chain: Result, InGlue,
9518	Condition: getSMToggleCondition(CallAttrs), PStateSM);
9519
9520	if (!Subtarget->isTargetDarwin() \|\| Subtarget->hasSVE()) {
9521	InGlue = Result.getValue(R: `1`);
9522	Result =
9523	DAG.getNode(Opcode: AArch64ISD::VG_RESTORE, DL,
9524	VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops: {Result, InGlue});
9525	}
9526	}
9527
9528	if (CallAttrs.requiresEnablingZAAfterCall())
9529	// Unconditionally resume ZA.
9530	Result = DAG.getNode(
9531	Opcode: AArch64ISD::SMSTART, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N1: Result,
9532	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32));
9533
9534	if (ShouldPreserveZT0)
9535	Result =
9536	DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
9537	Ops: {Result, DAG.getConstant(Val: `0`, DL, VT: MVT::i32), ZTFrameIdx});
9538
9539	if (RequiresLazySave) {
9540	// Conditionally restore the lazy save using a pseudo node.
9541	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9542	SDValue RegMask = DAG.getRegisterMask(
9543	RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
9544	SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
9545	Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
9546	SDValue TPIDR2_EL0 = DAG.getNode(
9547	Opcode: ISD::INTRINSIC_W_CHAIN, DL, VT: MVT::i64, N1: Result,
9548	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_get_tpidr2, DL, VT: MVT::i32));
9549
9550	// Copy the address of the TPIDR2 block into X0 before 'calling' the
9551	// RESTORE_ZA pseudo.
9552	SDValue Glue;
9553	SDValue TPIDR2Block = DAG.getFrameIndex(
9554	FI: TPIDR2.FrameIndex,
9555	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
9556	Result = DAG.getCopyToReg(Chain: Result, dl: DL, Reg: AArch64::X0, N: TPIDR2Block, Glue);
9557	Result =
9558	DAG.getNode(Opcode: AArch64ISD::RESTORE_ZA, DL, VT: MVT::Other,
9559	Ops: {Result, TPIDR2_EL0, DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64),
9560	RestoreRoutine, RegMask, Result.getValue(R: `1`)});
9561
9562	// Finally reset the TPIDR2_EL0 register to 0.
9563	Result = DAG.getNode(
9564	Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Result,
9565	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
9566	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
9567	TPIDR2.Uses++;
9568	} else if (RequiresSaveAllZA) {
9569	Result = emitSMEStateSaveRestore(TLI: *this, DAG, Info: FuncInfo, DL, Chain: Result,
9570	/IsSave=/false);
9571	}
9572
9573	if (RequiresSMChange \|\| RequiresLazySave \|\| ShouldPreserveZT0 \|\|
9574	RequiresSaveAllZA) {
9575	for (unsigned I = `0`; I < InVals.size(); ++I) {
9576	// The smstart/smstop is chained as part of the call, but when the
9577	// resulting chain is discarded (which happens when the call is not part
9578	// of a chain, e.g. a call to @llvm.cos()), we need to ensure the
9579	// smstart/smstop is chained to the result value. We can do that by doing
9580	// a vreg -> vreg copy.
9581	Register Reg = MF.getRegInfo().createVirtualRegister(
9582	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
9583	SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals [I]);
9584	InVals [I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
9585	VT: InVals [I].getValueType());
9586	}
9587	}
9588
9589	if (CallConv == CallingConv::PreserveNone) {
9590	for (const ISD::OutputArg &O : Outs) {
9591	if (O.Flags.isSwiftSelf() \|\| O.Flags.isSwiftError() \|\|
9592	O.Flags.isSwiftAsync()) {
9593	MachineFunction &MF = DAG.getMachineFunction();
9594	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
9595	MF.getFunction(),
9596	"Swift attributes can't be used with preserve_none",
9597	DL.getDebugLoc()));
9598	break;
9599	}
9600	}
9601	}
9602
9603	return Result;
9604	}
9605
9606	bool AArch64TargetLowering::CanLowerReturn(
9607	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9608	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
9609	const Type RetTy) const* {
9610	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9611	SmallVector<CCValAssign, `16`> RVLocs;
9612	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9613	return CCInfo.CheckReturn(Outs, Fn: RetCC);
9614	}
9615
9616	SDValue
9617	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9618	bool isVarArg,
9619	const SmallVectorImpl<ISD::OutputArg> &Outs,
9620	const SmallVectorImpl<SDValue> &OutVals,
9621	const SDLoc &DL, SelectionDAG &DAG) const {
9622	auto &MF = DAG.getMachineFunction();
9623	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9624
9625	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9626	SmallVector<CCValAssign, `16`> RVLocs;
9627	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9628	CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
9629
9630	// Copy the result values into the output registers.
9631	SDValue Glue;
9632	SmallVector<std::pair<unsigned, SDValue>, `4`> RetVals;
9633	SmallSet<unsigned, `4`> RegsUsed;
9634	for (unsigned i = `0`, realRVLocIdx = `0`; i != RVLocs.size();
9635	++i, ++realRVLocIdx) {
9636	CCValAssign &VA = RVLocs [i];
9637	assert(VA.isRegLoc() && "Can only return in registers!");
9638	SDValue Arg = OutVals [realRVLocIdx];
9639
9640	switch (VA.getLocInfo()) {
9641	default:
9642	llvm_unreachable("Unknown loc info!");
9643	case CCValAssign::Full:
9644	if (Outs [i].ArgVT == MVT::i1) {
9645	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
9646	// value. This is strictly redundant on Darwin (which uses "zeroext
9647	// i1"), but will be optimised out before ISel.
9648	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9649	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9650	}
9651	break;
9652	case CCValAssign::BCvt:
9653	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
9654	break;
9655	case CCValAssign::AExt:
9656	case CCValAssign::ZExt:
9657	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9658	break;
9659	case CCValAssign::AExtUpper:
9660	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9661	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9662	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9663	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
9664	break;
9665	}
9666
9667	if (RegsUsed.count(V: VA.getLocReg())) {
9668	SDValue &Bits =
9669	llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
9670	return Elt.first == VA.getLocReg();
9671	})->second;
9672	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9673	} else {
9674	RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9675	RegsUsed.insert(V: VA.getLocReg());
9676	}
9677	}
9678
9679	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9680
9681	// Emit SMSTOP before returning from a locally streaming function
9682	SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
9683	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9684	if (FuncAttrs.hasStreamingCompatibleInterface()) {
9685	Register Reg = FuncInfo->getPStateSMReg();
9686	assert(Reg.isValid() && "PStateSM Register is invalid");
9687	SDValue PStateSM = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: MVT::i64);
9688	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
9689	/Glue/ InGlue: SDValue (),
9690	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
9691	} else
9692	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
9693	/Glue/ InGlue: SDValue (), Condition: AArch64SME::Always);
9694	Glue = Chain.getValue(R: `1`);
9695	}
9696
9697	SmallVector<SDValue, `4`> RetOps(`1`, Chain);
9698	for (auto &RetVal : RetVals) {
9699	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9700	isPassedInFPR(VT: RetVal.second.getValueType()))
9701	RetVal.second =
9702	DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9703	VTList: DAG.getVTList(VT1: RetVal.second.getValueType(), VT2: MVT::Glue),
9704	N: RetVal.second);
9705	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
9706	Glue = Chain.getValue(R: `1`);
9707	RetOps.push_back(
9708	Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
9709	}
9710
9711	// Windows AArch64 ABIs require that for returning structs by value we copy
9712	// the sret argument into X0 for the return.
9713	// We saved the argument into a virtual register in the entry block,
9714	// so now we copy the value out and into X0.
9715	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9716	SDValue Val = DAG.getCopyFromReg(Chain: RetOps [`0`], dl: DL, Reg: SRetReg,
9717	VT: getPointerTy(DL: MF.getDataLayout()));
9718
9719	unsigned RetValReg = AArch64::X0;
9720	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9721	RetValReg = AArch64::X8;
9722	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
9723	Glue = Chain.getValue(R: `1`);
9724
9725	RetOps.push_back(
9726	Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
9727	}
9728
9729	const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
9730	if (I) {
9731	for (; *I; ++I) {
9732	if (AArch64::GPR64RegClass.contains(Reg: *I))
9733	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
9734	else if (AArch64::FPR64RegClass.contains(Reg: *I))
9735	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: `64`)));
9736	else
9737	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9738	}
9739	}
9740
9741	RetOps [`0`] = Chain; // Update chain.
9742
9743	// Add the glue if we have it.
9744	if (Glue.getNode())
9745	RetOps.push_back(Elt: Glue);
9746
9747	if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9748	// ARM64EC entry thunks use a special return sequence: instead of a regular
9749	// "ret" instruction, they need to explicitly call the emulator.
9750	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9751	SDValue Arm64ECRetDest =
9752	DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
9753	Arm64ECRetDest =
9754	getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: `0`);
9755	Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
9756	PtrInfo: MachinePointerInfo ());
9757	RetOps.insert(I: RetOps.begin() + `1`, Elt: Arm64ECRetDest);
9758	RetOps.insert(I: RetOps.begin() + `2`, Elt: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
9759	return DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VT: MVT::Other, Ops: RetOps);
9760	}
9761
9762	return DAG.getNode(Opcode: AArch64ISD::RET_GLUE, DL, VT: MVT::Other, Ops: RetOps);
9763	}
9764
9765	//===----------------------------------------------------------------------===//
9766	// Other Lowering Code
9767	//===----------------------------------------------------------------------===//
9768
9769	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9770	SelectionDAG &DAG,
9771	unsigned Flag) const {
9772	return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc (N), VT: Ty,
9773	offset: N->getOffset(), TargetFlags: Flag);
9774	}
9775
9776	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9777	SelectionDAG &DAG,
9778	unsigned Flag) const {
9779	return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
9780	}
9781
9782	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9783	SelectionDAG &DAG,
9784	unsigned Flag) const {
9785	return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
9786	Offset: N->getOffset(), TargetFlags: Flag);
9787	}
9788
9789	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9790	SelectionDAG &DAG,
9791	unsigned Flag) const {
9792	return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: `0`, TargetFlags: Flag);
9793	}
9794
9795	SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9796	SelectionDAG &DAG,
9797	unsigned Flag) const {
9798	return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
9799	}
9800
9801	// (loadGOT sym)
9802	template <class NodeTy>
9803	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9804	unsigned Flags) const {
9805	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9806	SDLoc DL(N);
9807	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9808	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
9809	// FIXME: Once remat is capable of dealing with instructions with register
9810	// operands, expand this into two nodes instead of using a wrapper node.
9811	if (DAG.getMachineFunction()
9812	.getInfo<AArch64FunctionInfo>()
9813	->hasELFSignedGOT())
9814	return SDValue (DAG.getMachineNode(Opcode: AArch64::LOADgotAUTH, dl: DL, VT: Ty, Op1: GotAddr),
9815	`0`);
9816	return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
9817	}
9818
9819	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9820	template <class NodeTy>
9821	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9822	unsigned Flags) const {
9823	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9824	SDLoc DL(N);
9825	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9826	const unsigned char MO_NC = AArch64II::MO_NC;
9827	return DAG.getNode(
9828	AArch64ISD::WrapperLarge, DL, Ty,
9829	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
9830	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
9831	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
9832	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
9833	}
9834
9835	// (addlow (adrp %hi(sym)) %lo(sym))
9836	template <class NodeTy>
9837	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9838	unsigned Flags) const {
9839	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9840	SDLoc DL(N);
9841	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9842	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
9843	SDValue Lo = getTargetNode(N, Ty, DAG,
9844	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
9845	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
9846	return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
9847	}
9848
9849	// (adr sym)
9850	template <class NodeTy>
9851	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9852	unsigned Flags) const {
9853	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9854	SDLoc DL(N);
9855	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9856	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9857	return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
9858	}
9859
9860	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9861	SelectionDAG &DAG) const {
9862	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
9863	const GlobalValue *GV = GN->getGlobal();
9864	unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
9865
9866	if (OpFlags != AArch64II::MO_NO_FLAG)
9867	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == `0` &&
9868	"unexpected offset in global node");
9869
9870	// This also catches the large code model case for Darwin, and tiny code
9871	// model with got relocations.
9872	if ((OpFlags & AArch64II::MO_GOT) != `0`) {
9873	return getGOT(N: GN, DAG, Flags: OpFlags);
9874	}
9875
9876	SDValue Result;
9877	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9878	!getTargetMachine().isPositionIndependent()) {
9879	Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
9880	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9881	Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
9882	} else {
9883	Result = getAddr(N: GN, DAG, Flags: OpFlags);
9884	}
9885	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9886	SDLoc DL(GN);
9887	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
9888	Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
9889	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
9890	return Result;
9891	}
9892
9893	/// Convert a TLS address reference into the correct sequence of loads
9894	/// and calls to compute the variable's address (for Darwin, currently) and
9895	/// return an SDValue containing the final node.
9896
9897	/// Darwin only has one TLS scheme which must be capable of dealing with the
9898	/// fully general situation, in the worst case. This means:
9899	/// + "extern __thread" declaration.
9900	/// + Defined in a possibly unknown dynamic library.
9901	///
9902	/// The general system is that each __thread variable has a [3 x i64] descriptor
9903	/// which contains information used by the runtime to calculate the address. The
9904	/// only part of this the compiler needs to know about is the first xword, which
9905	/// contains a function pointer that must be called with the address of the
9906	/// entire descriptor in "x0".
9907	///
9908	/// Since this descriptor may be in a different unit, in general even the
9909	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9910	/// is:
9911	/// adrp x0, _var@TLVPPAGE
9912	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9913	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9914	/// ; the function pointer
9915	/// blr x1 ; Uses descriptor address in x0
9916	/// ; Address of _var is now in x0.
9917	///
9918	/// If the address of _var's descriptor is* known to the linker, then it can*
9919	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9920	/// a slight efficiency gain.
9921	SDValue
9922	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9923	SelectionDAG &DAG) const {
9924	assert(Subtarget->isTargetDarwin() &&
9925	"This function expects a Darwin target");
9926
9927	SDLoc DL(Op);
9928	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9929	MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
9930	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
9931
9932	SDValue TLVPAddr =
9933	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9934	SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
9935
9936	// The first entry in the descriptor is a function pointer that we must call
9937	// to obtain the address of the variable.
9938	SDValue Chain = DAG.getEntryNode();
9939	SDValue FuncTLVGet = DAG.getLoad(
9940	VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
9941	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
9942	Alignment: Align (PtrMemVT.getSizeInBits() / `8`),
9943	MMOFlags: MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
9944	Chain = FuncTLVGet.getValue(R: `1`);
9945
9946	// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9947	FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
9948
9949	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9950	MFI.setAdjustsStack(true);
9951
9952	// TLS calls preserve all registers except those that absolutely must be
9953	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9954	// silly).
9955	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9956	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9957	if (Subtarget->hasCustomCallingConv())
9958	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
9959
9960	// Finally, we can make the call. This is just a degenerate version of a
9961	// normal AArch64 call node: x0 takes the address of the descriptor, and
9962	// returns the address of the variable in this thread.
9963	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X0, N: DescAddr, Glue: SDValue ());
9964
9965	unsigned Opcode = AArch64ISD::CALL;
9966	SmallVector<SDValue, `8`> Ops;
9967	Ops.push_back(Elt: Chain);
9968	Ops.push_back(Elt: FuncTLVGet);
9969
9970	// With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
9971	if (DAG.getMachineFunction().getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
9972	Opcode = AArch64ISD::AUTH_CALL;
9973	Ops.push_back(Elt: DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32));
9974	Ops.push_back(Elt: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64)); // Integer Disc.
9975	Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::NoRegister, VT: MVT::i64)); // Addr Disc.
9976	}
9977
9978	Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64));
9979	Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
9980	Ops.push_back(Elt: Chain.getValue(R: `1`));
9981	Chain = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
9982	return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue: Chain.getValue(R: `1`));
9983	}
9984
9985	/// Convert a thread-local variable reference into a sequence of instructions to
9986	/// compute the variable's address for the local exec TLS model of ELF targets.
9987	/// The sequence depends on the maximum TLS area size.
9988	SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9989	SDValue ThreadBase,
9990	const SDLoc &DL,
9991	SelectionDAG &DAG) const {
9992	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9993	SDValue TPOff, Addr;
9994
9995	switch (DAG.getTarget().Options.TLSSize) {
9996	default:
9997	llvm_unreachable("Unexpected TLS size");
9998
9999	case `12`: {
10000	// mrs x0, TPIDR_EL0
10001	// add x0, x0, :tprel_lo12:a
10002	SDValue Var = DAG.getTargetGlobalAddress(
10003	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF);
10004	return SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
10005	Op2: Var,
10006	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10007	`0`);
10008	}
10009
10010	case `24`: {
10011	// mrs x0, TPIDR_EL0
10012	// add x0, x0, :tprel_hi12:a
10013	// add x0, x0, :tprel_lo12_nc:a
10014	SDValue HiVar = DAG.getTargetGlobalAddress(
10015	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
10016	SDValue LoVar = DAG.getTargetGlobalAddress(
10017	GV, DL, VT: PtrVT, offset: `0`,
10018	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
10019	Addr = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
10020	Op2: HiVar,
10021	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10022	`0`);
10023	return SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: Addr,
10024	Op2: LoVar,
10025	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10026	`0`);
10027	}
10028
10029	case `32`: {
10030	// mrs x1, TPIDR_EL0
10031	// movz x0, #:tprel_g1:a
10032	// movk x0, #:tprel_g0_nc:a
10033	// add x0, x1, x0
10034	SDValue HiVar = DAG.getTargetGlobalAddress(
10035	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1);
10036	SDValue LoVar = DAG.getTargetGlobalAddress(
10037	GV, DL, VT: PtrVT, offset: `0`,
10038	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
10039	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
10040	Op2: DAG.getTargetConstant(Val: `16`, DL, VT: MVT::i32)),
10041	`0`);
10042	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10043	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10044	`0`);
10045	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10046	}
10047
10048	case `48`: {
10049	// mrs x1, TPIDR_EL0
10050	// movz x0, #:tprel_g2:a
10051	// movk x0, #:tprel_g1_nc:a
10052	// movk x0, #:tprel_g0_nc:a
10053	// add x0, x1, x0
10054	SDValue HiVar = DAG.getTargetGlobalAddress(
10055	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G2);
10056	SDValue MiVar = DAG.getTargetGlobalAddress(
10057	GV, DL, VT: PtrVT, offset: `0`,
10058	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1 \| AArch64II::MO_NC);
10059	SDValue LoVar = DAG.getTargetGlobalAddress(
10060	GV, DL, VT: PtrVT, offset: `0`,
10061	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
10062	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
10063	Op2: DAG.getTargetConstant(Val: `32`, DL, VT: MVT::i32)),
10064	`0`);
10065	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: MiVar,
10066	Op3: DAG.getTargetConstant(Val: `16`, DL, VT: MVT::i32)),
10067	`0`);
10068	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10069	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10070	`0`);
10071	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10072	}
10073	}
10074	}
10075
10076	/// When accessing thread-local variables under either the general-dynamic or
10077	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10078	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10079	/// is a function pointer to carry out the resolution.
10080	///
10081	/// The sequence is:
10082	/// adrp x0, :tlsdesc:var
10083	/// ldr x1, [x0, #:tlsdesc_lo12:var]
10084	/// add x0, x0, #:tlsdesc_lo12:var
10085	/// .tlsdesccall var
10086	/// blr x1
10087	/// (TPIDR_EL0 offset now in x0)
10088	///
10089	/// The above sequence must be produced unscheduled, to enable the linker to
10090	/// optimize/relax this sequence.
10091	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10092	/// above sequence, and expanded really late in the compilation flow, to ensure
10093	/// the sequence is produced as per above.
10094	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10095	const SDLoc &DL,
10096	SelectionDAG &DAG) const {
10097	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10098
10099	SDValue Chain = DAG.getEntryNode();
10100	SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
10101
10102	unsigned Opcode =
10103	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10104	? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10105	: AArch64ISD::TLSDESC_CALLSEQ;
10106	Chain = DAG.getNode(Opcode, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
10107	SDValue Glue = Chain.getValue(R: `1`);
10108
10109	return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue);
10110	}
10111
10112	SDValue
10113	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10114	SelectionDAG &DAG) const {
10115	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10116
10117	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10118	AArch64FunctionInfo *MFI =
10119	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10120
10121	TLSModel::Model Model = MFI->hasELFSignedGOT()
10122	? TLSModel::GeneralDynamic
10123	: getTargetMachine().getTLSModel(GV: GA->getGlobal());
10124
10125	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
10126	if (Model == TLSModel::LocalDynamic)
10127	Model = TLSModel::GeneralDynamic;
10128	}
10129
10130	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
10131	Model != TLSModel::LocalExec)
10132	report_fatal_error(reason: "ELF TLS only supported in small memory model or "
10133	"in local exec TLS model");
10134	// Different choices can be made for the maximum size of the TLS area for a
10135	// module. For the small address model, the default TLS size is 16MiB and the
10136	// maximum TLS size is 4GiB.
10137	// FIXME: add tiny and large code model support for TLS access models other
10138	// than local exec. We currently generate the same code as small for tiny,
10139	// which may be larger than needed.
10140
10141	SDValue TPOff;
10142	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10143	SDLoc DL(Op);
10144	const GlobalValue *GV = GA->getGlobal();
10145
10146	SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
10147
10148	if (Model == TLSModel::LocalExec) {
10149	return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10150	} else if (Model == TLSModel::InitialExec) {
10151	TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
10152	TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
10153	} else if (Model == TLSModel::LocalDynamic) {
10154	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10155	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10156	// the beginning of the module's TLS region, followed by a DTPREL offset
10157	// calculation.
10158
10159	// These accesses will need deduplicating if there's more than one.
10160	MFI->incNumLocalDynamicTLSAccesses();
10161
10162	// The call needs a relocation too for linker relaxation. It doesn't make
10163	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10164	// the address.
10165	SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
10166	TargetFlags: AArch64II::MO_TLS);
10167
10168	// Now we can calculate the offset from TPIDR_EL0 to this module's
10169	// thread-local area.
10170	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10171
10172	// Now use :dtprel_whatever: operations to calculate this variable's offset
10173	// in its thread-storage area.
10174	SDValue HiVar = DAG.getTargetGlobalAddress(
10175	GV, DL, VT: MVT::i64, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
10176	SDValue LoVar = DAG.getTargetGlobalAddress(
10177	GV, DL, VT: MVT::i64, offset: `0`,
10178	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
10179
10180	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: HiVar,
10181	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10182	`0`);
10183	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
10184	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10185	`0`);
10186	} else if (Model == TLSModel::GeneralDynamic) {
10187	// The call needs a relocation too for linker relaxation. It doesn't make
10188	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10189	// the address.
10190	SDValue SymAddr =
10191	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
10192
10193	// Finally we can make a call to calculate the offset from tpidr_el0.
10194	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10195	} else
10196	llvm_unreachable("Unsupported ELF TLS access model");
10197
10198	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
10199	}
10200
10201	SDValue
10202	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10203	SelectionDAG &DAG) const {
10204	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10205
10206	SDValue Chain = DAG.getEntryNode();
10207	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
10208	SDLoc DL(Op);
10209
10210	SDValue TEB = DAG.getRegister(Reg: AArch64::X18, VT: MVT::i64);
10211
10212	// Load the ThreadLocalStoragePointer from the TEB
10213	// A pointer to the TLS array is located at offset 0x58 from the TEB.
10214	SDValue TLSArray =
10215	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: `0x58`, DL));
10216	TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo ());
10217	Chain = TLSArray.getValue(R: `1`);
10218
10219	// Load the TLS index from the C runtime;
10220	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
10221	// This also does the same as LOADgot, but using a generic i32 load,
10222	// while LOADgot only loads i64.
10223	SDValue TLSIndexHi =
10224	DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
10225	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10226	Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
10227	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
10228	SDValue TLSIndex =
10229	DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
10230	TLSIndex = DAG.getLoad(VT: MVT::i32, dl: DL, Chain, Ptr: TLSIndex, PtrInfo: MachinePointerInfo ());
10231	Chain = TLSIndex.getValue(R: `1`);
10232
10233	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10234	// offset into the TLSArray.
10235	TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
10236	SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
10237	N2: DAG.getConstant(Val: `3`, DL, VT: PtrVT));
10238	SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
10239	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
10240	PtrInfo: MachinePointerInfo ());
10241	Chain = TLS.getValue(R: `1`);
10242
10243	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10244	const GlobalValue *GV = GA->getGlobal();
10245	SDValue TGAHi = DAG.getTargetGlobalAddress(
10246	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
10247	SDValue TGALo = DAG.getTargetGlobalAddress(
10248	GV, DL, VT: PtrVT, offset: `0`,
10249	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
10250
10251	// Add the offset from the start of the .tls section (section base).
10252	SDValue Addr =
10253	SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TLS, Op2: TGAHi,
10254	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
10255	`0`);
10256	Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
10257	return Addr;
10258	}
10259
10260	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10261	SelectionDAG &DAG) const {
10262	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
10263	if (DAG.getTarget().useEmulatedTLS())
10264	return LowerToTLSEmulatedModel(GA, DAG);
10265
10266	if (Subtarget->isTargetDarwin())
10267	return LowerDarwinGlobalTLSAddress(Op, DAG);
10268	if (Subtarget->isTargetELF())
10269	return LowerELFGlobalTLSAddress(Op, DAG);
10270	if (Subtarget->isTargetWindows())
10271	return LowerWindowsGlobalTLSAddress(Op, DAG);
10272
10273	llvm_unreachable("Unexpected platform trying to use TLS");
10274	}
10275
10276	//===----------------------------------------------------------------------===//
10277	// PtrAuthGlobalAddress lowering
10278	//
10279	// We have 3 lowering alternatives to choose from:
10280	// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10281	// If the GV doesn't need a GOT load (i.e., is locally defined)
10282	// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10283	//
10284	// - LOADgotPAC: similar to LOADgot, with added PAC.
10285	// If the GV needs a GOT load, materialize the pointer using the usual
10286	// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10287	// section is assumed to be read-only (for example, via relro mechanism). See
10288	// LowerMOVaddrPAC.
10289	//
10290	// - LOADauthptrstatic: similar to LOADgot, but use a
10291	// special stub slot instead of a GOT slot.
10292	// Load a signed pointer for symbol 'sym' from a stub slot named
10293	// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10294	// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10295	// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10296	//
10297	// All 3 are pseudos that are expand late to longer sequences: this lets us
10298	// provide integrity guarantees on the to-be-signed intermediate values.
10299	//
10300	// LOADauthptrstatic is undesirable because it requires a large section filled
10301	// with often similarly-signed pointers, making it a good harvesting target.
10302	// Thus, it's only used for ptrauth references to extern_weak to avoid null
10303	// checks.
10304
10305	static SDValue LowerPtrAuthGlobalAddressStatically(
10306	SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10307	SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10308	const auto *TGN = cast<GlobalAddressSDNode>(Val: TGA.getNode());
10309	assert(TGN->getGlobal()->hasExternalWeakLinkage());
10310
10311	// Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10312	// offset alone as a pointer if the symbol wasn't available, which would
10313	// probably break null checks in users. Ptrauth complicates things further:
10314	// error out.
10315	if (TGN->getOffset() != `0`)
10316	report_fatal_error(
10317	reason: "unsupported non-zero offset in weak ptrauth global reference");
10318
10319	if (!isNullConstant(V: AddrDiscriminator))
10320	report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
10321
10322	SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
10323	return SDValue (DAG.getMachineNode(Opcode: AArch64::LOADauthptrstatic, dl: DL, VT: MVT::i64,
10324	Ops: {TGA, Key, Discriminator}),
10325	`0`);
10326	}
10327
10328	SDValue
10329	AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10330	SelectionDAG &DAG) const {
10331	SDValue Ptr = Op.getOperand(i: `0`);
10332	uint64_t KeyC = Op.getConstantOperandVal(i: `1`);
10333	SDValue AddrDiscriminator = Op.getOperand(i: `2`);
10334	uint64_t DiscriminatorC = Op.getConstantOperandVal(i: `3`);
10335	EVT VT = Op.getValueType();
10336	SDLoc DL(Op);
10337
10338	if (KeyC > AArch64PACKey::LAST)
10339	report_fatal_error(reason: "key in ptrauth global out of range [0, " +
10340	Twine ((int)AArch64PACKey::LAST) + "]");
10341
10342	// Blend only works if the integer discriminator is 16-bit wide.
10343	if (!isUInt<`16`>(x: DiscriminatorC))
10344	report_fatal_error(
10345	reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
10346
10347	// Choosing between 3 lowering alternatives is target-specific.
10348	if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10349	report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
10350
10351	int64_t PtrOffsetC = `0`;
10352	if (Ptr.getOpcode() == ISD::ADD) {
10353	PtrOffsetC = Ptr.getConstantOperandVal(i: `1`);
10354	Ptr = Ptr.getOperand(i: `0`);
10355	}
10356	const auto *PtrN = cast<GlobalAddressSDNode>(Val: Ptr.getNode());
10357	const GlobalValue *PtrGV = PtrN->getGlobal();
10358
10359	// Classify the reference to determine whether it needs a GOT load.
10360	const unsigned OpFlags =
10361	Subtarget->ClassifyGlobalReference(GV: PtrGV, TM: getTargetMachine());
10362	const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != `0`);
10363	assert(((OpFlags & (~AArch64II::MO_GOT)) == `0`) &&
10364	"unsupported non-GOT op flags on ptrauth global reference");
10365
10366	// Fold any offset into the GV; our pseudos expect it there.
10367	PtrOffsetC += PtrN->getOffset();
10368	SDValue TPtr = DAG.getTargetGlobalAddress(GV: PtrGV, DL, VT, offset: PtrOffsetC,
10369	/TargetFlags=/`0`);
10370	assert(PtrN->getTargetFlags() == `0` &&
10371	"unsupported target flags on ptrauth global");
10372
10373	SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
10374	SDValue Discriminator = DAG.getTargetConstant(Val: DiscriminatorC, DL, VT: MVT::i64);
10375	SDValue TAddrDiscriminator = !isNullConstant(V: AddrDiscriminator)
10376	? AddrDiscriminator
10377	: DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10378
10379	// No GOT load needed -> MOVaddrPAC
10380	if (!NeedsGOTLoad) {
10381	assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10382	return SDValue (
10383	DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, VT: MVT::i64,
10384	Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
10385	`0`);
10386	}
10387
10388	// GOT load -> LOADgotPAC
10389	// Note that we disallow extern_weak refs to avoid null checks later.
10390	if (!PtrGV->hasExternalWeakLinkage())
10391	return SDValue (
10392	DAG.getMachineNode(Opcode: AArch64::LOADgotPAC, dl: DL, VT: MVT::i64,
10393	Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
10394	`0`);
10395
10396	// extern_weak ref -> LOADauthptrstatic
10397	return LowerPtrAuthGlobalAddressStatically(
10398	TGA: TPtr, DL, VT, KeyC: (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10399	DAG);
10400	}
10401
10402	// Looks through \param Val to determine the bit that can be used to
10403	// check the sign of the value. It returns the unextended value and
10404	// the sign bit position.
10405	std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10406	if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10407	return {Val.getOperand(i: `0`),
10408	cast<VTSDNode>(Val: Val.getOperand(i: `1`))->getVT().getFixedSizeInBits() -
10409	`1`};
10410
10411	if (Val.getOpcode() == ISD::SIGN_EXTEND)
10412	return {Val.getOperand(i: `0`),
10413	Val.getOperand(i: `0`)->getValueType(ResNo: `0`).getFixedSizeInBits() - `1`};
10414
10415	return {Val, Val.getValueSizeInBits() - `1`};
10416	}
10417
10418	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10419	SDValue Chain = Op.getOperand(i: `0`);
10420	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `1`))->get();
10421	SDValue LHS = Op.getOperand(i: `2`);
10422	SDValue RHS = Op.getOperand(i: `3`);
10423	SDValue Dest = Op.getOperand(i: `4`);
10424	SDLoc DL(Op);
10425
10426	MachineFunction &MF = DAG.getMachineFunction();
10427	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10428	// will not be produced, as they are conditional branch instructions that do
10429	// not set flags.
10430	bool ProduceNonFlagSettingCondBr =
10431	!MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
10432
10433	// Handle f128 first, since lowering it will result in comparing the return
10434	// value of a libcall against zero, which is just what the rest of LowerBR_CC
10435	// is expecting to deal with.
10436	if (LHS.getValueType() == MVT::f128) {
10437	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS);
10438
10439	// If softenSetCCOperands returned a scalar, we need to compare the result
10440	// against zero to select between true and false values.
10441	if (!RHS.getNode()) {
10442	RHS = DAG.getConstant(Val: `0`, DL, VT: LHS.getValueType());
10443	CC = ISD::SETNE;
10444	}
10445	}
10446
10447	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
10448	// instruction.
10449	if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
10450	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
10451	// Only lower legal XALUO ops.
10452	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS ->getValueType(ResNo: `0`)))
10453	return SDValue ();
10454
10455	// The actual operation with overflow check.
10456	AArch64CC::CondCode OFCC;
10457	SDValue Value, Overflow;
10458	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: `0`), DAG);
10459
10460	if (CC == ISD::SETNE)
10461	OFCC = getInvertedCondCode(Code: OFCC);
10462	SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
10463
10464	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
10465	N4: Overflow);
10466	}
10467
10468	if (LHS.getValueType().isInteger()) {
10469	assert((LHS.getValueType() == RHS.getValueType()) &&
10470	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
10471
10472	// If the RHS of the comparison is zero, we can potentially fold this
10473	// to a specialized branch.
10474	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
10475	if (RHSC && RHSC->getZExtValue() == `0` && ProduceNonFlagSettingCondBr) {
10476	if (CC == ISD::SETEQ) {
10477	// See if we can use a TBZ to fold in an AND as well.
10478	// TBZ has a smaller branch displacement than CBZ. If the offset is
10479	// out of bounds, a late MI-layer pass rewrites branches.
10480	// 403.gcc is an example that hits this case.
10481	if (LHS.getOpcode() == ISD::AND &&
10482	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
10483	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
10484	SDValue Test = LHS.getOperand(i: `0`);
10485	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
10486	return DAG.getNode(Opcode: AArch64ISD::TBZ, DL, VT: MVT::Other, N1: Chain, N2: Test,
10487	N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL, VT: MVT::i64),
10488	N4: Dest);
10489	}
10490
10491	return DAG.getNode(Opcode: AArch64ISD::CBZ, DL, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
10492	} else if (CC == ISD::SETNE) {
10493	// See if we can use a TBZ to fold in an AND as well.
10494	// TBZ has a smaller branch displacement than CBZ. If the offset is
10495	// out of bounds, a late MI-layer pass rewrites branches.
10496	// 403.gcc is an example that hits this case.
10497	if (LHS.getOpcode() == ISD::AND &&
10498	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
10499	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
10500	SDValue Test = LHS.getOperand(i: `0`);
10501	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
10502	return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL, VT: MVT::Other, N1: Chain, N2: Test,
10503	N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL, VT: MVT::i64),
10504	N4: Dest);
10505	}
10506
10507	return DAG.getNode(Opcode: AArch64ISD::CBNZ, DL, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
10508	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
10509	// Don't combine AND since emitComparison converts the AND to an ANDS
10510	// (a.k.a. TST) and the test in the test bit and branch instruction
10511	// becomes redundant. This would also increase register pressure.
10512	uint64_t SignBitPos;
10513	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
10514	return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL, VT: MVT::Other, N1: Chain, N2: LHS,
10515	N3: DAG.getConstant(Val: SignBitPos, DL, VT: MVT::i64), N4: Dest);
10516	}
10517	}
10518	if (RHSC && RHSC->getSExtValue() == -`1` && CC == ISD::SETGT &&
10519	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
10520	// Don't combine AND since emitComparison converts the AND to an ANDS
10521	// (a.k.a. TST) and the test in the test bit and branch instruction
10522	// becomes redundant. This would also increase register pressure.
10523	uint64_t SignBitPos;
10524	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
10525	return DAG.getNode(Opcode: AArch64ISD::TBZ, DL, VT: MVT::Other, N1: Chain, N2: LHS,
10526	N3: DAG.getConstant(Val: SignBitPos, DL, VT: MVT::i64), N4: Dest);
10527	}
10528
10529	// Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
10530	// larger branch displacement but do prefer CB over cmp + br.
10531	if (Subtarget->hasCMPBR() &&
10532	AArch64CC::isValidCBCond(Code: changeIntCCToAArch64CC(CC)) &&
10533	ProduceNonFlagSettingCondBr) {
10534	SDValue Cond =
10535	DAG.getTargetConstant(Val: changeIntCCToAArch64CC(CC), DL, VT: MVT::i32);
10536	return DAG.getNode(Opcode: AArch64ISD::CB, DL, VT: MVT::Other, N1: Chain, N2: Cond, N3: LHS, N4: RHS,
10537	N5: Dest);
10538	}
10539
10540	SDValue CCVal;
10541	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
10542	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
10543	N4: Cmp);
10544	}
10545
10546	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::bf16 \|\|
10547	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
10548
10549	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10550	// clean. Some of them require two branches to implement.
10551	SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
10552	AArch64CC::CondCode CC1, CC2;
10553	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10554	SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
10555	SDValue BR1 =
10556	DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: Chain, N2: Dest, N3: CC1Val, N4: Cmp);
10557	if (CC2 != AArch64CC::AL) {
10558	SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
10559	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL, VT: MVT::Other, N1: BR1, N2: Dest, N3: CC2Val,
10560	N4: Cmp);
10561	}
10562
10563	return BR1;
10564	}
10565
10566	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
10567	SelectionDAG &DAG) const {
10568	if (!Subtarget->isNeonAvailable() &&
10569	!Subtarget->useSVEForFixedLengthVectors())
10570	return SDValue ();
10571
10572	EVT VT = Op.getValueType();
10573	EVT IntVT = VT.changeTypeToInteger();
10574	SDLoc DL(Op);
10575
10576	SDValue In1 = Op.getOperand(i: `0`);
10577	SDValue In2 = Op.getOperand(i: `1`);
10578	EVT SrcVT = In2.getValueType();
10579
10580	if (!SrcVT.bitsEq(VT))
10581	In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
10582
10583	if (VT.isScalableVector())
10584	IntVT =
10585	getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
10586
10587	if (VT.isFixedLengthVector() &&
10588	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
10589	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
10590
10591	In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
10592	In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
10593
10594	SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
10595	return convertFromScalableVector(DAG, VT, V: Res);
10596	}
10597
10598	// With SVE, but without Neon, extend the scalars to scalable vectors and use
10599	// a SVE FCOPYSIGN.
10600	if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
10601	Subtarget->isSVEorStreamingSVEAvailable()) {
10602	if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
10603	return SDValue ();
10604	EVT SVT = getPackedSVEVectorVT(VT);
10605
10606	SDValue Ins1 =
10607	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SVT, N1: DAG.getUNDEF(VT: SVT), N2: In1,
10608	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10609	SDValue Ins2 =
10610	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SVT, N1: DAG.getUNDEF(VT: SVT), N2: In2,
10611	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10612	SDValue FCS = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: SVT, N1: Ins1, N2: Ins2);
10613	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: FCS,
10614	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10615	}
10616
10617	auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
10618	if (VT.isScalableVector())
10619	return getSVESafeBitCast(VT, Op, DAG);
10620
10621	return DAG.getBitcast(VT, V: Op);
10622	};
10623
10624	SDValue VecVal1, VecVal2;
10625	EVT VecVT;
10626	auto SetVecVal = [&](int Idx = -`1`) {
10627	if (!VT.isVector()) {
10628	VecVal1 =
10629	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
10630	VecVal2 =
10631	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
10632	} else {
10633	VecVal1 = BitCast (VecVT, In1, DAG);
10634	VecVal2 = BitCast (VecVT, In2, DAG);
10635	}
10636	};
10637	if (VT.isVector()) {
10638	VecVT = IntVT;
10639	SetVecVal ();
10640	} else if (VT == MVT::f64) {
10641	VecVT = MVT::v2i64;
10642	SetVecVal (AArch64::dsub);
10643	} else if (VT == MVT::f32) {
10644	VecVT = MVT::v4i32;
10645	SetVecVal (AArch64::ssub);
10646	} else if (VT == MVT::f16 \|\| VT == MVT::bf16) {
10647	VecVT = MVT::v8i16;
10648	SetVecVal (AArch64::hsub);
10649	} else {
10650	llvm_unreachable("Invalid type for copysign!");
10651	}
10652
10653	unsigned BitWidth = In1.getScalarValueSizeInBits();
10654	SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
10655
10656	// We want to materialize a mask with every bit but the high bit set, but the
10657	// AdvSIMD immediate moves cannot materialize that in a single instruction for
10658	// 64-bit elements. Instead, materialize all bits set and then negate that.
10659	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
10660	SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
10661	SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f64, Operand: SignMaskV);
10662	SignMaskV = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::v2f64, Operand: SignMaskV);
10663	SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: SignMaskV);
10664	}
10665
10666	SDValue BSP =
10667	DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
10668	if (VT == MVT::f16 \|\| VT == MVT::bf16)
10669	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: BSP);
10670	if (VT == MVT::f32)
10671	return DAG.getTargetExtractSubreg(SRIdx: AArch64::ssub, DL, VT, Operand: BSP);
10672	if (VT == MVT::f64)
10673	return DAG.getTargetExtractSubreg(SRIdx: AArch64::dsub, DL, VT, Operand: BSP);
10674
10675	return BitCast (VT, BSP, DAG);
10676	}
10677
10678	SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10679	SelectionDAG &DAG) const {
10680	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10681	Kind: Attribute::NoImplicitFloat))
10682	return SDValue ();
10683
10684	EVT VT = Op.getValueType();
10685	if (VT.isScalableVector() \|\|
10686	useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
10687	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
10688
10689	bool IsParity = Op.getOpcode() == ISD::PARITY;
10690	SDValue Val = Op.getOperand(i: `0`);
10691	SDLoc DL(Op);
10692
10693	// for i32, general parity function using EORs is more efficient compared to
10694	// using floating point
10695	if (VT == MVT::i32 && IsParity)
10696	return SDValue ();
10697
10698	if (Subtarget->isSVEorStreamingSVEAvailable()) {
10699	if (VT == MVT::i32 \|\| VT == MVT::i64) {
10700	EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
10701	Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
10702	N1: DAG.getUNDEF(VT: ContainerVT), N2: Val,
10703	N3: DAG.getVectorIdxConstant(Val: `0`, DL));
10704	Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: ContainerVT, Operand: Val);
10705	Val = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Val,
10706	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
10707	if (IsParity)
10708	Val = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Val, N2: DAG.getConstant(Val: `1`, DL, VT));
10709	return Val;
10710	}
10711
10712	if (VT == MVT::i128) {
10713	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: Val);
10714	Val = convertToScalableVector(DAG, VT: MVT::nxv2i64, V: Val);
10715	Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::nxv2i64, Operand: Val);
10716	Val = convertFromScalableVector(DAG, VT: MVT::v2i64, V: Val);
10717	Val = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i64, Operand: Val);
10718	Val = DAG.getZExtOrTrunc(Op: Val, DL, VT);
10719	if (IsParity)
10720	Val = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Val, N2: DAG.getConstant(Val: `1`, DL, VT));
10721	return Val;
10722	}
10723	}
10724
10725	if (!Subtarget->isNeonAvailable())
10726	return SDValue ();
10727
10728	// If there is no CNT instruction available, GPR popcount can
10729	// be more efficiently lowered to the following sequence that uses
10730	// AdvSIMD registers/instructions as long as the copies to/from
10731	// the AdvSIMD registers are cheap.
10732	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10733	// CNT V0.8B, V0.8B // 8xbyte pop-counts
10734	// ADDV B0, V0.8B // sum 8xbyte pop-counts
10735	// FMOV X0, D0 // copy result back to integer reg
10736	if (VT == MVT::i32 \|\| VT == MVT::i64) {
10737	if (VT == MVT::i32)
10738	Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
10739	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Val);
10740
10741	SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v8i8, Operand: Val);
10742	SDValue AddV = DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: MVT::v8i8, Operand: CtPop);
10743	AddV = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL,
10744	VT: VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, Operand: AddV);
10745	AddV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: AddV,
10746	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10747	if (IsParity)
10748	AddV = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: AddV, N2: DAG.getConstant(Val: `1`, DL, VT));
10749	return AddV;
10750	} else if (VT == MVT::i128) {
10751	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: Val);
10752
10753	SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v16i8, Operand: Val);
10754	SDValue AddV = DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: MVT::v16i8, Operand: CtPop);
10755	AddV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
10756	N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v2i64, Operand: AddV),
10757	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10758	AddV = DAG.getZExtOrTrunc(Op: AddV, DL, VT);
10759	if (IsParity)
10760	AddV = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: AddV, N2: DAG.getConstant(Val: `1`, DL, VT));
10761	return AddV;
10762	}
10763
10764	assert(!IsParity && "ISD::PARITY of vector types not supported");
10765
10766	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
10767	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
10768	"Unexpected type for custom ctpop lowering");
10769
10770	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10771	Val = DAG.getBitcast(VT: VT8Bit, V: Val);
10772	Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
10773
10774	if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != `16` &&
10775	VT.getVectorNumElements() >= `2`) {
10776	EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10777	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: DT);
10778	SDValue Ones = DAG.getConstant(Val: `1`, DL, VT: VT8Bit);
10779
10780	if (VT == MVT::v2i64) {
10781	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10782	Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT, Operand: Val);
10783	} else if (VT == MVT::v2i32) {
10784	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10785	} else if (VT == MVT::v4i32) {
10786	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10787	} else {
10788	llvm_unreachable("Unexpected type for custom ctpop lowering");
10789	}
10790
10791	return Val;
10792	}
10793
10794	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10795	unsigned EltSize = `8`;
10796	unsigned NumElts = VT.is64BitVector() ? `8` : `16`;
10797	while (EltSize != VT.getScalarSizeInBits()) {
10798	EltSize *= `2`;
10799	NumElts /= `2`;
10800	MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
10801	Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: WidenVT, Operand: Val);
10802	}
10803
10804	return Val;
10805	}
10806
10807	SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10808	EVT VT = Op.getValueType();
10809	assert(VT.isScalableVector() \|\|
10810	useSVEForFixedLengthVectorVT(
10811	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()));
10812
10813	SDLoc DL(Op);
10814	SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: `0`));
10815	return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
10816	}
10817
10818	SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10819	SelectionDAG &DAG) const {
10820
10821	EVT VT = Op.getValueType();
10822	SDLoc DL(Op);
10823	unsigned Opcode = Op.getOpcode();
10824	ISD::CondCode CC;
10825	switch (Opcode) {
10826	default:
10827	llvm_unreachable("Wrong instruction");
10828	case ISD::SMAX:
10829	CC = ISD::SETGT;
10830	break;
10831	case ISD::SMIN:
10832	CC = ISD::SETLT;
10833	break;
10834	case ISD::UMAX:
10835	CC = ISD::SETUGT;
10836	break;
10837	case ISD::UMIN:
10838	CC = ISD::SETULT;
10839	break;
10840	}
10841
10842	if (VT.isScalableVector() \|\|
10843	useSVEForFixedLengthVectorVT(
10844	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
10845	switch (Opcode) {
10846	default:
10847	llvm_unreachable("Wrong instruction");
10848	case ISD::SMAX:
10849	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
10850	case ISD::SMIN:
10851	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
10852	case ISD::UMAX:
10853	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
10854	case ISD::UMIN:
10855	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
10856	}
10857	}
10858
10859	SDValue Op0 = Op.getOperand(i: `0`);
10860	SDValue Op1 = Op.getOperand(i: `1`);
10861	SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
10862	return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
10863	}
10864
10865	SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10866	SelectionDAG &DAG) const {
10867	EVT VT = Op.getValueType();
10868
10869	if (VT.isScalableVector() \|\|
10870	useSVEForFixedLengthVectorVT(
10871	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
10872	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10873
10874	SDLoc DL(Op);
10875	SDValue REVB;
10876	MVT VST;
10877
10878	switch (VT.getSimpleVT().SimpleTy) {
10879	default:
10880	llvm_unreachable("Invalid type for bitreverse!");
10881
10882	case MVT::v2i32: {
10883	VST = MVT::v8i8;
10884	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10885
10886	break;
10887	}
10888
10889	case MVT::v4i32: {
10890	VST = MVT::v16i8;
10891	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10892
10893	break;
10894	}
10895
10896	case MVT::v1i64: {
10897	VST = MVT::v8i8;
10898	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10899
10900	break;
10901	}
10902
10903	case MVT::v2i64: {
10904	VST = MVT::v16i8;
10905	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10906
10907	break;
10908	}
10909	}
10910
10911	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
10912	Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
10913	}
10914
10915	// Check whether the continuous comparison sequence.
10916	static bool
10917	isOrXorChain(SDValue N, unsigned &Num,
10918	SmallVector<std::pair<SDValue, SDValue>, `16`> &WorkList) {
10919	if (Num == MaxXors)
10920	return false;
10921
10922	// Skip the one-use zext
10923	if (N ->getOpcode() == ISD::ZERO_EXTEND && N ->hasOneUse())
10924	N = N ->getOperand(Num: `0`);
10925
10926	// The leaf node must be XOR
10927	if (N ->getOpcode() == ISD::XOR) {
10928	WorkList.push_back(Elt: std::make_pair(x: N ->getOperand(Num: `0`), y: N ->getOperand(Num: `1`)));
10929	Num++;
10930	return true;
10931	}
10932
10933	// All the non-leaf nodes must be OR.
10934	if (N ->getOpcode() != ISD::OR \|\| !N ->hasOneUse())
10935	return false;
10936
10937	if (isOrXorChain(N: N ->getOperand(Num: `0`), Num, WorkList) &&
10938	isOrXorChain(N: N ->getOperand(Num: `1`), Num, WorkList))
10939	return true;
10940	return false;
10941	}
10942
10943	// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10944	static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
10945	SDValue LHS = N->getOperand(Num: `0`);
10946	SDValue RHS = N->getOperand(Num: `1`);
10947	SDLoc DL(N);
10948	EVT VT = N->getValueType(ResNo: `0`);
10949	SmallVector<std::pair<SDValue, SDValue>, `16`> WorkList;
10950
10951	// Only handle integer compares.
10952	if (N->getOpcode() != ISD::SETCC)
10953	return SDValue ();
10954
10955	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
10956	// Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10957	// sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10958	unsigned NumXors = `0`;
10959	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
10960	LHS ->getOpcode() == ISD::OR && LHS ->hasOneUse() &&
10961	isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
10962	SDValue XOR0, XOR1;
10963	std::tie(args&: XOR0, args&: XOR1) = WorkList [`0`];
10964	unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10965	SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10966	for (unsigned I = `1`; I < WorkList.size(); I++) {
10967	std::tie(args&: XOR0, args&: XOR1) = WorkList [I];
10968	SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10969	Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
10970	}
10971
10972	// Exit early by inverting the condition, which help reduce indentations.
10973	return Cmp;
10974	}
10975
10976	return SDValue ();
10977	}
10978
10979	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10980
10981	if (Op.getValueType().isVector())
10982	return LowerVSETCC(Op, DAG);
10983
10984	bool IsStrict = Op ->isStrictFPOpcode();
10985	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10986	unsigned OpNo = IsStrict ? `1` : `0`;
10987	SDValue Chain;
10988	if (IsStrict)
10989	Chain = Op.getOperand(i: `0`);
10990	SDValue LHS = Op.getOperand(i: OpNo + `0`);
10991	SDValue RHS = Op.getOperand(i: OpNo + `1`);
10992	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + `2`))->get();
10993	SDLoc DL(Op);
10994
10995	// We chose ZeroOrOneBooleanContents, so use zero and one.
10996	EVT VT = Op.getValueType();
10997	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT);
10998	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT);
10999
11000	// Handle f128 first, since one possible outcome is a normal integer
11001	// comparison which gets picked up by the next if statement.
11002	if (LHS.getValueType() == MVT::f128) {
11003	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS, Chain,
11004	IsSignaling);
11005
11006	// If softenSetCCOperands returned a scalar, use it.
11007	if (!RHS.getNode()) {
11008	assert(LHS.getValueType() == Op.getValueType() &&
11009	"Unexpected setcc expansion!");
11010	return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl: DL) : LHS;
11011	}
11012	}
11013
11014	if (LHS.getValueType().isInteger()) {
11015
11016	simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11017
11018	SDValue CCVal;
11019	SDValue Cmp = getAArch64Cmp(
11020	LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, DL);
11021
11022	// Note that we inverted the condition above, so we reverse the order of
11023	// the true and false operands here. This will allow the setcc to be
11024	// matched to a single CSINC instruction.
11025	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
11026	return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl: DL) : Res;
11027	}
11028
11029	// Now we know we're dealing with FP values.
11030	assert(LHS.getValueType() == MVT::bf16 \|\| LHS.getValueType() == MVT::f16 \|\|
11031	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
11032
11033	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11034	// and do the comparison.
11035	SDValue Cmp;
11036	if (IsStrict)
11037	Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11038	else
11039	Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11040
11041	AArch64CC::CondCode CC1, CC2;
11042	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
11043	SDValue Res;
11044	if (CC2 == AArch64CC::AL) {
11045	changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
11046	CondCode2&: CC2);
11047	SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11048
11049	// Note that we inverted the condition above, so we reverse the order of
11050	// the true and false operands here. This will allow the setcc to be
11051	// matched to a single CSINC instruction.
11052	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
11053	} else {
11054	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11055	// totally clean. Some of them require two CSELs to implement. As is in
11056	// this case, we emit the first CSEL and then emit a second using the output
11057	// of the first as the RHS. We're effectively OR'ing the two CC's together.
11058
11059	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11060	SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11061	SDValue CS1 =
11062	DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
11063
11064	SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
11065	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
11066	}
11067	return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: `1`)}, dl: DL) : Res;
11068	}
11069
11070	SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11071	SelectionDAG &DAG) const {
11072
11073	SDValue LHS = Op.getOperand(i: `0`);
11074	SDValue RHS = Op.getOperand(i: `1`);
11075	EVT VT = LHS.getValueType();
11076	if (VT != MVT::i32 && VT != MVT::i64)
11077	return SDValue ();
11078
11079	SDLoc DL(Op);
11080	SDValue Carry = Op.getOperand(i: `2`);
11081	// SBCS uses a carry not a borrow so the carry flag should be inverted first.
11082	SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
11083	SDValue Cmp = DAG.getNode(Opcode: AArch64ISD::SBCS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue),
11084	N1: LHS, N2: RHS, N3: InvCarry);
11085
11086	EVT OpVT = Op.getValueType();
11087	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OpVT);
11088	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OpVT);
11089
11090	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: `3`))->get();
11091	ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
11092	SDValue CCVal =
11093	DAG.getConstant(Val: changeIntCCToAArch64CC(CC: CondInv), DL, VT: MVT::i32);
11094	// Inputs are swapped because the condition is inverted. This will allow
11095	// matching with a single CSINC instruction.
11096	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
11097	N4: Cmp.getValue(R: `1`));
11098	}
11099
11100	/// Emit vector comparison for floating-point values, producing a mask.
11101	static SDValue emitVectorComparison(SDValue LHS, SDValue RHS,
11102	AArch64CC::CondCode CC, bool NoNans, EVT VT,
11103	const SDLoc &DL, SelectionDAG &DAG) {
11104	assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11105	"function only supposed to emit natural comparisons");
11106
11107	switch (CC) {
11108	default:
11109	return SDValue ();
11110	case AArch64CC::NE: {
11111	SDValue Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL, VT, N1: LHS, N2: RHS);
11112	// Use vector semantics for the inversion to potentially save a copy between
11113	// SIMD and regular registers.
11114	if (!LHS.getValueType().isVector()) {
11115	EVT VecVT =
11116	EVT::getVectorVT(Context&: *DAG.getContext(), VT, NumElements: `128` / VT.getSizeInBits());
11117	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
11118	SDValue MaskVec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT,
11119	N1: DAG.getUNDEF(VT: VecVT), N2: Fcmeq, N3: Zero);
11120	SDValue InvertedMask = DAG.getNOT(DL, Val: MaskVec, VT: VecVT);
11121	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: InvertedMask, N2: Zero);
11122	}
11123	return DAG.getNOT(DL, Val: Fcmeq, VT);
11124	}
11125	case AArch64CC::EQ:
11126	return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL, VT, N1: LHS, N2: RHS);
11127	case AArch64CC::GE:
11128	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL, VT, N1: LHS, N2: RHS);
11129	case AArch64CC::GT:
11130	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL, VT, N1: LHS, N2: RHS);
11131	case AArch64CC::LE:
11132	if (!NoNans)
11133	return SDValue ();
11134	// If we ignore NaNs then we can use to the LS implementation.
11135	[[fallthrough]];
11136	case AArch64CC::LS:
11137	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL, VT, N1: RHS, N2: LHS);
11138	case AArch64CC::LT:
11139	if (!NoNans)
11140	return SDValue ();
11141	// If we ignore NaNs then we can use to the MI implementation.
11142	[[fallthrough]];
11143	case AArch64CC::MI:
11144	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL, VT, N1: RHS, N2: LHS);
11145	}
11146	}
11147
11148	/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11149	/// values are scalars, try to emit a mask generating vector instruction.
11150	static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal,
11151	SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11152	const SDLoc &DL, SelectionDAG &DAG) {
11153	assert(!LHS.getValueType().isVector());
11154	assert(!RHS.getValueType().isVector());
11155
11156	auto *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
11157	auto *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
11158	if (!CTVal \|\| !CFVal)
11159	return {};
11160	if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11161	!(CTVal->isZero() && CFVal->isAllOnes()))
11162	return {};
11163
11164	if (CTVal->isZero())
11165	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11166
11167	EVT VT = TVal.getValueType();
11168	if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11169	return {};
11170
11171	if (!NoNaNs && (CC == ISD::SETUO \|\| CC == ISD::SETO)) {
11172	bool OneNaN = false;
11173	if (LHS == RHS) {
11174	OneNaN = true;
11175	} else if (DAG.isKnownNeverNaN(Op: RHS)) {
11176	OneNaN = true;
11177	RHS = LHS;
11178	} else if (DAG.isKnownNeverNaN(Op: LHS)) {
11179	OneNaN = true;
11180	LHS = RHS;
11181	}
11182	if (OneNaN)
11183	CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11184	}
11185
11186	AArch64CC::CondCode CC1;
11187	AArch64CC::CondCode CC2;
11188	bool ShouldInvert = false;
11189	changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
11190	SDValue Cmp = emitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT, DL, DAG);
11191	SDValue Cmp2;
11192	if (CC2 != AArch64CC::AL) {
11193	Cmp2 = emitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT, DL, DAG);
11194	if (!Cmp2)
11195	return {};
11196	}
11197	if (!Cmp2 && !ShouldInvert)
11198	return Cmp;
11199
11200	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT, NumElements: `128` / VT.getSizeInBits());
11201	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
11202	Cmp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT, N1: DAG.getUNDEF(VT: VecVT), N2: Cmp,
11203	N3: Zero);
11204	if (Cmp2) {
11205	Cmp2 = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VecVT, N1: DAG.getUNDEF(VT: VecVT),
11206	N2: Cmp2, N3: Zero);
11207	Cmp = DAG.getNode(Opcode: ISD::OR, DL, VT: VecVT, N1: Cmp, N2: Cmp2);
11208	}
11209	if (ShouldInvert)
11210	Cmp = DAG.getNOT(DL, Val: Cmp, VT: VecVT);
11211	Cmp = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Cmp, N2: Zero);
11212	return Cmp;
11213	}
11214
11215	SDValue AArch64TargetLowering::LowerSELECT_CC(
11216	ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal,
11217	iterator_range<SDNode::user_iterator> Users, bool HasNoNaNs,
11218	const SDLoc &DL, SelectionDAG &DAG) const {
11219	// Handle f128 first, because it will result in a comparison of some RTLIB
11220	// call result against zero.
11221	if (LHS.getValueType() == MVT::f128) {
11222	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL, OldLHS: LHS, OldRHS: RHS);
11223
11224	// If softenSetCCOperands returned a scalar, we need to compare the result
11225	// against zero to select between true and false values.
11226	if (!RHS.getNode()) {
11227	RHS = DAG.getConstant(Val: `0`, DL, VT: LHS.getValueType());
11228	CC = ISD::SETNE;
11229	}
11230	}
11231
11232	// Also handle f16, for which we need to do a f32 comparison.
11233	if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
11234	LHS.getValueType() == MVT::bf16) {
11235	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
11236	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
11237	}
11238
11239	// Next, handle integers.
11240	if (LHS.getValueType().isInteger()) {
11241	assert((LHS.getValueType() == RHS.getValueType()) &&
11242	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
11243
11244	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
11245	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
11246	ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
11247	// Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
11248	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
11249	// supported types.
11250	if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
11251	CTVal->isOne() && CFVal->isAllOnes() &&
11252	LHS.getValueType() == TVal.getValueType()) {
11253	EVT VT = LHS.getValueType();
11254	SDValue Shift =
11255	DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: LHS,
11256	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL, VT));
11257	return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Shift, N2: DAG.getConstant(Val: `1`, DL, VT));
11258	}
11259
11260	// Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11261	// (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11262	// (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11263	// Both require less instructions than compare and conditional select.
11264	if ((CC == ISD::SETGT \|\| CC == ISD::SETLT) && LHS == TVal &&
11265	RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11266	LHS.getValueType() == RHS.getValueType()) {
11267	EVT VT = LHS.getValueType();
11268	SDValue Shift =
11269	DAG.getNode(Opcode: ISD::SRA, DL, VT, N1: LHS,
11270	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL, VT));
11271
11272	if (CC == ISD::SETGT)
11273	Shift = DAG.getNOT(DL, Val: Shift, VT);
11274
11275	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: LHS, N2: Shift);
11276	}
11277
11278	unsigned Opcode = AArch64ISD::CSEL;
11279
11280	// If both the TVal and the FVal are constants, see if we can swap them in
11281	// order to for a CSINV or CSINC out of them.
11282	if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11283	std::swap(a&: TVal, b&: FVal);
11284	std::swap(a&: CTVal, b&: CFVal);
11285	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11286	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11287	std::swap(a&: TVal, b&: FVal);
11288	std::swap(a&: CTVal, b&: CFVal);
11289	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11290	} else if (TVal.getOpcode() == ISD::XOR) {
11291	// If TVal is a NOT we want to swap TVal and FVal so that we can match
11292	// with a CSINV rather than a CSEL.
11293	if (isAllOnesConstant(V: TVal.getOperand(i: `1`))) {
11294	std::swap(a&: TVal, b&: FVal);
11295	std::swap(a&: CTVal, b&: CFVal);
11296	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11297	}
11298	} else if (TVal.getOpcode() == ISD::SUB) {
11299	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11300	// that we can match with a CSNEG rather than a CSEL.
11301	if (isNullConstant(V: TVal.getOperand(i: `0`))) {
11302	std::swap(a&: TVal, b&: FVal);
11303	std::swap(a&: CTVal, b&: CFVal);
11304	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11305	}
11306	} else if (CTVal && CFVal) {
11307	const int64_t TrueVal = CTVal->getSExtValue();
11308	const int64_t FalseVal = CFVal->getSExtValue();
11309	bool Swap = false;
11310
11311	// If both TVal and FVal are constants, see if FVal is the
11312	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11313	// instead of a CSEL in that case.
11314	if (TrueVal == ~FalseVal) {
11315	Opcode = AArch64ISD::CSINV;
11316	} else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11317	TrueVal == -FalseVal) {
11318	Opcode = AArch64ISD::CSNEG;
11319	} else if (TVal.getValueType() == MVT::i32) {
11320	// If our operands are only 32-bit wide, make sure we use 32-bit
11321	// arithmetic for the check whether we can use CSINC. This ensures that
11322	// the addition in the check will wrap around properly in case there is
11323	// an overflow (which would not be the case if we do the check with
11324	// 64-bit arithmetic).
11325	const uint32_t TrueVal32 = CTVal->getZExtValue();
11326	const uint32_t FalseVal32 = CFVal->getZExtValue();
11327
11328	if ((TrueVal32 == FalseVal32 + `1`) \|\| (TrueVal32 + `1` == FalseVal32)) {
11329	Opcode = AArch64ISD::CSINC;
11330
11331	if (TrueVal32 > FalseVal32) {
11332	Swap = true;
11333	}
11334	}
11335	} else {
11336	// 64-bit check whether we can use CSINC.
11337	const uint64_t TrueVal64 = TrueVal;
11338	const uint64_t FalseVal64 = FalseVal;
11339
11340	if ((TrueVal64 == FalseVal64 + `1`) \|\| (TrueVal64 + `1` == FalseVal64)) {
11341	Opcode = AArch64ISD::CSINC;
11342
11343	if (TrueVal > FalseVal) {
11344	Swap = true;
11345	}
11346	}
11347	}
11348
11349	// Swap TVal and FVal if necessary.
11350	if (Swap) {
11351	std::swap(a&: TVal, b&: FVal);
11352	std::swap(a&: CTVal, b&: CFVal);
11353	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
11354	}
11355
11356	if (Opcode != AArch64ISD::CSEL) {
11357	// Drop FVal since we can get its value by simply inverting/negating
11358	// TVal.
11359	FVal = TVal;
11360	}
11361	}
11362
11363	// Avoid materializing a constant when possible by reusing a known value in
11364	// a register. However, don't perform this optimization if the known value
11365	// is one, zero or negative one in the case of a CSEL. We can always
11366	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11367	// FVal, respectively.
11368	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
11369	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11370	!RHSVal->isZero() && !RHSVal->isAllOnes()) {
11371	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11372	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11373	// "a != C ? x : a" to avoid materializing C.
11374	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11375	TVal = LHS;
11376	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11377	FVal = LHS;
11378	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11379	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11380	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11381	// avoid materializing C.
11382	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
11383	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11384	Opcode = AArch64ISD::CSINV;
11385	TVal = LHS;
11386	FVal = DAG.getConstant(Val: `0`, DL, VT: FVal.getValueType());
11387	}
11388	}
11389
11390	SDValue CCVal;
11391	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, DL);
11392	EVT VT = TVal.getValueType();
11393	return DAG.getNode(Opcode, DL, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
11394	}
11395
11396	// Now we know we're dealing with FP values.
11397	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
11398	LHS.getValueType() == MVT::f64);
11399	assert(LHS.getValueType() == RHS.getValueType());
11400	EVT VT = TVal.getValueType();
11401
11402	// If the purpose of the comparison is to select between all ones
11403	// or all zeros, try to use a vector comparison because the operands are
11404	// already stored in SIMD registers.
11405	if (Subtarget->isNeonAvailable() && all_of(Range&: Users, P: [](const SDNode *U) {
11406	switch (U->getOpcode()) {
11407	default:
11408	return false;
11409	case ISD::INSERT_VECTOR_ELT:
11410	case ISD::SCALAR_TO_VECTOR:
11411	case AArch64ISD::DUP:
11412	return true;
11413	}
11414	})) {
11415	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath \|\| HasNoNaNs;
11416	SDValue VectorCmp =
11417	emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11418	if (VectorCmp)
11419	return VectorCmp;
11420	}
11421
11422	SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11423
11424	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11425	// clean. Some of them require two CSELs to implement.
11426	AArch64CC::CondCode CC1, CC2;
11427	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
11428
11429	if (DAG.getTarget().Options.UnsafeFPMath) {
11430	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
11431	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
11432	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
11433	if (RHSVal && RHSVal->isZero()) {
11434	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
11435	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
11436
11437	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
11438	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
11439	TVal = LHS;
11440	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
11441	CFVal && CFVal->isZero() &&
11442	FVal.getValueType() == LHS.getValueType())
11443	FVal = LHS;
11444	}
11445	}
11446
11447	// Emit first, and possibly only, CSEL.
11448	SDValue CC1Val = DAG.getConstant(Val: CC1, DL, VT: MVT::i32);
11449	SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
11450
11451	// If we need a second CSEL, emit it, using the output of the first as the
11452	// RHS. We're effectively OR'ing the two CC's together.
11453	if (CC2 != AArch64CC::AL) {
11454	SDValue CC2Val = DAG.getConstant(Val: CC2, DL, VT: MVT::i32);
11455	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
11456	}
11457
11458	// Otherwise, return the output of the first CSEL.
11459	return CS1;
11460	}
11461
11462	SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
11463	SelectionDAG &DAG) const {
11464	EVT Ty = Op.getValueType();
11465	auto Idx = Op.getConstantOperandAPInt(i: `2`);
11466	int64_t IdxVal = Idx.getSExtValue();
11467	assert(Ty.isScalableVector() &&
11468	"Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
11469
11470	// We can use the splice instruction for certain index values where we are
11471	// able to efficiently generate the correct predicate. The index will be
11472	// inverted and used directly as the input to the ptrue instruction, i.e.
11473	// -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
11474	// splice predicate. However, we can only do this if we can guarantee that
11475	// there are enough elements in the vector, hence we check the index <= min
11476	// number of elements.
11477	std::optional<unsigned> PredPattern;
11478	if (Ty.isScalableVector() && IdxVal < `0` &&
11479	(PredPattern = getSVEPredPatternFromNumElements(MinNumElts: std::abs(i: IdxVal))) !=
11480	std::nullopt) {
11481	SDLoc DL(Op);
11482
11483	// Create a predicate where all but the last -IdxVal elements are false.
11484	EVT PredVT = Ty.changeVectorElementType(EltVT: MVT::i1);
11485	SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
11486	Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
11487
11488	// Now splice the two inputs together using the predicate.
11489	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: `0`),
11490	N3: Op.getOperand(i: `1`));
11491	}
11492
11493	// We can select to an EXT instruction when indexing the first 256 bytes.
11494	unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
11495	if (IdxVal >= `0` && (IdxVal * BlockSize / `8`) < `256`)
11496	return Op;
11497
11498	return SDValue ();
11499	}
11500
11501	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
11502	SelectionDAG &DAG) const {
11503	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `4`))->get();
11504	SDValue LHS = Op.getOperand(i: `0`);
11505	SDValue RHS = Op.getOperand(i: `1`);
11506	SDValue TVal = Op.getOperand(i: `2`);
11507	SDValue FVal = Op.getOperand(i: `3`);
11508	bool HasNoNans = Op ->getFlags().hasNoNaNs();
11509	SDLoc DL(Op);
11510	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Users: Op ->users(), HasNoNaNs: HasNoNans, DL,
11511	DAG);
11512	}
11513
11514	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
11515	SelectionDAG &DAG) const {
11516	SDValue CCVal = Op ->getOperand(Num: `0`);
11517	SDValue TVal = Op ->getOperand(Num: `1`);
11518	SDValue FVal = Op ->getOperand(Num: `2`);
11519	bool HasNoNans = Op ->getFlags().hasNoNaNs();
11520	SDLoc DL(Op);
11521
11522	EVT Ty = Op.getValueType();
11523	if (Ty == MVT::aarch64svcount) {
11524	TVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: TVal);
11525	FVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: FVal);
11526	SDValue Sel =
11527	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::nxv16i1, N1: CCVal, N2: TVal, N3: FVal);
11528	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
11529	}
11530
11531	if (Ty.isScalableVector()) {
11532	MVT PredVT = MVT::getVectorVT(VT: MVT::i1, EC: Ty.getVectorElementCount());
11533	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
11534	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
11535	}
11536
11537	if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
11538	// FIXME: Ideally this would be the same as above using i1 types, however
11539	// for the moment we can't deal with fixed i1 vector types properly, so
11540	// instead extend the predicate to a result type sized integer vector.
11541	MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
11542	MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
11543	SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
11544	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
11545	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
11546	}
11547
11548	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
11549	// instruction.
11550	if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
11551	// Only lower legal XALUO ops.
11552	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal ->getValueType(ResNo: `0`)))
11553	return SDValue ();
11554
11555	AArch64CC::CondCode OFCC;
11556	SDValue Value, Overflow;
11557	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: `0`), DAG);
11558	SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
11559
11560	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
11561	N3: CCVal, N4: Overflow);
11562	}
11563
11564	// Lower it the same way as we would lower a SELECT_CC node.
11565	ISD::CondCode CC;
11566	SDValue LHS, RHS;
11567	if (CCVal.getOpcode() == ISD::SETCC) {
11568	LHS = CCVal.getOperand(i: `0`);
11569	RHS = CCVal.getOperand(i: `1`);
11570	CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: `2`))->get();
11571	} else {
11572	LHS = CCVal;
11573	RHS = DAG.getConstant(Val: `0`, DL, VT: CCVal.getValueType());
11574	CC = ISD::SETNE;
11575	}
11576
11577	// If we are lowering a f16 and we do not have fullf16, convert to a f32 in
11578	// order to use FCSELSrrr
11579	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11580	TVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
11581	Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: TVal);
11582	FVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
11583	Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: FVal);
11584	}
11585
11586	SDValue Res =
11587	LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Users: Op ->users(), HasNoNaNs: HasNoNans, DL, DAG);
11588
11589	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
11590	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: Ty, Operand: Res);
11591	}
11592
11593	return Res;
11594	}
11595
11596	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
11597	SelectionDAG &DAG) const {
11598	// Jump table entries as PC relative offsets. No additional tweaking
11599	// is necessary here. Just get the address of the jump table.
11600	JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
11601
11602	CodeModel::Model CM = getTargetMachine().getCodeModel();
11603	if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
11604	!Subtarget->isTargetMachO())
11605	return getAddrLarge(N: JT, DAG);
11606	if (CM == CodeModel::Tiny)
11607	return getAddrTiny(N: JT, DAG);
11608	return getAddr(N: JT, DAG);
11609	}
11610
11611	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
11612	SelectionDAG &DAG) const {
11613	// Jump table entries as PC relative offsets. No additional tweaking
11614	// is necessary here. Just get the address of the jump table.
11615	SDLoc DL(Op);
11616	SDValue JT = Op.getOperand(i: `1`);
11617	SDValue Entry = Op.getOperand(i: `2`);
11618	int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
11619
11620	auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11621	AFI->setJumpTableEntryInfo(Idx: JTI, Size: `4`, PCRelSym: nullptr);
11622
11623	// With aarch64-jump-table-hardening, we only expand the jump table dispatch
11624	// sequence later, to guarantee the integrity of the intermediate values.
11625	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
11626	Kind: "aarch64-jump-table-hardening")) {
11627	CodeModel::Model CM = getTargetMachine().getCodeModel();
11628	if (Subtarget->isTargetMachO()) {
11629	if (CM != CodeModel::Small && CM != CodeModel::Large)
11630	report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
11631	} else {
11632	// Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
11633	assert(Subtarget->isTargetELF() &&
11634	"jump table hardening only supported on MachO/ELF");
11635	if (CM != CodeModel::Small)
11636	report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
11637	}
11638
11639	SDValue X16Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::X16,
11640	N: Entry, Glue: SDValue ());
11641	SDNode *B = DAG.getMachineNode(Opcode: AArch64::BR_JumpTable, dl: DL, VT: MVT::Other,
11642	Op1: DAG.getTargetJumpTable(JTI, VT: MVT::i32),
11643	Op2: X16Copy.getValue(R: `0`), Op3: X16Copy.getValue(R: `1`));
11644	return SDValue (B, `0`);
11645	}
11646
11647	SDNode *Dest =
11648	DAG.getMachineNode(Opcode: AArch64::JumpTableDest32, dl: DL, VT1: MVT::i64, VT2: MVT::i64, Op1: JT,
11649	Op2: Entry, Op3: DAG.getTargetJumpTable(JTI, VT: MVT::i32));
11650	SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: `0`), DL);
11651	return DAG.getNode(Opcode: ISD::BRIND, DL, VT: MVT::Other, N1: JTInfo, N2: SDValue (Dest, `0`));
11652	}
11653
11654	SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
11655	SDValue Chain = Op.getOperand(i: `0`);
11656	SDValue Dest = Op.getOperand(i: `1`);
11657
11658	// BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
11659	// Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
11660	if (Dest ->isMachineOpcode() &&
11661	Dest ->getMachineOpcode() == AArch64::JumpTableDest32)
11662	return SDValue ();
11663
11664	const MachineFunction &MF = DAG.getMachineFunction();
11665	std::optional<uint16_t> BADisc =
11666	Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: MF.getFunction());
11667	if (!BADisc)
11668	return SDValue ();
11669
11670	SDLoc DL(Op);
11671
11672	SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
11673	SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
11674	SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11675
11676	SDNode *BrA = DAG.getMachineNode(Opcode: AArch64::BRA, dl: DL, VT: MVT::Other,
11677	Ops: {Dest, Key, Disc, AddrDisc, Chain});
11678	return SDValue (BrA, `0`);
11679	}
11680
11681	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
11682	SelectionDAG &DAG) const {
11683	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
11684	CodeModel::Model CM = getTargetMachine().getCodeModel();
11685	if (CM == CodeModel::Large) {
11686	// Use the GOT for the large code model on iOS.
11687	if (Subtarget->isTargetMachO()) {
11688	return getGOT(N: CP, DAG);
11689	}
11690	if (!getTargetMachine().isPositionIndependent())
11691	return getAddrLarge(N: CP, DAG);
11692	} else if (CM == CodeModel::Tiny) {
11693	return getAddrTiny(N: CP, DAG);
11694	}
11695	return getAddr(N: CP, DAG);
11696	}
11697
11698	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
11699	SelectionDAG &DAG) const {
11700	BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Val&: Op);
11701	const BlockAddress *BA = BAN->getBlockAddress();
11702
11703	if (std::optional<uint16_t> BADisc =
11704	Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
11705	ParentFn: *BA->getFunction())) {
11706	SDLoc DL(Op);
11707
11708	// This isn't cheap, but BRIND is rare.
11709	SDValue TargetBA = DAG.getTargetBlockAddress(BA, VT: BAN->getValueType(ResNo: `0`));
11710
11711	SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
11712
11713	SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
11714	SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11715
11716	SDNode *MOV =
11717	DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
11718	Ops: {TargetBA, Key, AddrDisc, Disc});
11719	return DAG.getCopyFromReg(Chain: SDValue (MOV, `0`), dl: DL, Reg: AArch64::X16, VT: MVT::i64,
11720	Glue: SDValue (MOV, `1`));
11721	}
11722
11723	CodeModel::Model CM = getTargetMachine().getCodeModel();
11724	if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
11725	if (!getTargetMachine().isPositionIndependent())
11726	return getAddrLarge(N: BAN, DAG);
11727	} else if (CM == CodeModel::Tiny) {
11728	return getAddrTiny(N: BAN, DAG);
11729	}
11730	return getAddr(N: BAN, DAG);
11731	}
11732
11733	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
11734	SelectionDAG &DAG) const {
11735	AArch64FunctionInfo *FuncInfo =
11736	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11737
11738	SDLoc DL(Op);
11739	SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
11740	VT: getPointerTy(DL: DAG.getDataLayout()));
11741	FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
11742	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
11743	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
11744	PtrInfo: MachinePointerInfo (SV));
11745	}
11746
11747	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
11748	SelectionDAG &DAG) const {
11749	MachineFunction &MF = DAG.getMachineFunction();
11750	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11751
11752	SDLoc DL(Op);
11753	SDValue FR;
11754	if (Subtarget->isWindowsArm64EC()) {
11755	// With the Arm64EC ABI, we compute the address of the varargs save area
11756	// relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
11757	// but calls from an entry thunk can pass in a different address.
11758	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
11759	SDValue Val = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: VReg, VT: MVT::i64);
11760	uint64_t StackOffset;
11761	if (FuncInfo->getVarArgsGPRSize() > `0`)
11762	StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
11763	else
11764	StackOffset = FuncInfo->getVarArgsStackOffset();
11765	FR = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
11766	N2: DAG.getConstant(Val: StackOffset, DL, VT: MVT::i64));
11767	} else {
11768	FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > `0`
11769	? FuncInfo->getVarArgsGPRIndex()
11770	: FuncInfo->getVarArgsStackIndex(),
11771	VT: getPointerTy(DL: DAG.getDataLayout()));
11772	}
11773	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
11774	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
11775	PtrInfo: MachinePointerInfo (SV));
11776	}
11777
11778	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
11779	SelectionDAG &DAG) const {
11780	// The layout of the va_list struct is specified in the AArch64 Procedure Call
11781	// Standard, section B.3.
11782	MachineFunction &MF = DAG.getMachineFunction();
11783	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
11784	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
11785	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11786	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11787	SDLoc DL(Op);
11788
11789	SDValue Chain = Op.getOperand(i: `0`);
11790	SDValue VAList = Op.getOperand(i: `1`);
11791	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
11792	SmallVector<SDValue, `4`> MemOps;
11793
11794	// void __stack at offset 0*
11795	unsigned Offset = `0`;
11796	SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
11797	Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
11798	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
11799	PtrInfo: MachinePointerInfo (SV), Alignment: Align (PtrSize)));
11800
11801	// void __gr_top at offset 8 (4 on ILP32)*
11802	Offset += PtrSize;
11803	int GPRSize = FuncInfo->getVarArgsGPRSize();
11804	if (GPRSize > `0`) {
11805	SDValue GRTop, GRTopAddr;
11806
11807	GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11808	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11809
11810	GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
11811	GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
11812	N2: DAG.getSignedConstant(Val: GPRSize, DL, VT: PtrVT));
11813	GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
11814
11815	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
11816	PtrInfo: MachinePointerInfo (SV, Offset),
11817	Alignment: Align (PtrSize)));
11818	}
11819
11820	// void __vr_top at offset 16 (8 on ILP32)*
11821	Offset += PtrSize;
11822	int FPRSize = FuncInfo->getVarArgsFPRSize();
11823	if (FPRSize > `0`) {
11824	SDValue VRTop, VRTopAddr;
11825	VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11826	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11827
11828	VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
11829	VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
11830	N2: DAG.getSignedConstant(Val: FPRSize, DL, VT: PtrVT));
11831	VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
11832
11833	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
11834	PtrInfo: MachinePointerInfo (SV, Offset),
11835	Alignment: Align (PtrSize)));
11836	}
11837
11838	// int __gr_offs at offset 24 (12 on ILP32)
11839	Offset += PtrSize;
11840	SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11841	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11842	MemOps.push_back(
11843	Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getSignedConstant(Val: -GPRSize, DL, VT: MVT::i32),
11844	Ptr: GROffsAddr, PtrInfo: MachinePointerInfo (SV, Offset), Alignment: Align (`4`)));
11845
11846	// int __vr_offs at offset 28 (16 on ILP32)
11847	Offset += `4`;
11848	SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11849	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11850	MemOps.push_back(
11851	Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getSignedConstant(Val: -FPRSize, DL, VT: MVT::i32),
11852	Ptr: VROffsAddr, PtrInfo: MachinePointerInfo (SV, Offset), Alignment: Align (`4`)));
11853
11854	return DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
11855	}
11856
11857	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11858	SelectionDAG &DAG) const {
11859	MachineFunction &MF = DAG.getMachineFunction();
11860	Function &F = MF.getFunction();
11861
11862	if (Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg()))
11863	return LowerWin64_VASTART(Op, DAG);
11864	else if (Subtarget->isTargetDarwin())
11865	return LowerDarwin_VASTART(Op, DAG);
11866	else
11867	return LowerAAPCS_VASTART(Op, DAG);
11868	}
11869
11870	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11871	SelectionDAG &DAG) const {
11872	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11873	// pointer.
11874	SDLoc DL(Op);
11875	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
11876	unsigned VaListSize =
11877	(Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
11878	? PtrSize
11879	: Subtarget->isTargetILP32() ? `20` : `32`;
11880	const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `3`))->getValue();
11881	const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `4`))->getValue();
11882
11883	return DAG.getMemcpy(Chain: Op.getOperand(i: `0`), dl: DL, Dst: Op.getOperand(i: `1`), Src: Op.getOperand(i: `2`),
11884	Size: DAG.getConstant(Val: VaListSize, DL, VT: MVT::i32),
11885	Alignment: Align (PtrSize), isVol: false, AlwaysInline: false, /CI=/nullptr,
11886	OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo (DestSV),
11887	SrcPtrInfo: MachinePointerInfo (SrcSV));
11888	}
11889
11890	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11891	assert(Subtarget->isTargetDarwin() &&
11892	"automatic va_arg instruction only works on Darwin");
11893
11894	const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
11895	EVT VT = Op.getValueType();
11896	SDLoc DL(Op);
11897	SDValue Chain = Op.getOperand(i: `0`);
11898	SDValue Addr = Op.getOperand(i: `1`);
11899	MaybeAlign Align(Op.getConstantOperandVal(i: `3`));
11900	unsigned MinSlotSize = Subtarget->isTargetILP32() ? `4` : `8`;
11901	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11902	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11903	SDValue VAList =
11904	DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
11905	Chain = VAList.getValue(R: `1`);
11906	VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
11907
11908	if (VT.isScalableVector())
11909	report_fatal_error(reason: "Passing SVE types to variadic functions is "
11910	"currently not supported");
11911
11912	if (Align && *Align > MinSlotSize) {
11913	VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11914	N2: DAG.getConstant(Val: Align ->value() - `1`, DL, VT: PtrVT));
11915	VAList =
11916	DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
11917	N2: DAG.getSignedConstant(Val: -(int64_t)Align ->value(), DL, VT: PtrVT));
11918	}
11919
11920	Type ArgTy = VT.getTypeForEVT(Context&: DAG.getContext());
11921	unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
11922
11923	// Scalar integer and FP values smaller than 64 bits are implicitly extended
11924	// up to 64 bits. At the very least, we have to increase the striding of the
11925	// vaargs list to match this, and for FP values we need to introduce
11926	// FP_ROUND nodes as well.
11927	if (VT.isInteger() && !VT.isVector())
11928	ArgSize = std::max(a: ArgSize, b: MinSlotSize);
11929	bool NeedFPTrunc = false;
11930	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11931	ArgSize = `8`;
11932	NeedFPTrunc = true;
11933	}
11934
11935	// Increment the pointer, VAList, to the next vaarg
11936	SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11937	N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
11938	VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
11939
11940	// Store the incremented VAList to the legalized pointer
11941	SDValue APStore =
11942	DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
11943
11944	// Load the actual argument out of the pointer VAList
11945	if (NeedFPTrunc) {
11946	// Load the value as an f64.
11947	SDValue WideFP =
11948	DAG.getLoad(VT: MVT::f64, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo ());
11949	// Round the value down to an f32.
11950	SDValue NarrowFP =
11951	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: `0`),
11952	N2: DAG.getIntPtrConstant(Val: `1`, DL, /isTarget=/true));
11953	SDValue Ops[] = { NarrowFP, WideFP.getValue(R: `1`) };
11954	// Merge the rounded value with the chain output of the load.
11955	return DAG.getMergeValues(Ops, dl: DL);
11956	}
11957
11958	return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo ());
11959	}
11960
11961	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11962	SelectionDAG &DAG) const {
11963	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11964	MFI.setFrameAddressIsTaken(true);
11965
11966	EVT VT = Op.getValueType();
11967	SDLoc DL(Op);
11968	unsigned Depth = Op.getConstantOperandVal(i: `0`);
11969	SDValue FrameAddr =
11970	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT: MVT::i64);
11971	while (Depth--)
11972	FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
11973	PtrInfo: MachinePointerInfo ());
11974
11975	if (Subtarget->isTargetILP32())
11976	FrameAddr = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: MVT::i64, N1: FrameAddr,
11977	N2: DAG.getValueType(VT));
11978
11979	return FrameAddr;
11980	}
11981
11982	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11983	SelectionDAG &DAG) const {
11984	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11985
11986	EVT VT = getPointerTy(DL: DAG.getDataLayout());
11987	int FI = MFI.CreateFixedObject(Size: `4`, SPOffset: `0`, IsImmutable: false);
11988	return DAG.getFrameIndex(FI, VT);
11989	}
11990
11991	#define GET_REGISTER_MATCHER
11992	#include "AArch64GenAsmMatcher.inc"
11993
11994	// FIXME? Maybe this could be a TableGen attribute on some registers and
11995	// this table could be generated automatically from RegInfo.
11996	Register AArch64TargetLowering::
11997	getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11998	Register Reg = MatchRegisterName(Name: RegName);
11999	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12000	const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12001	unsigned DwarfRegNum = MRI->getDwarfRegNum(RegNum: Reg, isEH: false);
12002	if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
12003	!MRI->isReservedReg(MF, Reg))
12004	Reg = Register ();
12005	}
12006	return Reg;
12007	}
12008
12009	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12010	SelectionDAG &DAG) const {
12011	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
12012
12013	EVT VT = Op.getValueType();
12014	SDLoc DL(Op);
12015
12016	SDValue FrameAddr =
12017	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT);
12018	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
12019
12020	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
12021	}
12022
12023	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12024	SelectionDAG &DAG) const {
12025	MachineFunction &MF = DAG.getMachineFunction();
12026	MachineFrameInfo &MFI = MF.getFrameInfo();
12027	MFI.setReturnAddressIsTaken(true);
12028
12029	EVT VT = Op.getValueType();
12030	SDLoc DL(Op);
12031	unsigned Depth = Op.getConstantOperandVal(i: `0`);
12032	SDValue ReturnAddress;
12033	if (Depth) {
12034	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12035	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
12036	ReturnAddress = DAG.getLoad(
12037	VT, dl: DL, Chain: DAG.getEntryNode(),
12038	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo ());
12039	} else {
12040	// Return LR, which contains the return address. Mark it an implicit
12041	// live-in.
12042	Register Reg = MF.addLiveIn(PReg: AArch64::LR, RC: &AArch64::GPR64RegClass);
12043	ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
12044	}
12045
12046	// The XPACLRI instruction assembles to a hint-space instruction before
12047	// Armv8.3-A therefore this instruction can be safely used for any pre
12048	// Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12049	// that instead.
12050	SDNode *St;
12051	if (Subtarget->hasPAuth()) {
12052	St = DAG.getMachineNode(Opcode: AArch64::XPACI, dl: DL, VT, Op1: ReturnAddress);
12053	} else {
12054	// XPACLRI operates on LR therefore we must move the operand accordingly.
12055	SDValue Chain =
12056	DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::LR, N: ReturnAddress);
12057	St = DAG.getMachineNode(Opcode: AArch64::XPACLRI, dl: DL, VT, Op1: Chain);
12058	}
12059	return SDValue (St, `0`);
12060	}
12061
12062	/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12063	/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12064	SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12065	SelectionDAG &DAG) const {
12066	SDValue Lo, Hi;
12067	expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
12068	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc (Op));
12069	}
12070
12071	bool AArch64TargetLowering::isOffsetFoldingLegal(
12072	const GlobalAddressSDNode GA) const* {
12073	// Offsets are folded in the DAG combine rather than here so that we can
12074	// intelligently choose an offset based on the uses.
12075	return false;
12076	}
12077
12078	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
12079	bool OptForSize) const {
12080	bool IsLegal = false;
12081	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12082	// 16-bit case when target has full fp16 support.
12083	// We encode bf16 bit patterns as if they were fp16. This results in very
12084	// strange looking assembly but should populate the register with appropriate
12085	// values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12086	// end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12087	// FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12088	// FIXME: We should be able to handle f128 as well with a clever lowering.
12089	const APInt ImmInt = Imm.bitcastToAPInt();
12090	if (VT == MVT::f64)
12091	IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
12092	else if (VT == MVT::f32)
12093	IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
12094	else if (VT == MVT::f16 \|\| VT == MVT::bf16)
12095	IsLegal =
12096	(Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -`1`) \|\|
12097	Imm.isPosZero();
12098
12099	// If we can not materialize in immediate field for fmov, check if the
12100	// value can be encoded as the immediate operand of a logical instruction.
12101	// The immediate value will be created with either MOVZ, MOVN, or ORR.
12102	// TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12103	// generate that fmov.
12104	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
12105	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12106	// however the mov+fmov sequence is always better because of the reduced
12107	// cache pressure. The timings are still the same if you consider
12108	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12109	// movw+movk is fused). So we limit up to 2 instrdduction at most.
12110	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
12111	AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
12112	assert(Insn.size() <= `4` &&
12113	"Should be able to build any value with at most 4 moves");
12114	unsigned Limit = (OptForSize ? `1` : (Subtarget->hasFuseLiterals() ? `4` : `2`));
12115	IsLegal = Insn.size() <= Limit;
12116	}
12117
12118	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12119	<< " imm value: "; Imm.dump(););
12120	return IsLegal;
12121	}
12122
12123	//===----------------------------------------------------------------------===//
12124	// AArch64 Optimization Hooks
12125	//===----------------------------------------------------------------------===//
12126
12127	static SDValue getEstimate(const AArch64Subtarget ST, unsigned* Opcode,
12128	SDValue Operand, SelectionDAG &DAG,
12129	int &ExtraSteps) {
12130	EVT VT = Operand.getValueType();
12131	if ((ST->hasNEON() &&
12132	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
12133	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\| VT == MVT::v2f32 \|\|
12134	VT == MVT::v4f32)) \|\|
12135	(ST->hasSVE() &&
12136	(VT == MVT::nxv8f16 \|\| VT == MVT::nxv4f32 \|\| VT == MVT::nxv2f64))) {
12137	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
12138	// For the reciprocal estimates, convergence is quadratic, so the number
12139	// of digits is doubled after each iteration. In ARMv8, the accuracy of
12140	// the initial estimate is 2^-8. Thus the number of extra steps to refine
12141	// the result for float (23 mantissa bits) is 2 and for double (52
12142	// mantissa bits) is 3.
12143	constexpr unsigned AccurateBits = `8`;
12144	unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12145	ExtraSteps = DesiredBits <= AccurateBits
12146	? `0`
12147	: Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
12148	}
12149
12150	return DAG.getNode(Opcode, DL: SDLoc (Operand), VT, Operand);
12151	}
12152
12153	return SDValue ();
12154	}
12155
12156	SDValue
12157	AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12158	const DenormalMode &Mode) const {
12159	SDLoc DL(Op);
12160	EVT VT = Op.getValueType();
12161	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
12162	SDValue FPZero = DAG.getConstantFP(Val: `0.0`, DL, VT);
12163	return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
12164	}
12165
12166	SDValue
12167	AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12168	SelectionDAG &DAG) const {
12169	return Op;
12170	}
12171
12172	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12173	SelectionDAG &DAG, int Enabled,
12174	int &ExtraSteps,
12175	bool &UseOneConst,
12176	bool Reciprocal) const {
12177	if (Enabled == ReciprocalEstimate::Enabled \|\|
12178	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12179	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
12180	DAG, ExtraSteps)) {
12181	SDLoc DL(Operand);
12182	EVT VT = Operand.getValueType();
12183
12184	SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12185
12186	// Newton reciprocal square root iteration: E 0.5 * (3 - X * E^2)*
12187	// AArch64 reciprocal square root iteration instruction: 0.5 (3 - M * N)*
12188	for (int i = ExtraSteps; i > `0`; --i) {
12189	SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
12190	Flags);
12191	Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
12192	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
12193	}
12194	if (!Reciprocal)
12195	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
12196
12197	ExtraSteps = `0`;
12198	return Estimate;
12199	}
12200
12201	return SDValue ();
12202	}
12203
12204	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12205	SelectionDAG &DAG, int Enabled,
12206	int &ExtraSteps) const {
12207	if (Enabled == ReciprocalEstimate::Enabled)
12208	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
12209	DAG, ExtraSteps)) {
12210	SDLoc DL(Operand);
12211	EVT VT = Operand.getValueType();
12212
12213	SDNodeFlags Flags = SDNodeFlags::AllowReassociation;
12214
12215	// Newton reciprocal iteration: E (2 - X * E)*
12216	// AArch64 reciprocal iteration instruction: (2 - M N)*
12217	for (int i = ExtraSteps; i > `0`; --i) {
12218	SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
12219	N2: Estimate, Flags);
12220	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
12221	}
12222
12223	ExtraSteps = `0`;
12224	return Estimate;
12225	}
12226
12227	return SDValue ();
12228	}
12229
12230	//===----------------------------------------------------------------------===//
12231	// AArch64 Inline Assembly Support
12232	//===----------------------------------------------------------------------===//
12233
12234	// Table of Constraints
12235	// TODO: This is the current set of constraints supported by ARM for the
12236	// compiler, not all of them may make sense.
12237	//
12238	// r - A general register
12239	// w - An FP/SIMD register of some size in the range v0-v31
12240	// x - An FP/SIMD register of some size in the range v0-v15
12241	// I - Constant that can be used with an ADD instruction
12242	// J - Constant that can be used with a SUB instruction
12243	// K - Constant that can be used with a 32-bit logical instruction
12244	// L - Constant that can be used with a 64-bit logical instruction
12245	// M - Constant that can be used as a 32-bit MOV immediate
12246	// N - Constant that can be used as a 64-bit MOV immediate
12247	// Q - A memory reference with base register and no offset
12248	// S - A symbolic address
12249	// Y - Floating point constant zero
12250	// Z - Integer constant zero
12251	//
12252	// Note that general register operands will be output using their 64-bit x
12253	// register name, whatever the size of the variable, unless the asm operand
12254	// is prefixed by the %w modifier. Floating-point and SIMD register operands
12255	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12256	// %q modifier.
12257	const char AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const* {
12258	// At this point, we have to lower this constraint to something else, so we
12259	// lower it to an "r" or "w". However, by doing this we will force the result
12260	// to be in register, while the X constraint is much more permissive.
12261	//
12262	// Although we are correct (we are free to emit anything, without
12263	// constraints), we might break use cases that would expect us to be more
12264	// efficient and emit something else.
12265	if (!Subtarget->hasFPARMv8())
12266	return "r";
12267
12268	if (ConstraintVT.isFloatingPoint())
12269	return "w";
12270
12271	if (ConstraintVT.isVector() &&
12272	(ConstraintVT.getSizeInBits() == `64` \|\|
12273	ConstraintVT.getSizeInBits() == `128`))
12274	return "w";
12275
12276	return "r";
12277	}
12278
12279	enum class PredicateConstraint { Uph, Upl, Upa };
12280
12281	// Returns a {Reg, RegisterClass} tuple if the constraint is
12282	// a specific predicate register.
12283	//
12284	// For some constraint like "{pn3}" the default path in
12285	// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12286	// suitable register class for this register is "PPRorPNR", after which it
12287	// determines that nxv16i1 is an appropriate type for the constraint, which is
12288	// not what we want. The code here pre-empts this by matching the register
12289	// explicitly.
12290	static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12291	parseSVERegAsConstraint(StringRef Constraint) {
12292	if (!Constraint.starts_with(Prefix: `'{'`) \|\| !Constraint.ends_with(Suffix: `'}'`) \|\|
12293	(Constraint [`1`] != `'p'` && Constraint [`1`] != `'z'`))
12294	return std::nullopt;
12295
12296	bool IsPredicate = Constraint [`1`] == `'p'`;
12297	Constraint = Constraint.substr(Start: `2`, N: Constraint.size() - `3`);
12298	bool IsPredicateAsCount = IsPredicate && Constraint.starts_with(Prefix: "n");
12299	if (IsPredicateAsCount)
12300	Constraint = Constraint.drop_front(N: `1`);
12301
12302	unsigned V;
12303	if (Constraint.getAsInteger(Radix: `10`, Result&: V) \|\| V > `31`)
12304	return std::nullopt;
12305
12306	if (IsPredicateAsCount)
12307	return std::make_pair(x: AArch64::PN0 + V, y: &AArch64::PNRRegClass);
12308	if (IsPredicate)
12309	return std::make_pair(x: AArch64::P0 + V, y: &AArch64::PPRRegClass);
12310	return std::make_pair(x: AArch64::Z0 + V, y: &AArch64::ZPRRegClass);
12311	}
12312
12313	static std::optional<PredicateConstraint>
12314	parsePredicateConstraint(StringRef Constraint) {
12315	return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
12316	.Case(S: "Uph", Value: PredicateConstraint::Uph)
12317	.Case(S: "Upl", Value: PredicateConstraint::Upl)
12318	.Case(S: "Upa", Value: PredicateConstraint::Upa)
12319	.Default(Value: std::nullopt);
12320	}
12321
12322	static const TargetRegisterClass *
12323	getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
12324	if (VT != MVT::aarch64svcount &&
12325	(!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1))
12326	return nullptr;
12327
12328	switch (Constraint) {
12329	case PredicateConstraint::Uph:
12330	return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12331	: &AArch64::PPR_p8to15RegClass;
12332	case PredicateConstraint::Upl:
12333	return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12334	: &AArch64::PPR_3bRegClass;
12335	case PredicateConstraint::Upa:
12336	return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12337	: &AArch64::PPRRegClass;
12338	}
12339
12340	llvm_unreachable("Missing PredicateConstraint!");
12341	}
12342
12343	enum class ReducedGprConstraint { Uci, Ucj };
12344
12345	static std::optional<ReducedGprConstraint>
12346	parseReducedGprConstraint(StringRef Constraint) {
12347	return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
12348	.Case(S: "Uci", Value: ReducedGprConstraint::Uci)
12349	.Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
12350	.Default(Value: std::nullopt);
12351	}
12352
12353	static const TargetRegisterClass *
12354	getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
12355	if (!VT.isScalarInteger() \|\| VT.getFixedSizeInBits() > `64`)
12356	return nullptr;
12357
12358	switch (Constraint) {
12359	case ReducedGprConstraint::Uci:
12360	return &AArch64::MatrixIndexGPR32_8_11RegClass;
12361	case ReducedGprConstraint::Ucj:
12362	return &AArch64::MatrixIndexGPR32_12_15RegClass;
12363	}
12364
12365	llvm_unreachable("Missing ReducedGprConstraint!");
12366	}
12367
12368	// The set of cc code supported is from
12369	// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12370	static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
12371	AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
12372	.Case(S: "{@cchi}", Value: AArch64CC::HI)
12373	.Case(S: "{@cccs}", Value: AArch64CC::HS)
12374	.Case(S: "{@cclo}", Value: AArch64CC::LO)
12375	.Case(S: "{@ccls}", Value: AArch64CC::LS)
12376	.Case(S: "{@cccc}", Value: AArch64CC::LO)
12377	.Case(S: "{@cceq}", Value: AArch64CC::EQ)
12378	.Case(S: "{@ccgt}", Value: AArch64CC::GT)
12379	.Case(S: "{@ccge}", Value: AArch64CC::GE)
12380	.Case(S: "{@cclt}", Value: AArch64CC::LT)
12381	.Case(S: "{@ccle}", Value: AArch64CC::LE)
12382	.Case(S: "{@cchs}", Value: AArch64CC::HS)
12383	.Case(S: "{@ccne}", Value: AArch64CC::NE)
12384	.Case(S: "{@ccvc}", Value: AArch64CC::VC)
12385	.Case(S: "{@ccpl}", Value: AArch64CC::PL)
12386	.Case(S: "{@ccvs}", Value: AArch64CC::VS)
12387	.Case(S: "{@ccmi}", Value: AArch64CC::MI)
12388	.Default(Value: AArch64CC::Invalid);
12389	return Cond;
12390	}
12391
12392	/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12393	/// WZR, invert(<cond>)'.
12394	static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
12395	SelectionDAG &DAG) {
12396	return DAG.getNode(
12397	Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
12398	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
12399	N3: DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32), N4: NZCV);
12400	}
12401
12402	// Lower @cc flag output via getSETCC.
12403	SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12404	SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12405	const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12406	AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
12407	if (Cond == AArch64CC::Invalid)
12408	return SDValue ();
12409	// The output variable should be a scalar integer.
12410	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
12411	OpInfo.ConstraintVT.getSizeInBits() < `8`)
12412	report_fatal_error(reason: "Flag output operand is of invalid type");
12413
12414	// Get NZCV register. Only update chain when copyfrom is glued.
12415	if (Glue.getNode()) {
12416	Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32, Glue);
12417	Chain = Glue.getValue(R: `1`);
12418	} else
12419	Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32);
12420	// Extract CC code.
12421	SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
12422
12423	SDValue Result;
12424
12425	// Truncate or ZERO_EXTEND based on value types.
12426	if (OpInfo.ConstraintVT.getSizeInBits() <= `32`)
12427	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
12428	else
12429	Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
12430
12431	return Result;
12432	}
12433
12434	/// getConstraintType - Given a constraint letter, return the type of
12435	/// constraint it is for this target.
12436	AArch64TargetLowering::ConstraintType
12437	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
12438	if (Constraint.size() == `1`) {
12439	switch (Constraint [`0`]) {
12440	default:
12441	break;
12442	case `'x'`:
12443	case `'w'`:
12444	case `'y'`:
12445	return C_RegisterClass;
12446	// An address with a single base register. Due to the way we
12447	// currently handle addresses it is the same as 'r'.
12448	case `'Q'`:
12449	return C_Memory;
12450	case `'I'`:
12451	case `'J'`:
12452	case `'K'`:
12453	case `'L'`:
12454	case `'M'`:
12455	case `'N'`:
12456	case `'Y'`:
12457	case `'Z'`:
12458	return C_Immediate;
12459	case `'z'`:
12460	case `'S'`: // A symbol or label reference with a constant offset
12461	return C_Other;
12462	}
12463	} else if (parsePredicateConstraint(Constraint))
12464	return C_RegisterClass;
12465	else if (parseReducedGprConstraint(Constraint))
12466	return C_RegisterClass;
12467	else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
12468	return C_Other;
12469	return TargetLowering::getConstraintType(Constraint);
12470	}
12471
12472	/// Examine constraint type and operand type and determine a weight value.
12473	/// This object must already have been set up with the operand type
12474	/// and the current alternative constraint selected.
12475	TargetLowering::ConstraintWeight
12476	AArch64TargetLowering::getSingleConstraintMatchWeight(
12477	AsmOperandInfo &info, const char constraint) const* {
12478	ConstraintWeight weight = CW_Invalid;
12479	Value *CallOperandVal = info.CallOperandVal;
12480	// If we don't have a value, we can't do a match,
12481	// but allow it at the lowest weight.
12482	if (!CallOperandVal)
12483	return CW_Default;
12484	Type *type = CallOperandVal->getType();
12485	// Look at the constraint type.
12486	switch (*constraint) {
12487	default:
12488	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
12489	break;
12490	case `'x'`:
12491	case `'w'`:
12492	case `'y'`:
12493	if (type->isFloatingPointTy() \|\| type->isVectorTy())
12494	weight = CW_Register;
12495	break;
12496	case `'z'`:
12497	weight = CW_Constant;
12498	break;
12499	case `'U'`:
12500	if (parsePredicateConstraint(Constraint: constraint) \|\|
12501	parseReducedGprConstraint(Constraint: constraint))
12502	weight = CW_Register;
12503	break;
12504	}
12505	return weight;
12506	}
12507
12508	std::pair<unsigned, const TargetRegisterClass *>
12509	AArch64TargetLowering::getRegForInlineAsmConstraint(
12510	const TargetRegisterInfo TRI, StringRef Constraint, MVT VT) const* {
12511	if (Constraint.size() == `1`) {
12512	switch (Constraint [`0`]) {
12513	case `'r'`:
12514	if (VT.isScalableVector())
12515	return std::make_pair(x: `0U`, y: nullptr);
12516	if (Subtarget->hasLS64() && VT.getSizeInBits() == `512`)
12517	return std::make_pair(x: `0U`, y: &AArch64::GPR64x8ClassRegClass);
12518	if (VT.getFixedSizeInBits() == `64`)
12519	return std::make_pair(x: `0U`, y: &AArch64::GPR64commonRegClass);
12520	return std::make_pair(x: `0U`, y: &AArch64::GPR32commonRegClass);
12521	case `'w'`: {
12522	if (!Subtarget->hasFPARMv8())
12523	break;
12524	if (VT.isScalableVector()) {
12525	if (VT.getVectorElementType() != MVT::i1)
12526	return std::make_pair(x: `0U`, y: &AArch64::ZPRRegClass);
12527	return std::make_pair(x: `0U`, y: nullptr);
12528	}
12529	if (VT == MVT::Other)
12530	break;
12531	uint64_t VTSize = VT.getFixedSizeInBits();
12532	if (VTSize == `16`)
12533	return std::make_pair(x: `0U`, y: &AArch64::FPR16RegClass);
12534	if (VTSize == `32`)
12535	return std::make_pair(x: `0U`, y: &AArch64::FPR32RegClass);
12536	if (VTSize == `64`)
12537	return std::make_pair(x: `0U`, y: &AArch64::FPR64RegClass);
12538	if (VTSize == `128`)
12539	return std::make_pair(x: `0U`, y: &AArch64::FPR128RegClass);
12540	break;
12541	}
12542	// The instructions that this constraint is designed for can
12543	// only take 128-bit registers so just use that regclass.
12544	case `'x'`:
12545	if (!Subtarget->hasFPARMv8())
12546	break;
12547	if (VT.isScalableVector())
12548	return std::make_pair(x: `0U`, y: &AArch64::ZPR_4bRegClass);
12549	if (VT.getSizeInBits() == `128`)
12550	return std::make_pair(x: `0U`, y: &AArch64::FPR128_loRegClass);
12551	break;
12552	case `'y'`:
12553	if (!Subtarget->hasFPARMv8())
12554	break;
12555	if (VT.isScalableVector())
12556	return std::make_pair(x: `0U`, y: &AArch64::ZPR_3bRegClass);
12557	break;
12558	}
12559	} else {
12560	if (const auto P = parseSVERegAsConstraint(Constraint)) {
12561	// SME functions that are not in streaming mode, should
12562	// still observe clobbers of Z-registers by clobbering
12563	// the lower 128bits of those registers.
12564	if (AArch64::ZPRRegClass.hasSubClassEq(RC: P ->second) &&
12565	!Subtarget->isSVEorStreamingSVEAvailable())
12566	return std::make_pair(x: TRI->getSubReg(Reg: P ->first, Idx: AArch64::zsub),
12567	y: &AArch64::FPR128RegClass);
12568	return *P;
12569	}
12570	if (const auto PC = parsePredicateConstraint(Constraint))
12571	if (const auto RegClass = getPredicateRegisterClass(Constraint: PC, VT))
12572	return std::make_pair(x: `0U`, y&: RegClass);
12573
12574	if (const auto RGC = parseReducedGprConstraint(Constraint))
12575	if (const auto RegClass = getReducedGprRegisterClass(Constraint: RGC, VT))
12576	return std::make_pair(x: `0U`, y&: RegClass);
12577	}
12578	if (StringRef ("{cc}").equals_insensitive(RHS: Constraint) \|\|
12579	parseConstraintCode(Constraint) != AArch64CC::Invalid)
12580	return std::make_pair(x: unsigned(AArch64::NZCV), y: &AArch64::CCRRegClass);
12581
12582	if (Constraint == "{za}") {
12583	return std::make_pair(x: unsigned(AArch64::ZA), y: &AArch64::MPRRegClass);
12584	}
12585
12586	if (Constraint == "{zt0}") {
12587	return std::make_pair(x: unsigned(AArch64::ZT0), y: &AArch64::ZTRRegClass);
12588	}
12589
12590	// Use the default implementation in TargetLowering to convert the register
12591	// constraint into a member of a register class.
12592	std::pair<unsigned, const TargetRegisterClass *> Res;
12593	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12594
12595	// Not found as a standard register?
12596	if (!Res.second) {
12597	unsigned Size = Constraint.size();
12598	if ((Size == `4` \|\| Size == `5`) && Constraint [`0`] == `'{'` &&
12599	tolower(c: Constraint [`1`]) == `'v'` && Constraint [Size - `1`] == `'}'`) {
12600	int RegNo;
12601	bool Failed = Constraint.slice(Start: `2`, End: Size - `1`).getAsInteger(Radix: `10`, Result&: RegNo);
12602	if (!Failed && RegNo >= `0` && RegNo <= `31`) {
12603	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
12604	// By default we'll emit v0-v31 for this unless there's a modifier where
12605	// we'll emit the correct register as well.
12606	if (VT != MVT::Other && VT.getSizeInBits() == `64`) {
12607	Res.first = AArch64::FPR64RegClass.getRegister(i: RegNo);
12608	Res.second = &AArch64::FPR64RegClass;
12609	} else {
12610	Res.first = AArch64::FPR128RegClass.getRegister(i: RegNo);
12611	Res.second = &AArch64::FPR128RegClass;
12612	}
12613	}
12614	}
12615	}
12616
12617	if (Res.second && !Subtarget->hasFPARMv8() &&
12618	!AArch64::GPR32allRegClass.hasSubClassEq(RC: Res.second) &&
12619	!AArch64::GPR64allRegClass.hasSubClassEq(RC: Res.second))
12620	return std::make_pair(x: `0U`, y: nullptr);
12621
12622	return Res;
12623	}
12624
12625	EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
12626	llvm::Type *Ty,
12627	bool AllowUnknown) const {
12628	if (Subtarget->hasLS64() && Ty->isIntegerTy(Bitwidth: `512`))
12629	return EVT (MVT::i64x8);
12630
12631	return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
12632	}
12633
12634	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
12635	/// vector. If it is invalid, don't add anything to Ops.
12636	void AArch64TargetLowering::LowerAsmOperandForConstraint(
12637	SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
12638	SelectionDAG &DAG) const {
12639	SDValue Result;
12640
12641	// Currently only support length 1 constraints.
12642	if (Constraint.size() != `1`)
12643	return;
12644
12645	char ConstraintLetter = Constraint [`0`];
12646	switch (ConstraintLetter) {
12647	default:
12648	break;
12649
12650	// This set of constraints deal with valid constants for various instructions.
12651	// Validate and return a target constant for them if we can.
12652	case `'z'`: {
12653	// 'z' maps to xzr or wzr so it needs an input of 0.
12654	if (!isNullConstant(V: Op))
12655	return;
12656
12657	if (Op.getValueType() == MVT::i64)
12658	Result = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
12659	else
12660	Result = DAG.getRegister(Reg: AArch64::WZR, VT: MVT::i32);
12661	break;
12662	}
12663	case `'S'`:
12664	// Use the generic code path for "s". In GCC's aarch64 port, "S" is
12665	// supported for PIC while "s" isn't, making "s" less useful. We implement
12666	// "S" but not "s".
12667	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
12668	break;
12669
12670	case `'I'`:
12671	case `'J'`:
12672	case `'K'`:
12673	case `'L'`:
12674	case `'M'`:
12675	case `'N'`:
12676	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
12677	if (!C)
12678	return;
12679
12680	// Grab the value and do some validation.
12681	uint64_t CVal = C->getZExtValue();
12682	switch (ConstraintLetter) {
12683	// The I constraint applies only to simple ADD or SUB immediate operands:
12684	// i.e. 0 to 4095 with optional shift by 12
12685	// The J constraint applies only to ADD or SUB immediates that would be
12686	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
12687	// instruction [or vice versa], in other words -1 to -4095 with optional
12688	// left shift by 12.
12689	case `'I'`:
12690	if (isUInt<`12`>(x: CVal) \|\| isShiftedUInt<`12`, `12`>(x: CVal))
12691	break;
12692	return;
12693	case `'J'`: {
12694	uint64_t NVal = -C->getSExtValue();
12695	if (isUInt<`12`>(x: NVal) \|\| isShiftedUInt<`12`, `12`>(x: NVal)) {
12696	CVal = C->getSExtValue();
12697	break;
12698	}
12699	return;
12700	}
12701	// The K and L constraints apply only* to logical immediates, including*
12702	// what used to be the MOVI alias for ORR (though the MOVI alias has now
12703	// been removed and MOV should be used). So these constraints have to
12704	// distinguish between bit patterns that are valid 32-bit or 64-bit
12705	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
12706	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
12707	// versa.
12708	case `'K'`:
12709	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
12710	break;
12711	return;
12712	case `'L'`:
12713	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
12714	break;
12715	return;
12716	// The M and N constraints are a superset of K and L respectively, for use
12717	// with the MOV (immediate) alias. As well as the logical immediates they
12718	// also match 32 or 64-bit immediates that can be loaded either using a
12719	// single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca*
12720	// (M) or 64-bit 0x1234000000000000 (N) etc.
12721	// As a note some of this code is liberally stolen from the asm parser.
12722	case `'M'`: {
12723	if (!isUInt<`32`>(x: CVal))
12724	return;
12725	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
12726	break;
12727	if ((CVal & `0xFFFF`) == CVal)
12728	break;
12729	if ((CVal & `0xFFFF0000ULL`) == CVal)
12730	break;
12731	uint64_t NCVal = ~(uint32_t)CVal;
12732	if ((NCVal & `0xFFFFULL`) == NCVal)
12733	break;
12734	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
12735	break;
12736	return;
12737	}
12738	case `'N'`: {
12739	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
12740	break;
12741	if ((CVal & `0xFFFFULL`) == CVal)
12742	break;
12743	if ((CVal & `0xFFFF0000ULL`) == CVal)
12744	break;
12745	if ((CVal & `0xFFFF00000000ULL`) == CVal)
12746	break;
12747	if ((CVal & `0xFFFF000000000000ULL`) == CVal)
12748	break;
12749	uint64_t NCVal = ~CVal;
12750	if ((NCVal & `0xFFFFULL`) == NCVal)
12751	break;
12752	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
12753	break;
12754	if ((NCVal & `0xFFFF00000000ULL`) == NCVal)
12755	break;
12756	if ((NCVal & `0xFFFF000000000000ULL`) == NCVal)
12757	break;
12758	return;
12759	}
12760	default:
12761	return;
12762	}
12763
12764	// All assembler immediates are 64-bit integers.
12765	Result = DAG.getTargetConstant(Val: CVal, DL: SDLoc (Op), VT: MVT::i64);
12766	break;
12767	}
12768
12769	if (Result.getNode()) {
12770	Ops.push_back(x: Result);
12771	return;
12772	}
12773
12774	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12775	}
12776
12777	//===----------------------------------------------------------------------===//
12778	// AArch64 Advanced SIMD Support
12779	//===----------------------------------------------------------------------===//
12780
12781	/// WidenVector - Given a value in the V64 register class, produce the
12782	/// equivalent value in the V128 register class.
12783	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
12784	EVT VT = V64Reg.getValueType();
12785	unsigned NarrowSize = VT.getVectorNumElements();
12786	MVT EltTy = VT.getVectorElementType().getSimpleVT();
12787	MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: `2` * NarrowSize);
12788	SDLoc DL(V64Reg);
12789
12790	return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideTy, N1: DAG.getUNDEF(VT: WideTy),
12791	N2: V64Reg, N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
12792	}
12793
12794	/// getExtFactor - Determine the adjustment factor for the position when
12795	/// generating an "extract from vector registers" instruction.
12796	static unsigned getExtFactor(SDValue &V) {
12797	EVT EltType = V.getValueType().getVectorElementType();
12798	return EltType.getSizeInBits() / `8`;
12799	}
12800
12801	// Check if a vector is built from one vector via extracted elements of
12802	// another together with an AND mask, ensuring that all elements fit
12803	// within range. This can be reconstructed using AND and NEON's TBL1.
12804	SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
12805	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12806	SDLoc DL(Op);
12807	EVT VT = Op.getValueType();
12808	assert(!VT.isScalableVector() &&
12809	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12810
12811	// Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
12812	// directly to TBL1.
12813	if (VT != MVT::v16i8 && VT != MVT::v8i8)
12814	return SDValue ();
12815
12816	unsigned NumElts = VT.getVectorNumElements();
12817	assert((NumElts == `8` \|\| NumElts == `16`) &&
12818	"Need to have exactly 8 or 16 elements in vector.");
12819
12820	SDValue SourceVec;
12821	SDValue MaskSourceVec;
12822	SmallVector<SDValue, `16`> AndMaskConstants;
12823
12824	for (unsigned i = `0`; i < NumElts; ++i) {
12825	SDValue V = Op.getOperand(i);
12826	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12827	return SDValue ();
12828
12829	SDValue OperandSourceVec = V.getOperand(i: `0`);
12830	if (!SourceVec)
12831	SourceVec = OperandSourceVec;
12832	else if (SourceVec != OperandSourceVec)
12833	return SDValue ();
12834
12835	// This only looks at shuffles with elements that are
12836	// a) truncated by a constant AND mask extracted from a mask vector, or
12837	// b) extracted directly from a mask vector.
12838	SDValue MaskSource = V.getOperand(i: `1`);
12839	if (MaskSource.getOpcode() == ISD::AND) {
12840	if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: `1`)))
12841	return SDValue ();
12842
12843	AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: `1`));
12844	MaskSource = MaskSource ->getOperand(Num: `0`);
12845	} else if (!AndMaskConstants.empty()) {
12846	// Either all or no operands should have an AND mask.
12847	return SDValue ();
12848	}
12849
12850	// An ANY_EXTEND may be inserted between the AND and the source vector
12851	// extraction. We don't care about that, so we can just skip it.
12852	if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
12853	MaskSource = MaskSource.getOperand(i: `0`);
12854
12855	if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12856	return SDValue ();
12857
12858	SDValue MaskIdx = MaskSource.getOperand(i: `1`);
12859	if (!isa<ConstantSDNode>(Val: MaskIdx) \|\|
12860	!cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
12861	return SDValue ();
12862
12863	// We only apply this if all elements come from the same vector with the
12864	// same vector type.
12865	if (!MaskSourceVec) {
12866	MaskSourceVec = MaskSource ->getOperand(Num: `0`);
12867	if (MaskSourceVec.getValueType() != VT)
12868	return SDValue ();
12869	} else if (MaskSourceVec != MaskSource ->getOperand(Num: `0`)) {
12870	return SDValue ();
12871	}
12872	}
12873
12874	// We need a v16i8 for TBL, so we extend the source with a placeholder vector
12875	// for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12876	// insert, we know that the index in the mask must be smaller than the number
12877	// of elements in the source, or we would have an out-of-bounds access.
12878	if (NumElts == `8`)
12879	SourceVec = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: SourceVec,
12880	N2: DAG.getUNDEF(VT));
12881
12882	// Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12883	if (!AndMaskConstants.empty())
12884	MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: MaskSourceVec,
12885	N2: DAG.getBuildVector(VT, DL, Ops: AndMaskConstants));
12886
12887	return DAG.getNode(
12888	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
12889	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: SourceVec,
12890	N3: MaskSourceVec);
12891	}
12892
12893	// Gather data to see if the operation can be modelled as a
12894	// shuffle in combination with VEXTs.
12895	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
12896	SelectionDAG &DAG) const {
12897	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12898	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12899	SDLoc DL(Op);
12900	EVT VT = Op.getValueType();
12901	assert(!VT.isScalableVector() &&
12902	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12903	unsigned NumElts = VT.getVectorNumElements();
12904
12905	struct ShuffleSourceInfo {
12906	SDValue Vec;
12907	unsigned MinElt;
12908	unsigned MaxElt;
12909
12910	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12911	// be compatible with the shuffle we intend to construct. As a result
12912	// ShuffleVec will be some sliding window into the original Vec.
12913	SDValue ShuffleVec;
12914
12915	// Code should guarantee that element i in Vec starts at element "WindowBase
12916	// + i WindowScale in ShuffleVec".*
12917	int WindowBase;
12918	int WindowScale;
12919
12920	ShuffleSourceInfo(SDValue Vec)
12921	: Vec (Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(`0`),
12922	ShuffleVec (Vec), WindowBase(`0`), WindowScale(`1`) {}
12923
12924	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12925	};
12926
12927	// First gather all vectors used as an immediate source for this BUILD_VECTOR
12928	// node.
12929	SmallVector<ShuffleSourceInfo, `2`> Sources;
12930	for (unsigned i = `0`; i < NumElts; ++i) {
12931	SDValue V = Op.getOperand(i);
12932	if (V.isUndef())
12933	continue;
12934	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
12935	!isa<ConstantSDNode>(Val: V.getOperand(i: `1`)) \|\|
12936	V.getOperand(i: `0`).getValueType().isScalableVector()) {
12937	LLVM_DEBUG(
12938	dbgs() << "Reshuffle failed: "
12939	"a shuffle can only come from building a vector from "
12940	"various elements of other fixed-width vectors, provided "
12941	"their indices are constant\n");
12942	return SDValue ();
12943	}
12944
12945	// Add this element source to the list if it's not already there.
12946	SDValue SourceVec = V.getOperand(i: `0`);
12947	auto Source = find(Range&: Sources, Val: SourceVec);
12948	if (Source == Sources.end())
12949	Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
12950
12951	// Update the minimum and maximum lane number seen.
12952	unsigned EltNo = V.getConstantOperandVal(i: `1`);
12953	Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
12954	Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
12955	}
12956
12957	// If we have 3 or 4 sources, try to generate a TBL, which will at least be
12958	// better than moving to/from gpr registers for larger vectors.
12959	if ((Sources.size() == `3` \|\| Sources.size() == `4`) && NumElts > `4`) {
12960	// Construct a mask for the tbl. We may need to adjust the index for types
12961	// larger than i8.
12962	SmallVector<unsigned, `16`> Mask;
12963	unsigned OutputFactor = VT.getScalarSizeInBits() / `8`;
12964	for (unsigned I = `0`; I < NumElts; ++I) {
12965	SDValue V = Op.getOperand(i: I);
12966	if (V.isUndef()) {
12967	for (unsigned OF = `0`; OF < OutputFactor; OF++)
12968	Mask.push_back(Elt: -`1`);
12969	continue;
12970	}
12971	// Set the Mask lanes adjusted for the size of the input and output
12972	// lanes. The Mask is always i8, so it will set OutputFactor lanes per
12973	// output element, adjusted in their positions per input and output types.
12974	unsigned Lane = V.getConstantOperandVal(i: `1`);
12975	for (unsigned S = `0`; S < Sources.size(); S++) {
12976	if (V.getOperand(i: `0`) == Sources [S].Vec) {
12977	unsigned InputSize = Sources [S].Vec.getScalarValueSizeInBits();
12978	unsigned InputBase = `16` * S + Lane * InputSize / `8`;
12979	for (unsigned OF = `0`; OF < OutputFactor; OF++)
12980	Mask.push_back(Elt: InputBase + OF);
12981	break;
12982	}
12983	}
12984	}
12985
12986	// Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12987	// v16i8, and the TBLMask
12988	SmallVector<SDValue, `16`> TBLOperands;
12989	TBLOperands.push_back(Elt: DAG.getConstant(Val: Sources.size() == `3`
12990	? Intrinsic::aarch64_neon_tbl3
12991	: Intrinsic::aarch64_neon_tbl4,
12992	DL, VT: MVT::i32));
12993	for (unsigned i = `0`; i < Sources.size(); i++) {
12994	SDValue Src = Sources [i].Vec;
12995	EVT SrcVT = Src.getValueType();
12996	Src = DAG.getBitcast(VT: SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, V: Src);
12997	assert((SrcVT.is64BitVector() \|\| SrcVT.is128BitVector()) &&
12998	"Expected a legally typed vector");
12999	if (SrcVT.is64BitVector())
13000	Src = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Src,
13001	N2: DAG.getUNDEF(VT: MVT::v8i8));
13002	TBLOperands.push_back(Elt: Src);
13003	}
13004
13005	SmallVector<SDValue, `16`> TBLMask;
13006	for (unsigned i = `0`; i < Mask.size(); i++)
13007	TBLMask.push_back(Elt: DAG.getConstant(Val: Mask [i], DL, VT: MVT::i32));
13008	assert((Mask.size() == `8` \|\| Mask.size() == `16`) &&
13009	"Expected a v8i8 or v16i8 Mask");
13010	TBLOperands.push_back(Elt: DAG.getBuildVector(
13011	VT: Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, DL, Ops: TBLMask));
13012
13013	SDValue Shuffle =
13014	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
13015	VT: Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, Ops: TBLOperands);
13016	return DAG.getBitcast(VT, V: Shuffle);
13017	}
13018
13019	if (Sources.size() > `2`) {
13020	LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13021	<< "sensible when at most two source vectors are "
13022	<< "involved\n");
13023	return SDValue ();
13024	}
13025
13026	// Find out the smallest element size among result and two sources, and use
13027	// it as element size to build the shuffle_vector.
13028	EVT SmallestEltTy = VT.getVectorElementType();
13029	for (auto &Source : Sources) {
13030	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13031	if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
13032	SmallestEltTy = SrcEltTy;
13033	}
13034	}
13035	unsigned ResMultiplier =
13036	VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13037	uint64_t VTSize = VT.getFixedSizeInBits();
13038	NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13039	EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
13040
13041	// If the source vector is too wide or too narrow, we may nevertheless be able
13042	// to construct a compatible shuffle either by concatenating it with UNDEF or
13043	// extracting a suitable range of elements.
13044	for (auto &Src : Sources) {
13045	EVT SrcVT = Src.ShuffleVec.getValueType();
13046
13047	TypeSize SrcVTSize = SrcVT.getSizeInBits();
13048	if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
13049	continue;
13050
13051	// This stage of the search produces a source with the same element type as
13052	// the original, but with a total width matching the BUILD_VECTOR output.
13053	EVT EltVT = SrcVT.getVectorElementType();
13054	unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13055	EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
13056
13057	if (SrcVTSize.getFixedValue() < VTSize) {
13058	assert(`2` * SrcVTSize == VTSize);
13059	// We can pad out the smaller vector for free, so if it's part of a
13060	// shuffle...
13061	Src.ShuffleVec =
13062	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: DestVT, N1: Src.ShuffleVec,
13063	N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
13064	continue;
13065	}
13066
13067	if (SrcVTSize.getFixedValue() != `2` * VTSize) {
13068	LLVM_DEBUG(
13069	dbgs() << "Reshuffle failed: result vector too small to extract\n");
13070	return SDValue ();
13071	}
13072
13073	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13074	LLVM_DEBUG(
13075	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13076	return SDValue ();
13077	}
13078
13079	if (Src.MinElt >= NumSrcElts) {
13080	// The extraction can just take the second half
13081	Src.ShuffleVec =
13082	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13083	N2: DAG.getConstant(Val: NumSrcElts, DL, VT: MVT::i64));
13084	Src.WindowBase = -NumSrcElts;
13085	} else if (Src.MaxElt < NumSrcElts) {
13086	// The extraction can just take the first half
13087	Src.ShuffleVec =
13088	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13089	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
13090	} else {
13091	// An actual VEXT is needed
13092	SDValue VEXTSrc1 =
13093	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13094	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
13095	SDValue VEXTSrc2 =
13096	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: DestVT, N1: Src.ShuffleVec,
13097	N2: DAG.getConstant(Val: NumSrcElts, DL, VT: MVT::i64));
13098	unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
13099
13100	if (!SrcVT.is64BitVector()) {
13101	LLVM_DEBUG(
13102	dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13103	"for SVE vectors.");
13104	return SDValue ();
13105	}
13106
13107	Src.ShuffleVec =
13108	DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: DestVT, N1: VEXTSrc1, N2: VEXTSrc2,
13109	N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
13110	Src.WindowBase = -Src.MinElt;
13111	}
13112	}
13113
13114	// Another possible incompatibility occurs from the vector element types. We
13115	// can fix this by bitcasting the source vectors to the same type we intend
13116	// for the shuffle.
13117	for (auto &Src : Sources) {
13118	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13119	if (SrcEltTy == SmallestEltTy)
13120	continue;
13121	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13122	if (DAG.getDataLayout().isBigEndian()) {
13123	Src.ShuffleVec =
13124	DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ShuffleVT, Operand: Src.ShuffleVec);
13125	} else {
13126	Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ShuffleVT, Operand: Src.ShuffleVec);
13127	}
13128	Src.WindowScale =
13129	SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13130	Src.WindowBase *= Src.WindowScale;
13131	}
13132
13133	// Final check before we try to actually produce a shuffle.
13134	LLVM_DEBUG({
13135	for (auto Src : Sources)
13136	assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13137	});
13138
13139	// The stars all align, our next step is to produce the mask for the shuffle.
13140	SmallVector<int, `8`> Mask(ShuffleVT.getVectorNumElements(), -`1`);
13141	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13142	for (unsigned i = `0`; i < VT.getVectorNumElements(); ++i) {
13143	SDValue Entry = Op.getOperand(i);
13144	if (Entry.isUndef())
13145	continue;
13146
13147	auto Src = find(Range&: Sources, Val: Entry.getOperand(i: `0`));
13148	int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: `1`))->getSExtValue();
13149
13150	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13151	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13152	// segment.
13153	EVT OrigEltTy = Entry.getOperand(i: `0`).getValueType().getVectorElementType();
13154	int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
13155	b: VT.getScalarSizeInBits());
13156	int LanesDefined = BitsDefined / BitsPerShuffleLane;
13157
13158	// This source is expected to fill ResMultiplier lanes of the final shuffle,
13159	// starting at the appropriate offset.
13160	int LaneMask = &Mask [i ResMultiplier];
13161
13162	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13163	ExtractBase += NumElts * (Src - Sources.begin());
13164	for (int j = `0`; j < LanesDefined; ++j)
13165	LaneMask[j] = ExtractBase + j;
13166	}
13167
13168	// Final check before we try to produce nonsense...
13169	if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
13170	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13171	return SDValue ();
13172	}
13173
13174	SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
13175	for (unsigned i = `0`; i < Sources.size(); ++i)
13176	ShuffleOps[i] = Sources [i].ShuffleVec;
13177
13178	SDValue Shuffle =
13179	DAG.getVectorShuffle(VT: ShuffleVT, dl: DL, N1: ShuffleOps[`0`], N2: ShuffleOps[`1`], Mask);
13180	SDValue V;
13181	if (DAG.getDataLayout().isBigEndian()) {
13182	V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Shuffle);
13183	} else {
13184	V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Shuffle);
13185	}
13186
13187	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13188	dbgs() << "Reshuffle, creating node: "; V.dump(););
13189
13190	return V;
13191	}
13192
13193	// check if an EXT instruction can handle the shuffle mask when the
13194	// vector sources of the shuffle are the same.
13195	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13196	unsigned NumElts = VT.getVectorNumElements();
13197
13198	// Assume that the first shuffle index is not UNDEF. Fail if it is.
13199	if (M [`0`] < `0`)
13200	return false;
13201
13202	Imm = M [`0`];
13203
13204	// If this is a VEXT shuffle, the immediate value is the index of the first
13205	// element. The other shuffle indices must be the successive elements after
13206	// the first one.
13207	unsigned ExpectedElt = Imm;
13208	for (unsigned i = `1`; i < NumElts; ++i) {
13209	// Increment the expected index. If it wraps around, just follow it
13210	// back to index zero and keep going.
13211	++ExpectedElt;
13212	if (ExpectedElt == NumElts)
13213	ExpectedElt = `0`;
13214
13215	if (M [i] < `0`)
13216	continue; // ignore UNDEF indices
13217	if (ExpectedElt != static_cast<unsigned>(M [i]))
13218	return false;
13219	}
13220
13221	return true;
13222	}
13223
13224	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13225	// v4i32s. This is really a truncate, which we can construct out of (legal)
13226	// concats and truncate nodes.
13227	static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
13228	if (V.getValueType() != MVT::v16i8)
13229	return SDValue ();
13230	assert(V.getNumOperands() == `16` && "Expected 16 operands on the BUILDVECTOR");
13231
13232	for (unsigned X = `0`; X < `4`; X++) {
13233	// Check the first item in each group is an extract from lane 0 of a v4i32
13234	// or v4i16.
13235	SDValue BaseExt = V.getOperand(i: X * `4`);
13236	if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
13237	(BaseExt.getOperand(i: `0`).getValueType() != MVT::v4i16 &&
13238	BaseExt.getOperand(i: `0`).getValueType() != MVT::v4i32) \|\|
13239	!isa<ConstantSDNode>(Val: BaseExt.getOperand(i: `1`)) \|\|
13240	BaseExt.getConstantOperandVal(i: `1`) != `0`)
13241	return SDValue ();
13242	SDValue Base = BaseExt.getOperand(i: `0`);
13243	// And check the other items are extracts from the same vector.
13244	for (unsigned Y = `1`; Y < `4`; Y++) {
13245	SDValue Ext = V.getOperand(i: X * `4` + Y);
13246	if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
13247	Ext.getOperand(i: `0`) != Base \|\|
13248	!isa<ConstantSDNode>(Val: Ext.getOperand(i: `1`)) \|\|
13249	Ext.getConstantOperandVal(i: `1`) != Y)
13250	return SDValue ();
13251	}
13252	}
13253
13254	// Turn the buildvector into a series of truncates and concates, which will
13255	// become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13256	// concat together to produce 2 v8i16. These are both truncated and concat
13257	// together.
13258	SDLoc DL(V);
13259	SDValue Trunc[`4`] = {
13260	V.getOperand(i: `0`).getOperand(i: `0`), V.getOperand(i: `4`).getOperand(i: `0`),
13261	V.getOperand(i: `8`).getOperand(i: `0`), V.getOperand(i: `12`).getOperand(i: `0`)};
13262	for (SDValue &V : Trunc)
13263	if (V.getValueType() == MVT::v4i32)
13264	V = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v4i16, Operand: V);
13265	SDValue Concat0 =
13266	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[`0`], N2: Trunc[`1`]);
13267	SDValue Concat1 =
13268	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[`2`], N2: Trunc[`3`]);
13269	SDValue Trunc0 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat0);
13270	SDValue Trunc1 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat1);
13271	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Trunc0, N2: Trunc1);
13272	}
13273
13274	/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13275	/// element width than the vector lane type. If that is the case the function
13276	/// returns true and writes the value of the DUP instruction lane operand into
13277	/// DupLaneOp
13278	static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13279	unsigned &DupLaneOp) {
13280	assert((BlockSize == `16` \|\| BlockSize == `32` \|\| BlockSize == `64`) &&
13281	"Only possible block sizes for wide DUP are: 16, 32, 64");
13282
13283	if (BlockSize <= VT.getScalarSizeInBits())
13284	return false;
13285	if (BlockSize % VT.getScalarSizeInBits() != `0`)
13286	return false;
13287	if (VT.getSizeInBits() % BlockSize != `0`)
13288	return false;
13289
13290	size_t SingleVecNumElements = VT.getVectorNumElements();
13291	size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13292	size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13293
13294	// We are looking for masks like
13295	// [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13296	// might be replaced by 'undefined'. BlockIndices will eventually contain
13297	// lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13298	// for the above examples)
13299	SmallVector<int, `8`> BlockElts(NumEltsPerBlock, -`1`);
13300	for (size_t BlockIndex = `0`; BlockIndex < NumBlocks; BlockIndex++)
13301	for (size_t I = `0`; I < NumEltsPerBlock; I++) {
13302	int Elt = M [BlockIndex * NumEltsPerBlock + I];
13303	if (Elt < `0`)
13304	continue;
13305	// For now we don't support shuffles that use the second operand
13306	if ((unsigned)Elt >= SingleVecNumElements)
13307	return false;
13308	if (BlockElts [I] < `0`)
13309	BlockElts [I] = Elt;
13310	else if (BlockElts [I] != Elt)
13311	return false;
13312	}
13313
13314	// We found a candidate block (possibly with some undefs). It must be a
13315	// sequence of consecutive integers starting with a value divisible by
13316	// NumEltsPerBlock with some values possibly replaced by undef-s.
13317
13318	// Find first non-undef element
13319	auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= `0`; });
13320	assert(FirstRealEltIter != BlockElts.end() &&
13321	"Shuffle with all-undefs must have been caught by previous cases, "
13322	"e.g. isSplat()");
13323	if (FirstRealEltIter == BlockElts.end()) {
13324	DupLaneOp = `0`;
13325	return true;
13326	}
13327
13328	// Index of FirstRealElt in BlockElts
13329	size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13330
13331	if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13332	return false;
13333	// BlockElts[0] must have the following value if it isn't undef:
13334	size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13335
13336	// Check the first element
13337	if (Elt0 % NumEltsPerBlock != `0`)
13338	return false;
13339	// Check that the sequence indeed consists of consecutive integers (modulo
13340	// undefs)
13341	for (size_t I = `0`; I < NumEltsPerBlock; I++)
13342	if (BlockElts [I] >= `0` && (unsigned)BlockElts [I] != Elt0 + I)
13343	return false;
13344
13345	DupLaneOp = Elt0 / NumEltsPerBlock;
13346	return true;
13347	}
13348
13349	// check if an EXT instruction can handle the shuffle mask when the
13350	// vector sources of the shuffle are different.
13351	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13352	unsigned &Imm) {
13353	// Look for the first non-undef element.
13354	const int FirstRealElt = find_if(Range&: M, P: [](int* Elt) { return Elt >= `0`; });
13355
13356	// Benefit form APInt to handle overflow when calculating expected element.
13357	unsigned NumElts = VT.getVectorNumElements();
13358	unsigned MaskBits = APInt (`32`, NumElts * `2`).logBase2();
13359	APInt ExpectedElt = APInt (MaskBits, FirstRealElt + `1`, /isSigned=/*false,
13360	/implicitTrunc=/true);
13361	// The following shuffle indices must be the successive elements after the
13362	// first real element.
13363	bool FoundWrongElt = std::any_of(first: FirstRealElt + `1`, last: M.end(), pred: [&](int Elt) {
13364	return Elt != ExpectedElt ++ && Elt != -`1`;
13365	});
13366	if (FoundWrongElt)
13367	return false;
13368
13369	// The index of an EXT is the first element if it is not UNDEF.
13370	// Watch out for the beginning UNDEFs. The EXT index should be the expected
13371	// value of the first element. E.g.
13372	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13373	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
13374	// ExpectedElt is the last mask index plus 1.
13375	Imm = ExpectedElt.getZExtValue();
13376
13377	// There are two difference cases requiring to reverse input vectors.
13378	// For example, for vector <4 x i32> we have the following cases,
13379	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13380	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13381	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
13382	// to reverse two input vectors.
13383	if (Imm < NumElts)
13384	ReverseEXT = true;
13385	else
13386	Imm -= NumElts;
13387
13388	return true;
13389	}
13390
13391	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13392	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13393	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13394	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13395	unsigned NumElts = VT.getVectorNumElements();
13396	if (NumElts % `2` != `0`)
13397	return false;
13398	WhichResult = (M [`0`] == `0` ? `0` : `1`);
13399	unsigned Idx = WhichResult * NumElts / `2`;
13400	for (unsigned i = `0`; i != NumElts; i += `2`) {
13401	if ((M [i] >= `0` && (unsigned)M [i] != Idx) \|\|
13402	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != Idx))
13403	return false;
13404	Idx += `1`;
13405	}
13406
13407	return true;
13408	}
13409
13410	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13411	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13412	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13413	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13414	unsigned Half = VT.getVectorNumElements() / `2`;
13415	WhichResult = (M [`0`] == `0` ? `0` : `1`);
13416	for (unsigned j = `0`; j != `2`; ++j) {
13417	unsigned Idx = WhichResult;
13418	for (unsigned i = `0`; i != Half; ++i) {
13419	int MIdx = M [i + j * Half];
13420	if (MIdx >= `0` && (unsigned)MIdx != Idx)
13421	return false;
13422	Idx += `2`;
13423	}
13424	}
13425
13426	return true;
13427	}
13428
13429	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
13430	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13431	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
13432	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13433	unsigned NumElts = VT.getVectorNumElements();
13434	if (NumElts % `2` != `0`)
13435	return false;
13436	WhichResult = (M [`0`] == `0` ? `0` : `1`);
13437	for (unsigned i = `0`; i < NumElts; i += `2`) {
13438	if ((M [i] >= `0` && (unsigned)M [i] != i + WhichResult) \|\|
13439	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != i + WhichResult))
13440	return false;
13441	}
13442	return true;
13443	}
13444
13445	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
13446	bool &DstIsLeft, int &Anomaly) {
13447	if (M.size() != static_cast<size_t>(NumInputElements))
13448	return false;
13449
13450	int NumLHSMatch = `0`, NumRHSMatch = `0`;
13451	int LastLHSMismatch = -`1`, LastRHSMismatch = -`1`;
13452
13453	for (int i = `0`; i < NumInputElements; ++i) {
13454	if (M [i] == -`1`) {
13455	++NumLHSMatch;
13456	++NumRHSMatch;
13457	continue;
13458	}
13459
13460	if (M [i] == i)
13461	++NumLHSMatch;
13462	else
13463	LastLHSMismatch = i;
13464
13465	if (M [i] == i + NumInputElements)
13466	++NumRHSMatch;
13467	else
13468	LastRHSMismatch = i;
13469	}
13470
13471	if (NumLHSMatch == NumInputElements - `1`) {
13472	DstIsLeft = true;
13473	Anomaly = LastLHSMismatch;
13474	return true;
13475	} else if (NumRHSMatch == NumInputElements - `1`) {
13476	DstIsLeft = false;
13477	Anomaly = LastRHSMismatch;
13478	return true;
13479	}
13480
13481	return false;
13482	}
13483
13484	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
13485	if (VT.getSizeInBits() != `128`)
13486	return false;
13487
13488	unsigned NumElts = VT.getVectorNumElements();
13489
13490	for (int I = `0`, E = NumElts / `2`; I != E; I++) {
13491	if (Mask [I] != I)
13492	return false;
13493	}
13494
13495	int Offset = NumElts / `2`;
13496	for (int I = NumElts / `2`, E = NumElts; I != E; I++) {
13497	if (Mask [I] != I + SplitLHS * Offset)
13498	return false;
13499	}
13500
13501	return true;
13502	}
13503
13504	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
13505	SDLoc DL(Op);
13506	EVT VT = Op.getValueType();
13507	SDValue V0 = Op.getOperand(i: `0`);
13508	SDValue V1 = Op.getOperand(i: `1`);
13509	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13510
13511	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
13512	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
13513	return SDValue ();
13514
13515	bool SplitV0 = V0.getValueSizeInBits() == `128`;
13516
13517	if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
13518	return SDValue ();
13519
13520	EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
13521	if (SplitV0) {
13522	V0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V0,
13523	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
13524	}
13525	if (V1.getValueSizeInBits() == `128`) {
13526	V1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V1,
13527	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
13528	}
13529	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
13530	}
13531
13532	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
13533	/// the specified operations to build the shuffle. ID is the perfect-shuffle
13534	//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
13535	//table entry and LHS/RHS are the immediate inputs for this stage of the
13536	//shuffle.
13537	static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2,
13538	unsigned PFEntry, SDValue LHS,
13539	SDValue RHS, SelectionDAG &DAG,
13540	const SDLoc &DL) {
13541	unsigned OpNum = (PFEntry >> `26`) & `0x0F`;
13542	unsigned LHSID = (PFEntry >> `13`) & ((`1` << `13`) - `1`);
13543	unsigned RHSID = (PFEntry >> `0`) & ((`1` << `13`) - `1`);
13544
13545	enum {
13546	OP_COPY = `0`, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
13547	OP_VREV,
13548	OP_VDUP0,
13549	OP_VDUP1,
13550	OP_VDUP2,
13551	OP_VDUP3,
13552	OP_VEXT1,
13553	OP_VEXT2,
13554	OP_VEXT3,
13555	OP_VUZPL, // VUZP, left result
13556	OP_VUZPR, // VUZP, right result
13557	OP_VZIPL, // VZIP, left result
13558	OP_VZIPR, // VZIP, right result
13559	OP_VTRNL, // VTRN, left result
13560	OP_VTRNR, // VTRN, right result
13561	OP_MOVLANE // Move lane. RHSID is the lane to move into
13562	};
13563
13564	if (OpNum == OP_COPY) {
13565	if (LHSID == (`1` * `9` + `2`) * `9` + `3`)
13566	return LHS;
13567	assert(LHSID == ((`4` * `9` + `5`) * `9` + `6`) * `9` + `7` && "Illegal OP_COPY!");
13568	return RHS;
13569	}
13570
13571	if (OpNum == OP_MOVLANE) {
13572	// Decompose a PerfectShuffle ID to get the Mask for lane Elt
13573	auto getPFIDLane = [](unsigned ID, int Elt) -> int {
13574	assert(Elt < `4` && "Expected Perfect Lanes to be less than 4");
13575	Elt = `3` - Elt;
13576	while (Elt > `0`) {
13577	ID /= `9`;
13578	Elt--;
13579	}
13580	return (ID % `9` == `8`) ? -`1` : ID % `9`;
13581	};
13582
13583	// For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
13584	// get the lane to move from the PFID, which is always from the
13585	// original vectors (V1 or V2).
13586	SDValue OpLHS = GeneratePerfectShuffle(
13587	ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
13588	EVT VT = OpLHS.getValueType();
13589	assert(RHSID < `8` && "Expected a lane index for RHSID!");
13590	unsigned ExtLane = `0`;
13591	SDValue Input;
13592
13593	// OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
13594	// convert into a higher type.
13595	if (RHSID & `0x4`) {
13596	int MaskElt = getPFIDLane (ID, (RHSID & `0x01`) << `1`) >> `1`;
13597	if (MaskElt == -`1`)
13598	MaskElt = (getPFIDLane (ID, ((RHSID & `0x01`) << `1`) + `1`) - `1`) >> `1`;
13599	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
13600	ExtLane = MaskElt < `2` ? MaskElt : (MaskElt - `2`);
13601	Input = MaskElt < `2` ? V1 : V2;
13602	if (VT.getScalarSizeInBits() == `16`) {
13603	Input = DAG.getBitcast(VT: MVT::v2f32, V: Input);
13604	OpLHS = DAG.getBitcast(VT: MVT::v2f32, V: OpLHS);
13605	} else {
13606	assert(VT.getScalarSizeInBits() == `32` &&
13607	"Expected 16 or 32 bit shuffle elements");
13608	Input = DAG.getBitcast(VT: MVT::v2f64, V: Input);
13609	OpLHS = DAG.getBitcast(VT: MVT::v2f64, V: OpLHS);
13610	}
13611	} else {
13612	int MaskElt = getPFIDLane (ID, RHSID);
13613	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
13614	ExtLane = MaskElt < `4` ? MaskElt : (MaskElt - `4`);
13615	Input = MaskElt < `4` ? V1 : V2;
13616	// Be careful about creating illegal types. Use f16 instead of i16.
13617	if (VT == MVT::v4i16) {
13618	Input = DAG.getBitcast(VT: MVT::v4f16, V: Input);
13619	OpLHS = DAG.getBitcast(VT: MVT::v4f16, V: OpLHS);
13620	}
13621	}
13622	SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL,
13623	VT: Input.getValueType().getVectorElementType(),
13624	N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL));
13625	SDValue Ins =
13626	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: Input.getValueType(), N1: OpLHS,
13627	N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & `0x3`, DL));
13628	return DAG.getBitcast(VT, V: Ins);
13629	}
13630
13631	SDValue OpLHS, OpRHS;
13632	OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
13633	RHS, DAG, DL);
13634	OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
13635	RHS, DAG, DL);
13636	EVT VT = OpLHS.getValueType();
13637
13638	switch (OpNum) {
13639	default:
13640	llvm_unreachable("Unknown shuffle opcode!");
13641	case OP_VREV:
13642	// VREV divides the vector in half and swaps within the half.
13643	if (VT.getVectorElementType() == MVT::i32 \|\|
13644	VT.getVectorElementType() == MVT::f32)
13645	return DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT, Operand: OpLHS);
13646	// vrev <4 x i16> -> REV32
13647	if (VT.getVectorElementType() == MVT::i16 \|\|
13648	VT.getVectorElementType() == MVT::f16 \|\|
13649	VT.getVectorElementType() == MVT::bf16)
13650	return DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT, Operand: OpLHS);
13651	// vrev <4 x i8> -> REV16
13652	assert(VT.getVectorElementType() == MVT::i8);
13653	return DAG.getNode(Opcode: AArch64ISD::REV16, DL, VT, Operand: OpLHS);
13654	case OP_VDUP0:
13655	case OP_VDUP1:
13656	case OP_VDUP2:
13657	case OP_VDUP3: {
13658	EVT EltTy = VT.getVectorElementType();
13659	unsigned Opcode;
13660	if (EltTy == MVT::i8)
13661	Opcode = AArch64ISD::DUPLANE8;
13662	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16 \|\| EltTy == MVT::bf16)
13663	Opcode = AArch64ISD::DUPLANE16;
13664	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
13665	Opcode = AArch64ISD::DUPLANE32;
13666	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
13667	Opcode = AArch64ISD::DUPLANE64;
13668	else
13669	llvm_unreachable("Invalid vector element type?");
13670
13671	if (VT.getSizeInBits() == `64`)
13672	OpLHS = WidenVector(V64Reg: OpLHS, DAG);
13673	SDValue Lane = DAG.getConstant(Val: OpNum - OP_VDUP0, DL, VT: MVT::i64);
13674	return DAG.getNode(Opcode, DL, VT, N1: OpLHS, N2: Lane);
13675	}
13676	case OP_VEXT1:
13677	case OP_VEXT2:
13678	case OP_VEXT3: {
13679	unsigned Imm = (OpNum - OP_VEXT1 + `1`) * getExtFactor(V&: OpLHS);
13680	return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT, N1: OpLHS, N2: OpRHS,
13681	N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
13682	}
13683	case OP_VUZPL:
13684	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: OpLHS, N2: OpRHS);
13685	case OP_VUZPR:
13686	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: OpLHS, N2: OpRHS);
13687	case OP_VZIPL:
13688	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT, N1: OpLHS, N2: OpRHS);
13689	case OP_VZIPR:
13690	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT, N1: OpLHS, N2: OpRHS);
13691	case OP_VTRNL:
13692	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL, VT, N1: OpLHS, N2: OpRHS);
13693	case OP_VTRNR:
13694	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL, VT, N1: OpLHS, N2: OpRHS);
13695	}
13696	}
13697
13698	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
13699	SelectionDAG &DAG) {
13700	// Check to see if we can use the TBL instruction.
13701	SDValue V1 = Op.getOperand(i: `0`);
13702	SDValue V2 = Op.getOperand(i: `1`);
13703	SDLoc DL(Op);
13704
13705	EVT EltVT = Op.getValueType().getVectorElementType();
13706	unsigned BytesPerElt = EltVT.getSizeInBits() / `8`;
13707
13708	bool Swap = false;
13709	if (V1.isUndef() \|\| isZerosVector(N: V1.getNode())) {
13710	std::swap(a&: V1, b&: V2);
13711	Swap = true;
13712	}
13713
13714	// If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
13715	// out of range values with 0s. We do need to make sure that any out-of-range
13716	// values are really out-of-range for a v16i8 vector.
13717	bool IsUndefOrZero = V2.isUndef() \|\| isZerosVector(N: V2.getNode());
13718	MVT IndexVT = MVT::v8i8;
13719	unsigned IndexLen = `8`;
13720	if (Op.getValueSizeInBits() == `128`) {
13721	IndexVT = MVT::v16i8;
13722	IndexLen = `16`;
13723	}
13724
13725	SmallVector<SDValue, `8`> TBLMask;
13726	for (int Val : ShuffleMask) {
13727	for (unsigned Byte = `0`; Byte < BytesPerElt; ++Byte) {
13728	unsigned Offset = Byte + Val * BytesPerElt;
13729	if (Swap)
13730	Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
13731	if (IsUndefOrZero && Offset >= IndexLen)
13732	Offset = `255`;
13733	TBLMask.push_back(Elt: DAG.getConstant(Val: Offset, DL, VT: MVT::i32));
13734	}
13735	}
13736
13737	SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
13738	SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
13739
13740	SDValue Shuffle;
13741	if (IsUndefOrZero) {
13742	if (IndexLen == `8`)
13743	V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V1Cst);
13744	Shuffle = DAG.getNode(
13745	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13746	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
13747	N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13748	} else {
13749	if (IndexLen == `8`) {
13750	V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V2Cst);
13751	Shuffle = DAG.getNode(
13752	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13753	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
13754	N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13755	} else {
13756	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
13757	// cannot currently represent the register constraints on the input
13758	// table registers.
13759	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
13760	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
13761	// IndexLen));
13762	Shuffle = DAG.getNode(
13763	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
13764	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i32), N2: V1Cst,
13765	N3: V2Cst,
13766	N4: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
13767	}
13768	}
13769	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
13770	}
13771
13772	static unsigned getDUPLANEOp(EVT EltType) {
13773	if (EltType == MVT::i8)
13774	return AArch64ISD::DUPLANE8;
13775	if (EltType == MVT::i16 \|\| EltType == MVT::f16 \|\| EltType == MVT::bf16)
13776	return AArch64ISD::DUPLANE16;
13777	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
13778	return AArch64ISD::DUPLANE32;
13779	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
13780	return AArch64ISD::DUPLANE64;
13781
13782	llvm_unreachable("Invalid vector element type?");
13783	}
13784
13785	static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
13786	unsigned Opcode, SelectionDAG &DAG) {
13787	// Try to eliminate a bitcasted extract subvector before a DUPLANE.
13788	auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
13789	// Match: dup (bitcast (extract_subv X, C)), LaneC
13790	if (BitCast.getOpcode() != ISD::BITCAST \|\|
13791	BitCast.getOperand(i: `0`).getOpcode() != ISD::EXTRACT_SUBVECTOR)
13792	return false;
13793
13794	// The extract index must align in the destination type. That may not
13795	// happen if the bitcast is from narrow to wide type.
13796	SDValue Extract = BitCast.getOperand(i: `0`);
13797	unsigned ExtIdx = Extract.getConstantOperandVal(i: `1`);
13798	unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
13799	unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
13800	unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
13801	if (ExtIdxInBits % CastedEltBitWidth != `0`)
13802	return false;
13803
13804	// Can't handle cases where vector size is not 128-bit
13805	if (!Extract.getOperand(i: `0`).getValueType().is128BitVector())
13806	return false;
13807
13808	// Update the lane value by offsetting with the scaled extract index.
13809	LaneC += ExtIdxInBits / CastedEltBitWidth;
13810
13811	// Determine the casted vector type of the wide vector input.
13812	// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
13813	// Examples:
13814	// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
13815	// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
13816	unsigned SrcVecNumElts =
13817	Extract.getOperand(i: `0`).getValueSizeInBits() / CastedEltBitWidth;
13818	CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
13819	NumElements: SrcVecNumElts);
13820	return true;
13821	};
13822	MVT CastVT;
13823	if (getScaledOffsetDup (V, Lane, CastVT)) {
13824	V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: `0`).getOperand(i: `0`));
13825	} else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
13826	V.getOperand(i: `0`).getValueType().is128BitVector()) {
13827	// The lane is incremented by the index of the extract.
13828	// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
13829	Lane += V.getConstantOperandVal(i: `1`);
13830	V = V.getOperand(i: `0`);
13831	} else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
13832	// The lane is decremented if we are splatting from the 2nd operand.
13833	// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
13834	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / `2`;
13835	Lane -= Idx * VT.getVectorNumElements() / `2`;
13836	V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
13837	} else if (VT.getSizeInBits() == `64`) {
13838	// Widen the operand to 128-bit register with undef.
13839	V = WidenVector(V64Reg: V, DAG);
13840	}
13841	return DAG.getNode(Opcode, DL, VT, N1: V, N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
13842	}
13843
13844	// Try to widen element type to get a new mask value for a better permutation
13845	// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13846	// UZP1/2, TRN1/2, REV, INS, etc.
13847	// For example:
13848	// shufflevector <4 x i32> %a, <4 x i32> %b,
13849	// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13850	// is equivalent to:
13851	// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13852	// Finally, we can get:
13853	// mov v0.d[0], v1.d[1]
13854	static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
13855	SDLoc DL(Op);
13856	EVT VT = Op.getValueType();
13857	EVT ScalarVT = VT.getVectorElementType();
13858	unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13859	SDValue V0 = Op.getOperand(i: `0`);
13860	SDValue V1 = Op.getOperand(i: `1`);
13861	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13862
13863	// If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13864	// We need to make sure the wider element type is legal. Thus, ElementSize
13865	// should be not larger than 32 bits, and i1 type should also be excluded.
13866	if (ElementSize > `32` \|\| ElementSize == `1`)
13867	return SDValue ();
13868
13869	SmallVector<int, `8`> NewMask;
13870	if (widenShuffleMaskElts(M: Mask, NewMask)) {
13871	MVT NewEltVT = VT.isFloatingPoint()
13872	? MVT::getFloatingPointVT(BitWidth: ElementSize * `2`)
13873	: MVT::getIntegerVT(BitWidth: ElementSize * `2`);
13874	MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / `2`);
13875	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
13876	V0 = DAG.getBitcast(VT: NewVT, V: V0);
13877	V1 = DAG.getBitcast(VT: NewVT, V: V1);
13878	return DAG.getBitcast(VT,
13879	V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
13880	}
13881	}
13882
13883	return SDValue ();
13884	}
13885
13886	// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13887	static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
13888	ArrayRef<int> ShuffleMask,
13889	SelectionDAG &DAG) {
13890	SDValue Tbl1 = Op ->getOperand(Num: `0`);
13891	SDValue Tbl2 = Op ->getOperand(Num: `1`);
13892	SDLoc DL(Op);
13893	SDValue Tbl2ID =
13894	DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i64);
13895
13896	EVT VT = Op.getValueType();
13897	if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
13898	Tbl1.getOperand(i: `0`) != Tbl2ID \|\|
13899	Tbl2.getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
13900	Tbl2.getOperand(i: `0`) != Tbl2ID)
13901	return SDValue ();
13902
13903	if (Tbl1.getValueType() != MVT::v16i8 \|\| Tbl2.getValueType() != MVT::v16i8)
13904	return SDValue ();
13905
13906	SDValue Mask1 = Tbl1.getOperand(i: `3`);
13907	SDValue Mask2 = Tbl2.getOperand(i: `3`);
13908	if (Mask1.getOpcode() != ISD::BUILD_VECTOR \|\|
13909	Mask2.getOpcode() != ISD::BUILD_VECTOR)
13910	return SDValue ();
13911
13912	SmallVector<SDValue, `16`> TBLMaskParts(`16`, SDValue ());
13913	for (unsigned I = `0`; I < `16`; I++) {
13914	if (ShuffleMask [I] < `16`)
13915	TBLMaskParts [I] = Mask1.getOperand(i: ShuffleMask [I]);
13916	else {
13917	auto *C = dyn_cast<ConstantSDNode>(Val: Mask2.getOperand(i: ShuffleMask [I] - `16`));
13918	if (!C)
13919	return SDValue ();
13920	TBLMaskParts [I] = DAG.getConstant(Val: C->getSExtValue() + `32`, DL, VT: MVT::i32);
13921	}
13922	}
13923
13924	SDValue TBLMask = DAG.getBuildVector(VT, DL, Ops: TBLMaskParts);
13925	SDValue ID =
13926	DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl4, DL, VT: MVT::i64);
13927
13928	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::v16i8,
13929	Ops: {ID, Tbl1 ->getOperand(Num: `1`), Tbl1 ->getOperand(Num: `2`),
13930	Tbl2 ->getOperand(Num: `1`), Tbl2 ->getOperand(Num: `2`), TBLMask});
13931	}
13932
13933	// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13934	// but we don't have an appropriate instruction,
13935	// so custom-lower it as ZIP1-with-zeros.
13936	SDValue
13937	AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13938	SelectionDAG &DAG) const {
13939	SDLoc DL(Op);
13940	EVT VT = Op.getValueType();
13941	SDValue SrcOp = Op.getOperand(i: `0`);
13942	EVT SrcVT = SrcOp.getValueType();
13943	assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == `0` &&
13944	"Unexpected extension factor.");
13945	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13946	// FIXME: support multi-step zipping?
13947	if (Scale != `2`)
13948	return SDValue ();
13949	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: SrcVT);
13950	return DAG.getBitcast(VT,
13951	V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: SrcVT, N1: SrcOp, N2: Zeros));
13952	}
13953
13954	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13955	SelectionDAG &DAG) const {
13956	SDLoc DL(Op);
13957	EVT VT = Op.getValueType();
13958
13959	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
13960
13961	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13962	return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13963
13964	// Convert shuffles that are directly supported on NEON to target-specific
13965	// DAG nodes, instead of keeping them as shuffles and matching them again
13966	// during code selection. This is more efficient and avoids the possibility
13967	// of inconsistencies between legalization and selection.
13968	ArrayRef<int> ShuffleMask = SVN->getMask();
13969
13970	SDValue V1 = Op.getOperand(i: `0`);
13971	SDValue V2 = Op.getOperand(i: `1`);
13972
13973	assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13974	assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13975	"Unexpected VECTOR_SHUFFLE mask size!");
13976
13977	if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13978	return Res;
13979
13980	if (SVN->isSplat()) {
13981	int Lane = SVN->getSplatIndex();
13982	// If this is undef splat, generate it via "just" vdup, if possible.
13983	if (Lane == -`1`)
13984	Lane = `0`;
13985
13986	if (Lane == `0` && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13987	return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT: V1.getValueType(),
13988	Operand: V1.getOperand(i: `0`));
13989	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13990	// constant. If so, we can just reference the lane's definition directly.
13991	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13992	!isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
13993	return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: V1.getOperand(i: Lane));
13994
13995	// Otherwise, duplicate from the lane of the input vector.
13996	unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
13997	return constructDup(V: V1, Lane, DL, VT, Opcode, DAG);
13998	}
13999
14000	// Check if the mask matches a DUP for a wider element
14001	for (unsigned LaneSize : {`64U`, `32U`, `16U`}) {
14002	unsigned Lane = `0`;
14003	if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
14004	unsigned Opcode = LaneSize == `64` ? AArch64ISD::DUPLANE64
14005	: LaneSize == `32` ? AArch64ISD::DUPLANE32
14006	: AArch64ISD::DUPLANE16;
14007	// Cast V1 to an integer vector with required lane size
14008	MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
14009	unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14010	MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
14011	V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
14012	// Construct the DUP instruction
14013	V1 = constructDup(V: V1, Lane, DL, VT: NewVecTy, Opcode, DAG);
14014	// Cast back to the original type
14015	return DAG.getBitcast(VT, V: V1);
14016	}
14017	}
14018
14019	unsigned NumElts = VT.getVectorNumElements();
14020	unsigned EltSize = VT.getScalarSizeInBits();
14021	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `64`))
14022	return DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: V1.getValueType(), Operand: V1);
14023	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `32`))
14024	return DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: V1.getValueType(), Operand: V1);
14025	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `16`))
14026	return DAG.getNode(Opcode: AArch64ISD::REV16, DL, VT: V1.getValueType(), Operand: V1);
14027
14028	if (((NumElts == `8` && EltSize == `16`) \|\| (NumElts == `16` && EltSize == `8`)) &&
14029	ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
14030	SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT, Operand: V1);
14031	return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT, N1: Rev, N2: Rev,
14032	N3: DAG.getConstant(Val: `8`, DL, VT: MVT::i32));
14033	}
14034
14035	bool ReverseEXT = false;
14036	unsigned Imm;
14037	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
14038	if (ReverseEXT)
14039	std::swap(a&: V1, b&: V2);
14040	Imm *= getExtFactor(V&: V1);
14041	return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: V1.getValueType(), N1: V1, N2: V2,
14042	N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
14043	} else if (V2 ->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
14044	Imm *= getExtFactor(V&: V1);
14045	return DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: V1.getValueType(), N1: V1, N2: V1,
14046	N3: DAG.getConstant(Val: Imm, DL, VT: MVT::i32));
14047	}
14048
14049	unsigned WhichResult;
14050	if (isZIPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
14051	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14052	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14053	}
14054	if (isUZPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
14055	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14056	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14057	}
14058	if (isTRNMask(M: ShuffleMask, NumElts, WhichResult)) {
14059	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14060	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V2);
14061	}
14062
14063	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14064	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14065	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14066	}
14067	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14068	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14069	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14070	}
14071	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
14072	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14073	return DAG.getNode(Opcode: Opc, DL, VT: V1.getValueType(), N1: V1, N2: V1);
14074	}
14075
14076	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
14077	return Concat;
14078
14079	bool DstIsLeft;
14080	int Anomaly;
14081	int NumInputElements = V1.getValueType().getVectorNumElements();
14082	if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14083	SDValue DstVec = DstIsLeft ? V1 : V2;
14084	SDValue DstLaneV = DAG.getConstant(Val: Anomaly, DL, VT: MVT::i64);
14085
14086	SDValue SrcVec = V1;
14087	int SrcLane = ShuffleMask [Anomaly];
14088	if (SrcLane >= NumInputElements) {
14089	SrcVec = V2;
14090	SrcLane -= NumElts;
14091	}
14092	SDValue SrcLaneV = DAG.getConstant(Val: SrcLane, DL, VT: MVT::i64);
14093
14094	EVT ScalarVT = VT.getVectorElementType();
14095
14096	if (ScalarVT.getFixedSizeInBits() < `32` && ScalarVT.isInteger())
14097	ScalarVT = MVT::i32;
14098
14099	return DAG.getNode(
14100	Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: DstVec,
14101	N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
14102	N3: DstLaneV);
14103	}
14104
14105	if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14106	return NewSD;
14107
14108	// If the shuffle is not directly supported and it has 4 elements, use
14109	// the PerfectShuffle-generated table to synthesize it from other shuffles.
14110	if (NumElts == `4`) {
14111	unsigned PFIndexes[`4`];
14112	for (unsigned i = `0`; i != `4`; ++i) {
14113	if (ShuffleMask [i] < `0`)
14114	PFIndexes[i] = `8`;
14115	else
14116	PFIndexes[i] = ShuffleMask [i];
14117	}
14118
14119	// Compute the index in the perfect shuffle table.
14120	unsigned PFTableIndex = PFIndexes[`0`] * `9` * `9` * `9` + PFIndexes[`1`] * `9` * `9` +
14121	PFIndexes[`2`] * `9` + PFIndexes[`3`];
14122	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14123	return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
14124	DL);
14125	}
14126
14127	// Check for a "select shuffle", generating a BSL to pick between lanes in
14128	// V1/V2.
14129	if (ShuffleVectorInst::isSelectMask(Mask: ShuffleMask, NumSrcElts: NumElts)) {
14130	assert(VT.getScalarSizeInBits() <= `32` &&
14131	"Expected larger vector element sizes to be handled already");
14132	SmallVector<SDValue> MaskElts;
14133	for (int M : ShuffleMask)
14134	MaskElts.push_back(Elt: DAG.getConstant(
14135	Val: M >= static_cast<int>(NumElts) ? `0` : `0xffffffff`, DL, VT: MVT::i32));
14136	EVT IVT = VT.changeVectorElementTypeToInteger();
14137	SDValue MaskConst = DAG.getBuildVector(VT: IVT, DL, Ops: MaskElts);
14138	return DAG.getBitcast(VT, V: DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: IVT, N1: MaskConst,
14139	N2: DAG.getBitcast(VT: IVT, V: V1),
14140	N3: DAG.getBitcast(VT: IVT, V: V2)));
14141	}
14142
14143	// Fall back to generating a TBL
14144	return GenerateTBL(Op, ShuffleMask, DAG);
14145	}
14146
14147	SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14148	SelectionDAG &DAG) const {
14149	EVT VT = Op.getValueType();
14150
14151	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14152	return LowerToScalableOp(Op, DAG);
14153
14154	assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14155	"Unexpected vector type!");
14156
14157	// We can handle the constant cases during isel.
14158	if (isa<ConstantSDNode>(Val: Op.getOperand(i: `0`)))
14159	return Op;
14160
14161	// There isn't a natural way to handle the general i1 case, so we use some
14162	// trickery with whilelo.
14163	SDLoc DL(Op);
14164	SDValue SplatVal = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL, VT: MVT::i64);
14165	SplatVal = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i64, N1: SplatVal,
14166	N2: DAG.getValueType(MVT::i1));
14167	SDValue ID =
14168	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64);
14169	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
14170	if (VT == MVT::nxv1i1)
14171	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::nxv1i1,
14172	N1: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::nxv2i1, N1: ID,
14173	N2: Zero, N3: SplatVal),
14174	N2: Zero);
14175	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
14176	}
14177
14178	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14179	SelectionDAG &DAG) const {
14180	SDLoc DL(Op);
14181
14182	EVT VT = Op.getValueType();
14183	if (!isTypeLegal(VT) \|\| !VT.isScalableVector())
14184	return SDValue ();
14185
14186	// Current lowering only supports the SVE-ACLE types.
14187	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
14188	return SDValue ();
14189
14190	// The DUPQ operation is independent of element type so normalise to i64s.
14191	SDValue Idx128 = Op.getOperand(i: `2`);
14192
14193	// DUPQ can be used when idx is in range.
14194	auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
14195	if (CIdx && (CIdx->getZExtValue() <= `3`)) {
14196	SDValue CI = DAG.getTargetConstant(Val: CIdx->getZExtValue(), DL, VT: MVT::i64);
14197	return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: `1`), N2: CI);
14198	}
14199
14200	SDValue V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv2i64, Operand: Op.getOperand(i: `1`));
14201
14202	// The ACLE says this must produce the same result as:
14203	// svtbl(data, svadd_x(svptrue_b64(),
14204	// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14205	// index 2))*
14206	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i64);
14207	SDValue SplatOne = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: One);
14208
14209	// create the vector 0,1,0,1,...
14210	SDValue SV = DAG.getStepVector(DL, ResVT: MVT::nxv2i64);
14211	SV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatOne);
14212
14213	// create the vector idx64,idx64+1,idx64,idx64+1,...
14214	SDValue Idx64 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Idx128, N2: Idx128);
14215	SDValue SplatIdx64 = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Idx64);
14216	SDValue ShuffleMask = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatIdx64);
14217
14218	// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14219	SDValue TBL = DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: MVT::nxv2i64, N1: V, N2: ShuffleMask);
14220	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
14221	}
14222
14223
14224	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14225	APInt &UndefBits) {
14226	EVT VT = BVN->getValueType(ResNo: `0`);
14227	APInt SplatBits, SplatUndef;
14228	unsigned SplatBitSize;
14229	bool HasAnyUndefs;
14230	if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14231	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14232
14233	for (unsigned i = `0`; i < NumSplats; ++i) {
14234	CnstBits <<= SplatBitSize;
14235	UndefBits <<= SplatBitSize;
14236	CnstBits \|= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
14237	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
14238	}
14239
14240	return true;
14241	}
14242
14243	return false;
14244	}
14245
14246	// Try 64-bit splatted SIMD immediate.
14247	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14248	const APInt &Bits) {
14249	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14250	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14251	EVT VT = Op.getValueType();
14252	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v2i64 : MVT::f64;
14253
14254	if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
14255	Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
14256
14257	SDLoc DL(Op);
14258	SDValue Mov =
14259	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14260	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14261	}
14262	}
14263
14264	return SDValue ();
14265	}
14266
14267	// Try 32-bit splatted SIMD immediate.
14268	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14269	const APInt &Bits,
14270	const SDValue LHS = nullptr*) {
14271	EVT VT = Op.getValueType();
14272	if (VT.isFixedLengthVector() &&
14273	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14274	return SDValue ();
14275
14276	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14277	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14278	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
14279	bool isAdvSIMDModImm = false;
14280	uint64_t Shift;
14281
14282	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
14283	Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
14284	Shift = `0`;
14285	}
14286	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
14287	Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
14288	Shift = `8`;
14289	}
14290	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
14291	Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
14292	Shift = `16`;
14293	}
14294	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
14295	Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
14296	Shift = `24`;
14297	}
14298
14299	if (isAdvSIMDModImm) {
14300	SDLoc DL(Op);
14301	SDValue Mov;
14302
14303	if (LHS)
14304	Mov = DAG.getNode(Opcode: NewOp, DL, VT: MovTy,
14305	N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MovTy, Operand: *LHS),
14306	N2: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14307	N3: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14308	else
14309	Mov =
14310	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14311	N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14312
14313	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14314	}
14315	}
14316
14317	return SDValue ();
14318	}
14319
14320	// Try 16-bit splatted SIMD immediate.
14321	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14322	const APInt &Bits,
14323	const SDValue LHS = nullptr*) {
14324	EVT VT = Op.getValueType();
14325	if (VT.isFixedLengthVector() &&
14326	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
14327	return SDValue ();
14328
14329	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14330	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14331	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v8i16 : MVT::v4i16;
14332	bool isAdvSIMDModImm = false;
14333	uint64_t Shift;
14334
14335	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
14336	Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
14337	Shift = `0`;
14338	}
14339	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
14340	Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
14341	Shift = `8`;
14342	}
14343
14344	if (isAdvSIMDModImm) {
14345	SDLoc DL(Op);
14346	SDValue Mov;
14347
14348	if (LHS)
14349	Mov = DAG.getNode(Opcode: NewOp, DL, VT: MovTy,
14350	N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MovTy, Operand: *LHS),
14351	N2: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14352	N3: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14353	else
14354	Mov =
14355	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14356	N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14357
14358	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14359	}
14360	}
14361
14362	return SDValue ();
14363	}
14364
14365	// Try 32-bit splatted SIMD immediate with shifted ones.
14366	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
14367	SelectionDAG &DAG, const APInt &Bits) {
14368	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14369	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14370	EVT VT = Op.getValueType();
14371	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
14372	bool isAdvSIMDModImm = false;
14373	uint64_t Shift;
14374
14375	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
14376	Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
14377	Shift = `264`;
14378	}
14379	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
14380	Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
14381	Shift = `272`;
14382	}
14383
14384	if (isAdvSIMDModImm) {
14385	SDLoc DL(Op);
14386	SDValue Mov =
14387	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, N1: DAG.getConstant(Val: Value, DL, VT: MVT::i32),
14388	N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
14389	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14390	}
14391	}
14392
14393	return SDValue ();
14394	}
14395
14396	// Try 8-bit splatted SIMD immediate.
14397	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14398	const APInt &Bits) {
14399	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14400	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14401	EVT VT = Op.getValueType();
14402	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v16i8 : MVT::v8i8;
14403
14404	if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
14405	Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
14406
14407	SDLoc DL(Op);
14408	SDValue Mov =
14409	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14410	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14411	}
14412	}
14413
14414	return SDValue ();
14415	}
14416
14417	// Try FP splatted SIMD immediate.
14418	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14419	const APInt &Bits) {
14420	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
14421	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
14422	EVT VT = Op.getValueType();
14423	bool isWide = (VT.getSizeInBits() == `128`);
14424	MVT MovTy;
14425	bool isAdvSIMDModImm = false;
14426
14427	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
14428	Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
14429	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
14430	}
14431	else if (isWide &&
14432	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
14433	Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
14434	MovTy = MVT::v2f64;
14435	}
14436
14437	if (isAdvSIMDModImm) {
14438	SDLoc DL(Op);
14439	SDValue Mov =
14440	DAG.getNode(Opcode: NewOp, DL, VT: MovTy, Operand: DAG.getConstant(Val: Value, DL, VT: MVT::i32));
14441	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Mov);
14442	}
14443	}
14444
14445	return SDValue ();
14446	}
14447
14448	// Specialized code to quickly find if PotentialBVec is a BuildVector that
14449	// consists of only the same constant int value, returned in reference arg
14450	// ConstVal
14451	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
14452	uint64_t &ConstVal) {
14453	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
14454	if (!Bvec)
14455	return false;
14456	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: `0`));
14457	if (!FirstElt)
14458	return false;
14459	EVT VT = Bvec->getValueType(ResNo: `0`);
14460	unsigned NumElts = VT.getVectorNumElements();
14461	for (unsigned i = `1`; i < NumElts; ++i)
14462	if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
14463	return false;
14464	ConstVal = FirstElt->getZExtValue();
14465	return true;
14466	}
14467
14468	static bool isAllInactivePredicate(SDValue N) {
14469	// Look through cast.
14470	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
14471	N = N.getOperand(i: `0`);
14472
14473	return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
14474	}
14475
14476	static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
14477	unsigned NumElts = N.getValueType().getVectorMinNumElements();
14478
14479	// Look through cast.
14480	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
14481	N = N.getOperand(i: `0`);
14482	// When reinterpreting from a type with fewer elements the "new" elements
14483	// are not active, so bail if they're likely to be used.
14484	if (N.getValueType().getVectorMinNumElements() < NumElts)
14485	return false;
14486	}
14487
14488	if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
14489	return true;
14490
14491	// "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
14492	// or smaller than the implicit element type represented by N.
14493	// NOTE: A larger element count implies a smaller element type.
14494	if (N.getOpcode() == AArch64ISD::PTRUE &&
14495	N.getConstantOperandVal(i: `0`) == AArch64SVEPredPattern::all)
14496	return N.getValueType().getVectorMinNumElements() >= NumElts;
14497
14498	// If we're compiling for a specific vector-length, we can check if the
14499	// pattern's VL equals that of the scalable vector at runtime.
14500	if (N.getOpcode() == AArch64ISD::PTRUE) {
14501	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
14502	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
14503	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
14504	if (MaxSVESize && MinSVESize == MaxSVESize) {
14505	unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
14506	unsigned PatNumElts =
14507	getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: `0`));
14508	return PatNumElts == (NumElts * VScale);
14509	}
14510	}
14511
14512	return false;
14513	}
14514
14515	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
14516	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
14517	// BUILD_VECTORs with constant element C1, C2 is a constant, and:
14518	// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
14519	// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
14520	// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
14521	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
14522	EVT VT = N->getValueType(ResNo: `0`);
14523
14524	if (!VT.isVector())
14525	return SDValue ();
14526
14527	SDLoc DL(N);
14528
14529	SDValue And;
14530	SDValue Shift;
14531
14532	SDValue FirstOp = N->getOperand(Num: `0`);
14533	unsigned FirstOpc = FirstOp.getOpcode();
14534	SDValue SecondOp = N->getOperand(Num: `1`);
14535	unsigned SecondOpc = SecondOp.getOpcode();
14536
14537	// Is one of the operands an AND or a BICi? The AND may have been optimised to
14538	// a BICi in order to use an immediate instead of a register.
14539	// Is the other operand an shl or lshr? This will have been turned into:
14540	// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
14541	// or (AArch64ISD::SHL_PRED \|\| AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
14542	if ((FirstOpc == ISD::AND \|\| FirstOpc == AArch64ISD::BICi) &&
14543	(SecondOpc == AArch64ISD::VSHL \|\| SecondOpc == AArch64ISD::VLSHR \|\|
14544	SecondOpc == AArch64ISD::SHL_PRED \|\|
14545	SecondOpc == AArch64ISD::SRL_PRED)) {
14546	And = FirstOp;
14547	Shift = SecondOp;
14548
14549	} else if ((SecondOpc == ISD::AND \|\| SecondOpc == AArch64ISD::BICi) &&
14550	(FirstOpc == AArch64ISD::VSHL \|\| FirstOpc == AArch64ISD::VLSHR \|\|
14551	FirstOpc == AArch64ISD::SHL_PRED \|\|
14552	FirstOpc == AArch64ISD::SRL_PRED)) {
14553	And = SecondOp;
14554	Shift = FirstOp;
14555	} else
14556	return SDValue ();
14557
14558	bool IsAnd = And.getOpcode() == ISD::AND;
14559	bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR \|\|
14560	Shift.getOpcode() == AArch64ISD::SRL_PRED;
14561	bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED \|\|
14562	Shift.getOpcode() == AArch64ISD::SRL_PRED;
14563
14564	// Is the shift amount constant and are all lanes active?
14565	uint64_t C2;
14566	if (ShiftHasPredOp) {
14567	if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: `0`)))
14568	return SDValue ();
14569	APInt C;
14570	if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: `2`).getNode(), SplatValue&: C))
14571	return SDValue ();
14572	C2 = C.getZExtValue();
14573	} else if (ConstantSDNode *C2node =
14574	dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
14575	C2 = C2node->getZExtValue();
14576	else
14577	return SDValue ();
14578
14579	APInt C1AsAPInt;
14580	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
14581	if (IsAnd) {
14582	// Is the and mask vector all constant?
14583	if (!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: C1AsAPInt))
14584	return SDValue ();
14585	} else {
14586	// Reconstruct the corresponding AND immediate from the two BICi immediates.
14587	ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `1`));
14588	ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `2`));
14589	assert(C1nodeImm && C1nodeShift);
14590	C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
14591	C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
14592	}
14593
14594	// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
14595	// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
14596	// how much one can shift elements of a particular size?
14597	if (C2 > ElemSizeInBits)
14598	return SDValue ();
14599
14600	APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
14601	: APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
14602	if (C1AsAPInt != RequiredC1)
14603	return SDValue ();
14604
14605	SDValue X = And.getOperand(i: `0`);
14606	SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: `1`) : Shift.getOperand(i: `0`);
14607	SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(Val: C2, DL, VT: MVT::i32)
14608	: Shift.getOperand(i: `1`);
14609
14610	unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
14611	SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
14612
14613	return ResultSLI;
14614	}
14615
14616	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
14617	SelectionDAG &DAG) const {
14618	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14619	OverrideNEON: !Subtarget->isNeonAvailable()))
14620	return LowerToScalableOp(Op, DAG);
14621
14622	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
14623	if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
14624	return Res;
14625
14626	EVT VT = Op.getValueType();
14627	if (VT.isScalableVector())
14628	return Op;
14629
14630	SDValue LHS = Op.getOperand(i: `0`);
14631	BuildVectorSDNode *BVN =
14632	dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `1`).getNode());
14633	if (!BVN) {
14634	// OR commutes, so try swapping the operands.
14635	LHS = Op.getOperand(i: `1`);
14636	BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `0`).getNode());
14637	}
14638	if (!BVN)
14639	return Op;
14640
14641	APInt DefBits(VT.getSizeInBits(), `0`);
14642	APInt UndefBits(VT.getSizeInBits(), `0`);
14643	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
14644	SDValue NewOp;
14645
14646	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
14647	Bits: DefBits, LHS: &LHS)) \|\|
14648	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
14649	Bits: DefBits, LHS: &LHS)))
14650	return NewOp;
14651
14652	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
14653	Bits: UndefBits, LHS: &LHS)) \|\|
14654	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
14655	Bits: UndefBits, LHS: &LHS)))
14656	return NewOp;
14657	}
14658
14659	// We can always fall back to a non-immediate OR.
14660	return Op;
14661	}
14662
14663	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
14664	// be truncated to fit element width.
14665	static SDValue NormalizeBuildVector(SDValue Op,
14666	SelectionDAG &DAG) {
14667	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
14668	SDLoc DL(Op);
14669	EVT VT = Op.getValueType();
14670	EVT EltTy= VT.getVectorElementType();
14671
14672	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > `16`)
14673	return Op;
14674
14675	SmallVector<SDValue, `16`> Ops;
14676	for (SDValue Lane : Op ->ops()) {
14677	// For integer vectors, type legalization would have promoted the
14678	// operands already. Otherwise, if Op is a floating-point splat
14679	// (with operands cast to integers), then the only possibilities
14680	// are constants and UNDEFs.
14681	if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
14682	Lane = DAG.getConstant(
14683	Val: CstLane->getAPIntValue().trunc(width: EltTy.getSizeInBits()).getZExtValue(),
14684	DL, VT: MVT::i32);
14685	} else if (Lane.getNode()->isUndef()) {
14686	Lane = DAG.getUNDEF(VT: MVT::i32);
14687	} else {
14688	assert(Lane.getValueType() == MVT::i32 &&
14689	"Unexpected BUILD_VECTOR operand type");
14690	}
14691	Ops.push_back(Elt: Lane);
14692	}
14693	return DAG.getBuildVector(VT, DL, Ops);
14694	}
14695
14696	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
14697	const AArch64Subtarget *ST) {
14698	EVT VT = Op.getValueType();
14699	assert((VT.getSizeInBits() == `64` \|\| VT.getSizeInBits() == `128`) &&
14700	"Expected a legal NEON vector");
14701
14702	APInt DefBits(VT.getSizeInBits(), `0`);
14703	APInt UndefBits(VT.getSizeInBits(), `0`);
14704	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
14705	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
14706	auto TryMOVIWithBits = [&](APInt DefBits) {
14707	SDValue NewOp;
14708	if ((NewOp =
14709	tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) \|\|
14710	(NewOp =
14711	tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
14712	(NewOp =
14713	tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) \|\|
14714	(NewOp =
14715	tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
14716	(NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) \|\|
14717	(NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
14718	return NewOp;
14719
14720	APInt NotDefBits = ~DefBits;
14721	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
14722	Bits: NotDefBits)) \|\|
14723	(NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
14724	Bits: NotDefBits)) \|\|
14725	(NewOp =
14726	tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
14727	return NewOp;
14728	return SDValue ();
14729	};
14730	if (SDValue R = TryMOVIWithBits (DefBits))
14731	return R;
14732	if (SDValue R = TryMOVIWithBits (UndefBits))
14733	return R;
14734
14735	// See if a fneg of the constant can be materialized with a MOVI, etc
14736	auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
14737	// FNegate each sub-element of the constant
14738	assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == `0`);
14739	APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: `1`)
14740	.zext(width: VT.getSizeInBits());
14741	APInt NegBits(VT.getSizeInBits(), `0`);
14742	unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
14743	for (unsigned i = `0`; i < NumElts; i++)
14744	NegBits \|= Neg << (FVT.getScalarSizeInBits() * i);
14745	NegBits = DefBits ^ NegBits;
14746
14747	// Try to create the new constants with MOVI, and if so generate a fneg
14748	// for it.
14749	if (SDValue NewOp = TryMOVIWithBits (NegBits)) {
14750	SDLoc DL(Op);
14751	MVT VFVT = NumElts == `1` ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
14752	return DAG.getNode(
14753	Opcode: AArch64ISD::NVCAST, DL, VT,
14754	Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
14755	Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
14756	}
14757	return SDValue ();
14758	};
14759	SDValue R;
14760	if ((R = TryWithFNeg (DefBits, MVT::f32)) \|\|
14761	(R = TryWithFNeg (DefBits, MVT::f64)) \|\|
14762	(ST->hasFullFP16() && (R = TryWithFNeg (DefBits, MVT::f16))))
14763	return R;
14764	}
14765
14766	return SDValue ();
14767	}
14768
14769	SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
14770	SDValue Op, SelectionDAG &DAG) const {
14771	EVT VT = Op.getValueType();
14772	SDLoc DL(Op);
14773	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
14774	auto *BVN = cast<BuildVectorSDNode>(Val&: Op);
14775
14776	if (auto SeqInfo = BVN->isConstantSequence()) {
14777	SDValue Start = DAG.getConstant(Val: SeqInfo ->first, DL, VT: ContainerVT);
14778	SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo ->second);
14779	SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
14780	return convertFromScalableVector(DAG, VT, V: Seq);
14781	}
14782
14783	unsigned NumElems = VT.getVectorNumElements();
14784	if (!VT.isPow2VectorType() \|\| VT.getFixedSizeInBits() > `128` \|\|
14785	NumElems <= `1` \|\| BVN->isConstant())
14786	return SDValue ();
14787
14788	auto IsExtractElt = [](SDValue Op) {
14789	return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
14790	};
14791
14792	// For integer types that are not already in vectors limit to at most four
14793	// elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
14794	if (VT.getScalarType().isInteger() &&
14795	NumElems - count_if(Range: Op ->op_values(), P: IsExtractElt) > `4`)
14796	return SDValue ();
14797
14798	// Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
14799	SDValue ZeroI64 = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
14800	SmallVector<SDValue, `16`> Intermediates = map_to_vector<`16`>(
14801	C: Op ->op_values(), F: [&, Undef = DAG.getUNDEF(VT: ContainerVT)](SDValue Op) {
14802	return Op.isUndef() ? Undef
14803	: DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL,
14804	VT: ContainerVT, N1: Undef, N2: Op, N3: ZeroI64);
14805	});
14806
14807	ElementCount ZipEC = ContainerVT.getVectorElementCount();
14808	while (Intermediates.size() > `1`) {
14809	EVT ZipVT = getPackedSVEVectorVT(EC: ZipEC);
14810
14811	for (unsigned I = `0`; I < Intermediates.size(); I += `2`) {
14812	SDValue Op0 = DAG.getBitcast(VT: ZipVT, V: Intermediates [I + `0`]);
14813	SDValue Op1 = DAG.getBitcast(VT: ZipVT, V: Intermediates [I + `1`]);
14814	Intermediates [I / `2`] =
14815	Op1.isUndef() ? Op0
14816	: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ZipVT, N1: Op0, N2: Op1);
14817	}
14818
14819	Intermediates.resize(N: Intermediates.size() / `2`);
14820	ZipEC = ZipEC.divideCoefficientBy(RHS: `2`);
14821	}
14822
14823	assert(Intermediates.size() == `1`);
14824	SDValue Vec = DAG.getBitcast(VT: ContainerVT, V: Intermediates [`0`]);
14825	return convertFromScalableVector(DAG, VT, V: Vec);
14826	}
14827
14828	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
14829	SelectionDAG &DAG) const {
14830	EVT VT = Op.getValueType();
14831
14832	bool OverrideNEON = !Subtarget->isNeonAvailable() \|\|
14833	cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence();
14834	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
14835	return LowerFixedLengthBuildVectorToSVE(Op, DAG);
14836
14837	// Try to build a simple constant vector.
14838	Op = NormalizeBuildVector(Op, DAG);
14839	// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
14840	// abort.
14841	if (Op.getOpcode() != ISD::BUILD_VECTOR)
14842	return SDValue ();
14843
14844	// Certain vector constants, used to express things like logical NOT and
14845	// arithmetic NEG, are passed through unmodified. This allows special
14846	// patterns for these operations to match, which will lower these constants
14847	// to whatever is proven necessary.
14848	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
14849	if (BVN->isConstant()) {
14850	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
14851	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
14852	APInt Val(BitSize,
14853	Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
14854	if (Val.isZero() \|\| (VT.isInteger() && Val.isAllOnes()))
14855	return Op;
14856	}
14857	if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
14858	if (Const->isZero() && !Const->isNegative())
14859	return Op;
14860	}
14861
14862	if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
14863	return V;
14864
14865	// Scan through the operands to find some interesting properties we can
14866	// exploit:
14867	// 1) If only one value is used, we can use a DUP, or
14868	// 2) if only the low element is not undef, we can just insert that, or
14869	// 3) if only one constant value is used (w/ some non-constant lanes),
14870	// we can splat the constant value into the whole vector then fill
14871	// in the non-constant lanes.
14872	// 4) FIXME: If different constant values are used, but we can intelligently
14873	// select the values we'll be overwriting for the non-constant
14874	// lanes such that we can directly materialize the vector
14875	// some other way (MOVI, e.g.), we can be sneaky.
14876	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
14877	SDLoc DL(Op);
14878	unsigned NumElts = VT.getVectorNumElements();
14879	bool isOnlyLowElement = true;
14880	bool usesOnlyOneValue = true;
14881	bool usesOnlyOneConstantValue = true;
14882	bool isConstant = true;
14883	bool AllLanesExtractElt = true;
14884	unsigned NumConstantLanes = `0`;
14885	unsigned NumDifferentLanes = `0`;
14886	unsigned NumUndefLanes = `0`;
14887	SDValue Value;
14888	SDValue ConstantValue;
14889	SmallMapVector<SDValue, unsigned, `16`> DifferentValueMap;
14890	unsigned ConsecutiveValCount = `0`;
14891	SDValue PrevVal;
14892	for (unsigned i = `0`; i < NumElts; ++i) {
14893	SDValue V = Op.getOperand(i);
14894	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14895	AllLanesExtractElt = false;
14896	if (V.isUndef()) {
14897	++NumUndefLanes;
14898	continue;
14899	}
14900	if (i > `0`)
14901	isOnlyLowElement = false;
14902	if (!isIntOrFPConstant(V))
14903	isConstant = false;
14904
14905	if (isIntOrFPConstant(V)) {
14906	++NumConstantLanes;
14907	if (!ConstantValue.getNode())
14908	ConstantValue = V;
14909	else if (ConstantValue != V)
14910	usesOnlyOneConstantValue = false;
14911	}
14912
14913	if (!Value.getNode())
14914	Value = V;
14915	else if (V != Value) {
14916	usesOnlyOneValue = false;
14917	++NumDifferentLanes;
14918	}
14919
14920	if (PrevVal != V) {
14921	ConsecutiveValCount = `0`;
14922	PrevVal = V;
14923	}
14924
14925	// Keep different values and its last consecutive count. For example,
14926	//
14927	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14928	// t24, t24, t24, t24, t24, t24, t24, t24
14929	// t23 = consecutive count 8
14930	// t24 = consecutive count 8
14931	// ------------------------------------------------------------------
14932	// t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14933	// t24, t24, t24, t24, t24, t24, t24, t24
14934	// t23 = consecutive count 5
14935	// t24 = consecutive count 9
14936	DifferentValueMap [V] = ++ConsecutiveValCount;
14937	}
14938
14939	if (!Value.getNode()) {
14940	LLVM_DEBUG(
14941	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14942	return DAG.getUNDEF(VT);
14943	}
14944
14945	// Convert BUILD_VECTOR where all elements but the lowest are undef into
14946	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14947	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14948	if (isOnlyLowElement && !(NumElts == `1` && isIntOrFPConstant(V: Value))) {
14949	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14950	"SCALAR_TO_VECTOR node\n");
14951	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT, Operand: Value);
14952	}
14953
14954	if (AllLanesExtractElt) {
14955	SDNode Vector = nullptr*;
14956	bool Even = false;
14957	bool Odd = false;
14958	// Check whether the extract elements match the Even pattern <0,2,4,...> or
14959	// the Odd pattern <1,3,5,...>.
14960	for (unsigned i = `0`; i < NumElts; ++i) {
14961	SDValue V = Op.getOperand(i);
14962	const SDNode *N = V.getNode();
14963	if (!isa<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
14964	Even = false;
14965	Odd = false;
14966	break;
14967	}
14968	SDValue N0 = N->getOperand(Num: `0`);
14969
14970	// All elements are extracted from the same vector.
14971	if (!Vector) {
14972	Vector = N0.getNode();
14973	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
14974	// BUILD_VECTOR.
14975	if (VT.getVectorElementType() !=
14976	N0.getValueType().getVectorElementType())
14977	break;
14978	} else if (Vector != N0.getNode()) {
14979	Odd = false;
14980	Even = false;
14981	break;
14982	}
14983
14984	// Extracted values are either at Even indices <0,2,4,...> or at Odd
14985	// indices <1,3,5,...>.
14986	uint64_t Val = N->getConstantOperandVal(Num: `1`);
14987	if (Val == `2` * i) {
14988	Even = true;
14989	continue;
14990	}
14991	if (Val - `1` == `2` * i) {
14992	Odd = true;
14993	continue;
14994	}
14995
14996	// Something does not match: abort.
14997	Odd = false;
14998	Even = false;
14999	break;
15000	}
15001	if (Even \|\| Odd) {
15002	SDValue LHS =
15003	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue (Vector, `0`),
15004	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
15005	SDValue RHS =
15006	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue (Vector, `0`),
15007	N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
15008
15009	if (Even && !Odd)
15010	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: LHS, N2: RHS);
15011	if (Odd && !Even)
15012	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: LHS, N2: RHS);
15013	}
15014	}
15015
15016	// Use DUP for non-constant splats. For f32 constant splats, reduce to
15017	// i32 and try again.
15018	if (usesOnlyOneValue) {
15019	if (!isConstant) {
15020	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
15021	Value.getValueType() != VT) {
15022	LLVM_DEBUG(
15023	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15024	return DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: Value);
15025	}
15026
15027	// This is actually a DUPLANExx operation, which keeps everything vectory.
15028
15029	SDValue Lane = Value.getOperand(i: `1`);
15030	Value = Value.getOperand(i: `0`);
15031	if (Value.getValueSizeInBits() == `64`) {
15032	LLVM_DEBUG(
15033	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15034	"widening it\n");
15035	Value = WidenVector(V64Reg: Value, DAG);
15036	}
15037
15038	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
15039	return DAG.getNode(Opcode, DL, VT, N1: Value, N2: Lane);
15040	}
15041
15042	if (VT.getVectorElementType().isFloatingPoint()) {
15043	SmallVector<SDValue, `8`> Ops;
15044	EVT EltTy = VT.getVectorElementType();
15045	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::bf16 \|\| EltTy == MVT::f32 \|\|
15046	EltTy == MVT::f64) && "Unsupported floating-point vector type");
15047	LLVM_DEBUG(
15048	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15049	"BITCASTS, and try again\n");
15050	MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
15051	for (unsigned i = `0`; i < NumElts; ++i)
15052	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewType, Operand: Op.getOperand(i)));
15053	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
15054	SDValue Val = DAG.getBuildVector(VT: VecVT, DL, Ops);
15055	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15056	Val.dump(););
15057	Val = LowerBUILD_VECTOR(Op: Val, DAG);
15058	if (Val.getNode())
15059	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
15060	}
15061	}
15062
15063	// If we need to insert a small number of different non-constant elements and
15064	// the vector width is sufficiently large, prefer using DUP with the common
15065	// value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15066	// skip the constant lane handling below.
15067	bool PreferDUPAndInsert =
15068	!isConstant && NumDifferentLanes >= `1` &&
15069	NumDifferentLanes < ((NumElts - NumUndefLanes) / `2`) &&
15070	NumDifferentLanes >= NumConstantLanes;
15071
15072	// If there was only one constant value used and for more than one lane,
15073	// start by splatting that value, then replace the non-constant lanes. This
15074	// is better than the default, which will perform a separate initialization
15075	// for each lane.
15076	if (!PreferDUPAndInsert && NumConstantLanes > `0` && usesOnlyOneConstantValue) {
15077	// Firstly, try to materialize the splat constant.
15078	SDValue Val = DAG.getSplatBuildVector(VT, DL, Op: ConstantValue);
15079	unsigned BitSize = VT.getScalarSizeInBits();
15080	APInt ConstantValueAPInt(`1`, `0`);
15081	if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
15082	ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
15083	if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
15084	!ConstantValueAPInt.isAllOnes()) {
15085	Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
15086	if (!Val)
15087	// Otherwise, materialize the constant and splat it.
15088	Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL, VT, Operand: ConstantValue);
15089	}
15090
15091	// Now insert the non-constant lanes.
15092	for (unsigned i = `0`; i < NumElts; ++i) {
15093	SDValue V = Op.getOperand(i);
15094	SDValue LaneIdx = DAG.getConstant(Val: i, DL, VT: MVT::i64);
15095	if (!isIntOrFPConstant(V))
15096	// Note that type legalization likely mucked about with the VT of the
15097	// source operand, so we may have to convert it here before inserting.
15098	Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: Val, N2: V, N3: LaneIdx);
15099	}
15100	return Val;
15101	}
15102
15103	// This will generate a load from the constant pool.
15104	if (isConstant) {
15105	LLVM_DEBUG(
15106	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15107	"expansion\n");
15108	return SDValue ();
15109	}
15110
15111	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15112	// v4i32s. This is really a truncate, which we can construct out of (legal)
15113	// concats and truncate nodes.
15114	if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
15115	return M;
15116
15117	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15118	if (NumElts >= `4`) {
15119	if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15120	return Shuffle;
15121
15122	if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15123	return Shuffle;
15124	}
15125
15126	if (PreferDUPAndInsert) {
15127	// First, build a constant vector with the common element.
15128	SmallVector<SDValue, `8`> Ops(NumElts, Value);
15129	SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL, Ops), DAG);
15130	// Next, insert the elements that do not match the common value.
15131	for (unsigned I = `0`; I < NumElts; ++I)
15132	if (Op.getOperand(i: I) != Value)
15133	NewVector =
15134	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: NewVector,
15135	N2: Op.getOperand(i: I), N3: DAG.getConstant(Val: I, DL, VT: MVT::i64));
15136
15137	return NewVector;
15138	}
15139
15140	// If vector consists of two different values, try to generate two DUPs and
15141	// (CONCAT_VECTORS or VECTOR_SHUFFLE).
15142	if (DifferentValueMap.size() == `2` && NumUndefLanes == `0`) {
15143	SmallVector<SDValue, `2`> Vals;
15144	// Check the consecutive count of the value is the half number of vector
15145	// elements. In this case, we can use CONCAT_VECTORS. For example,
15146	//
15147	// canUseVECTOR_CONCAT = true;
15148	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15149	// t24, t24, t24, t24, t24, t24, t24, t24
15150	//
15151	// canUseVECTOR_CONCAT = false;
15152	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15153	// t24, t24, t24, t24, t24, t24, t24, t24
15154	bool canUseVECTOR_CONCAT = true;
15155	for (auto Pair : DifferentValueMap) {
15156	// Check different values have same length which is NumElts / 2.
15157	if (Pair.second != NumElts / `2`)
15158	canUseVECTOR_CONCAT = false;
15159	Vals.push_back(Elt: Pair.first);
15160	}
15161
15162	// If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15163	// CONCAT_VECTORs. For example,
15164	//
15165	// t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15166	// t24, t24, t24, t24, t24, t24, t24, t24
15167	// ==>
15168	// t26: v8i8 = AArch64ISD::DUP t23
15169	// t28: v8i8 = AArch64ISD::DUP t24
15170	// t29: v16i8 = concat_vectors t26, t28
15171	if (canUseVECTOR_CONCAT) {
15172	EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
15173	if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
15174	SubVT.getVectorNumElements() >= `2`) {
15175	SmallVector<SDValue, `8`> Ops1(NumElts / `2`, Vals [`0`]);
15176	SmallVector<SDValue, `8`> Ops2(NumElts / `2`, Vals [`1`]);
15177	SDValue DUP1 =
15178	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL, Ops: Ops1), DAG);
15179	SDValue DUP2 =
15180	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL, Ops: Ops2), DAG);
15181	SDValue CONCAT_VECTORS =
15182	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: DUP1, N2: DUP2);
15183	return CONCAT_VECTORS;
15184	}
15185	}
15186
15187	// Let's try to generate VECTOR_SHUFFLE. For example,
15188	//
15189	// t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15190	// ==>
15191	// t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15192	// t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15193	// t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15194	if (NumElts >= `8`) {
15195	SmallVector<int, `16`> MaskVec;
15196	// Build mask for VECTOR_SHUFLLE.
15197	SDValue FirstLaneVal = Op.getOperand(i: `0`);
15198	for (unsigned i = `0`; i < NumElts; ++i) {
15199	SDValue Val = Op.getOperand(i);
15200	if (FirstLaneVal == Val)
15201	MaskVec.push_back(Elt: i);
15202	else
15203	MaskVec.push_back(Elt: i + NumElts);
15204	}
15205
15206	SmallVector<SDValue, `8`> Ops1(NumElts, Vals [`0`]);
15207	SmallVector<SDValue, `8`> Ops2(NumElts, Vals [`1`]);
15208	SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops: Ops1);
15209	SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops: Ops2);
15210	SDValue VECTOR_SHUFFLE =
15211	DAG.getVectorShuffle(VT, dl: DL, N1: VEC1, N2: VEC2, Mask: MaskVec);
15212	return VECTOR_SHUFFLE;
15213	}
15214	}
15215
15216	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15217	// know the default expansion would otherwise fall back on something even
15218	// worse. For a vector with one or two non-undef values, that's
15219	// scalar_to_vector for the elements followed by a shuffle (provided the
15220	// shuffle is valid for the target) and materialization element by element
15221	// on the stack followed by a load for everything else.
15222	if (!isConstant && !usesOnlyOneValue) {
15223	LLVM_DEBUG(
15224	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15225	"of INSERT_VECTOR_ELT\n");
15226
15227	SDValue Vec = DAG.getUNDEF(VT);
15228	SDValue Op0 = Op.getOperand(i: `0`);
15229	unsigned i = `0`;
15230
15231	// Use SCALAR_TO_VECTOR for lane zero to
15232	// a) Avoid a RMW dependency on the full vector register, and
15233	// b) Allow the register coalescer to fold away the copy if the
15234	// value is already in an S or D register, and we're forced to emit an
15235	// INSERT_SUBREG that we can't fold anywhere.
15236	//
15237	// We also allow types like i8 and i16 which are illegal scalar but legal
15238	// vector element types. After type-legalization the inserted value is
15239	// extended (i32) and it is safe to cast them to the vector type by ignoring
15240	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
15241	if (!Op0.isUndef()) {
15242	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15243	Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT, Operand: Op0);
15244	++i;
15245	}
15246	LLVM_DEBUG({
15247	if (i < NumElts)
15248	dbgs() << "Creating nodes for the other vector elements:\n";
15249	});
15250	for (; i < NumElts; ++i) {
15251	SDValue V = Op.getOperand(i);
15252	if (V.isUndef())
15253	continue;
15254	SDValue LaneIdx = DAG.getConstant(Val: i, DL, VT: MVT::i64);
15255	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT, N1: Vec, N2: V, N3: LaneIdx);
15256	}
15257	return Vec;
15258	}
15259
15260	LLVM_DEBUG(
15261	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15262	"better alternative\n");
15263	return SDValue ();
15264	}
15265
15266	SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15267	SelectionDAG &DAG) const {
15268	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
15269	OverrideNEON: !Subtarget->isNeonAvailable()))
15270	return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15271
15272	assert(Op.getValueType().isScalableVector() &&
15273	isTypeLegal(Op.getValueType()) &&
15274	"Expected legal scalable vector type!");
15275
15276	if (isTypeLegal(VT: Op.getOperand(i: `0`).getValueType())) {
15277	unsigned NumOperands = Op ->getNumOperands();
15278	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
15279	"Unexpected number of operands in CONCAT_VECTORS");
15280
15281	if (NumOperands == `2`)
15282	return Op;
15283
15284	// Concat each pair of subvectors and pack into the lower half of the array.
15285	SmallVector<SDValue> ConcatOps(Op ->ops());
15286	while (ConcatOps.size() > `1`) {
15287	for (unsigned I = `0`, E = ConcatOps.size(); I != E; I += `2`) {
15288	SDValue V1 = ConcatOps [I];
15289	SDValue V2 = ConcatOps [I + `1`];
15290	EVT SubVT = V1.getValueType();
15291	EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
15292	ConcatOps [I / `2`] =
15293	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT: PairVT, N1: V1, N2: V2);
15294	}
15295	ConcatOps.resize(N: ConcatOps.size() / `2`);
15296	}
15297	return ConcatOps [`0`];
15298	}
15299
15300	return SDValue ();
15301	}
15302
15303	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15304	SelectionDAG &DAG) const {
15305	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15306
15307	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
15308	OverrideNEON: !Subtarget->isNeonAvailable()))
15309	return LowerFixedLengthInsertVectorElt(Op, DAG);
15310
15311	EVT VT = Op.getOperand(i: `0`).getValueType();
15312
15313	if (VT.getScalarType() == MVT::i1) {
15314	EVT VectorVT = getPromotedVTForPredicate(VT);
15315	SDLoc DL(Op);
15316	SDValue ExtendedVector =
15317	DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL, VT: VectorVT);
15318	SDValue ExtendedValue =
15319	DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `1`), DL,
15320	VT: VectorVT.getScalarType().getSizeInBits() < `32`
15321	? MVT::i32
15322	: VectorVT.getScalarType());
15323	ExtendedVector =
15324	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
15325	N2: ExtendedValue, N3: Op.getOperand(i: `2`));
15326	return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
15327	}
15328
15329	// Check for non-constant or out of range lane.
15330	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
15331	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
15332	return SDValue ();
15333
15334	return Op;
15335	}
15336
15337	SDValue
15338	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15339	SelectionDAG &DAG) const {
15340	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
15341	EVT VT = Op.getOperand(i: `0`).getValueType();
15342
15343	if (VT.getScalarType() == MVT::i1) {
15344	// We can't directly extract from an SVE predicate; extend it first.
15345	// (This isn't the only possible lowering, but it's straightforward.)
15346	EVT VectorVT = getPromotedVTForPredicate(VT);
15347	SDLoc DL(Op);
15348	SDValue Extend =
15349	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: `0`));
15350	MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
15351	SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
15352	N1: Extend, N2: Op.getOperand(i: `1`));
15353	return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
15354	}
15355
15356	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15357	return LowerFixedLengthExtractVectorElt(Op, DAG);
15358
15359	// Check for non-constant or out of range lane.
15360	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
15361	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
15362	return SDValue ();
15363
15364	// Insertion/extraction are legal for V128 types.
15365	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
15366	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
15367	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
15368	return Op;
15369
15370	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
15371	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
15372	VT != MVT::v4bf16)
15373	return SDValue ();
15374
15375	// For V64 types, we perform extraction by expanding the value
15376	// to a V128 type and perform the extraction on that.
15377	SDLoc DL(Op);
15378	SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: `0`), DAG);
15379	EVT WideTy = WideVec.getValueType();
15380
15381	EVT ExtrTy = WideTy.getVectorElementType();
15382	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
15383	ExtrTy = MVT::i32;
15384
15385	// For extractions, we just return the result directly.
15386	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
15387	N2: Op.getOperand(i: `1`));
15388	}
15389
15390	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
15391	SelectionDAG &DAG) const {
15392	EVT VT = Op.getValueType();
15393	assert(VT.isFixedLengthVector() &&
15394	"Only cases that extract a fixed length vector are supported!");
15395	EVT InVT = Op.getOperand(i: `0`).getValueType();
15396
15397	// If we don't have legal types yet, do nothing
15398	if (!isTypeLegal(VT: InVT))
15399	return SDValue ();
15400
15401	if (InVT.is128BitVector()) {
15402	assert(VT.is64BitVector() && "Extracting unexpected vector type!");
15403	unsigned Idx = Op.getConstantOperandVal(i: `1`);
15404
15405	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
15406	if (Idx == `0`)
15407	return Op;
15408
15409	// If this is extracting the upper 64-bits of a 128-bit vector, we match
15410	// that directly.
15411	if (Idx * InVT.getScalarSizeInBits() == `64` && Subtarget->isNeonAvailable())
15412	return Op;
15413	}
15414
15415	if (InVT.isScalableVector() \|\|
15416	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
15417	SDLoc DL(Op);
15418	SDValue Vec = Op.getOperand(i: `0`);
15419	SDValue Idx = Op.getOperand(i: `1`);
15420
15421	EVT PackedVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
15422	if (PackedVT != InVT) {
15423	// Pack input into the bottom part of an SVE register and try again.
15424	SDValue Container = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: PackedVT,
15425	N1: DAG.getUNDEF(VT: PackedVT), N2: Vec,
15426	N3: DAG.getVectorIdxConstant(Val: `0`, DL));
15427	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Container, N2: Idx);
15428	}
15429
15430	// This will get matched by custom code during ISelDAGToDAG.
15431	if (isNullConstant(V: Idx))
15432	return Op;
15433
15434	assert(InVT.isScalableVector() && "Unexpected vector type!");
15435	// Move requested subvector to the start of the vector and try again.
15436	SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: InVT, N1: Vec, N2: Vec, N3: Idx);
15437	return convertFromScalableVector(DAG, VT, V: Splice);
15438	}
15439
15440	return SDValue ();
15441	}
15442
15443	SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
15444	SelectionDAG &DAG) const {
15445	assert(Op.getValueType().isScalableVector() &&
15446	"Only expect to lower inserts into scalable vectors!");
15447
15448	EVT InVT = Op.getOperand(i: `1`).getValueType();
15449	unsigned Idx = Op.getConstantOperandVal(i: `2`);
15450
15451	SDValue Vec0 = Op.getOperand(i: `0`);
15452	SDValue Vec1 = Op.getOperand(i: `1`);
15453	SDLoc DL(Op);
15454	EVT VT = Op.getValueType();
15455
15456	if (InVT.isScalableVector()) {
15457	if (!isTypeLegal(VT))
15458	return SDValue ();
15459
15460	// Break down insert_subvector into simpler parts.
15461	if (VT.getVectorElementType() == MVT::i1) {
15462	unsigned NumElts = VT.getVectorMinNumElements();
15463	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
15464
15465	SDValue Lo, Hi;
15466	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
15467	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
15468	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
15469	N2: DAG.getVectorIdxConstant(Val: NumElts / `2`, DL));
15470	if (Idx < (NumElts / `2`))
15471	Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
15472	N3: DAG.getVectorIdxConstant(Val: Idx, DL));
15473	else
15474	Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
15475	N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / `2`), DL));
15476
15477	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
15478	}
15479
15480	// We can select these directly.
15481	if (isTypeLegal(VT: InVT) && Vec0.isUndef())
15482	return Op;
15483
15484	// Ensure the subvector is half the size of the main vector.
15485	if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * `2`))
15486	return SDValue ();
15487
15488	// Here narrow and wide refers to the vector element types. After "casting"
15489	// both vectors must have the same bit length and so because the subvector
15490	// has fewer elements, those elements need to be bigger.
15491	EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
15492	EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
15493
15494	// NOP cast operands to the largest legal vector of the same element count.
15495	if (VT.isFloatingPoint()) {
15496	Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
15497	Vec1 = getSVESafeBitCast(VT: NarrowVT, Op: Vec1, DAG);
15498	} else {
15499	// Legal integer vectors are already their largest so Vec0 is fine as is.
15500	Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
15501	Vec1 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: Vec1);
15502	}
15503
15504	// To replace the top/bottom half of vector V with vector SubV we widen the
15505	// preserved half of V, concatenate this to SubV (the order depending on the
15506	// half being replaced) and then narrow the result.
15507	SDValue Narrow;
15508	if (Idx == `0`) {
15509	SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
15510	HiVec0 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: HiVec0);
15511	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
15512	} else {
15513	assert(Idx == InVT.getVectorMinNumElements() &&
15514	"Invalid subvector index!");
15515	SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
15516	LoVec0 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: NarrowVT, Operand: LoVec0);
15517	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
15518	}
15519
15520	return getSVESafeBitCast(VT, Op: Narrow, DAG);
15521	}
15522
15523	if (Idx == `0` && isPackedVectorType(VT, DAG)) {
15524	// This will be matched by custom code during ISelDAGToDAG.
15525	if (Vec0.isUndef())
15526	return Op;
15527
15528	std::optional<unsigned> PredPattern =
15529	getSVEPredPatternFromNumElements(MinNumElts: InVT.getVectorNumElements());
15530	auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
15531	SDValue PTrue = getPTrue(DAG, DL, VT: PredTy, Pattern: *PredPattern);
15532	SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
15533	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
15534	}
15535
15536	return SDValue ();
15537	}
15538
15539	static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
15540	if (Op.getOpcode() != AArch64ISD::DUP &&
15541	Op.getOpcode() != ISD::SPLAT_VECTOR &&
15542	Op.getOpcode() != ISD::BUILD_VECTOR)
15543	return false;
15544
15545	if (Op.getOpcode() == ISD::BUILD_VECTOR &&
15546	!isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
15547	return false;
15548
15549	if (Op.getOpcode() != ISD::BUILD_VECTOR &&
15550	!isa<ConstantSDNode>(Val: Op ->getOperand(Num: `0`)))
15551	return false;
15552
15553	SplatVal = Op ->getConstantOperandVal(Num: `0`);
15554	if (Op.getValueType().getVectorElementType() != MVT::i64)
15555	SplatVal = (int32_t)SplatVal;
15556
15557	Negated = false;
15558	if (isPowerOf2_64(Value: SplatVal))
15559	return true;
15560
15561	Negated = true;
15562	if (isPowerOf2_64(Value: -SplatVal)) {
15563	SplatVal = -SplatVal;
15564	return true;
15565	}
15566
15567	return false;
15568	}
15569
15570	SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
15571	EVT VT = Op.getValueType();
15572	SDLoc DL(Op);
15573
15574	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
15575	return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
15576
15577	assert(VT.isScalableVector() && "Expected a scalable vector.");
15578
15579	bool Signed = Op.getOpcode() == ISD::SDIV;
15580	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
15581
15582	bool Negated;
15583	uint64_t SplatVal;
15584	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
15585	SDValue Pg = getPredicateForScalableVector(DAG, DL, VT);
15586	SDValue Res =
15587	DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL, VT, N1: Pg, N2: Op ->getOperand(Num: `0`),
15588	N3: DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL, VT: MVT::i32));
15589	if (Negated)
15590	Res = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT), N2: Res);
15591
15592	return Res;
15593	}
15594
15595	if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv2i64)
15596	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
15597
15598	// SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
15599	// operations, and truncate the result.
15600	EVT WidenedVT;
15601	if (VT == MVT::nxv16i8)
15602	WidenedVT = MVT::nxv8i16;
15603	else if (VT == MVT::nxv8i16)
15604	WidenedVT = MVT::nxv4i32;
15605	else
15606	llvm_unreachable("Unexpected Custom DIV operation");
15607
15608	unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15609	unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
15610	SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
15611	SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
15612	SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
15613	SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
15614	SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
15615	SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
15616	SDValue ResultLoCast = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: ResultLo);
15617	SDValue ResultHiCast = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: ResultHi);
15618	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT, N1: ResultLoCast, N2: ResultHiCast);
15619	}
15620
15621	bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
15622	EVT VT, unsigned DefinedValues) const {
15623	if (!Subtarget->isNeonAvailable())
15624	return false;
15625	return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
15626	}
15627
15628	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
15629	// Currently no fixed length shuffles that require SVE are legal.
15630	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15631	return false;
15632
15633	if (VT.getVectorNumElements() == `4` &&
15634	(VT.is128BitVector() \|\| VT.is64BitVector())) {
15635	unsigned Cost = getPerfectShuffleCost(M);
15636	if (Cost <= `1`)
15637	return true;
15638	}
15639
15640	bool DummyBool;
15641	int DummyInt;
15642	unsigned DummyUnsigned;
15643
15644	unsigned EltSize = VT.getScalarSizeInBits();
15645	unsigned NumElts = VT.getVectorNumElements();
15646	return (ShuffleVectorSDNode::isSplatMask(Mask: M) \|\|
15647	isREVMask(M, EltSize, NumElts, BlockSize: `64`) \|\|
15648	isREVMask(M, EltSize, NumElts, BlockSize: `32`) \|\|
15649	isREVMask(M, EltSize, NumElts, BlockSize: `16`) \|\|
15650	isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) \|\|
15651	isTRNMask(M, NumElts, WhichResult&: DummyUnsigned) \|\|
15652	isUZPMask(M, NumElts, WhichResultOut&: DummyUnsigned) \|\|
15653	isZIPMask(M, NumElts, WhichResultOut&: DummyUnsigned) \|\|
15654	isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
15655	isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
15656	isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
15657	isINSMask(M, NumInputElements: NumElts, DstIsLeft&: DummyBool, Anomaly&: DummyInt) \|\|
15658	isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == `128`));
15659	}
15660
15661	bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
15662	EVT VT) const {
15663	// Just delegate to the generic legality, clear masks aren't special.
15664	return isShuffleMaskLegal(M, VT);
15665	}
15666
15667	/// getVShiftImm - Check if this is a valid build_vector for the immediate
15668	/// operand of a vector shift operation, where all the elements of the
15669	/// build_vector must have the same constant integer value.
15670	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
15671	// Ignore bit_converts.
15672	while (Op.getOpcode() == ISD::BITCAST)
15673	Op = Op.getOperand(i: `0`);
15674	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
15675	APInt SplatBits, SplatUndef;
15676	unsigned SplatBitSize;
15677	bool HasAnyUndefs;
15678	if (!BVN \|\| !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
15679	HasAnyUndefs, MinSplatBits: ElementBits) \|\|
15680	SplatBitSize > ElementBits)
15681	return false;
15682	Cnt = SplatBits.getSExtValue();
15683	return true;
15684	}
15685
15686	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
15687	/// operand of a vector shift left operation. That value must be in the range:
15688	/// 0 <= Value < ElementBits for a left shift; or
15689	/// 0 <= Value <= ElementBits for a long left shift.
15690	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
15691	assert(VT.isVector() && "vector shift count is not a vector type");
15692	int64_t ElementBits = VT.getScalarSizeInBits();
15693	if (!getVShiftImm(Op, ElementBits, Cnt))
15694	return false;
15695	return (Cnt >= `0` && (isLong ? Cnt - `1` : Cnt) < ElementBits);
15696	}
15697
15698	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
15699	/// operand of a vector shift right operation. The value must be in the range:
15700	/// 1 <= Value <= ElementBits for a right shift; or
15701	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
15702	assert(VT.isVector() && "vector shift count is not a vector type");
15703	int64_t ElementBits = VT.getScalarSizeInBits();
15704	if (!getVShiftImm(Op, ElementBits, Cnt))
15705	return false;
15706	return (Cnt >= `1` && Cnt <= (isNarrow ? ElementBits / `2` : ElementBits));
15707	}
15708
15709	SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
15710	SelectionDAG &DAG) const {
15711	EVT VT = Op.getValueType();
15712
15713	if (VT.getScalarType() == MVT::i1) {
15714	// Lower i1 truncate to `(x & 1) != 0`.
15715	SDLoc DL(Op);
15716	EVT OpVT = Op.getOperand(i: `0`).getValueType();
15717	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: OpVT);
15718	SDValue One = DAG.getConstant(Val: `1`, DL, VT: OpVT);
15719	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT: OpVT, N1: Op.getOperand(i: `0`), N2: One);
15720	return DAG.getSetCC(DL, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
15721	}
15722
15723	if (!VT.isVector() \|\| VT.isScalableVector())
15724	return SDValue ();
15725
15726	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
15727	OverrideNEON: !Subtarget->isNeonAvailable()))
15728	return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
15729
15730	return SDValue ();
15731	}
15732
15733	// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
15734	// possibly a truncated type, it tells how many bits of the value are to be
15735	// used.
15736	static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
15737	SelectionDAG &DAG,
15738	unsigned &ShiftValue,
15739	SDValue &RShOperand) {
15740	if (Shift ->getOpcode() != ISD::SRL)
15741	return false;
15742
15743	EVT VT = Shift.getValueType();
15744	assert(VT.isScalableVT());
15745
15746	auto ShiftOp1 =
15747	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift ->getOperand(Num: `1`)));
15748	if (!ShiftOp1)
15749	return false;
15750
15751	ShiftValue = ShiftOp1->getZExtValue();
15752	if (ShiftValue < `1` \|\| ShiftValue > ResVT.getScalarSizeInBits())
15753	return false;
15754
15755	SDValue Add = Shift ->getOperand(Num: `0`);
15756	if (Add ->getOpcode() != ISD::ADD \|\| !Add ->hasOneUse())
15757	return false;
15758
15759	assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
15760	"ResVT must be truncated or same type as the shift.");
15761	// Check if an overflow can lead to incorrect results.
15762	uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
15763	if (ShiftValue > ExtraBits && !Add ->getFlags().hasNoUnsignedWrap())
15764	return false;
15765
15766	auto AddOp1 =
15767	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add ->getOperand(Num: `1`)));
15768	if (!AddOp1)
15769	return false;
15770	uint64_t AddValue = AddOp1->getZExtValue();
15771	if (AddValue != `1ULL` << (ShiftValue - `1`))
15772	return false;
15773
15774	RShOperand = Add ->getOperand(Num: `0`);
15775	return true;
15776	}
15777
15778	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
15779	SelectionDAG &DAG) const {
15780	EVT VT = Op.getValueType();
15781	SDLoc DL(Op);
15782	int64_t Cnt;
15783
15784	if (!Op.getOperand(i: `1`).getValueType().isVector())
15785	return Op;
15786	unsigned EltSize = VT.getScalarSizeInBits();
15787
15788	switch (Op.getOpcode()) {
15789	case ISD::SHL:
15790	if (VT.isScalableVector() \|\|
15791	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
15792	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
15793
15794	if (isVShiftLImm(Op: Op.getOperand(i: `1`), VT, isLong: false, Cnt) && Cnt < EltSize)
15795	return DAG.getNode(Opcode: AArch64ISD::VSHL, DL, VT, N1: Op.getOperand(i: `0`),
15796	N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32));
15797	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
15798	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_ushl, DL,
15799	VT: MVT::i32),
15800	N2: Op.getOperand(i: `0`), N3: Op.getOperand(i: `1`));
15801	case ISD::SRA:
15802	case ISD::SRL:
15803	if (VT.isScalableVector() &&
15804	(Subtarget->hasSVE2() \|\|
15805	(Subtarget->hasSME() && Subtarget->isStreaming()))) {
15806	SDValue RShOperand;
15807	unsigned ShiftValue;
15808	if (canLowerSRLToRoundingShiftForVT(Shift: Op, ResVT: VT, DAG, ShiftValue, RShOperand))
15809	return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT,
15810	N1: getPredicateForVector(DAG, DL, VT), N2: RShOperand,
15811	N3: DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32));
15812	}
15813
15814	if (VT.isScalableVector() \|\|
15815	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
15816	unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
15817	: AArch64ISD::SRL_PRED;
15818	return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
15819	}
15820
15821	// Right shift immediate
15822	if (isVShiftRImm(Op: Op.getOperand(i: `1`), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
15823	unsigned Opc =
15824	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
15825	return DAG.getNode(Opcode: Opc, DL, VT, N1: Op.getOperand(i: `0`),
15826	N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32), Flags: Op ->getFlags());
15827	}
15828
15829	// Right shift register. Note, there is not a shift right register
15830	// instruction, but the shift left register instruction takes a signed
15831	// value, where negative numbers specify a right shift.
15832	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
15833	: Intrinsic::aarch64_neon_ushl;
15834	// negate the shift amount
15835	SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
15836	N2: Op.getOperand(i: `1`));
15837	SDValue NegShiftLeft =
15838	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
15839	N1: DAG.getConstant(Val: Opc, DL, VT: MVT::i32), N2: Op.getOperand(i: `0`),
15840	N3: NegShift);
15841	return NegShiftLeft;
15842	}
15843
15844	llvm_unreachable("unexpected shift opcode");
15845	}
15846
15847	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15848	SelectionDAG &DAG) const {
15849	if (Op.getValueType().isScalableVector())
15850	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
15851
15852	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
15853	OverrideNEON: !Subtarget->isNeonAvailable()))
15854	return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15855
15856	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
15857	SDValue LHS = Op.getOperand(i: `0`);
15858	SDValue RHS = Op.getOperand(i: `1`);
15859	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15860	SDLoc DL(Op);
15861
15862	if (LHS.getValueType().getVectorElementType().isInteger())
15863	return Op;
15864
15865	assert(((!Subtarget->hasFullFP16() &&
15866	LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
15867	LHS.getValueType().getVectorElementType() != MVT::bf16 \|\|
15868	LHS.getValueType().getVectorElementType() != MVT::f128) &&
15869	"Unexpected type!");
15870
15871	// Lower isnan(x) \| isnan(never-nan) to x != x.
15872	// Lower !isnan(x) & !isnan(never-nan) to x == x.
15873	if (CC == ISD::SETUO \|\| CC == ISD::SETO) {
15874	bool OneNaN = false;
15875	if (LHS == RHS) {
15876	OneNaN = true;
15877	} else if (DAG.isKnownNeverNaN(Op: RHS)) {
15878	OneNaN = true;
15879	RHS = LHS;
15880	} else if (DAG.isKnownNeverNaN(Op: LHS)) {
15881	OneNaN = true;
15882	LHS = RHS;
15883	}
15884	if (OneNaN) {
15885	CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
15886	}
15887	}
15888
15889	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15890	// clean. Some of them require two branches to implement.
15891	AArch64CC::CondCode CC1, CC2;
15892	bool ShouldInvert;
15893	changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
15894
15895	bool NoNaNs =
15896	getTargetMachine().Options.NoNaNsFPMath \|\| Op ->getFlags().hasNoNaNs();
15897	SDValue Cmp = emitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, DL, DAG);
15898	if (!Cmp.getNode())
15899	return SDValue ();
15900
15901	if (CC2 != AArch64CC::AL) {
15902	SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, DL, DAG);
15903	if (!Cmp2.getNode())
15904	return SDValue ();
15905
15906	Cmp = DAG.getNode(Opcode: ISD::OR, DL, VT: CmpVT, N1: Cmp, N2: Cmp2);
15907	}
15908
15909	Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL, VT: Op.getValueType());
15910
15911	if (ShouldInvert)
15912	Cmp = DAG.getNOT(DL, Val: Cmp, VT: Cmp.getValueType());
15913
15914	return Cmp;
15915	}
15916
15917	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15918	SelectionDAG &DAG) {
15919	SDValue VecOp = ScalarOp.getOperand(i: `0`);
15920	auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
15921	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarOp.getValueType(), N1: Rdx,
15922	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
15923	}
15924
15925	static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15926	SDLoc DL, SelectionDAG &DAG) {
15927	unsigned ScalarOpcode;
15928	switch (Opcode) {
15929	case ISD::VECREDUCE_AND:
15930	ScalarOpcode = ISD::AND;
15931	break;
15932	case ISD::VECREDUCE_OR:
15933	ScalarOpcode = ISD::OR;
15934	break;
15935	case ISD::VECREDUCE_XOR:
15936	ScalarOpcode = ISD::XOR;
15937	break;
15938	default:
15939	llvm_unreachable("Expected bitwise vector reduction");
15940	return SDValue ();
15941	}
15942
15943	EVT VecVT = Vec.getValueType();
15944	assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15945	"Expected power-of-2 length vector");
15946
15947	EVT ElemVT = VecVT.getVectorElementType();
15948
15949	SDValue Result;
15950	unsigned NumElems = VecVT.getVectorNumElements();
15951
15952	// Special case for boolean reductions
15953	if (ElemVT == MVT::i1) {
15954	// Split large vectors into smaller ones
15955	if (NumElems > `16`) {
15956	SDValue Lo, Hi;
15957	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15958	EVT HalfVT = Lo.getValueType();
15959	SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
15960	return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
15961	}
15962
15963	// Results of setcc operations get widened to 128 bits if their input
15964	// operands are 128 bits wide, otherwise vectors that are less than 64 bits
15965	// get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
15966	// lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
15967	// size leads to the best codegen, since e.g. setcc results might need to be
15968	// truncated otherwise.
15969	unsigned ExtendedWidth = `64`;
15970	if (Vec.getOpcode() == ISD::SETCC &&
15971	Vec.getOperand(i: `0`).getValueSizeInBits() >= `128`) {
15972	ExtendedWidth = `128`;
15973	}
15974	EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: ExtendedWidth / NumElems, b: `8u`));
15975
15976	// any_ext doesn't work with umin/umax, so only use it for uadd.
15977	unsigned ExtendOp =
15978	ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
15979	SDValue Extended = DAG.getNode(
15980	Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
15981	// The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
15982	// in that case we bitcast the sign extended values from v2i64 to v4i32
15983	// before reduction for optimal code generation.
15984	if ((ScalarOpcode == ISD::AND \|\| ScalarOpcode == ISD::OR) &&
15985	NumElems == `2` && ExtendedWidth == `128`) {
15986	Extended = DAG.getBitcast(VT: MVT::v4i32, V: Extended);
15987	ExtendedVT = MVT::i32;
15988	}
15989	switch (ScalarOpcode) {
15990	case ISD::AND:
15991	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
15992	break;
15993	case ISD::OR:
15994	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
15995	break;
15996	case ISD::XOR:
15997	Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
15998	break;
15999	default:
16000	llvm_unreachable("Unexpected Opcode");
16001	}
16002
16003	Result = DAG.getAnyExtOrTrunc(Op: Result, DL, VT: MVT::i1);
16004	} else {
16005	// Iteratively split the vector in half and combine using the bitwise
16006	// operation until it fits in a 64 bit register.
16007	while (VecVT.getSizeInBits() > `64`) {
16008	SDValue Lo, Hi;
16009	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
16010	VecVT = Lo.getValueType();
16011	NumElems = VecVT.getVectorNumElements();
16012	Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
16013	}
16014
16015	EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
16016
16017	// Do the remaining work on a scalar since it allows the code generator to
16018	// combine the shift and bitwise operation into one instruction and since
16019	// integer instructions can have higher throughput than vector instructions.
16020	SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
16021
16022	// Iteratively combine the lower and upper halves of the scalar using the
16023	// bitwise operation, halving the relevant region of the scalar in each
16024	// iteration, until the relevant region is just one element of the original
16025	// vector.
16026	for (unsigned Shift = NumElems / `2`; Shift > `0`; Shift /= `2`) {
16027	SDValue ShiftAmount =
16028	DAG.getConstant(Val: Shift * ElemVT.getSizeInBits(), DL, VT: MVT::i64);
16029	SDValue Shifted =
16030	DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
16031	Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
16032	}
16033
16034	Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
16035	}
16036
16037	return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
16038	}
16039
16040	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16041	SelectionDAG &DAG) const {
16042	SDValue Src = Op.getOperand(i: `0`);
16043
16044	// Try to lower fixed length reductions to SVE.
16045	EVT SrcVT = Src.getValueType();
16046	bool OverrideNEON = !Subtarget->isNeonAvailable() \|\|
16047	Op.getOpcode() == ISD::VECREDUCE_AND \|\|
16048	Op.getOpcode() == ISD::VECREDUCE_OR \|\|
16049	Op.getOpcode() == ISD::VECREDUCE_XOR \|\|
16050	Op.getOpcode() == ISD::VECREDUCE_FADD \|\|
16051	(Op.getOpcode() != ISD::VECREDUCE_ADD &&
16052	SrcVT.getVectorElementType() == MVT::i64);
16053	if (SrcVT.isScalableVector() \|\|
16054	useSVEForFixedLengthVectorVT(
16055	VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16056
16057	if (SrcVT.getVectorElementType() == MVT::i1)
16058	return LowerPredReductionToSVE(ScalarOp: Op, DAG);
16059
16060	switch (Op.getOpcode()) {
16061	case ISD::VECREDUCE_ADD:
16062	return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
16063	case ISD::VECREDUCE_AND:
16064	return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
16065	case ISD::VECREDUCE_OR:
16066	return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
16067	case ISD::VECREDUCE_SMAX:
16068	return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
16069	case ISD::VECREDUCE_SMIN:
16070	return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
16071	case ISD::VECREDUCE_UMAX:
16072	return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
16073	case ISD::VECREDUCE_UMIN:
16074	return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
16075	case ISD::VECREDUCE_XOR:
16076	return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
16077	case ISD::VECREDUCE_FADD:
16078	return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
16079	case ISD::VECREDUCE_FMAX:
16080	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
16081	case ISD::VECREDUCE_FMIN:
16082	return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
16083	case ISD::VECREDUCE_FMAXIMUM:
16084	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
16085	case ISD::VECREDUCE_FMINIMUM:
16086	return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
16087	default:
16088	llvm_unreachable("Unhandled fixed length reduction");
16089	}
16090	}
16091
16092	// Lower NEON reductions.
16093	SDLoc DL(Op);
16094	switch (Op.getOpcode()) {
16095	case ISD::VECREDUCE_AND:
16096	case ISD::VECREDUCE_OR:
16097	case ISD::VECREDUCE_XOR:
16098	return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: `0`),
16099	VT: Op.getValueType(), DL, DAG);
16100	case ISD::VECREDUCE_ADD:
16101	return getReductionSDNode(Op: AArch64ISD::UADDV, DL, ScalarOp: Op, DAG);
16102	case ISD::VECREDUCE_SMAX:
16103	return getReductionSDNode(Op: AArch64ISD::SMAXV, DL, ScalarOp: Op, DAG);
16104	case ISD::VECREDUCE_SMIN:
16105	return getReductionSDNode(Op: AArch64ISD::SMINV, DL, ScalarOp: Op, DAG);
16106	case ISD::VECREDUCE_UMAX:
16107	return getReductionSDNode(Op: AArch64ISD::UMAXV, DL, ScalarOp: Op, DAG);
16108	case ISD::VECREDUCE_UMIN:
16109	return getReductionSDNode(Op: AArch64ISD::UMINV, DL, ScalarOp: Op, DAG);
16110	default:
16111	llvm_unreachable("Unhandled reduction");
16112	}
16113	}
16114
16115	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16116	SelectionDAG &DAG) const {
16117	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16118	// No point replacing if we don't have the relevant instruction/libcall anyway
16119	if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16120	return SDValue ();
16121
16122	// LSE has an atomic load-clear instruction, but not a load-and.
16123	SDLoc DL(Op);
16124	MVT VT = Op.getSimpleValueType();
16125	assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16126	SDValue RHS = Op.getOperand(i: `2`);
16127	AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
16128	RHS = DAG.getNode(Opcode: ISD::XOR, DL, VT, N1: DAG.getAllOnesConstant(DL, VT), N2: RHS);
16129	return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl: DL, MemVT: AN->getMemoryVT(),
16130	Chain: Op.getOperand(i: `0`), Ptr: Op.getOperand(i: `1`), Val: RHS,
16131	MMO: AN->getMemOperand());
16132	}
16133
16134	SDValue
16135	AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16136	SelectionDAG &DAG) const {
16137
16138	SDLoc DL(Op);
16139	// Get the inputs.
16140	SDNode *Node = Op.getNode();
16141	SDValue Chain = Op.getOperand(i: `0`);
16142	SDValue Size = Op.getOperand(i: `1`);
16143	MaybeAlign Align =
16144	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
16145	EVT VT = Node->getValueType(ResNo: `0`);
16146
16147	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
16148	Kind: "no-stack-arg-probe")) {
16149	SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16150	Chain = SP.getValue(R: `1`);
16151	SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16152	if (Align)
16153	SP =
16154	DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: `0`),
16155	N2: DAG.getSignedConstant(Val: -(uint64_t)Align ->value(), DL, VT));
16156	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::SP, N: SP);
16157	SDValue Ops[`2`] = {SP, Chain};
16158	return DAG.getMergeValues(Ops, dl: DL);
16159	}
16160
16161	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL);
16162
16163	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
16164	SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
16165	VT: PtrVT, TargetFlags: `0`);
16166
16167	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16168	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16169	if (Subtarget->hasCustomCallingConv())
16170	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
16171
16172	Size = DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: Size,
16173	N2: DAG.getConstant(Val: `4`, DL, VT: MVT::i64));
16174	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X15, N: Size, Glue: SDValue ());
16175	Chain =
16176	DAG.getNode(Opcode: AArch64ISD::CALL, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
16177	N1: Chain, N2: Callee, N3: DAG.getRegister(Reg: AArch64::X15, VT: MVT::i64),
16178	N4: DAG.getRegisterMask(RegMask: Mask), N5: Chain.getValue(R: `1`));
16179	// To match the actual intent better, we should read the output from X15 here
16180	// again (instead of potentially spilling it to the stack), but rereading Size
16181	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16182	// here.
16183
16184	Size = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Size,
16185	N2: DAG.getConstant(Val: `4`, DL, VT: MVT::i64));
16186
16187	SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16188	Chain = SP.getValue(R: `1`);
16189	SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16190	if (Align)
16191	SP = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: `0`),
16192	N2: DAG.getSignedConstant(Val: -(uint64_t)Align ->value(), DL, VT));
16193	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::SP, N: SP);
16194
16195	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: SDValue (), DL);
16196
16197	SDValue Ops[`2`] = {SP, Chain};
16198	return DAG.getMergeValues(Ops, dl: DL);
16199	}
16200
16201	SDValue
16202	AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16203	SelectionDAG &DAG) const {
16204	// Get the inputs.
16205	SDNode *Node = Op.getNode();
16206	SDValue Chain = Op.getOperand(i: `0`);
16207	SDValue Size = Op.getOperand(i: `1`);
16208
16209	MaybeAlign Align =
16210	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
16211	SDLoc DL(Op);
16212	EVT VT = Node->getValueType(ResNo: `0`);
16213
16214	// Construct the new SP value in a GPR.
16215	SDValue SP = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP, VT: MVT::i64);
16216	Chain = SP.getValue(R: `1`);
16217	SP = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: SP, N2: Size);
16218	if (Align)
16219	SP = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: SP.getValue(R: `0`),
16220	N2: DAG.getSignedConstant(Val: -(uint64_t)Align ->value(), DL, VT));
16221
16222	// Set the real SP to the new value with a probing loop.
16223	Chain = DAG.getNode(Opcode: AArch64ISD::PROBED_ALLOCA, DL, VT: MVT::Other, N1: Chain, N2: SP);
16224	SDValue Ops[`2`] = {SP, Chain};
16225	return DAG.getMergeValues(Ops, dl: DL);
16226	}
16227
16228	SDValue
16229	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16230	SelectionDAG &DAG) const {
16231	MachineFunction &MF = DAG.getMachineFunction();
16232
16233	if (Subtarget->isTargetWindows())
16234	return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16235	else if (hasInlineStackProbe(MF))
16236	return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16237	else
16238	return SDValue ();
16239	}
16240
16241	SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16242	unsigned NewOp) const {
16243	if (Subtarget->hasSVE2())
16244	return LowerToPredicatedOp(Op, DAG, NewOp);
16245
16246	// Default to expand.
16247	return SDValue ();
16248	}
16249
16250	SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16251	SelectionDAG &DAG) const {
16252	EVT VT = Op.getValueType();
16253	assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16254
16255	SDLoc DL(Op);
16256	APInt MulImm = Op.getConstantOperandAPInt(i: `0`);
16257	return DAG.getZExtOrTrunc(Op: DAG.getVScale(DL, VT: MVT::i64, MulImm: MulImm.sext(width: `64`)), DL,
16258	VT);
16259	}
16260
16261	/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16262	template <unsigned NumVecs>
16263	static bool
16264	setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
16265	AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
16266	Info.opc = ISD::INTRINSIC_VOID;
16267	// Retrieve EC from first vector argument.
16268	const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: `0`)->getType());
16269	ElementCount EC = VT.getVectorElementCount();
16270	#ifndef NDEBUG
16271	// Check the assumption that all input vectors are the same type.
16272	for (unsigned I = `0`; I < NumVecs; ++I)
16273	assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
16274	"Invalid type.");
16275	#endif
16276	// memVT is `NumVecs VT`.*
16277	Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
16278	EC: EC * NumVecs);
16279	Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - `1`);
16280	Info.offset = `0`;
16281	Info.align.reset();
16282	Info.flags = MachineMemOperand::MOStore;
16283	return true;
16284	}
16285
16286	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
16287	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
16288	/// specified in the intrinsic calls.
16289	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16290	const CallInst &I,
16291	MachineFunction &MF,
16292	unsigned Intrinsic) const {
16293	auto &DL = I.getDataLayout();
16294	switch (Intrinsic) {
16295	case Intrinsic::aarch64_sve_st2:
16296	return setInfoSVEStN<`2`>(TLI: *this, DL, Info, CI: I);
16297	case Intrinsic::aarch64_sve_st3:
16298	return setInfoSVEStN<`3`>(TLI: *this, DL, Info, CI: I);
16299	case Intrinsic::aarch64_sve_st4:
16300	return setInfoSVEStN<`4`>(TLI: *this, DL, Info, CI: I);
16301	case Intrinsic::aarch64_neon_ld2:
16302	case Intrinsic::aarch64_neon_ld3:
16303	case Intrinsic::aarch64_neon_ld4:
16304	case Intrinsic::aarch64_neon_ld1x2:
16305	case Intrinsic::aarch64_neon_ld1x3:
16306	case Intrinsic::aarch64_neon_ld1x4: {
16307	Info.opc = ISD::INTRINSIC_W_CHAIN;
16308	uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / `64`;
16309	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
16310	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
16311	Info.offset = `0`;
16312	Info.align.reset();
16313	// volatile loads with NEON intrinsics not supported
16314	Info.flags = MachineMemOperand::MOLoad;
16315	return true;
16316	}
16317	case Intrinsic::aarch64_neon_ld2lane:
16318	case Intrinsic::aarch64_neon_ld3lane:
16319	case Intrinsic::aarch64_neon_ld4lane:
16320	case Intrinsic::aarch64_neon_ld2r:
16321	case Intrinsic::aarch64_neon_ld3r:
16322	case Intrinsic::aarch64_neon_ld4r: {
16323	Info.opc = ISD::INTRINSIC_W_CHAIN;
16324	// ldx return struct with the same vec type
16325	Type *RetTy = I.getType();
16326	auto *StructTy = cast<StructType>(Val: RetTy);
16327	unsigned NumElts = StructTy->getNumElements();
16328	Type *VecTy = StructTy->getElementType(N: `0`);
16329	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
16330	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
16331	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
16332	Info.offset = `0`;
16333	Info.align.reset();
16334	// volatile loads with NEON intrinsics not supported
16335	Info.flags = MachineMemOperand::MOLoad;
16336	return true;
16337	}
16338	case Intrinsic::aarch64_neon_st2:
16339	case Intrinsic::aarch64_neon_st3:
16340	case Intrinsic::aarch64_neon_st4:
16341	case Intrinsic::aarch64_neon_st1x2:
16342	case Intrinsic::aarch64_neon_st1x3:
16343	case Intrinsic::aarch64_neon_st1x4: {
16344	Info.opc = ISD::INTRINSIC_VOID;
16345	unsigned NumElts = `0`;
16346	for (const Value *Arg : I.args()) {
16347	Type *ArgTy = Arg->getType();
16348	if (!ArgTy->isVectorTy())
16349	break;
16350	NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / `64`;
16351	}
16352	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
16353	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
16354	Info.offset = `0`;
16355	Info.align.reset();
16356	// volatile stores with NEON intrinsics not supported
16357	Info.flags = MachineMemOperand::MOStore;
16358	return true;
16359	}
16360	case Intrinsic::aarch64_neon_st2lane:
16361	case Intrinsic::aarch64_neon_st3lane:
16362	case Intrinsic::aarch64_neon_st4lane: {
16363	Info.opc = ISD::INTRINSIC_VOID;
16364	unsigned NumElts = `0`;
16365	// all the vector type is same
16366	Type *VecTy = I.getArgOperand(i: `0`)->getType();
16367	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
16368
16369	for (const Value *Arg : I.args()) {
16370	Type *ArgTy = Arg->getType();
16371	if (!ArgTy->isVectorTy())
16372	break;
16373	NumElts += `1`;
16374	}
16375
16376	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
16377	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
16378	Info.offset = `0`;
16379	Info.align.reset();
16380	// volatile stores with NEON intrinsics not supported
16381	Info.flags = MachineMemOperand::MOStore;
16382	return true;
16383	}
16384	case Intrinsic::aarch64_ldaxr:
16385	case Intrinsic::aarch64_ldxr: {
16386	Type *ValTy = I.getParamElementType(ArgNo: `0`);
16387	Info.opc = ISD::INTRINSIC_W_CHAIN;
16388	Info.memVT = MVT::getVT(Ty: ValTy);
16389	Info.ptrVal = I.getArgOperand(i: `0`);
16390	Info.offset = `0`;
16391	Info.align = DL.getABITypeAlign(Ty: ValTy);
16392	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
16393	return true;
16394	}
16395	case Intrinsic::aarch64_stlxr:
16396	case Intrinsic::aarch64_stxr: {
16397	Type *ValTy = I.getParamElementType(ArgNo: `1`);
16398	Info.opc = ISD::INTRINSIC_W_CHAIN;
16399	Info.memVT = MVT::getVT(Ty: ValTy);
16400	Info.ptrVal = I.getArgOperand(i: `1`);
16401	Info.offset = `0`;
16402	Info.align = DL.getABITypeAlign(Ty: ValTy);
16403	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
16404	return true;
16405	}
16406	case Intrinsic::aarch64_ldaxp:
16407	case Intrinsic::aarch64_ldxp:
16408	Info.opc = ISD::INTRINSIC_W_CHAIN;
16409	Info.memVT = MVT::i128;
16410	Info.ptrVal = I.getArgOperand(i: `0`);
16411	Info.offset = `0`;
16412	Info.align = Align (`16`);
16413	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
16414	return true;
16415	case Intrinsic::aarch64_stlxp:
16416	case Intrinsic::aarch64_stxp:
16417	Info.opc = ISD::INTRINSIC_W_CHAIN;
16418	Info.memVT = MVT::i128;
16419	Info.ptrVal = I.getArgOperand(i: `2`);
16420	Info.offset = `0`;
16421	Info.align = Align (`16`);
16422	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
16423	return true;
16424	case Intrinsic::aarch64_sve_ldnt1: {
16425	Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
16426	Info.opc = ISD::INTRINSIC_W_CHAIN;
16427	Info.memVT = MVT::getVT(Ty: I.getType());
16428	Info.ptrVal = I.getArgOperand(i: `1`);
16429	Info.offset = `0`;
16430	Info.align = DL.getABITypeAlign(Ty: ElTy);
16431	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MONonTemporal;
16432	return true;
16433	}
16434	case Intrinsic::aarch64_sve_stnt1: {
16435	Type *ElTy =
16436	cast<VectorType>(Val: I.getArgOperand(i: `0`)->getType())->getElementType();
16437	Info.opc = ISD::INTRINSIC_W_CHAIN;
16438	Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: `0`)->getType());
16439	Info.ptrVal = I.getArgOperand(i: `2`);
16440	Info.offset = `0`;
16441	Info.align = DL.getABITypeAlign(Ty: ElTy);
16442	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MONonTemporal;
16443	return true;
16444	}
16445	case Intrinsic::aarch64_mops_memset_tag: {
16446	Value *Dst = I.getArgOperand(i: `0`);
16447	Value *Val = I.getArgOperand(i: `1`);
16448	Info.opc = ISD::INTRINSIC_W_CHAIN;
16449	Info.memVT = MVT::getVT(Ty: Val->getType());
16450	Info.ptrVal = Dst;
16451	Info.offset = `0`;
16452	Info.align = I.getParamAlign(ArgNo: `0`).valueOrOne();
16453	Info.flags = MachineMemOperand::MOStore;
16454	// The size of the memory being operated on is unknown at this point
16455	Info.size = MemoryLocation::UnknownSize;
16456	return true;
16457	}
16458	default:
16459	break;
16460	}
16461
16462	return false;
16463	}
16464
16465	bool AArch64TargetLowering::shouldReduceLoadWidth(
16466	SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
16467	std::optional<unsigned> ByteOffset) const {
16468	// TODO: This may be worth removing. Check regression tests for diffs.
16469	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
16470	ByteOffset))
16471	return false;
16472
16473	// If we're reducing the load width in order to avoid having to use an extra
16474	// instruction to do extension then it's probably a good idea.
16475	if (ExtTy != ISD::NON_EXTLOAD)
16476	return true;
16477	// Don't reduce load width if it would prevent us from combining a shift into
16478	// the offset.
16479	MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
16480	assert(Mem);
16481	const SDValue &Base = Mem->getBasePtr();
16482	if (Base.getOpcode() == ISD::ADD &&
16483	Base.getOperand(i: `1`).getOpcode() == ISD::SHL &&
16484	Base.getOperand(i: `1`).hasOneUse() &&
16485	Base.getOperand(i: `1`).getOperand(i: `1`).getOpcode() == ISD::Constant) {
16486	// It's unknown whether a scalable vector has a power-of-2 bitwidth.
16487	if (Mem->getMemoryVT().isScalableVector())
16488	return false;
16489	// The shift can be combined if it matches the size of the value being
16490	// loaded (and so reducing the width would make it not match).
16491	uint64_t ShiftAmount = Base.getOperand(i: `1`).getConstantOperandVal(i: `1`);
16492	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/`8`;
16493	if (ShiftAmount == Log2_32(Value: LoadBytes))
16494	return false;
16495	}
16496	// We have no reason to disallow reducing the load width, so allow it.
16497	return true;
16498	}
16499
16500	// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
16501	bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
16502	EVT VT = Extend.getValueType();
16503	if ((VT == MVT::i64 \|\| VT == MVT::i32) && Extend ->use_size()) {
16504	SDValue Extract = Extend.getOperand(i: `0`);
16505	if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
16506	Extract = Extract.getOperand(i: `0`);
16507	if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
16508	EVT VecVT = Extract.getOperand(i: `0`).getValueType();
16509	if (VecVT.getScalarType() == MVT::i8 \|\| VecVT.getScalarType() == MVT::i16)
16510	return false;
16511	}
16512	}
16513	return true;
16514	}
16515
16516	// Truncations from 64-bit GPR to 32-bit GPR is free.
16517	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
16518	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
16519	return false;
16520	uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
16521	uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
16522	return NumBits1 > NumBits2;
16523	}
16524	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
16525	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
16526	return false;
16527	uint64_t NumBits1 = VT1.getFixedSizeInBits();
16528	uint64_t NumBits2 = VT2.getFixedSizeInBits();
16529	return NumBits1 > NumBits2;
16530	}
16531
16532	/// Check if it is profitable to hoist instruction in then/else to if.
16533	/// Not profitable if I and it's user can form a FMA instruction
16534	/// because we prefer FMSUB/FMADD.
16535	bool AArch64TargetLowering::isProfitableToHoist(Instruction I) const* {
16536	if (I->getOpcode() != Instruction::FMul)
16537	return true;
16538
16539	if (!I->hasOneUse())
16540	return true;
16541
16542	Instruction *User = I->user_back();
16543
16544	if (!(User->getOpcode() == Instruction::FSub \|\|
16545	User->getOpcode() == Instruction::FAdd))
16546	return true;
16547
16548	const TargetOptions &Options = getTargetMachine().Options;
16549	const Function *F = I->getFunction();
16550	const DataLayout &DL = F->getDataLayout();
16551	Type *Ty = User->getOperand(i: `0`)->getType();
16552
16553	return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
16554	isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
16555	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
16556	Options.UnsafeFPMath));
16557	}
16558
16559	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
16560	// 64-bit GPR.
16561	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
16562	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
16563	return false;
16564	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
16565	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
16566	return NumBits1 == `32` && NumBits2 == `64`;
16567	}
16568	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
16569	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
16570	return false;
16571	unsigned NumBits1 = VT1.getSizeInBits();
16572	unsigned NumBits2 = VT2.getSizeInBits();
16573	return NumBits1 == `32` && NumBits2 == `64`;
16574	}
16575
16576	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
16577	EVT VT1 = Val.getValueType();
16578	if (isZExtFree(VT1, VT2)) {
16579	return true;
16580	}
16581
16582	if (Val.getOpcode() != ISD::LOAD)
16583	return false;
16584
16585	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
16586	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
16587	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
16588	VT1.getSizeInBits() <= `32`);
16589	}
16590
16591	bool AArch64TargetLowering::isExtFreeImpl(const Instruction Ext) const* {
16592	if (isa<FPExtInst>(Val: Ext))
16593	return false;
16594
16595	// Vector types are not free.
16596	if (Ext->getType()->isVectorTy())
16597	return false;
16598
16599	for (const Use &U : Ext->uses()) {
16600	// The extension is free if we can fold it with a left shift in an
16601	// addressing mode or an arithmetic operation: add, sub, and cmp.
16602
16603	// Is there a shift?
16604	const Instruction *Instr = cast<Instruction>(Val: U.getUser());
16605
16606	// Is this a constant shift?
16607	switch (Instr->getOpcode()) {
16608	case Instruction::Shl:
16609	if (!isa<ConstantInt>(Val: Instr->getOperand(i: `1`)))
16610	return false;
16611	break;
16612	case Instruction::GetElementPtr: {
16613	gep_type_iterator GTI = gep_type_begin(GEP: Instr);
16614	auto &DL = Ext->getDataLayout();
16615	std::advance(i&: GTI, n: U.getOperandNo()-`1`);
16616	Type *IdxTy = GTI.getIndexedType();
16617	// This extension will end up with a shift because of the scaling factor.
16618	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
16619	// Get the shift amount based on the scaling factor:
16620	// log2(sizeof(IdxTy)) - log2(8).
16621	if (IdxTy->isScalableTy())
16622	return false;
16623	uint64_t ShiftAmt =
16624	llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
16625	`3`;
16626	// Is the constant foldable in the shift of the addressing mode?
16627	// I.e., shift amount is between 1 and 4 inclusive.
16628	if (ShiftAmt == `0` \|\| ShiftAmt > `4`)
16629	return false;
16630	break;
16631	}
16632	case Instruction::Trunc:
16633	// Check if this is a noop.
16634	// trunc(sext ty1 to ty2) to ty1.
16635	if (Instr->getType() == Ext->getOperand(i: `0`)->getType())
16636	continue;
16637	[[fallthrough]];
16638	default:
16639	return false;
16640	}
16641
16642	// At this point we can use the bfm family, so this extension is free
16643	// for that use.
16644	}
16645	return true;
16646	}
16647
16648	static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16649	unsigned NumElts, bool IsLittleEndian,
16650	SmallVectorImpl<int> &Mask) {
16651	if (DstWidth % `8` != `0` \|\| DstWidth <= `16` \|\| DstWidth > `64`)
16652	return false;
16653
16654	assert(DstWidth % SrcWidth == `0` &&
16655	"TBL lowering is not supported for a conversion instruction with this "
16656	"source and destination element type.");
16657
16658	unsigned Factor = DstWidth / SrcWidth;
16659	unsigned MaskLen = NumElts * Factor;
16660
16661	Mask.clear();
16662	Mask.resize(N: MaskLen, NV: NumElts);
16663
16664	unsigned SrcIndex = `0`;
16665	for (unsigned I = IsLittleEndian ? `0` : Factor - `1`; I < MaskLen; I += Factor)
16666	Mask [I] = SrcIndex++;
16667
16668	return true;
16669	}
16670
16671	static Value createTblShuffleForZExt(IRBuilderBase &Builder, Value Op,
16672	FixedVectorType *ZExtTy,
16673	FixedVectorType *DstTy,
16674	bool IsLittleEndian) {
16675	auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16676	unsigned NumElts = SrcTy->getNumElements();
16677	auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16678	auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16679
16680	SmallVector<int> Mask;
16681	if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16682	return nullptr;
16683
16684	auto *FirstEltZero = Builder.CreateInsertElement(
16685	Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getIntN(N: SrcWidth, C: `0`), Idx: uint64_t(`0`));
16686	Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16687	Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
16688	if (DstTy != ZExtTy)
16689	Result = Builder.CreateZExt(V: Result, DestTy: ZExtTy);
16690	return Result;
16691	}
16692
16693	static Value createTblShuffleForSExt(IRBuilderBase &Builder, Value Op,
16694	FixedVectorType *DstTy,
16695	bool IsLittleEndian) {
16696	auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16697	auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16698	auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16699
16700	SmallVector<int> Mask;
16701	if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts: SrcTy->getNumElements(),
16702	IsLittleEndian: !IsLittleEndian, Mask))
16703	return nullptr;
16704
16705	auto *FirstEltZero = Builder.CreateInsertElement(
16706	Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getIntN(N: SrcWidth, C: `0`), Idx: uint64_t(`0`));
16707
16708	return Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16709	}
16710
16711	static void createTblForTrunc(TruncInst TI, bool* IsLittleEndian) {
16712	IRBuilder<> Builder(TI);
16713	SmallVector<Value *> Parts;
16714	int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
16715	auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: `0`)->getType());
16716	auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
16717	assert(SrcTy->getElementType()->isIntegerTy() &&
16718	"Non-integer type source vector element is not supported");
16719	assert(DstTy->getElementType()->isIntegerTy(`8`) &&
16720	"Unsupported destination vector element type");
16721	unsigned SrcElemTySz =
16722	cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16723	unsigned DstElemTySz =
16724	cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16725	assert((SrcElemTySz % DstElemTySz == `0`) &&
16726	"Cannot lower truncate to tbl instructions for a source element size "
16727	"that is not divisible by the destination element size");
16728	unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16729	assert((SrcElemTySz == `16` \|\| SrcElemTySz == `32` \|\| SrcElemTySz == `64`) &&
16730	"Unsupported source vector element type size");
16731	Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: `16`);
16732
16733	// Create a mask to choose every nth byte from the source vector table of
16734	// bytes to create the truncated destination vector, where 'n' is the truncate
16735	// ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16736	// 0,8,16,..Y8th bytes for the little-endian format*
16737	SmallVector<Constant *, `16`> MaskConst;
16738	for (int Itr = `0`; Itr < `16`; Itr++) {
16739	if (Itr < NumElements)
16740	MaskConst.push_back(Elt: Builder.getInt8(
16741	C: IsLittleEndian ? Itr * TruncFactor
16742	: Itr * TruncFactor + (TruncFactor - `1`)));
16743	else
16744	MaskConst.push_back(Elt: Builder.getInt8(C: `255`));
16745	}
16746
16747	int MaxTblSz = `128` * `4`;
16748	int MaxSrcSz = SrcElemTySz * NumElements;
16749	int ElemsPerTbl =
16750	(MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16751	assert(ElemsPerTbl <= `16` &&
16752	"Maximum elements selected using TBL instruction cannot exceed 16!");
16753
16754	int ShuffleCount = `128` / SrcElemTySz;
16755	SmallVector<int> ShuffleLanes;
16756	for (int i = `0`; i < ShuffleCount; ++i)
16757	ShuffleLanes.push_back(Elt: i);
16758
16759	// Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16760	// over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16761	// call TBL & save the result in a vector of TBL results for combining later.
16762	SmallVector<Value *> Results;
16763	while (ShuffleLanes.back() < NumElements) {
16764	Parts.push_back(Elt: Builder.CreateBitCast(
16765	V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: `0`), Mask: ShuffleLanes), DestTy: VecTy));
16766
16767	if (Parts.size() == `4`) {
16768	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16769	Results.push_back(
16770	Elt: Builder.CreateIntrinsic(ID: Intrinsic::aarch64_neon_tbl4, Types: VecTy, Args: Parts));
16771	Parts.clear();
16772	}
16773
16774	for (int i = `0`; i < ShuffleCount; ++i)
16775	ShuffleLanes [i] += ShuffleCount;
16776	}
16777
16778	assert((Parts.empty() \|\| Results.empty()) &&
16779	"Lowering trunc for vectors requiring different TBL instructions is "
16780	"not supported!");
16781	// Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16782	// registers
16783	if (!Parts.empty()) {
16784	Intrinsic::ID TblID;
16785	switch (Parts.size()) {
16786	case `1`:
16787	TblID = Intrinsic::aarch64_neon_tbl1;
16788	break;
16789	case `2`:
16790	TblID = Intrinsic::aarch64_neon_tbl2;
16791	break;
16792	case `3`:
16793	TblID = Intrinsic::aarch64_neon_tbl3;
16794	break;
16795	}
16796
16797	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16798	Results.push_back(Elt: Builder.CreateIntrinsic(ID: TblID, Types: VecTy, Args: Parts));
16799	}
16800
16801	// Extract the destination vector from TBL result(s) after combining them
16802	// where applicable. Currently, at most two TBLs are supported.
16803	assert(Results.size() <= `2` && "Trunc lowering does not support generation of "
16804	"more than 2 tbl instructions!");
16805	Value *FinalResult = Results [`0`];
16806	if (Results.size() == `1`) {
16807	if (ElemsPerTbl < `16`) {
16808	SmallVector<int> FinalMask(ElemsPerTbl);
16809	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
16810	FinalResult = Builder.CreateShuffleVector(V: Results [`0`], Mask: FinalMask);
16811	}
16812	} else {
16813	SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16814	if (ElemsPerTbl < `16`) {
16815	std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: `0`);
16816	std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: `16`);
16817	} else {
16818	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
16819	}
16820	FinalResult =
16821	Builder.CreateShuffleVector(V1: Results [`0`], V2: Results [`1`], Mask: FinalMask);
16822	}
16823
16824	TI->replaceAllUsesWith(V: FinalResult);
16825	TI->eraseFromParent();
16826	}
16827
16828	bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
16829	Instruction I, Loop L, const TargetTransformInfo &TTI) const {
16830	// shuffle_vector instructions are serialized when targeting SVE,
16831	// see LowerSPLAT_VECTOR. This peephole is not beneficial.
16832	if (!EnableExtToTBL \|\| Subtarget->useSVEForFixedLengthVectors())
16833	return false;
16834
16835	// Try to optimize conversions using tbl. This requires materializing constant
16836	// index vectors, which can increase code size and add loads. Skip the
16837	// transform unless the conversion is in a loop block guaranteed to execute
16838	// and we are not optimizing for size.
16839	Function *F = I->getParent()->getParent();
16840	if (!L \|\| L->getHeader() != I->getParent() \|\| F->hasOptSize())
16841	return false;
16842
16843	auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: `0`)->getType());
16844	auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
16845	if (!SrcTy \|\| !DstTy)
16846	return false;
16847
16848	// Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16849	// lowered to tbl instructions to insert the original i8 elements
16850	// into i8x lanes. This is enabled for cases where it is beneficial.
16851	auto *ZExt = dyn_cast<ZExtInst>(Val: I);
16852	if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
16853	auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16854	if (DstWidth % `8` != `0`)
16855	return false;
16856
16857	auto *TruncDstType =
16858	cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
16859	// If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16860	// the remaining ZExt folded into the user, don't use tbl lowering.
16861	auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16862	if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
16863	CCH: TargetTransformInfo::getCastContextHint(I),
16864	CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
16865	if (SrcWidth * `2` >= TruncDstType->getElementType()->getScalarSizeInBits())
16866	return false;
16867
16868	DstTy = TruncDstType;
16869	}
16870
16871	// mul(zext(i8), sext) can be transformed into smull(zext, sext) which
16872	// performs one extend implicitly. If DstWidth is at most 4 SrcWidth, at*
16873	// most one extra extend step is needed and using tbl is not profitable.
16874	// Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
16875	// udot instruction.
16876	if (SrcWidth * `4` <= DstWidth) {
16877	if (all_of(Range: I->users(), P: [&](auto *U) {
16878	auto SingleUser = cast<Instruction>(&U);
16879	if (match(SingleUser, m_c_Mul(L: m_Specific(V: I), R: m_SExt(Op: m_Value()))))
16880	return true;
16881	if (match(SingleUser,
16882	m_Intrinsic<
16883	Intrinsic::experimental_vector_partial_reduce_add>(
16884	Op0: m_Value(), Op1: m_Specific(V: I))))
16885	return true;
16886	return false;
16887	}))
16888	return false;
16889	}
16890
16891	if (DstTy->getScalarSizeInBits() >= `64`)
16892	return false;
16893
16894	IRBuilder<> Builder(ZExt);
16895	Value *Result = createTblShuffleForZExt(
16896	Builder, Op: ZExt->getOperand(i_nocapture: `0`), ZExtTy: cast<FixedVectorType>(Val: ZExt->getType()),
16897	DstTy, IsLittleEndian: Subtarget->isLittleEndian());
16898	if (!Result)
16899	return false;
16900	ZExt->replaceAllUsesWith(V: Result);
16901	ZExt->eraseFromParent();
16902	return true;
16903	}
16904
16905	auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
16906	if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16907	DstTy->getElementType()->isFloatTy()) \|\|
16908	(SrcTy->getElementType()->isIntegerTy(Bitwidth: `16`) &&
16909	DstTy->getElementType()->isDoubleTy()))) {
16910	IRBuilder<> Builder(I);
16911	Value *ZExt = createTblShuffleForZExt(
16912	Builder, Op: I->getOperand(i: `0`), ZExtTy: FixedVectorType::getInteger(VTy: DstTy),
16913	DstTy: FixedVectorType::getInteger(VTy: DstTy), IsLittleEndian: Subtarget->isLittleEndian());
16914	assert(ZExt && "Cannot fail for the i8 to float conversion");
16915	auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
16916	I->replaceAllUsesWith(V: UI);
16917	I->eraseFromParent();
16918	return true;
16919	}
16920
16921	auto *SIToFP = dyn_cast<SIToFPInst>(Val: I);
16922	if (SIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16923	DstTy->getElementType()->isFloatTy()) {
16924	IRBuilder<> Builder(I);
16925	auto *Shuffle = createTblShuffleForSExt(Builder, Op: I->getOperand(i: `0`),
16926	DstTy: FixedVectorType::getInteger(VTy: DstTy),
16927	IsLittleEndian: Subtarget->isLittleEndian());
16928	assert(Shuffle && "Cannot fail for the i8 to float conversion");
16929	auto *Cast = Builder.CreateBitCast(V: Shuffle, DestTy: VectorType::getInteger(VTy: DstTy));
16930	auto AShr = Builder.CreateAShr(LHS: Cast, RHS: `24`, Name: "", isExact: true*);
16931	auto *SI = Builder.CreateSIToFP(V: AShr, DestTy: DstTy);
16932	I->replaceAllUsesWith(V: SI);
16933	I->eraseFromParent();
16934	return true;
16935	}
16936
16937	// Convert 'fptoui <(8\|16) x float> to <(8\|16) x i8>' to a wide fptoui
16938	// followed by a truncate lowered to using tbl.4.
16939	auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
16940	if (FPToUI &&
16941	(SrcTy->getNumElements() == `8` \|\| SrcTy->getNumElements() == `16`) &&
16942	SrcTy->getElementType()->isFloatTy() &&
16943	DstTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
16944	IRBuilder<> Builder(I);
16945	auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: `0`),
16946	DestTy: VectorType::getInteger(VTy: SrcTy));
16947	auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
16948	I->replaceAllUsesWith(V: TruncI);
16949	I->eraseFromParent();
16950	createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
16951	return true;
16952	}
16953
16954	// Convert 'trunc <(8\|16) x (i32\|i64)> %x to <(8\|16) x i8>' to an appropriate
16955	// tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16956	// per lane of the input that is represented using 1,2,3 or 4 128-bit table
16957	// registers
16958	auto *TI = dyn_cast<TruncInst>(Val: I);
16959	if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16960	((SrcTy->getElementType()->isIntegerTy(Bitwidth: `32`) \|\|
16961	SrcTy->getElementType()->isIntegerTy(Bitwidth: `64`)) &&
16962	(SrcTy->getNumElements() == `16` \|\| SrcTy->getNumElements() == `8`))) {
16963	createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
16964	return true;
16965	}
16966
16967	return false;
16968	}
16969
16970	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
16971	Align &RequiredAlignment) const {
16972	if (!LoadedType.isSimple() \|\|
16973	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16974	return false;
16975	// Cyclone supports unaligned accesses.
16976	RequiredAlignment = Align (`1`);
16977	unsigned NumBits = LoadedType.getSizeInBits();
16978	return NumBits == `32` \|\| NumBits == `64`;
16979	}
16980
16981	/// A helper function for determining the number of interleaved accesses we
16982	/// will generate when lowering accesses of the given type.
16983	unsigned AArch64TargetLowering::getNumInterleavedAccesses(
16984	VectorType VecTy, const* DataLayout &DL, bool UseScalable) const {
16985	unsigned VecSize = `128`;
16986	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16987	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16988	if (UseScalable && isa<FixedVectorType>(Val: VecTy))
16989	VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: `128u`);
16990	return std::max<unsigned>(a: `1`, b: (MinElts * ElSize + `127`) / VecSize);
16991	}
16992
16993	MachineMemOperand::Flags
16994	AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
16995	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16996	I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16997	return MOStridedAccess;
16998	return MachineMemOperand::MONone;
16999	}
17000
17001	bool AArch64TargetLowering::isLegalInterleavedAccessType(
17002	VectorType VecTy, const* DataLayout &DL, bool &UseScalable) const {
17003	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
17004	auto EC = VecTy->getElementCount();
17005	unsigned MinElts = EC.getKnownMinValue();
17006
17007	UseScalable = false;
17008
17009	if (isa<FixedVectorType>(Val: VecTy) && !Subtarget->isNeonAvailable() &&
17010	(!Subtarget->useSVEForFixedLengthVectors() \|\|
17011	!getSVEPredPatternFromNumElements(MinNumElts: MinElts)))
17012	return false;
17013
17014	if (isa<ScalableVectorType>(Val: VecTy) &&
17015	!Subtarget->isSVEorStreamingSVEAvailable())
17016	return false;
17017
17018	// Ensure the number of vector elements is greater than 1.
17019	if (MinElts < `2`)
17020	return false;
17021
17022	// Ensure the element type is legal.
17023	if (ElSize != `8` && ElSize != `16` && ElSize != `32` && ElSize != `64`)
17024	return false;
17025
17026	if (EC.isScalable()) {
17027	UseScalable = true;
17028	return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % `128` == `0`;
17029	}
17030
17031	unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
17032	if (Subtarget->useSVEForFixedLengthVectors()) {
17033	unsigned MinSVEVectorSize =
17034	std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: `128u`);
17035	if (VecSize % MinSVEVectorSize == `0` \|\|
17036	(VecSize < MinSVEVectorSize && isPowerOf2_32(Value: MinElts) &&
17037	(!Subtarget->isNeonAvailable() \|\| VecSize > `128`))) {
17038	UseScalable = true;
17039	return true;
17040	}
17041	}
17042
17043	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
17044	// 128 will be split into multiple interleaved accesses.
17045	return Subtarget->isNeonAvailable() && (VecSize == `64` \|\| VecSize % `128` == `0`);
17046	}
17047
17048	static ScalableVectorType getSVEContainerIRType(FixedVectorType VTy) {
17049	if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
17050	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
17051
17052	if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
17053	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
17054
17055	if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
17056	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
17057
17058	if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
17059	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
17060
17061	if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
17062	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
17063
17064	if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
17065	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
17066
17067	if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
17068	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
17069
17070	if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
17071	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `16`);
17072
17073	llvm_unreachable("Cannot handle input vector type");
17074	}
17075
17076	static Function getStructuredLoadFunction(Module M, unsigned Factor,
17077	bool Scalable, Type *LDVTy,
17078	Type *PtrTy) {
17079	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
17080	static const Intrinsic::ID SVELoads[`3`] = {Intrinsic::aarch64_sve_ld2_sret,
17081	Intrinsic::aarch64_sve_ld3_sret,
17082	Intrinsic::aarch64_sve_ld4_sret};
17083	static const Intrinsic::ID NEONLoads[`3`] = {Intrinsic::aarch64_neon_ld2,
17084	Intrinsic::aarch64_neon_ld3,
17085	Intrinsic::aarch64_neon_ld4};
17086	if (Scalable)
17087	return Intrinsic::getOrInsertDeclaration(M, id: SVELoads[Factor - `2`], Tys: {LDVTy});
17088
17089	return Intrinsic::getOrInsertDeclaration(M, id: NEONLoads[Factor - `2`],
17090	Tys: {LDVTy, PtrTy});
17091	}
17092
17093	static Function getStructuredStoreFunction(Module M, unsigned Factor,
17094	bool Scalable, Type *STVTy,
17095	Type *PtrTy) {
17096	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
17097	static const Intrinsic::ID SVEStores[`3`] = {Intrinsic::aarch64_sve_st2,
17098	Intrinsic::aarch64_sve_st3,
17099	Intrinsic::aarch64_sve_st4};
17100	static const Intrinsic::ID NEONStores[`3`] = {Intrinsic::aarch64_neon_st2,
17101	Intrinsic::aarch64_neon_st3,
17102	Intrinsic::aarch64_neon_st4};
17103	if (Scalable)
17104	return Intrinsic::getOrInsertDeclaration(M, id: SVEStores[Factor - `2`], Tys: {STVTy});
17105
17106	return Intrinsic::getOrInsertDeclaration(M, id: NEONStores[Factor - `2`],
17107	Tys: {STVTy, PtrTy});
17108	}
17109
17110	/// Lower an interleaved load into a ldN intrinsic.
17111	///
17112	/// E.g. Lower an interleaved load (Factor = 2):
17113	/// %wide.vec = load <8 x i32>, <8 x i32> %ptr*
17114	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17115	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17116	///
17117	/// Into:
17118	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17119	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17120	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17121	bool AArch64TargetLowering::lowerInterleavedLoad(
17122	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
17123	ArrayRef<unsigned> Indices, unsigned Factor) const {
17124	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
17125	"Invalid interleave factor");
17126	assert(!Shuffles.empty() && "Empty shufflevector input");
17127	assert(Shuffles.size() == Indices.size() &&
17128	"Unmatched number of shufflevectors and indices");
17129
17130	const DataLayout &DL = LI->getDataLayout();
17131
17132	VectorType *VTy = Shuffles [`0`]->getType();
17133
17134	// Skip if we do not have NEON and skip illegal vector types. We can
17135	// "legalize" wide vector types into multiple interleaved accesses as long as
17136	// the vector types are divisible by 128.
17137	bool UseScalable;
17138	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17139	return false;
17140
17141	// Check if the interleave is a zext(shuffle), that can be better optimized
17142	// into shift / and masks. For the moment we do this just for uitofp (not
17143	// zext) to avoid issues with widening instructions.
17144	if (Shuffles.size() == `4` && all_of(Range&: Shuffles, P: [](ShuffleVectorInst *SI) {
17145	return SI->hasOneUse() && match(V: SI->user_back(), P: m_UIToFP(Op: m_Value())) &&
17146	SI->getType()->getScalarSizeInBits() * `4` ==
17147	SI->user_back()->getType()->getScalarSizeInBits();
17148	}))
17149	return false;
17150
17151	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17152
17153	auto *FVTy = cast<FixedVectorType>(Val: VTy);
17154
17155	// A pointer vector can not be the return type of the ldN intrinsics. Need to
17156	// load integer vectors first and then convert to pointer vectors.
17157	Type *EltTy = FVTy->getElementType();
17158	if (EltTy->isPointerTy())
17159	FVTy =
17160	FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
17161
17162	// If we're going to generate more than one load, reset the sub-vector type
17163	// to something legal.
17164	FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
17165	NumElts: FVTy->getNumElements() / NumLoads);
17166
17167	auto *LDVTy =
17168	UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
17169
17170	IRBuilder<> Builder(LI);
17171
17172	// The base address of the load.
17173	Value *BaseAddr = LI->getPointerOperand();
17174
17175	Type *PtrTy = LI->getPointerOperandType();
17176	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
17177	EC: LDVTy->getElementCount());
17178
17179	Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
17180	Scalable: UseScalable, LDVTy, PtrTy);
17181
17182	// Holds sub-vectors extracted from the load intrinsic return values. The
17183	// sub-vectors are associated with the shufflevector instructions they will
17184	// replace.
17185	DenseMap<ShuffleVectorInst , SmallVector<Value , `4`>> SubVecs;
17186
17187	Value PTrue = nullptr*;
17188	if (UseScalable) {
17189	std::optional<unsigned> PgPattern =
17190	getSVEPredPatternFromNumElements(MinNumElts: FVTy->getNumElements());
17191	if (Subtarget->getMinSVEVectorSizeInBits() ==
17192	Subtarget->getMaxSVEVectorSizeInBits() &&
17193	Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(Ty: FVTy))
17194	PgPattern = AArch64SVEPredPattern::all;
17195
17196	auto *PTruePat =
17197	ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
17198	PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
17199	Args: {PTruePat});
17200	}
17201
17202	for (unsigned LoadCount = `0`; LoadCount < NumLoads; ++LoadCount) {
17203
17204	// If we're generating more than one load, compute the base address of
17205	// subsequent loads as an offset from the previous.
17206	if (LoadCount > `0`)
17207	BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
17208	Idx0: FVTy->getNumElements() * Factor);
17209
17210	CallInst *LdN;
17211	if (UseScalable)
17212	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
17213	else
17214	LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17215
17216	// Extract and store the sub-vectors returned by the load intrinsic.
17217	for (unsigned i = `0`; i < Shuffles.size(); i++) {
17218	ShuffleVectorInst *SVI = Shuffles [i];
17219	unsigned Index = Indices [i];
17220
17221	Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
17222
17223	if (UseScalable)
17224	SubVec = Builder.CreateExtractVector(DstType: FVTy, SrcVec: SubVec, Idx: uint64_t(`0`));
17225
17226	// Convert the integer vector to pointer vector if the element is pointer.
17227	if (EltTy->isPointerTy())
17228	SubVec = Builder.CreateIntToPtr(
17229	V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
17230	NumElts: FVTy->getNumElements()));
17231
17232	SubVecs [SVI].push_back(Elt: SubVec);
17233	}
17234	}
17235
17236	// Replace uses of the shufflevector instructions with the sub-vectors
17237	// returned by the load intrinsic. If a shufflevector instruction is
17238	// associated with more than one sub-vector, those sub-vectors will be
17239	// concatenated into a single wide vector.
17240	for (ShuffleVectorInst *SVI : Shuffles) {
17241	auto &SubVec = SubVecs [SVI];
17242	auto *WideVec =
17243	SubVec.size() > `1` ? concatenateVectors(Builder, Vecs: SubVec) : SubVec [`0`];
17244	SVI->replaceAllUsesWith(V: WideVec);
17245	}
17246
17247	return true;
17248	}
17249
17250	template <typename Iter>
17251	bool hasNearbyPairedStore(Iter It, Iter End, Value Ptr, const* DataLayout &DL) {
17252	int MaxLookupDist = `20`;
17253	unsigned IdxWidth = DL.getIndexSizeInBits(AS: `0`);
17254	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
17255	const Value *PtrA1 =
17256	Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
17257
17258	while (++It != End) {
17259	if (It->isDebugOrPseudoInst())
17260	continue;
17261	if (MaxLookupDist-- == `0`)
17262	break;
17263	if (const auto SI = dyn_cast<StoreInst>(&It)) {
17264	const Value *PtrB1 =
17265	SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
17266	DL, OffsetB);
17267	if (PtrA1 == PtrB1 &&
17268	(OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
17269	.abs() == `16`)
17270	return true;
17271	}
17272	}
17273
17274	return false;
17275	}
17276
17277	/// Lower an interleaved store into a stN intrinsic.
17278	///
17279	/// E.g. Lower an interleaved store (Factor = 3):
17280	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
17281	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
17282	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
17283	///
17284	/// Into:
17285	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
17286	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
17287	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
17288	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17289	///
17290	/// Note that the new shufflevectors will be removed and we'll only generate one
17291	/// st3 instruction in CodeGen.
17292	///
17293	/// Example for a more general valid mask (Factor 3). Lower:
17294	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
17295	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
17296	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
17297	///
17298	/// Into:
17299	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
17300	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
17301	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
17302	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17303	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
17304	ShuffleVectorInst *SVI,
17305	unsigned Factor) const {
17306
17307	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
17308	"Invalid interleave factor");
17309
17310	auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
17311	assert(VecTy->getNumElements() % Factor == `0` && "Invalid interleaved store");
17312
17313	unsigned LaneLen = VecTy->getNumElements() / Factor;
17314	Type *EltTy = VecTy->getElementType();
17315	auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
17316
17317	const DataLayout &DL = SI->getDataLayout();
17318	bool UseScalable;
17319
17320	// Skip if we do not have NEON and skip illegal vector types. We can
17321	// "legalize" wide vector types into multiple interleaved accesses as long as
17322	// the vector types are divisible by 128.
17323	if (!isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
17324	return false;
17325
17326	unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
17327
17328	Value *Op0 = SVI->getOperand(i_nocapture: `0`);
17329	Value *Op1 = SVI->getOperand(i_nocapture: `1`);
17330	IRBuilder<> Builder(SI);
17331
17332	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
17333	// vectors to integer vectors.
17334	if (EltTy->isPointerTy()) {
17335	Type *IntTy = DL.getIntPtrType(EltTy);
17336	unsigned NumOpElts =
17337	cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
17338
17339	// Convert to the corresponding integer vector.
17340	auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
17341	Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
17342	Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
17343
17344	SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
17345	}
17346
17347	// If we're going to generate more than one store, reset the lane length
17348	// and sub-vector type to something legal.
17349	LaneLen /= NumStores;
17350	SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
17351
17352	auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
17353	: SubVecTy;
17354
17355	// The base address of the store.
17356	Value *BaseAddr = SI->getPointerOperand();
17357
17358	auto Mask = SVI->getShuffleMask();
17359
17360	// Sanity check if all the indices are NOT in range.
17361	// If mask is `poison`, `Mask` may be a vector of -1s.
17362	// If all of them are `poison`, OOB read will happen later.
17363	if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
17364	return false;
17365	}
17366	// A 64bit st2 which does not start at element 0 will involved adding extra
17367	// ext elements making the st2 unprofitable, and if there is a nearby store
17368	// that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
17369	// zip;ldp pair which has higher throughput.
17370	if (Factor == `2` && SubVecTy->getPrimitiveSizeInBits() == `64` &&
17371	(Mask [`0`] != `0` \|\|
17372	hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
17373	DL) \|\|
17374	hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
17375	Ptr: BaseAddr, DL)))
17376	return false;
17377
17378	Type *PtrTy = SI->getPointerOperandType();
17379	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
17380	EC: STVTy->getElementCount());
17381
17382	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17383	Scalable: UseScalable, STVTy, PtrTy);
17384
17385	Value PTrue = nullptr*;
17386	if (UseScalable) {
17387	std::optional<unsigned> PgPattern =
17388	getSVEPredPatternFromNumElements(MinNumElts: SubVecTy->getNumElements());
17389	if (Subtarget->getMinSVEVectorSizeInBits() ==
17390	Subtarget->getMaxSVEVectorSizeInBits() &&
17391	Subtarget->getMinSVEVectorSizeInBits() ==
17392	DL.getTypeSizeInBits(Ty: SubVecTy))
17393	PgPattern = AArch64SVEPredPattern::all;
17394
17395	auto *PTruePat =
17396	ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
17397	PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
17398	Args: {PTruePat});
17399	}
17400
17401	for (unsigned StoreCount = `0`; StoreCount < NumStores; ++StoreCount) {
17402
17403	SmallVector<Value *, `5`> Ops;
17404
17405	// Split the shufflevector operands into sub vectors for the new stN call.
17406	for (unsigned i = `0`; i < Factor; i++) {
17407	Value *Shuffle;
17408	unsigned IdxI = StoreCount * LaneLen * Factor + i;
17409	if (Mask [IdxI] >= `0`) {
17410	Shuffle = Builder.CreateShuffleVector(
17411	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask [IdxI], NumInts: LaneLen, NumUndefs: `0`));
17412	} else {
17413	unsigned StartMask = `0`;
17414	for (unsigned j = `1`; j < LaneLen; j++) {
17415	unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17416	if (Mask [IdxJ] >= `0`) {
17417	StartMask = Mask [IdxJ] - j;
17418	break;
17419	}
17420	}
17421	// Note: Filling undef gaps with random elements is ok, since
17422	// those elements were being written anyway (with undefs).
17423	// In the case of all undefs we're defaulting to using elems from 0
17424	// Note: StartMask cannot be negative, it's checked in
17425	// isReInterleaveMask
17426	Shuffle = Builder.CreateShuffleVector(
17427	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: `0`));
17428	}
17429
17430	if (UseScalable)
17431	Shuffle = Builder.CreateInsertVector(DstType: STVTy, SrcVec: PoisonValue::get(T: STVTy),
17432	SubVec: Shuffle, Idx: uint64_t(`0`));
17433
17434	Ops.push_back(Elt: Shuffle);
17435	}
17436
17437	if (UseScalable)
17438	Ops.push_back(Elt: PTrue);
17439
17440	// If we generating more than one store, we compute the base address of
17441	// subsequent stores as an offset from the previous.
17442	if (StoreCount > `0`)
17443	BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
17444	Ptr: BaseAddr, Idx0: LaneLen * Factor);
17445
17446	Ops.push_back(Elt: BaseAddr);
17447	Builder.CreateCall(Callee: StNFunc, Args: Ops);
17448	}
17449	return true;
17450	}
17451
17452	bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17453	LoadInst LI, ArrayRef<Value > DeinterleavedValues) const {
17454	unsigned Factor = DeinterleavedValues.size();
17455	if (Factor != `2` && Factor != `4`) {
17456	LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
17457	return false;
17458	}
17459
17460	VectorType *VTy = cast<VectorType>(Val: DeinterleavedValues [`0`]->getType());
17461
17462	const DataLayout &DL = LI->getModule()->getDataLayout();
17463	bool UseScalable;
17464	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17465	return false;
17466
17467	// TODO: Add support for using SVE instructions with fixed types later, using
17468	// the code from lowerInterleavedLoad to obtain the correct container type.
17469	if (UseScalable && !VTy->isScalableTy())
17470	return false;
17471
17472	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17473	VectorType *LdTy =
17474	VectorType::get(ElementType: VTy->getElementType(),
17475	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
17476
17477	Type *PtrTy = LI->getPointerOperandType();
17478	Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
17479	Scalable: UseScalable, LDVTy: LdTy, PtrTy);
17480
17481	IRBuilder<> Builder(LI);
17482	Value Pred = nullptr*;
17483	if (UseScalable)
17484	Pred =
17485	Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
17486
17487	Value *BaseAddr = LI->getPointerOperand();
17488	if (NumLoads > `1`) {
17489	// Create multiple legal small ldN.
17490	SmallVector<Value *, `4`> ExtractedLdValues(Factor, PoisonValue::get(T: VTy));
17491	for (unsigned I = `0`; I < NumLoads; ++I) {
17492	Value Offset = Builder.getInt64(C: I Factor);
17493
17494	Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
17495	Value LdN = nullptr*;
17496	if (UseScalable)
17497	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
17498	else
17499	LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
17500	Value *Idx =
17501	Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
17502	for (unsigned J = `0`; J < Factor; ++J) {
17503	ExtractedLdValues [J] = Builder.CreateInsertVector(
17504	DstType: VTy, SrcVec: ExtractedLdValues [J], SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: J), Idx);
17505	}
17506	LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
17507	}
17508	// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17509	for (unsigned J = `0`; J < Factor; ++J)
17510	DeinterleavedValues [J]->replaceAllUsesWith(V: ExtractedLdValues [J]);
17511	} else {
17512	Value *Result;
17513	if (UseScalable)
17514	Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
17515	else
17516	Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17517	// Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
17518	for (unsigned I = `0`; I < Factor; I++) {
17519	Value *NewExtract = Builder.CreateExtractValue(Agg: Result, Idxs: I);
17520	DeinterleavedValues [I]->replaceAllUsesWith(V: NewExtract);
17521	}
17522	}
17523	return true;
17524	}
17525
17526	bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17527	StoreInst SI, ArrayRef<Value > InterleavedValues) const {
17528	unsigned Factor = InterleavedValues.size();
17529	if (Factor != `2` && Factor != `4`) {
17530	LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
17531	return false;
17532	}
17533
17534	VectorType *VTy = cast<VectorType>(Val: InterleavedValues [`0`]->getType());
17535	const DataLayout &DL = SI->getModule()->getDataLayout();
17536
17537	bool UseScalable;
17538	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17539	return false;
17540
17541	// TODO: Add support for using SVE instructions with fixed types later, using
17542	// the code from lowerInterleavedStore to obtain the correct container type.
17543	if (UseScalable && !VTy->isScalableTy())
17544	return false;
17545
17546	unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17547
17548	VectorType *StTy =
17549	VectorType::get(ElementType: VTy->getElementType(),
17550	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
17551
17552	Type *PtrTy = SI->getPointerOperandType();
17553	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17554	Scalable: UseScalable, STVTy: StTy, PtrTy);
17555
17556	IRBuilder<> Builder(SI);
17557
17558	Value *BaseAddr = SI->getPointerOperand();
17559	Value Pred = nullptr*;
17560
17561	if (UseScalable)
17562	Pred =
17563	Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
17564
17565	auto ExtractedValues = InterleavedValues;
17566	SmallVector<Value *, `4`> StoreOperands(InterleavedValues);
17567	if (UseScalable)
17568	StoreOperands.push_back(Elt: Pred);
17569	StoreOperands.push_back(Elt: BaseAddr);
17570	for (unsigned I = `0`; I < NumStores; ++I) {
17571	Value *Address = BaseAddr;
17572	if (NumStores > `1`) {
17573	Value Offset = Builder.getInt64(C: I Factor);
17574	Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
17575	Value *Idx =
17576	Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
17577	for (unsigned J = `0`; J < Factor; J++) {
17578	StoreOperands [J] =
17579	Builder.CreateExtractVector(DstType: StTy, SrcVec: ExtractedValues [J], Idx);
17580	}
17581	// update the address
17582	StoreOperands [StoreOperands.size() - `1`] = Address;
17583	}
17584	Builder.CreateCall(Callee: StNFunc, Args: StoreOperands);
17585	}
17586	return true;
17587	}
17588
17589	EVT AArch64TargetLowering::getOptimalMemOpType(
17590	const MemOp &Op, const AttributeList &FuncAttributes) const {
17591	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17592	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17593	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17594	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
17595	// taken one instruction to materialize the v2i64 zero and one store (with
17596	// restrictive addressing mode). Just do i64 stores.
17597	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
17598	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17599	if (Op.isAligned(AlignCheck))
17600	return true;
17601	unsigned Fast;
17602	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
17603	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17604	Fast;
17605	};
17606
17607	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17608	AlignmentIsAcceptable (MVT::v16i8, Align (`16`)))
17609	return MVT::v16i8;
17610	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable (MVT::f128, Align (`16`)))
17611	return MVT::f128;
17612	if (Op.size() >= `8` && AlignmentIsAcceptable (MVT::i64, Align (`8`)))
17613	return MVT::i64;
17614	if (Op.size() >= `4` && AlignmentIsAcceptable (MVT::i32, Align (`4`)))
17615	return MVT::i32;
17616	return MVT::Other;
17617	}
17618
17619	LLT AArch64TargetLowering::getOptimalMemOpLLT(
17620	const MemOp &Op, const AttributeList &FuncAttributes) const {
17621	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17622	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17623	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17624	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
17625	// taken one instruction to materialize the v2i64 zero and one store (with
17626	// restrictive addressing mode). Just do i64 stores.
17627	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
17628	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17629	if (Op.isAligned(AlignCheck))
17630	return true;
17631	unsigned Fast;
17632	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
17633	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17634	Fast;
17635	};
17636
17637	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17638	AlignmentIsAcceptable (MVT::v2i64, Align (`16`)))
17639	return LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
17640	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable (MVT::f128, Align (`16`)))
17641	return LLT::scalar(SizeInBits: `128`);
17642	if (Op.size() >= `8` && AlignmentIsAcceptable (MVT::i64, Align (`8`)))
17643	return LLT::scalar(SizeInBits: `64`);
17644	if (Op.size() >= `4` && AlignmentIsAcceptable (MVT::i32, Align (`4`)))
17645	return LLT::scalar(SizeInBits: `32`);
17646	return LLT ();
17647	}
17648
17649	// 12-bit optionally shifted immediates are legal for adds.
17650	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
17651	if (Immed == std::numeric_limits<int64_t>::min()) {
17652	return false;
17653	}
17654	// Same encoding for add/sub, just flip the sign.
17655	return isLegalArithImmed(C: (uint64_t)std::abs(i: Immed));
17656	}
17657
17658	bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
17659	// We will only emit addvl/inc instructions for SVE2*
17660	if (!Subtarget->hasSVE2())
17661	return false;
17662
17663	// addvl's immediates are in terms of the number of bytes in a register.
17664	// Since there are 16 in the base supported size (128bits), we need to
17665	// divide the immediate by that much to give us a useful immediate to
17666	// multiply by vscale. We can't have a remainder as a result of this.
17667	if (Imm % `16` == `0`)
17668	return isInt<`6`>(x: Imm / `16`);
17669
17670	// Inc[b\|h\|w\|d] instructions take a pattern and a positive immediate
17671	// multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17672	// of addvl as a result, so only take h\|w\|d into account.
17673	// Dec[h\|w\|d] will cover subtractions.
17674	// Immediates are in the range [1,16], so we can't do a 2's complement check.
17675	// FIXME: Can we make use of other patterns to cover other immediates?
17676
17677	// inch\|dech
17678	if (Imm % `8` == `0`)
17679	return std::abs(i: Imm / `8`) <= `16`;
17680	// incw\|decw
17681	if (Imm % `4` == `0`)
17682	return std::abs(i: Imm / `4`) <= `16`;
17683	// incd\|decd
17684	if (Imm % `2` == `0`)
17685	return std::abs(i: Imm / `2`) <= `16`;
17686
17687	return false;
17688	}
17689
17690	// Return false to prevent folding
17691	// (mul (add x, c1), c2) -> (add (mul x, c2), c2c1) in DAGCombine,*
17692	// if the folding leads to worse code.
17693	bool AArch64TargetLowering::isMulAddWithConstProfitable(
17694	SDValue AddNode, SDValue ConstNode) const {
17695	// Let the DAGCombiner decide for vector types and large types.
17696	const EVT VT = AddNode.getValueType();
17697	if (VT.isVector() \|\| VT.getScalarSizeInBits() > `64`)
17698	return true;
17699
17700	// It is worse if c1 is legal add immediate, while c1c2 is not*
17701	// and has to be composed by at least two instructions.
17702	const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: `1`));
17703	const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
17704	const int64_t C1 = C1Node->getSExtValue();
17705	const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17706	if (!isLegalAddImmediate(Immed: C1) \|\| isLegalAddImmediate(Immed: C1C2.getSExtValue()))
17707	return true;
17708	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
17709	// Adapt to the width of a register.
17710	unsigned BitSize = VT.getSizeInBits() <= `32` ? `32` : `64`;
17711	AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
17712	if (Insn.size() > `1`)
17713	return false;
17714
17715	// Default to true and let the DAGCombiner decide.
17716	return true;
17717	}
17718
17719	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17720	// immediates is the same as for an add or a sub.
17721	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
17722	return isLegalAddImmediate(Immed);
17723	}
17724
17725	/// isLegalAddressingMode - Return true if the addressing mode represented
17726	/// by AM is legal for this target, for a load/store of the specified type.
17727	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
17728	const AddrMode &AMode, Type *Ty,
17729	unsigned AS, Instruction I) const* {
17730	// AArch64 has five basic addressing modes:
17731	// reg
17732	// reg + 9-bit signed offset
17733	// reg + SIZE_IN_BYTES 12-bit unsigned offset*
17734	// reg1 + reg2
17735	// reg + SIZE_IN_BYTES reg*
17736
17737	// No global is ever allowed as a base.
17738	if (AMode.BaseGV)
17739	return false;
17740
17741	// No reg+reg+imm addressing.
17742	if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17743	return false;
17744
17745	// Canonicalise `1ScaledReg + imm` into `BaseReg + imm` and*
17746	// `2ScaledReg` into `BaseReg + ScaledReg`*
17747	AddrMode AM = AMode;
17748	if (AM.Scale && !AM.HasBaseReg) {
17749	if (AM.Scale == `1`) {
17750	AM.HasBaseReg = true;
17751	AM.Scale = `0`;
17752	} else if (AM.Scale == `2`) {
17753	AM.HasBaseReg = true;
17754	AM.Scale = `1`;
17755	} else {
17756	return false;
17757	}
17758	}
17759
17760	// A base register is required in all addressing modes.
17761	if (!AM.HasBaseReg)
17762	return false;
17763
17764	if (Ty->isScalableTy()) {
17765	if (isa<ScalableVectorType>(Val: Ty)) {
17766	// See if we have a foldable vscale-based offset, for vector types which
17767	// are either legal or smaller than the minimum; more work will be
17768	// required if we need to consider addressing for types which need
17769	// legalization by splitting.
17770	uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / `8`;
17771	if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17772	(AM.ScalableOffset % VecNumBytes == `0`) && VecNumBytes <= `16` &&
17773	isPowerOf2_64(Value: VecNumBytes))
17774	return isInt<`4`>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
17775
17776	uint64_t VecElemNumBytes =
17777	DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / `8`;
17778	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17779	(AM.Scale == `0` \|\| (uint64_t)AM.Scale == VecElemNumBytes);
17780	}
17781
17782	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17783	}
17784
17785	// No scalable offsets allowed for non-scalable types.
17786	if (AM.ScalableOffset)
17787	return false;
17788
17789	// check reg + imm case:
17790	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES uimm12*
17791	uint64_t NumBytes = `0`;
17792	if (Ty->isSized()) {
17793	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17794	NumBytes = NumBits / `8`;
17795	if (!isPowerOf2_64(Value: NumBits))
17796	NumBytes = `0`;
17797	}
17798
17799	return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
17800	Scale: AM.Scale);
17801	}
17802
17803	// Check whether the 2 offsets belong to the same imm24 range, and their high
17804	// 12bits are same, then their high part can be decoded with the offset of add.
17805	int64_t
17806	AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
17807	int64_t MaxOffset) const {
17808	int64_t HighPart = MinOffset & ~`0xfffULL`;
17809	if (MinOffset >> `12` == MaxOffset >> `12` && isLegalAddImmediate(Immed: HighPart)) {
17810	// Rebase the value to an integer multiple of imm12.
17811	return HighPart;
17812	}
17813
17814	return `0`;
17815	}
17816
17817	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
17818	// Consider splitting large offset of struct or array.
17819	return true;
17820	}
17821
17822	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
17823	const MachineFunction &MF, EVT VT) const {
17824	VT = VT.getScalarType();
17825
17826	if (!VT.isSimple())
17827	return false;
17828
17829	switch (VT.getSimpleVT().SimpleTy) {
17830	case MVT::f16:
17831	return Subtarget->hasFullFP16();
17832	case MVT::f32:
17833	case MVT::f64:
17834	return true;
17835	default:
17836	break;
17837	}
17838
17839	return false;
17840	}
17841
17842	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17843	Type Ty) const* {
17844	switch (Ty->getScalarType()->getTypeID()) {
17845	case Type::FloatTyID:
17846	case Type::DoubleTyID:
17847	return true;
17848	default:
17849	return false;
17850	}
17851	}
17852
17853	bool AArch64TargetLowering::generateFMAsInMachineCombiner(
17854	EVT VT, CodeGenOptLevel OptLevel) const {
17855	return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17856	!useSVEForFixedLengthVectorVT(VT);
17857	}
17858
17859	const MCPhysReg *
17860	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
17861	// LR is a callee-save register, but we must treat it as clobbered by any call
17862	// site. Hence we include LR in the scratch registers, which are in turn added
17863	// as implicit-defs for stackmaps and patchpoints.
17864	static const MCPhysReg ScratchRegs[] = {
17865	AArch64::X16, AArch64::X17, AArch64::LR, `0`
17866	};
17867	return ScratchRegs;
17868	}
17869
17870	ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
17871	static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17872	return RCRegs;
17873	}
17874
17875	bool
17876	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
17877	CombineLevel Level) const {
17878	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
17879	N->getOpcode() == ISD::SRL) &&
17880	"Expected shift op");
17881
17882	SDValue ShiftLHS = N->getOperand(Num: `0`);
17883	EVT VT = N->getValueType(ResNo: `0`);
17884
17885	if (!ShiftLHS ->hasOneUse())
17886	return false;
17887
17888	if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
17889	!ShiftLHS.getOperand(i: `0`)->hasOneUse())
17890	return false;
17891
17892	// If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17893	// combine it with shift 'N' to let it be lowered to UBFX except:
17894	// ((x >> C) & mask) << C.
17895	if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
17896	isa<ConstantSDNode>(Val: ShiftLHS.getOperand(i: `1`))) {
17897	uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: `1`);
17898	if (isMask_64(Value: TruncMask)) {
17899	SDValue AndLHS = ShiftLHS.getOperand(i: `0`);
17900	if (AndLHS.getOpcode() == ISD::SRL) {
17901	if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: `1`))) {
17902	if (N->getOpcode() == ISD::SHL)
17903	if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`)))
17904	return SRLC->getZExtValue() == SHLC->getZExtValue();
17905	return false;
17906	}
17907	}
17908	}
17909	}
17910	return true;
17911	}
17912
17913	bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
17914	const SDNode N) const* {
17915	assert(N->getOpcode() == ISD::XOR &&
17916	(N->getOperand(`0`).getOpcode() == ISD::SHL \|\|
17917	N->getOperand(`0`).getOpcode() == ISD::SRL) &&
17918	"Expected XOR(SHIFT) pattern");
17919
17920	// Only commute if the entire NOT mask is a hidden shifted mask.
17921	auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
17922	auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
17923	if (XorC && ShiftC) {
17924	unsigned MaskIdx, MaskLen;
17925	if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17926	unsigned ShiftAmt = ShiftC->getZExtValue();
17927	unsigned BitWidth = N->getValueType(ResNo: `0`).getScalarSizeInBits();
17928	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL)
17929	return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17930	return MaskIdx == `0` && MaskLen == (BitWidth - ShiftAmt);
17931	}
17932	}
17933
17934	return false;
17935	}
17936
17937	bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
17938	const SDNode N, CombineLevel Level) const* {
17939	assert(((N->getOpcode() == ISD::SHL &&
17940	N->getOperand(`0`).getOpcode() == ISD::SRL) \|\|
17941	(N->getOpcode() == ISD::SRL &&
17942	N->getOperand(`0`).getOpcode() == ISD::SHL)) &&
17943	"Expected shift-shift mask");
17944	// Don't allow multiuse shift folding with the same shift amount.
17945	if (!N->getOperand(Num: `0`)->hasOneUse())
17946	return false;
17947
17948	// Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17949	EVT VT = N->getValueType(ResNo: `0`);
17950	if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 \|\| VT == MVT::i64)) {
17951	auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
17952	auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
17953	return (!C1 \|\| !C2 \|\| C1->getZExtValue() >= C2->getZExtValue());
17954	}
17955
17956	// We do not need to fold when this shifting used in specific load case:
17957	// (ldr x, (add x, (shl (srl x, c1) 2)))
17958	if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
17959	if (auto C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
17960	unsigned ShlAmt = C2->getZExtValue();
17961	if (auto ShouldADD = *N->user_begin();
17962	ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
17963	if (auto ShouldLOAD = dyn_cast<LoadSDNode>(Val: *ShouldADD->user_begin())) {
17964	unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / `8`;
17965	if ((`1ULL` << ShlAmt) == ByteVT &&
17966	isIndexedLoadLegal(IdxMode: ISD::PRE_INC, VT: ShouldLOAD->getMemoryVT()))
17967	return false;
17968	}
17969	}
17970	}
17971	}
17972
17973	return true;
17974	}
17975
17976	bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
17977	unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
17978	SDValue Y) const {
17979	return VT.isScalableVector() && isTypeLegal(VT) &&
17980	SelectOpcode == ISD::VSELECT;
17981	}
17982
17983	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17984	Type Ty) const* {
17985	assert(Ty->isIntegerTy());
17986
17987	unsigned BitSize = Ty->getPrimitiveSizeInBits();
17988	if (BitSize == `0`)
17989	return false;
17990
17991	int64_t Val = Imm.getSExtValue();
17992	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
17993	return true;
17994
17995	if ((int64_t)Val < `0`)
17996	Val = ~Val;
17997	if (BitSize == `32`)
17998	Val &= (`1LL` << `32`) - `1`;
17999
18000	unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / `16`;
18001	// MOVZ is free so return true for one or fewer MOVK.
18002	return Shift < `3`;
18003	}
18004
18005	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
18006	unsigned Index) const {
18007	if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
18008	return false;
18009
18010	return (Index == `0` \|\| Index == ResVT.getVectorMinNumElements());
18011	}
18012
18013	/// Turn vector tests of the signbit in the form of:
18014	/// xor (sra X, elt_size(X)-1), -1
18015	/// into:
18016	/// cmge X, X, #0
18017	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
18018	const AArch64Subtarget *Subtarget) {
18019	EVT VT = N->getValueType(ResNo: `0`);
18020	if (!Subtarget->hasNEON() \|\| !VT.isVector())
18021	return SDValue ();
18022
18023	// There must be a shift right algebraic before the xor, and the xor must be a
18024	// 'not' operation.
18025	SDValue Shift = N->getOperand(Num: `0`);
18026	SDValue Ones = N->getOperand(Num: `1`);
18027	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
18028	!ISD::isBuildVectorAllOnes(N: Ones.getNode()))
18029	return SDValue ();
18030
18031	// The shift should be smearing the sign bit across each vector element.
18032	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
18033	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18034	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - `1`)
18035	return SDValue ();
18036
18037	SDLoc DL(N);
18038	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: Shift.getValueType());
18039	return DAG.getSetCC(DL, VT, LHS: Shift.getOperand(i: `0`), RHS: Zero, Cond: ISD::SETGE);
18040	}
18041
18042	// Given a vecreduce_add node, detect the below pattern and convert it to the
18043	// node sequence with UABDL, [S\|U]ADB and UADDLP.
18044	//
18045	// i32 vecreduce_add(
18046	// v16i32 abs(
18047	// v16i32 sub(
18048	// v16i32 [sign\|zero]_extend(v16i8 a), v16i32 [sign\|zero]_extend(v16i8 b))))
18049	// =================>
18050	// i32 vecreduce_add(
18051	// v4i32 UADDLP(
18052	// v8i16 add(
18053	// v8i16 zext(
18054	// v8i8 [S\|U]ABD low8:v16i8 a, low8:v16i8 b
18055	// v8i16 zext(
18056	// v8i8 [S\|U]ABD high8:v16i8 a, high8:v16i8 b
18057	static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
18058	SelectionDAG &DAG) {
18059	// Assumed i32 vecreduce_add
18060	if (N->getValueType(ResNo: `0`) != MVT::i32)
18061	return SDValue ();
18062
18063	SDValue VecReduceOp0 = N->getOperand(Num: `0`);
18064	unsigned Opcode = VecReduceOp0.getOpcode();
18065	// Assumed v16i32 abs
18066	if (Opcode != ISD::ABS \|\| VecReduceOp0 ->getValueType(ResNo: `0`) != MVT::v16i32)
18067	return SDValue ();
18068
18069	SDValue ABS = VecReduceOp0;
18070	// Assumed v16i32 sub
18071	if (ABS ->getOperand(Num: `0`)->getOpcode() != ISD::SUB \|\|
18072	ABS ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i32)
18073	return SDValue ();
18074
18075	SDValue SUB = ABS ->getOperand(Num: `0`);
18076	unsigned Opcode0 = SUB ->getOperand(Num: `0`).getOpcode();
18077	unsigned Opcode1 = SUB ->getOperand(Num: `1`).getOpcode();
18078	// Assumed v16i32 type
18079	if (SUB ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i32 \|\|
18080	SUB ->getOperand(Num: `1`)->getValueType(ResNo: `0`) != MVT::v16i32)
18081	return SDValue ();
18082
18083	// Assumed zext or sext
18084	bool IsZExt = false;
18085	if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18086	IsZExt = true;
18087	} else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18088	IsZExt = false;
18089	} else
18090	return SDValue ();
18091
18092	SDValue EXT0 = SUB ->getOperand(Num: `0`);
18093	SDValue EXT1 = SUB ->getOperand(Num: `1`);
18094	// Assumed zext's operand has v16i8 type
18095	if (EXT0 ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i8 \|\|
18096	EXT1 ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i8)
18097	return SDValue ();
18098
18099	// Pattern is detected. Let's convert it to sequence of nodes.
18100	SDLoc DL(N);
18101
18102	// First, create the node pattern of UABD/SABD.
18103	SDValue UABDHigh8Op0 =
18104	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0 ->getOperand(Num: `0`),
18105	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
18106	SDValue UABDHigh8Op1 =
18107	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1 ->getOperand(Num: `0`),
18108	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
18109	SDValue UABDHigh8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
18110	N1: UABDHigh8Op0, N2: UABDHigh8Op1);
18111	SDValue UABDL = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDHigh8);
18112
18113	// Second, create the node pattern of UABAL.
18114	SDValue UABDLo8Op0 =
18115	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0 ->getOperand(Num: `0`),
18116	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
18117	SDValue UABDLo8Op1 =
18118	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1 ->getOperand(Num: `0`),
18119	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
18120	SDValue UABDLo8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
18121	N1: UABDLo8Op0, N2: UABDLo8Op1);
18122	SDValue ZExtUABD = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDLo8);
18123	SDValue UABAL = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::v8i16, N1: UABDL, N2: ZExtUABD);
18124
18125	// Third, create the node of UADDLP.
18126	SDValue UADDLP = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: MVT::v4i32, Operand: UABAL);
18127
18128	// Fourth, create the node of VECREDUCE_ADD.
18129	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i32, Operand: UADDLP);
18130	}
18131
18132	static SDValue
18133	performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18134	const AArch64Subtarget *ST) {
18135	if (DCI.isBeforeLegalize())
18136	return SDValue ();
18137
18138	if (SDValue While = optimizeIncrementingWhile(N, DAG&: DCI.DAG, /IsSigned=/false,
18139	/IsEqual=/false))
18140	return While;
18141
18142	if (!N->getValueType(ResNo: `0`).isScalableVector() \|\|
18143	(!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18144	return SDValue ();
18145
18146	if (!N->hasNUsesOfValue(NUses: `2`, Value: `0`))
18147	return SDValue ();
18148
18149	const uint64_t HalfSize = N->getValueType(ResNo: `0`).getVectorMinNumElements() / `2`;
18150	if (HalfSize < `2`)
18151	return SDValue ();
18152
18153	auto It = N->user_begin();
18154	SDNode Lo = It ++;
18155	SDNode Hi = It;
18156
18157	if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
18158	Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18159	return SDValue ();
18160
18161	uint64_t OffLo = Lo->getConstantOperandVal(Num: `1`);
18162	uint64_t OffHi = Hi->getConstantOperandVal(Num: `1`);
18163
18164	if (OffLo > OffHi) {
18165	std::swap(a&: Lo, b&: Hi);
18166	std::swap(a&: OffLo, b&: OffHi);
18167	}
18168
18169	if (OffLo != `0` \|\| OffHi != HalfSize)
18170	return SDValue ();
18171
18172	EVT HalfVec = Lo->getValueType(ResNo: `0`);
18173	if (HalfVec != Hi->getValueType(ResNo: `0`) \|\|
18174	HalfVec.getVectorElementCount() != ElementCount::getScalable(MinVal: HalfSize))
18175	return SDValue ();
18176
18177	SelectionDAG &DAG = DCI.DAG;
18178	SDLoc DL(N);
18179	SDValue ID =
18180	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
18181	SDValue Idx = N->getOperand(Num: `0`);
18182	SDValue TC = N->getOperand(Num: `1`);
18183	if (Idx.getValueType() != MVT::i64) {
18184	Idx = DAG.getZExtOrTrunc(Op: Idx, DL, VT: MVT::i64);
18185	TC = DAG.getZExtOrTrunc(Op: TC, DL, VT: MVT::i64);
18186	}
18187	auto R =
18188	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
18189	ResultTys: {Lo->getValueType(ResNo: `0`), Hi->getValueType(ResNo: `0`)}, Ops: {ID, Idx, TC});
18190
18191	DCI.CombineTo(N: Lo, Res: R.getValue(R: `0`));
18192	DCI.CombineTo(N: Hi, Res: R.getValue(R: `1`));
18193
18194	return SDValue (N, `0`);
18195	}
18196
18197	// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
18198	// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
18199	// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
18200	// If we have vectors larger than v16i8 we extract v16i8 vectors,
18201	// Follow the same steps above to get DOT instructions concatenate them
18202	// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
18203	static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
18204	const AArch64Subtarget *ST) {
18205	if (!ST->isNeonAvailable())
18206	return SDValue ();
18207
18208	if (!ST->hasDotProd())
18209	return performVecReduceAddCombineWithUADDLP(N, DAG);
18210
18211	SDValue Op0 = N->getOperand(Num: `0`);
18212	if (N->getValueType(ResNo: `0`) != MVT::i32 \|\| Op0.getValueType().isScalableVT() \|\|
18213	Op0.getValueType().getVectorElementType() != MVT::i32)
18214	return SDValue ();
18215
18216	unsigned ExtOpcode = Op0.getOpcode();
18217	SDValue A = Op0;
18218	SDValue B;
18219	unsigned DotOpcode;
18220	if (ExtOpcode == ISD::MUL) {
18221	A = Op0.getOperand(i: `0`);
18222	B = Op0.getOperand(i: `1`);
18223	if (A.getOperand(i: `0`).getValueType() != B.getOperand(i: `0`).getValueType())
18224	return SDValue ();
18225	auto OpCodeA = A.getOpcode();
18226	if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
18227	return SDValue ();
18228
18229	auto OpCodeB = B.getOpcode();
18230	if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
18231	return SDValue ();
18232
18233	if (OpCodeA == OpCodeB) {
18234	DotOpcode =
18235	OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
18236	} else {
18237	// Check USDOT support support
18238	if (!ST->hasMatMulInt8())
18239	return SDValue ();
18240	DotOpcode = AArch64ISD::USDOT;
18241	if (OpCodeA == ISD::SIGN_EXTEND)
18242	std::swap(a&: A, b&: B);
18243	}
18244	} else if (ExtOpcode == ISD::ZERO_EXTEND) {
18245	DotOpcode = AArch64ISD::UDOT;
18246	} else if (ExtOpcode == ISD::SIGN_EXTEND) {
18247	DotOpcode = AArch64ISD::SDOT;
18248	} else {
18249	return SDValue ();
18250	}
18251
18252	EVT Op0VT = A.getOperand(i: `0`).getValueType();
18253	bool IsValidElementCount = Op0VT.getVectorNumElements() % `8` == `0`;
18254	bool IsValidSize = Op0VT.getScalarSizeInBits() == `8`;
18255	if (!IsValidElementCount \|\| !IsValidSize)
18256	return SDValue ();
18257
18258	SDLoc DL(Op0);
18259	// For non-mla reductions B can be set to 1. For MLA we take the operand of
18260	// the extend B.
18261	if (!B)
18262	B = DAG.getConstant(Val: `1`, DL, VT: Op0VT);
18263	else
18264	B = B.getOperand(i: `0`);
18265
18266	unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % `16` == `0`;
18267	unsigned NumOfVecReduce;
18268	EVT TargetType;
18269	if (IsMultipleOf16) {
18270	NumOfVecReduce = Op0VT.getVectorNumElements() / `16`;
18271	TargetType = MVT::v4i32;
18272	} else {
18273	NumOfVecReduce = Op0VT.getVectorNumElements() / `8`;
18274	TargetType = MVT::v2i32;
18275	}
18276	// Handle the case where we need to generate only one Dot operation.
18277	if (NumOfVecReduce == `1`) {
18278	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: TargetType);
18279	SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
18280	N2: A.getOperand(i: `0`), N3: B);
18281	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
18282	}
18283	// Generate Dot instructions that are multiple of 16.
18284	unsigned VecReduce16Num = Op0VT.getVectorNumElements() / `16`;
18285	SmallVector<SDValue, `4`> SDotVec16;
18286	unsigned I = `0`;
18287	for (; I < VecReduce16Num; I += `1`) {
18288	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: MVT::v4i32);
18289	SDValue Op0 =
18290	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: A.getOperand(i: `0`),
18291	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
18292	SDValue Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: B,
18293	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
18294	SDValue Dot =
18295	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
18296	SDotVec16.push_back(Elt: Dot);
18297	}
18298	// Concatenate dot operations.
18299	EVT SDot16EVT =
18300	EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::i32, NumElements: `4` VecReduce16Num);
18301	SDValue ConcatSDot16 =
18302	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
18303	SDValue VecReduceAdd16 =
18304	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: ConcatSDot16);
18305	unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % `16`) / `8`;
18306	if (VecReduce8Num == `0`)
18307	return VecReduceAdd16;
18308
18309	// Generate the remainder Dot operation that is multiple of 8.
18310	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: MVT::v2i32);
18311	SDValue Vec8Op0 =
18312	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: A.getOperand(i: `0`),
18313	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
18314	SDValue Vec8Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: B,
18315	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
18316	SDValue Dot =
18317	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
18318	SDValue VecReduceAdd8 =
18319	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
18320	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: VecReduceAdd16,
18321	N2: VecReduceAdd8);
18322	}
18323
18324	// Given an (integer) vecreduce, we know the order of the inputs does not
18325	// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
18326	// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
18327	// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
18328	static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
18329	auto DetectAddExtract = [&](SDValue A) {
18330	// Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
18331	// UADDLP(x) if found.
18332	assert(A.getOpcode() == ISD::ADD);
18333	EVT VT = A.getValueType();
18334	SDValue Op0 = A.getOperand(i: `0`);
18335	SDValue Op1 = A.getOperand(i: `1`);
18336	if (Op0.getOpcode() != Op1.getOpcode() \|\|
18337	(Op0.getOpcode() != ISD::ZERO_EXTEND &&
18338	Op0.getOpcode() != ISD::SIGN_EXTEND))
18339	return SDValue ();
18340	SDValue Ext0 = Op0.getOperand(i: `0`);
18341	SDValue Ext1 = Op1.getOperand(i: `0`);
18342	if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
18343	Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
18344	Ext0.getOperand(i: `0`) != Ext1.getOperand(i: `0`))
18345	return SDValue ();
18346	// Check that the type is twice the add types, and the extract are from
18347	// upper/lower parts of the same source.
18348	if (Ext0.getOperand(i: `0`).getValueType().getVectorNumElements() !=
18349	VT.getVectorNumElements() * `2`)
18350	return SDValue ();
18351	if ((Ext0.getConstantOperandVal(i: `1`) != `0` \|\|
18352	Ext1.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()) &&
18353	(Ext1.getConstantOperandVal(i: `1`) != `0` \|\|
18354	Ext0.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()))
18355	return SDValue ();
18356	unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
18357	: AArch64ISD::SADDLP;
18358	return DAG.getNode(Opcode, DL: SDLoc (A), VT, Operand: Ext0.getOperand(i: `0`));
18359	};
18360
18361	if (SDValue R = DetectAddExtract (A))
18362	return R;
18363
18364	if (A.getOperand(i: `0`).getOpcode() == ISD::ADD && A.getOperand(i: `0`).hasOneUse())
18365	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `0`), DAG))
18366	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
18367	N2: A.getOperand(i: `1`));
18368	if (A.getOperand(i: `1`).getOpcode() == ISD::ADD && A.getOperand(i: `1`).hasOneUse())
18369	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `1`), DAG))
18370	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
18371	N2: A.getOperand(i: `0`));
18372	return SDValue ();
18373	}
18374
18375	// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
18376	// UADDLV(concat), where the concat represents the 64-bit zext sources.
18377	static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
18378	// Look for add(zext(64-bit source), zext(64-bit source)), returning
18379	// UADDLV(concat(zext, zext)) if found.
18380	assert(A.getOpcode() == ISD::ADD);
18381	EVT VT = A.getValueType();
18382	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18383	return SDValue ();
18384	SDValue Op0 = A.getOperand(i: `0`);
18385	SDValue Op1 = A.getOperand(i: `1`);
18386	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\| Op0.getOpcode() != Op1.getOpcode())
18387	return SDValue ();
18388	SDValue Ext0 = Op0.getOperand(i: `0`);
18389	SDValue Ext1 = Op1.getOperand(i: `0`);
18390	EVT ExtVT0 = Ext0.getValueType();
18391	EVT ExtVT1 = Ext1.getValueType();
18392	// Check zext VTs are the same and 64-bit length.
18393	if (ExtVT0 != ExtVT1 \|\|
18394	VT.getScalarSizeInBits() != (`2` * ExtVT0.getScalarSizeInBits()))
18395	return SDValue ();
18396	// Get VT for concat of zext sources.
18397	EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
18398	SDValue Concat =
18399	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (A), VT: PairVT, N1: Ext0, N2: Ext1);
18400
18401	switch (VT.getSimpleVT().SimpleTy) {
18402	case MVT::v2i64:
18403	case MVT::v4i32:
18404	return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc (A), VT, Operand: Concat);
18405	case MVT::v8i16: {
18406	SDValue Uaddlv =
18407	DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc (A), VT: MVT::v4i32, Operand: Concat);
18408	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc (A), VT: MVT::v8i16, Operand: Uaddlv);
18409	}
18410	default:
18411	llvm_unreachable("Unhandled vector type");
18412	}
18413	}
18414
18415	static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
18416	SDValue A = N->getOperand(Num: `0`);
18417	if (A.getOpcode() == ISD::ADD) {
18418	if (SDValue R = performUADDVAddCombine(A, DAG))
18419	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: R);
18420	else if (SDValue R = performUADDVZextCombine(A, DAG))
18421	return R;
18422	}
18423	return SDValue ();
18424	}
18425
18426	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
18427	TargetLowering::DAGCombinerInfo &DCI,
18428	const AArch64Subtarget *Subtarget) {
18429	if (DCI.isBeforeLegalizeOps())
18430	return SDValue ();
18431
18432	return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
18433	}
18434
18435	SDValue
18436	AArch64TargetLowering::BuildSDIVPow2(SDNode N, const* APInt &Divisor,
18437	SelectionDAG &DAG,
18438	SmallVectorImpl<SDNode > &Created) const* {
18439	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18440	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
18441	return SDValue (N, `0`); // Lower SDIV as SDIV
18442
18443	EVT VT = N->getValueType(ResNo: `0`);
18444
18445	// If SVE is available, we can generate
18446	// sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
18447	// sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
18448	if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
18449	return SDValue (N, `0`);
18450
18451	// fold (sdiv X, pow2)
18452	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
18453	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
18454	return SDValue ();
18455
18456	// If the divisor is 2 or -2, the default expansion is better. It will add
18457	// (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
18458	if (Divisor == `2` \|\|
18459	Divisor == APInt (Divisor.getBitWidth(), -`2`, /isSigned/ true))
18460	return SDValue ();
18461
18462	return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
18463	}
18464
18465	SDValue
18466	AArch64TargetLowering::BuildSREMPow2(SDNode N, const* APInt &Divisor,
18467	SelectionDAG &DAG,
18468	SmallVectorImpl<SDNode > &Created) const* {
18469	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
18470	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
18471	return SDValue (N, `0`); // Lower SREM as SREM
18472
18473	EVT VT = N->getValueType(ResNo: `0`);
18474
18475	// For scalable and fixed types, mark them as cheap so we can handle it much
18476	// later. This allows us to handle larger than legal types.
18477	if (VT.isScalableVector() \|\| Subtarget->useSVEForFixedLengthVectors())
18478	return SDValue (N, `0`);
18479
18480	// fold (srem X, pow2)
18481	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
18482	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
18483	return SDValue ();
18484
18485	unsigned Lg2 = Divisor.countr_zero();
18486	if (Lg2 == `0`)
18487	return SDValue ();
18488
18489	SDLoc DL(N);
18490	SDValue N0 = N->getOperand(Num: `0`);
18491	SDValue Pow2MinusOne = DAG.getConstant(Val: (`1ULL` << Lg2) - `1`, DL, VT);
18492	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
18493	SDValue CCVal, CSNeg;
18494	if (Lg2 == `1`) {
18495	SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, DL);
18496	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18497	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
18498
18499	Created.push_back(Elt: Cmp.getNode());
18500	Created.push_back(Elt: And.getNode());
18501	} else {
18502	SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
18503	SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::i32);
18504
18505	SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
18506	SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18507	SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
18508	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
18509	N4: Negs.getValue(R: `1`));
18510
18511	Created.push_back(Elt: Negs.getNode());
18512	Created.push_back(Elt: AndPos.getNode());
18513	Created.push_back(Elt: AndNeg.getNode());
18514	}
18515
18516	return CSNeg;
18517	}
18518
18519	static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18520	switch(getIntrinsicID(N: S.getNode())) {
18521	default:
18522	break;
18523	case Intrinsic::aarch64_sve_cntb:
18524	return `8`;
18525	case Intrinsic::aarch64_sve_cnth:
18526	return `16`;
18527	case Intrinsic::aarch64_sve_cntw:
18528	return `32`;
18529	case Intrinsic::aarch64_sve_cntd:
18530	return `64`;
18531	}
18532	return {};
18533	}
18534
18535	/// Calculates what the pre-extend type is, based on the extension
18536	/// operation node provided by \p Extend.
18537	///
18538	/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18539	/// pre-extend type is pulled directly from the operand, while other extend
18540	/// operations need a bit more inspection to get this information.
18541	///
18542	/// \param Extend The SDNode from the DAG that represents the extend operation
18543	///
18544	/// \returns The type representing the \p Extend source type, or \p MVT::Other
18545	/// if no valid type can be determined
18546	static EVT calculatePreExtendType(SDValue Extend) {
18547	switch (Extend.getOpcode()) {
18548	case ISD::SIGN_EXTEND:
18549	case ISD::ZERO_EXTEND:
18550	case ISD::ANY_EXTEND:
18551	return Extend.getOperand(i: `0`).getValueType();
18552	case ISD::AssertSext:
18553	case ISD::AssertZext:
18554	case ISD::SIGN_EXTEND_INREG: {
18555	VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: `1`));
18556	if (!TypeNode)
18557	return MVT::Other;
18558	return TypeNode->getVT();
18559	}
18560	case ISD::AND: {
18561	ConstantSDNode *Constant =
18562	dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: `1`).getNode());
18563	if (!Constant)
18564	return MVT::Other;
18565
18566	uint32_t Mask = Constant->getZExtValue();
18567
18568	if (Mask == UCHAR_MAX)
18569	return MVT::i8;
18570	else if (Mask == USHRT_MAX)
18571	return MVT::i16;
18572	else if (Mask == UINT_MAX)
18573	return MVT::i32;
18574
18575	return MVT::Other;
18576	}
18577	default:
18578	return MVT::Other;
18579	}
18580	}
18581
18582	/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18583	/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18584	/// SExt/ZExt rather than the scalar SExt/ZExt
18585	static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
18586	EVT VT = BV.getValueType();
18587	if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18588	BV.getOpcode() != ISD::VECTOR_SHUFFLE)
18589	return SDValue ();
18590
18591	// Use the first item in the buildvector/shuffle to get the size of the
18592	// extend, and make sure it looks valid.
18593	SDValue Extend = BV ->getOperand(Num: `0`);
18594	unsigned ExtendOpcode = Extend.getOpcode();
18595	bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
18596	bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND \|\|
18597	ExtendOpcode == ISD::SIGN_EXTEND_INREG \|\|
18598	ExtendOpcode == ISD::AssertSext;
18599	if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18600	ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18601	return SDValue ();
18602	// Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
18603	// ensure calculatePreExtendType will work without issue.
18604	if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18605	ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18606	return SDValue ();
18607
18608	// Restrict valid pre-extend data type
18609	EVT PreExtendType = calculatePreExtendType(Extend);
18610	if (PreExtendType == MVT::Other \|\|
18611	PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / `2`)
18612	return SDValue ();
18613
18614	// Make sure all other operands are equally extended.
18615	bool SeenZExtOrSExt = !IsAnyExt;
18616	for (SDValue Op : drop_begin(RangeOrContainer: BV ->ops())) {
18617	if (Op.isUndef())
18618	continue;
18619
18620	if (calculatePreExtendType(Extend: Op) != PreExtendType)
18621	return SDValue ();
18622
18623	unsigned Opc = Op.getOpcode();
18624	if (Opc == ISD::ANY_EXTEND)
18625	continue;
18626
18627	bool OpcIsSExt = Opc == ISD::SIGN_EXTEND \|\| Opc == ISD::SIGN_EXTEND_INREG \|\|
18628	Opc == ISD::AssertSext;
18629
18630	if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
18631	return SDValue ();
18632
18633	IsSExt = OpcIsSExt;
18634	SeenZExtOrSExt = true;
18635	}
18636
18637	SDValue NBV;
18638	SDLoc DL(BV);
18639	if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18640	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
18641	EVT PreExtendLegalType =
18642	PreExtendType.getScalarSizeInBits() < `32` ? MVT::i32 : PreExtendType;
18643	SmallVector<SDValue, `8`> NewOps;
18644	for (SDValue Op : BV ->ops())
18645	NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
18646	: DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL,
18647	VT: PreExtendLegalType));
18648	NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
18649	} else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18650	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
18651	NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: `0`).getOperand(i: `0`),
18652	N2: BV.getOperand(i: `1`).isUndef()
18653	? DAG.getUNDEF(VT: PreExtendVT)
18654	: BV.getOperand(i: `1`).getOperand(i: `0`),
18655	Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
18656	}
18657	unsigned ExtOpc = !SeenZExtOrSExt
18658	? ISD::ANY_EXTEND
18659	: (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
18660	return DAG.getNode(Opcode: ExtOpc, DL, VT, Operand: NBV);
18661	}
18662
18663	/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18664	/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18665	static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
18666	// If the value type isn't a vector, none of the operands are going to be dups
18667	EVT VT = Mul->getValueType(ResNo: `0`);
18668	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18669	return SDValue ();
18670
18671	SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `0`), DAG);
18672	SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `1`), DAG);
18673
18674	// Neither operands have been changed, don't make any further changes
18675	if (!Op0 && !Op1)
18676	return SDValue ();
18677
18678	SDLoc DL(Mul);
18679	return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: `0`),
18680	N2: Op1 ? Op1 : Mul->getOperand(Num: `1`));
18681	}
18682
18683	// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18684	// Same for other types with equivalent constants.
18685	static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
18686	EVT VT = N->getValueType(ResNo: `0`);
18687	if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18688	VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18689	return SDValue ();
18690	if (N->getOperand(Num: `0`).getOpcode() != ISD::AND \|\|
18691	N->getOperand(Num: `0`).getOperand(i: `0`).getOpcode() != ISD::SRL)
18692	return SDValue ();
18693
18694	SDValue And = N->getOperand(Num: `0`);
18695	SDValue Srl = And.getOperand(i: `0`);
18696
18697	APInt V1, V2, V3;
18698	if (!ISD::isConstantSplatVector(N: N->getOperand(Num: `1`).getNode(), SplatValue&: V1) \|\|
18699	!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: V2) \|\|
18700	!ISD::isConstantSplatVector(N: Srl.getOperand(i: `1`).getNode(), SplatValue&: V3))
18701	return SDValue ();
18702
18703	unsigned HalfSize = VT.getScalarSizeInBits() / `2`;
18704	if (!V1.isMask(numBits: HalfSize) \|\| V2 != (`1ULL` \| `1ULL` << HalfSize) \|\|
18705	V3 != (HalfSize - `1`))
18706	return SDValue ();
18707
18708	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18709	VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
18710	EC: VT.getVectorElementCount() * `2`);
18711
18712	SDLoc DL(N);
18713	SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: `0`));
18714	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: In.getValueType());
18715	SDValue CM = DAG.getSetCC(DL, VT: HalfVT, LHS: Zero, RHS: In, Cond: ISD::SETGT);
18716	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
18717	}
18718
18719	// Transform vector add(zext i8 to i32, zext i8 to i32)
18720	// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18721	// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18722	// extends.
18723	static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
18724	EVT VT = N->getValueType(ResNo: `0`);
18725	if (!VT.isFixedLengthVector() \|\| VT.getSizeInBits() <= `128` \|\|
18726	(N->getOperand(Num: `0`).getOpcode() != ISD::ZERO_EXTEND &&
18727	N->getOperand(Num: `0`).getOpcode() != ISD::SIGN_EXTEND) \|\|
18728	(N->getOperand(Num: `1`).getOpcode() != ISD::ZERO_EXTEND &&
18729	N->getOperand(Num: `1`).getOpcode() != ISD::SIGN_EXTEND) \|\|
18730	N->getOperand(Num: `0`).getOperand(i: `0`).getValueType() !=
18731	N->getOperand(Num: `1`).getOperand(i: `0`).getValueType())
18732	return SDValue ();
18733
18734	if (N->getOpcode() == ISD::MUL &&
18735	N->getOperand(Num: `0`).getOpcode() != N->getOperand(Num: `1`).getOpcode())
18736	return SDValue ();
18737
18738	SDValue N0 = N->getOperand(Num: `0`).getOperand(i: `0`);
18739	SDValue N1 = N->getOperand(Num: `1`).getOperand(i: `0`);
18740	EVT InVT = N0.getValueType();
18741
18742	EVT S1 = InVT.getScalarType();
18743	EVT S2 = VT.getScalarType();
18744	if ((S2 == MVT::i32 && S1 == MVT::i8) \|\|
18745	(S2 == MVT::i64 && (S1 == MVT::i8 \|\| S1 == MVT::i16))) {
18746	SDLoc DL(N);
18747	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18748	VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
18749	EC: VT.getVectorElementCount());
18750	SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: `0`).getOpcode(), DL, VT: HalfVT, Operand: N0);
18751	SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: `1`).getOpcode(), DL, VT: HalfVT, Operand: N1);
18752	SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
18753	return DAG.getNode(Opcode: N->getOpcode() == ISD::MUL ? N->getOperand(Num: `0`).getOpcode()
18754	: (unsigned)ISD::SIGN_EXTEND,
18755	DL, VT, Operand: NewOp);
18756	}
18757	return SDValue ();
18758	}
18759
18760	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
18761	TargetLowering::DAGCombinerInfo &DCI,
18762	const AArch64Subtarget *Subtarget) {
18763
18764	if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
18765	return Ext;
18766	if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
18767	return Ext;
18768	if (SDValue Ext = performVectorExtCombine(N, DAG))
18769	return Ext;
18770
18771	if (DCI.isBeforeLegalizeOps())
18772	return SDValue ();
18773
18774	// Canonicalize X(Y+1) -> XY+X and (X+1)Y -> XY+Y,
18775	// and in MachineCombiner pass, add+mul will be combined into madd.
18776	// Similarly, X(1-Y) -> X - XY and (1-Y)X -> X - YX.
18777	SDLoc DL(N);
18778	EVT VT = N->getValueType(ResNo: `0`);
18779	SDValue N0 = N->getOperand(Num: `0`);
18780	SDValue N1 = N->getOperand(Num: `1`);
18781	SDValue MulOper;
18782	unsigned AddSubOpc;
18783
18784	auto IsAddSubWith1 = [&](SDValue V) -> bool {
18785	AddSubOpc = V ->getOpcode();
18786	if ((AddSubOpc == ISD::ADD \|\| AddSubOpc == ISD::SUB) && V ->hasOneUse()) {
18787	SDValue Opnd = V ->getOperand(Num: `1`);
18788	MulOper = V ->getOperand(Num: `0`);
18789	if (AddSubOpc == ISD::SUB)
18790	std::swap(a&: Opnd, b&: MulOper);
18791	if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
18792	return C->isOne();
18793	}
18794	return false;
18795	};
18796
18797	if (IsAddSubWith1 (N0)) {
18798	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
18799	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
18800	}
18801
18802	if (IsAddSubWith1 (N1)) {
18803	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
18804	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
18805	}
18806
18807	// The below optimizations require a constant RHS.
18808	if (!isa<ConstantSDNode>(Val: N1))
18809	return SDValue ();
18810
18811	ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
18812	const APInt &ConstValue = C->getAPIntValue();
18813
18814	// Allow the scaling to be folded into the `cnt` instruction by preventing
18815	// the scaling to be obscured here. This makes it easier to pattern match.
18816	if (IsSVECntIntrinsic(S: N0) \|\|
18817	(N0 ->getOpcode() == ISD::TRUNCATE &&
18818	(IsSVECntIntrinsic(S: N0 ->getOperand(Num: `0`)))))
18819	if (ConstValue.sge(RHS: `1`) && ConstValue.sle(RHS: `16`))
18820	return SDValue ();
18821
18822	// Multiplication of a power of two plus/minus one can be done more
18823	// cheaply as shift+add/sub. For now, this is true unilaterally. If
18824	// future CPUs have a cheaper MADD instruction, this may need to be
18825	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18826	// 64-bit is 5 cycles, so this is always a win.
18827	// More aggressively, some multiplications N0 C can be lowered to*
18828	// shift+add+shift if the constant C = A B where A = 2^N + 1 and B = 2^M,*
18829	// e.g. 6=32=(2+1)2, 45=(1+4)(1+8)*
18830	// TODO: lower more cases.
18831
18832	// TrailingZeroes is used to test if the mul can be lowered to
18833	// shift+add+shift.
18834	unsigned TrailingZeroes = ConstValue.countr_zero();
18835	if (TrailingZeroes) {
18836	// Conservatively do not lower to shift+add+shift if the mul might be
18837	// folded into smul or umul.
18838	if (N0 ->hasOneUse() && (isSignExtended(N: N0, DAG) \|\|
18839	isZeroExtended(N: N0, DAG)))
18840	return SDValue ();
18841	// Conservatively do not lower to shift+add+shift if the mul might be
18842	// folded into madd or msub.
18843	if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD \|\|
18844	N->user_begin()->getOpcode() == ISD::SUB))
18845	return SDValue ();
18846	}
18847	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18848	// and shift+add+shift.
18849	APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
18850	unsigned ShiftAmt;
18851
18852	auto Shl = [&](SDValue N0, unsigned N1) {
18853	if (!N0.getNode())
18854	return SDValue ();
18855	// If shift causes overflow, ignore this combine.
18856	if (N1 >= N0.getValueSizeInBits())
18857	return SDValue ();
18858	SDValue RHS = DAG.getConstant(Val: N1, DL, VT: MVT::i64);
18859	return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
18860	};
18861	auto Add = [&](SDValue N0, SDValue N1) {
18862	if (!N0.getNode() \|\| !N1.getNode())
18863	return SDValue ();
18864	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
18865	};
18866	auto Sub = [&](SDValue N0, SDValue N1) {
18867	if (!N0.getNode() \|\| !N1.getNode())
18868	return SDValue ();
18869	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
18870	};
18871	auto Negate = [&](SDValue N) {
18872	if (!N0.getNode())
18873	return SDValue ();
18874	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
18875	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
18876	};
18877
18878	// Can the const C be decomposed into (1+2^M1)(1+2^N1), eg:*
18879	// C = 45 is equal to (1+4)(1+8), we don't decompose it into (1+2)(16-1) as
18880	// the (2^N - 1) can't be execused via a single instruction.
18881	auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18882	unsigned BitWidth = C.getBitWidth();
18883	for (unsigned i = `1`; i < BitWidth / `2`; i++) {
18884	APInt Rem;
18885	APInt X(BitWidth, (`1` << i) + `1`);
18886	APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
18887	APInt NVMinus1 = N - `1`;
18888	if (Rem == `0` && NVMinus1.isPowerOf2()) {
18889	M = X;
18890	return true;
18891	}
18892	}
18893	return false;
18894	};
18895
18896	// Can the const C be decomposed into (2^M + 1) 2^N + 1), eg:*
18897	// C = 11 is equal to (1+4)2+1, we don't decompose it into (1+2)4-1 as
18898	// the (2^N - 1) can't be execused via a single instruction.
18899	auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18900	APInt CVMinus1 = C - `1`;
18901	if (CVMinus1.isNegative())
18902	return false;
18903	unsigned TrailingZeroes = CVMinus1.countr_zero();
18904	APInt SCVMinus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) - `1`;
18905	if (SCVMinus1.isPowerOf2()) {
18906	unsigned BitWidth = SCVMinus1.getBitWidth();
18907	M = APInt (BitWidth, SCVMinus1.logBase2());
18908	N = APInt (BitWidth, TrailingZeroes);
18909	return true;
18910	}
18911	return false;
18912	};
18913
18914	// Can the const C be decomposed into (1 - (1 - 2^M) 2^N), eg:*
18915	// C = 29 is equal to 1 - (1 - 2^3) 2^2.*
18916	auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18917	APInt CVMinus1 = C - `1`;
18918	if (CVMinus1.isNegative())
18919	return false;
18920	unsigned TrailingZeroes = CVMinus1.countr_zero();
18921	APInt CVPlus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) + `1`;
18922	if (CVPlus1.isPowerOf2()) {
18923	unsigned BitWidth = CVPlus1.getBitWidth();
18924	M = APInt (BitWidth, CVPlus1.logBase2());
18925	N = APInt (BitWidth, TrailingZeroes);
18926	return true;
18927	}
18928	return false;
18929	};
18930
18931	if (ConstValue.isNonNegative()) {
18932	// (mul x, (2^N + 1) 2^M) => (shl (add (shl x, N), x), M)*
18933	// (mul x, 2^N - 1) => (sub (shl x, N), x)
18934	// (mul x, (2^(N-M) - 1) 2^M) => (sub (shl x, N), (shl x, M))*
18935	// (mul x, (2^M + 1) (2^N + 1))*
18936	// => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18937	// (mul x, (2^M + 1) 2^N + 1))*
18938	// => MV = add (shl x, M), x); add (shl MV, N), x)
18939	// (mul x, 1 - (1 - 2^M) 2^N))*
18940	// => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18941	APInt SCVMinus1 = ShiftedConstValue - `1`;
18942	APInt SCVPlus1 = ShiftedConstValue + `1`;
18943	APInt CVPlus1 = ConstValue + `1`;
18944	APInt CVM, CVN;
18945	if (SCVMinus1.isPowerOf2()) {
18946	ShiftAmt = SCVMinus1.logBase2();
18947	return Shl (Add (Shl (N0, ShiftAmt), N0), TrailingZeroes);
18948	} else if (CVPlus1.isPowerOf2()) {
18949	ShiftAmt = CVPlus1.logBase2();
18950	return Sub (Shl (N0, ShiftAmt), N0);
18951	} else if (SCVPlus1.isPowerOf2()) {
18952	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18953	return Sub (Shl (N0, ShiftAmt), Shl (N0, TrailingZeroes));
18954	}
18955	if (Subtarget->hasALULSLFast() &&
18956	isPowPlusPlusConst (ConstValue, CVM, CVN)) {
18957	APInt CVMMinus1 = CVM - `1`;
18958	APInt CVNMinus1 = CVN - `1`;
18959	unsigned ShiftM1 = CVMMinus1.logBase2();
18960	unsigned ShiftN1 = CVNMinus1.logBase2();
18961	// ALULSLFast implicate that Shifts <= 4 places are fast
18962	if (ShiftM1 <= `4` && ShiftN1 <= `4`) {
18963	SDValue MVal = Add (Shl (N0, ShiftM1), N0);
18964	return Add (Shl (MVal, ShiftN1), MVal);
18965	}
18966	}
18967	if (Subtarget->hasALULSLFast() &&
18968	isPowPlusPlusOneConst (ConstValue, CVM, CVN)) {
18969	unsigned ShiftM = CVM.getZExtValue();
18970	unsigned ShiftN = CVN.getZExtValue();
18971	// ALULSLFast implicate that Shifts <= 4 places are fast
18972	if (ShiftM <= `4` && ShiftN <= `4`) {
18973	SDValue MVal = Add (Shl (N0, CVM.getZExtValue()), N0);
18974	return Add (Shl (MVal, CVN.getZExtValue()), N0);
18975	}
18976	}
18977
18978	if (Subtarget->hasALULSLFast() &&
18979	isPowMinusMinusOneConst (ConstValue, CVM, CVN)) {
18980	unsigned ShiftM = CVM.getZExtValue();
18981	unsigned ShiftN = CVN.getZExtValue();
18982	// ALULSLFast implicate that Shifts <= 4 places are fast
18983	if (ShiftM <= `4` && ShiftN <= `4`) {
18984	SDValue MVal = Sub (N0, Shl (N0, CVM.getZExtValue()));
18985	return Sub (N0, Shl (MVal, CVN.getZExtValue()));
18986	}
18987	}
18988	} else {
18989	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18990	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18991	// (mul x, -(2^(N-M) - 1) 2^M) => (sub (shl x, M), (shl x, N))*
18992	APInt SCVPlus1 = -ShiftedConstValue + `1`;
18993	APInt CVNegPlus1 = -ConstValue + `1`;
18994	APInt CVNegMinus1 = -ConstValue - `1`;
18995	if (CVNegPlus1.isPowerOf2()) {
18996	ShiftAmt = CVNegPlus1.logBase2();
18997	return Sub (N0, Shl (N0, ShiftAmt));
18998	} else if (CVNegMinus1.isPowerOf2()) {
18999	ShiftAmt = CVNegMinus1.logBase2();
19000	return Negate (Add (Shl (N0, ShiftAmt), N0));
19001	} else if (SCVPlus1.isPowerOf2()) {
19002	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19003	return Sub (Shl (N0, TrailingZeroes), Shl (N0, ShiftAmt));
19004	}
19005	}
19006
19007	return SDValue ();
19008	}
19009
19010	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
19011	SelectionDAG &DAG) {
19012	// Take advantage of vector comparisons producing 0 or -1 in each lane to
19013	// optimize away operation when it's from a constant.
19014	//
19015	// The general transformation is:
19016	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19017	// AND(VECTOR_CMP(x,y), constant2)
19018	// constant2 = UNARYOP(constant)
19019
19020	// Early exit if this isn't a vector operation, the operand of the
19021	// unary operation isn't a bitwise AND, or if the sizes of the operations
19022	// aren't the same.
19023	EVT VT = N->getValueType(ResNo: `0`);
19024	if (!VT.isVector() \|\| N->getOperand(Num: `0`)->getOpcode() != ISD::AND \|\|
19025	N->getOperand(Num: `0`)->getOperand(Num: `0`)->getOpcode() != ISD::SETCC \|\|
19026	VT.getSizeInBits() != N->getOperand(Num: `0`)->getValueType(ResNo: `0`).getSizeInBits())
19027	return SDValue ();
19028
19029	// Now check that the other operand of the AND is a constant. We could
19030	// make the transformation for non-constant splats as well, but it's unclear
19031	// that would be a benefit as it would not eliminate any operations, just
19032	// perform one more step in scalar code before moving to the vector unit.
19033	if (BuildVectorSDNode *BV =
19034	dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `0`)->getOperand(Num: `1`))) {
19035	// Bail out if the vector isn't a constant.
19036	if (!BV->isConstant())
19037	return SDValue ();
19038
19039	// Everything checks out. Build up the new and improved node.
19040	SDLoc DL(N);
19041	EVT IntVT = BV->getValueType(ResNo: `0`);
19042	// Create a new constant of the appropriate type for the transformed
19043	// DAG.
19044	SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue (BV, `0`));
19045	// The AND node needs bitcasts to/from an integer vector type around it.
19046	SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
19047	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
19048	N1: N->getOperand(Num: `0`)->getOperand(Num: `0`), N2: MaskConst);
19049	SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
19050	return Res;
19051	}
19052
19053	return SDValue ();
19054	}
19055
19056	/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19057	/// functions, this can help to reduce the number of fmovs to/from GPRs.
19058	static SDValue
19059	tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
19060	TargetLowering::DAGCombinerInfo &DCI,
19061	const AArch64Subtarget *Subtarget) {
19062	if (N->isStrictFPOpcode())
19063	return SDValue ();
19064
19065	if (DCI.isBeforeLegalizeOps())
19066	return SDValue ();
19067
19068	if (!Subtarget->isSVEorStreamingSVEAvailable() \|\|
19069	(!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19070	return SDValue ();
19071
19072	auto isSupportedType = [](EVT VT) {
19073	return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19074	};
19075
19076	SDValue SrcVal = N->getOperand(Num: `0`);
19077	EVT SrcTy = SrcVal.getValueType();
19078	EVT DestTy = N->getValueType(ResNo: `0`);
19079
19080	if (!isSupportedType (SrcTy) \|\| !isSupportedType (DestTy))
19081	return SDValue ();
19082
19083	EVT SrcVecTy;
19084	EVT DestVecTy;
19085	if (DestTy.bitsGT(VT: SrcTy)) {
19086	DestVecTy = getPackedSVEVectorVT(VT: DestTy);
19087	SrcVecTy = DestVecTy.changeVectorElementType(EltVT: SrcTy);
19088	} else {
19089	SrcVecTy = getPackedSVEVectorVT(VT: SrcTy);
19090	DestVecTy = SrcVecTy.changeVectorElementType(EltVT: DestTy);
19091	}
19092
19093	// Ensure the resulting src/dest vector type is legal.
19094	if (SrcVecTy == MVT::nxv2i32 \|\| DestVecTy == MVT::nxv2i32)
19095	return SDValue ();
19096
19097	SDLoc DL(N);
19098	SDValue ZeroIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
19099	SDValue Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: SrcVecTy,
19100	N1: DAG.getUNDEF(VT: SrcVecTy), N2: SrcVal, N3: ZeroIdx);
19101	SDValue Convert = DAG.getNode(Opcode: N->getOpcode(), DL, VT: DestVecTy, Operand: Vec);
19102	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: DestTy, N1: Convert, N2: ZeroIdx);
19103	}
19104
19105	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
19106	TargetLowering::DAGCombinerInfo &DCI,
19107	const AArch64Subtarget *Subtarget) {
19108	// First try to optimize away the conversion when it's conditionally from
19109	// a constant. Vectors only.
19110	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
19111	return Res;
19112
19113	if (SDValue Res =
19114	tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19115	return Res;
19116
19117	EVT VT = N->getValueType(ResNo: `0`);
19118	if (VT != MVT::f32 && VT != MVT::f64)
19119	return SDValue ();
19120
19121	// Only optimize when the source and destination types have the same width.
19122	if (VT.getSizeInBits() != N->getOperand(Num: `0`).getValueSizeInBits())
19123	return SDValue ();
19124
19125	// If the result of an integer load is only used by an integer-to-float
19126	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
19127	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
19128	SDValue N0 = N->getOperand(Num: `0`);
19129	if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
19130	N0.hasOneUse() &&
19131	// Do not change the width of a volatile load.
19132	!cast<LoadSDNode>(Val&: N0)->isVolatile()) {
19133	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
19134	SDValue Load = DAG.getLoad(VT, dl: SDLoc (N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
19135	PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
19136	MMOFlags: LN0->getMemOperand()->getFlags());
19137
19138	// Make sure successors of the original load stay after it by updating them
19139	// to use the new Chain.
19140	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LN0, `1`), To: Load.getValue(R: `1`));
19141
19142	unsigned Opcode =
19143	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
19144	return DAG.getNode(Opcode, DL: SDLoc (N), VT, Operand: Load);
19145	}
19146
19147	return SDValue ();
19148	}
19149
19150	/// Fold a floating-point multiply by power of two into floating-point to
19151	/// fixed-point conversion.
19152	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
19153	TargetLowering::DAGCombinerInfo &DCI,
19154	const AArch64Subtarget *Subtarget) {
19155	if (SDValue Res =
19156	tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19157	return Res;
19158
19159	if (!Subtarget->isNeonAvailable())
19160	return SDValue ();
19161
19162	if (!N->getValueType(ResNo: `0`).isSimple())
19163	return SDValue ();
19164
19165	SDValue Op = N->getOperand(Num: `0`);
19166	if (!Op.getValueType().isSimple() \|\| Op.getOpcode() != ISD::FMUL)
19167	return SDValue ();
19168
19169	if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
19170	return SDValue ();
19171
19172	SDValue ConstVec = Op ->getOperand(Num: `1`);
19173	if (!isa<BuildVectorSDNode>(Val: ConstVec))
19174	return SDValue ();
19175
19176	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
19177	uint32_t FloatBits = FloatTy.getSizeInBits();
19178	if (FloatBits != `32` && FloatBits != `64` &&
19179	(FloatBits != `16` \|\| !Subtarget->hasFullFP16()))
19180	return SDValue ();
19181
19182	MVT IntTy = N->getSimpleValueType(ResNo: `0`).getVectorElementType();
19183	uint32_t IntBits = IntTy.getSizeInBits();
19184	if (IntBits != `16` && IntBits != `32` && IntBits != `64`)
19185	return SDValue ();
19186
19187	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
19188	if (IntBits > FloatBits)
19189	return SDValue ();
19190
19191	BitVector UndefElements;
19192	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
19193	int32_t Bits = IntBits == `64` ? `64` : `32`;
19194	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + `1`);
19195	if (C == -`1` \|\| C == `0` \|\| C > Bits)
19196	return SDValue ();
19197
19198	EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
19199	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
19200	return SDValue ();
19201
19202	if (N->getOpcode() == ISD::FP_TO_SINT_SAT \|\|
19203	N->getOpcode() == ISD::FP_TO_UINT_SAT) {
19204	EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
19205	if (SatVT.getScalarSizeInBits() != IntBits \|\| IntBits != FloatBits)
19206	return SDValue ();
19207	}
19208
19209	SDLoc DL(N);
19210	bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT \|\|
19211	N->getOpcode() == ISD::FP_TO_SINT_SAT);
19212	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
19213	: Intrinsic::aarch64_neon_vcvtfp2fxu;
19214	SDValue FixConv =
19215	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResTy,
19216	N1: DAG.getConstant(Val: IntrinsicOpcode, DL, VT: MVT::i32),
19217	N2: Op ->getOperand(Num: `0`), N3: DAG.getConstant(Val: C, DL, VT: MVT::i32));
19218	// We can handle smaller integers by generating an extra trunc.
19219	if (IntBits < FloatBits)
19220	FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: `0`), Operand: FixConv);
19221
19222	return FixConv;
19223	}
19224
19225	static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19226	const AArch64TargetLowering &TLI) {
19227	EVT VT = N->getValueType(ResNo: `0`);
19228	SelectionDAG &DAG = DCI.DAG;
19229	SDLoc DL(N);
19230	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
19231
19232	if (!VT.isVector())
19233	return SDValue ();
19234
19235	if (VT.isScalableVector() && !Subtarget.hasSVE2())
19236	return SDValue ();
19237
19238	if (VT.isFixedLengthVector() &&
19239	(!Subtarget.isNeonAvailable() \|\| TLI.useSVEForFixedLengthVectorVT(VT)))
19240	return SDValue ();
19241
19242	SDValue N0 = N->getOperand(Num: `0`);
19243	if (N0.getOpcode() != ISD::AND)
19244	return SDValue ();
19245
19246	SDValue N1 = N->getOperand(Num: `1`);
19247	if (N1.getOpcode() != ISD::AND)
19248	return SDValue ();
19249
19250	// InstCombine does (not (neg a)) => (add a -1).
19251	// Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
19252	// Loop over all combinations of AND operands.
19253	for (int i = `1`; i >= `0`; --i) {
19254	for (int j = `1`; j >= `0`; --j) {
19255	SDValue O0 = N0 ->getOperand(Num: i);
19256	SDValue O1 = N1 ->getOperand(Num: j);
19257	SDValue Sub, Add, SubSibling, AddSibling;
19258
19259	// Find a SUB and an ADD operand, one from each AND.
19260	if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
19261	Sub = O0;
19262	Add = O1;
19263	SubSibling = N0 ->getOperand(Num: `1` - i);
19264	AddSibling = N1 ->getOperand(Num: `1` - j);
19265	} else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
19266	Add = O0;
19267	Sub = O1;
19268	AddSibling = N0 ->getOperand(Num: `1` - i);
19269	SubSibling = N1 ->getOperand(Num: `1` - j);
19270	} else
19271	continue;
19272
19273	if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: `0`).getNode()))
19274	continue;
19275
19276	// Constant ones is always righthand operand of the Add.
19277	if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: `1`).getNode()))
19278	continue;
19279
19280	if (Sub.getOperand(i: `1`) != Add.getOperand(i: `0`))
19281	continue;
19282
19283	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
19284	}
19285	}
19286
19287	// (or (and a b) (and (not a) c)) => (bsl a b c)
19288	// We only have to look for constant vectors here since the general, variable
19289	// case can be handled in TableGen.
19290	unsigned Bits = VT.getScalarSizeInBits();
19291	uint64_t BitMask = Bits == `64` ? -`1ULL` : ((`1ULL` << Bits) - `1`);
19292	for (int i = `1`; i >= `0`; --i)
19293	for (int j = `1`; j >= `0`; --j) {
19294	APInt Val1, Val2;
19295
19296	if (ISD::isConstantSplatVector(N: N0 ->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
19297	ISD::isConstantSplatVector(N: N1 ->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
19298	(BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
19299	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
19300	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
19301	}
19302	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0 ->getOperand(Num: i));
19303	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1 ->getOperand(Num: j));
19304	if (!BVN0 \|\| !BVN1)
19305	continue;
19306
19307	bool FoundMatch = true;
19308	for (unsigned k = `0`; k < VT.getVectorNumElements(); ++k) {
19309	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
19310	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
19311	if (!CN0 \|\| !CN1 \|\|
19312	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
19313	FoundMatch = false;
19314	break;
19315	}
19316	}
19317	if (FoundMatch)
19318	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
19319	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
19320	}
19321
19322	return SDValue ();
19323	}
19324
19325	// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
19326	// convert to csel(ccmp(.., cc0)), depending on cc1:
19327
19328	// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19329	// =>
19330	// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
19331	//
19332	// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
19333	// =>
19334	// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
19335	static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
19336	EVT VT = N->getValueType(ResNo: `0`);
19337	SDValue CSel0 = N->getOperand(Num: `0`);
19338	SDValue CSel1 = N->getOperand(Num: `1`);
19339
19340	if (CSel0.getOpcode() != AArch64ISD::CSEL \|\|
19341	CSel1.getOpcode() != AArch64ISD::CSEL)
19342	return SDValue ();
19343
19344	if (!CSel0 ->hasOneUse() \|\| !CSel1 ->hasOneUse())
19345	return SDValue ();
19346
19347	if (!isNullConstant(V: CSel0.getOperand(i: `0`)) \|\|
19348	!isOneConstant(V: CSel0.getOperand(i: `1`)) \|\|
19349	!isNullConstant(V: CSel1.getOperand(i: `0`)) \|\|
19350	!isOneConstant(V: CSel1.getOperand(i: `1`)))
19351	return SDValue ();
19352
19353	SDValue Cmp0 = CSel0.getOperand(i: `3`);
19354	SDValue Cmp1 = CSel1.getOperand(i: `3`);
19355	AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: `2`);
19356	AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: `2`);
19357	if (!Cmp0 ->hasOneUse() \|\| !Cmp1 ->hasOneUse())
19358	return SDValue ();
19359	if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
19360	Cmp0.getOpcode() == AArch64ISD::SUBS) {
19361	std::swap(a&: Cmp0, b&: Cmp1);
19362	std::swap(a&: CC0, b&: CC1);
19363	}
19364
19365	if (Cmp1.getOpcode() != AArch64ISD::SUBS)
19366	return SDValue ();
19367
19368	SDLoc DL(N);
19369	SDValue CCmp, Condition;
19370	unsigned NZCV;
19371
19372	if (N->getOpcode() == ISD::AND) {
19373	AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
19374	Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
19375	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
19376	} else {
19377	AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
19378	Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
19379	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
19380	}
19381
19382	SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
19383
19384	auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: `1`));
19385	if (Op1 && Op1->getAPIntValue().isNegative() &&
19386	Op1->getAPIntValue().sgt(RHS: -`32`)) {
19387	// CCMP accept the constant int the range [0, 31]
19388	// if the Op1 is a constant in the range [-31, -1], we
19389	// can select to CCMN to avoid the extra mov
19390	SDValue AbsOp1 =
19391	DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: `0`));
19392	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`), N2: AbsOp1,
19393	N3: NZCVOp, N4: Condition, N5: Cmp0);
19394	} else {
19395	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`),
19396	N2: Cmp1.getOperand(i: `1`), N3: NZCVOp, N4: Condition, N5: Cmp0);
19397	}
19398	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: CSel0.getOperand(i: `0`),
19399	N2: CSel0.getOperand(i: `1`), N3: DAG.getConstant(Val: CC1, DL, VT: MVT::i32),
19400	N4: CCmp);
19401	}
19402
19403	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19404	const AArch64Subtarget *Subtarget,
19405	const AArch64TargetLowering &TLI) {
19406	SelectionDAG &DAG = DCI.DAG;
19407	EVT VT = N->getValueType(ResNo: `0`);
19408
19409	if (SDValue R = performANDORCSELCombine(N, DAG))
19410	return R;
19411
19412	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19413	return SDValue ();
19414
19415	if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
19416	return Res;
19417
19418	return SDValue ();
19419	}
19420
19421	static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
19422	if (!MemVT.getVectorElementType().isSimple())
19423	return false;
19424
19425	uint64_t MaskForTy = `0ull`;
19426	switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
19427	case MVT::i8:
19428	MaskForTy = `0xffull`;
19429	break;
19430	case MVT::i16:
19431	MaskForTy = `0xffffull`;
19432	break;
19433	case MVT::i32:
19434	MaskForTy = `0xffffffffull`;
19435	break;
19436	default:
19437	return false;
19438	break;
19439	}
19440
19441	if (N->getOpcode() == AArch64ISD::DUP \|\| N->getOpcode() == ISD::SPLAT_VECTOR)
19442	if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`)))
19443	return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
19444
19445	return false;
19446	}
19447
19448	static SDValue performReinterpretCastCombine(SDNode *N) {
19449	SDValue LeafOp = SDValue (N, `0`);
19450	SDValue Op = N->getOperand(Num: `0`);
19451	while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
19452	LeafOp.getValueType() != Op.getValueType())
19453	Op = Op ->getOperand(Num: `0`);
19454	if (LeafOp.getValueType() == Op.getValueType())
19455	return Op;
19456	return SDValue ();
19457	}
19458
19459	static SDValue performSVEAndCombine(SDNode *N,
19460	TargetLowering::DAGCombinerInfo &DCI) {
19461	SelectionDAG &DAG = DCI.DAG;
19462	SDValue Src = N->getOperand(Num: `0`);
19463	unsigned Opc = Src ->getOpcode();
19464
19465	// Zero/any extend of an unsigned unpack
19466	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
19467	SDValue UnpkOp = Src ->getOperand(Num: `0`);
19468	SDValue Dup = N->getOperand(Num: `1`);
19469
19470	if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
19471	return SDValue ();
19472
19473	SDLoc DL(N);
19474	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup ->getOperand(Num: `0`));
19475	if (!C)
19476	return SDValue ();
19477
19478	uint64_t ExtVal = C->getZExtValue();
19479
19480	auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
19481	return ((ExtVal == `0xFF` && VT == MVT::i8) \|\|
19482	(ExtVal == `0xFFFF` && VT == MVT::i16) \|\|
19483	(ExtVal == `0xFFFFFFFF` && VT == MVT::i32));
19484	};
19485
19486	// If the mask is fully covered by the unpack, we don't need to push
19487	// a new AND onto the operand
19488	EVT EltTy = UnpkOp ->getValueType(ResNo: `0`).getVectorElementType();
19489	if (MaskAndTypeMatch (EltTy))
19490	return Src;
19491
19492	// If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
19493	// to see if the mask is all-ones of size MemTy.
19494	auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
19495	if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD \|\|
19496	MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
19497	EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
19498	if (MaskAndTypeMatch (EltTy))
19499	return Src;
19500	}
19501
19502	// Truncate to prevent a DUP with an over wide constant
19503	APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
19504
19505	// Otherwise, make sure we propagate the AND to the operand
19506	// of the unpack
19507	Dup = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: UnpkOp ->getValueType(ResNo: `0`),
19508	Operand: DAG.getConstant(Val: Mask.zextOrTrunc(width: `32`), DL, VT: MVT::i32));
19509
19510	SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
19511	VT: UnpkOp ->getValueType(ResNo: `0`), N1: UnpkOp, N2: Dup);
19512
19513	return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: `0`), Operand: And);
19514	}
19515
19516	if (DCI.isBeforeLegalizeOps())
19517	return SDValue ();
19518
19519	// If both sides of AND operations are i1 splat_vectors then
19520	// we can produce just i1 splat_vector as the result.
19521	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `0`)))
19522	return N->getOperand(Num: `1`);
19523	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `1`)))
19524	return N->getOperand(Num: `0`);
19525
19526	if (!EnableCombineMGatherIntrinsics)
19527	return SDValue ();
19528
19529	SDValue Mask = N->getOperand(Num: `1`);
19530
19531	if (!Src.hasOneUse())
19532	return SDValue ();
19533
19534	EVT MemVT;
19535
19536	// SVE load instructions perform an implicit zero-extend, which makes them
19537	// perfect candidates for combining.
19538	switch (Opc) {
19539	case AArch64ISD::LD1_MERGE_ZERO:
19540	case AArch64ISD::LDNF1_MERGE_ZERO:
19541	case AArch64ISD::LDFF1_MERGE_ZERO:
19542	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `3`))->getVT();
19543	break;
19544	case AArch64ISD::GLD1_MERGE_ZERO:
19545	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19546	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19547	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19548	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19549	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19550	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19551	case AArch64ISD::GLDFF1_MERGE_ZERO:
19552	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19553	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19554	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19555	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19556	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19557	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19558	case AArch64ISD::GLDNT1_MERGE_ZERO:
19559	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `4`))->getVT();
19560	break;
19561	default:
19562	return SDValue ();
19563	}
19564
19565	if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
19566	return Src;
19567
19568	return SDValue ();
19569	}
19570
19571	// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
19572	static SDValue performANDSETCCCombine(SDNode *N,
19573	TargetLowering::DAGCombinerInfo &DCI) {
19574
19575	// This function performs an optimization on a specific pattern involving
19576	// an AND operation and SETCC (Set Condition Code) node.
19577
19578	SDValue SetCC = N->getOperand(Num: `0`);
19579	EVT VT = N->getValueType(ResNo: `0`);
19580	SelectionDAG &DAG = DCI.DAG;
19581
19582	// Checks if the current node (N) is used by any SELECT instruction and
19583	// returns an empty SDValue to avoid applying the optimization to prevent
19584	// incorrect results
19585	for (auto U : N->users())
19586	if (U->getOpcode() == ISD::SELECT)
19587	return SDValue ();
19588
19589	// Check if the operand is a SETCC node with floating-point comparison
19590	if (SetCC.getOpcode() == ISD::SETCC &&
19591	SetCC.getOperand(i: `0`).getValueType() == MVT::f32) {
19592
19593	SDValue Cmp;
19594	AArch64CC::CondCode CC;
19595
19596	// Check if the DAG is after legalization and if we can emit the conjunction
19597	if (!DCI.isBeforeLegalize() &&
19598	(Cmp = emitConjunction(DAG, Val: SDValue (N, `0`), OutCC&: CC))) {
19599
19600	AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
19601
19602	SDLoc DL(N);
19603	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
19604	N2: DAG.getConstant(Val: `0`, DL, VT),
19605	N3: DAG.getConstant(Val: InvertedCC, DL, VT: MVT::i32), N4: Cmp);
19606	}
19607	}
19608	return SDValue ();
19609	}
19610
19611	static SDValue performANDCombine(SDNode *N,
19612	TargetLowering::DAGCombinerInfo &DCI) {
19613	SelectionDAG &DAG = DCI.DAG;
19614	SDValue LHS = N->getOperand(Num: `0`);
19615	SDValue RHS = N->getOperand(Num: `1`);
19616	EVT VT = N->getValueType(ResNo: `0`);
19617
19618	if (SDValue R = performANDORCSELCombine(N, DAG))
19619	return R;
19620
19621	if (SDValue R = performANDSETCCCombine(N,DCI))
19622	return R;
19623
19624	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19625	return SDValue ();
19626
19627	if (VT.isScalableVector())
19628	return performSVEAndCombine(N, DCI);
19629
19630	// The combining code below works only for NEON vectors. In particular, it
19631	// does not work for SVE when dealing with vectors wider than 128 bits.
19632	if (!VT.is64BitVector() && !VT.is128BitVector())
19633	return SDValue ();
19634
19635	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
19636	if (!BVN)
19637	return SDValue ();
19638
19639	// AND does not accept an immediate, so check if we can use a BIC immediate
19640	// instruction instead. We do this here instead of using a (and x, (mvni imm))
19641	// pattern in isel, because some immediates may be lowered to the preferred
19642	// (and x, (movi imm)) form, even though an mvni representation also exists.
19643	APInt DefBits(VT.getSizeInBits(), `0`);
19644	APInt UndefBits(VT.getSizeInBits(), `0`);
19645	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
19646	SDValue NewOp;
19647
19648	// Any bits known to already be 0 need not be cleared again, which can help
19649	// reduce the size of the immediate to one supported by the instruction.
19650	KnownBits Known = DAG.computeKnownBits(Op: LHS);
19651	APInt ZeroSplat(VT.getSizeInBits(), `0`);
19652	for (unsigned I = `0`; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19653	ZeroSplat \|= Known.Zero.zext(width: VT.getSizeInBits())
19654	<< (Known.Zero.getBitWidth() * I);
19655
19656	DefBits = ~(DefBits \| ZeroSplat);
19657	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19658	Bits: DefBits, LHS: &LHS)) \|\|
19659	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19660	Bits: DefBits, LHS: &LHS)))
19661	return NewOp;
19662
19663	UndefBits = ~(UndefBits \| ZeroSplat);
19664	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19665	Bits: UndefBits, LHS: &LHS)) \|\|
19666	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19667	Bits: UndefBits, LHS: &LHS)))
19668	return NewOp;
19669	}
19670
19671	return SDValue ();
19672	}
19673
19674	static SDValue performFADDCombine(SDNode *N,
19675	TargetLowering::DAGCombinerInfo &DCI) {
19676	SelectionDAG &DAG = DCI.DAG;
19677	SDValue LHS = N->getOperand(Num: `0`);
19678	SDValue RHS = N->getOperand(Num: `1`);
19679	EVT VT = N->getValueType(ResNo: `0`);
19680	SDLoc DL(N);
19681
19682	if (!N->getFlags().hasAllowReassociation())
19683	return SDValue ();
19684
19685	// Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19686	auto ReassocComplex = [&](SDValue A, SDValue B) {
19687	if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19688	return SDValue ();
19689	unsigned Opc = A.getConstantOperandVal(i: `0`);
19690	if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19691	Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19692	Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19693	Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19694	return SDValue ();
19695	SDValue VCMLA = DAG.getNode(
19696	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: `0`),
19697	N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: `1`), N2: B, Flags: N->getFlags()),
19698	N3: A.getOperand(i: `2`), N4: A.getOperand(i: `3`));
19699	VCMLA ->setFlags(A ->getFlags());
19700	return VCMLA;
19701	};
19702	if (SDValue R = ReassocComplex (LHS, RHS))
19703	return R;
19704	if (SDValue R = ReassocComplex (RHS, LHS))
19705	return R;
19706
19707	return SDValue ();
19708	}
19709
19710	static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19711	switch (Opcode) {
19712	case ISD::STRICT_FADD:
19713	case ISD::FADD:
19714	return (FullFP16 && VT == MVT::f16) \|\| VT == MVT::f32 \|\| VT == MVT::f64;
19715	case ISD::ADD:
19716	return VT == MVT::i64;
19717	default:
19718	return false;
19719	}
19720	}
19721
19722	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19723	AArch64CC::CondCode Cond);
19724
19725	static bool isPredicateCCSettingOp(SDValue N) {
19726	if ((N.getOpcode() == ISD::SETCC) \|\|
19727	// get_active_lane_mask is lowered to a whilelo instruction.
19728	(N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) \|\|
19729	(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19730	(N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilege \|\|
19731	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilegt \|\|
19732	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilehi \|\|
19733	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilehs \|\|
19734	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilele \|\|
19735	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilelo \|\|
19736	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilels \|\|
19737	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilelt)))
19738	return true;
19739
19740	return false;
19741	}
19742
19743	// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19744	// ... into: "ptrue p, all" + PTEST
19745	static SDValue
19746	performFirstTrueTestVectorCombine(SDNode *N,
19747	TargetLowering::DAGCombinerInfo &DCI,
19748	const AArch64Subtarget *Subtarget) {
19749	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19750	// Make sure PTEST can be legalised with illegal types.
19751	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
19752	return SDValue ();
19753
19754	SDValue N0 = N->getOperand(Num: `0`);
19755	EVT VT = N0.getValueType();
19756
19757	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1 \|\|
19758	!isNullConstant(V: N->getOperand(Num: `1`)))
19759	return SDValue ();
19760
19761	// Restricted the DAG combine to only cases where we're extracting from a
19762	// flag-setting operation.
19763	if (!isPredicateCCSettingOp(N: N0))
19764	return SDValue ();
19765
19766	// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19767	SelectionDAG &DAG = DCI.DAG;
19768	SDValue Pg = getPTrue(DAG, DL: SDLoc (N), VT, Pattern: AArch64SVEPredPattern::all);
19769	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
19770	}
19771
19772	// Materialize : Idx = (add (mul vscale, NumEls), -1)
19773	// i1 = extract_vector_elt t37, Constant:i64<Idx>
19774	// ... into: "ptrue p, all" + PTEST
19775	static SDValue
19776	performLastTrueTestVectorCombine(SDNode *N,
19777	TargetLowering::DAGCombinerInfo &DCI,
19778	const AArch64Subtarget *Subtarget) {
19779	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19780	// Make sure PTEST is legal types.
19781	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
19782	return SDValue ();
19783
19784	SDValue N0 = N->getOperand(Num: `0`);
19785	EVT OpVT = N0.getValueType();
19786
19787	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
19788	return SDValue ();
19789
19790	// Idx == (add (mul vscale, NumEls), -1)
19791	SDValue Idx = N->getOperand(Num: `1`);
19792	if (Idx.getOpcode() != ISD::ADD \|\| !isAllOnesConstant(V: Idx.getOperand(i: `1`)))
19793	return SDValue ();
19794
19795	SDValue VS = Idx.getOperand(i: `0`);
19796	if (VS.getOpcode() != ISD::VSCALE)
19797	return SDValue ();
19798
19799	unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19800	if (VS.getConstantOperandVal(i: `0`) != NumEls)
19801	return SDValue ();
19802
19803	// Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19804	SelectionDAG &DAG = DCI.DAG;
19805	SDValue Pg = getPTrue(DAG, DL: SDLoc (N), VT: OpVT, Pattern: AArch64SVEPredPattern::all);
19806	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
19807	}
19808
19809	static SDValue
19810	performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19811	const AArch64Subtarget *Subtarget) {
19812	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19813	SelectionDAG &DAG = DCI.DAG;
19814	SDValue Vec = N->getOperand(Num: `0`);
19815	SDValue Idx = N->getOperand(Num: `1`);
19816
19817	if (DCI.isBeforeLegalize() \|\| Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
19818	return SDValue ();
19819
19820	// Only legal for 8, 16, 32, and 64 bit element types.
19821	EVT EltVT = Vec.getValueType().getVectorElementType();
19822	if (!is_contained(Range: ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
19823	MVT::bf16, MVT::f32, MVT::f64}),
19824	Element: EltVT.getSimpleVT().SimpleTy))
19825	return SDValue ();
19826
19827	SDValue Mask = Idx.getOperand(i: `0`);
19828	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19829	if (!TLI.isOperationLegal(Op: ISD::VECTOR_FIND_LAST_ACTIVE, VT: Mask.getValueType()))
19830	return SDValue ();
19831
19832	return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Mask,
19833	N2: Vec);
19834	}
19835
19836	static SDValue
19837	performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19838	const AArch64Subtarget *Subtarget) {
19839	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19840	if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19841	return Res;
19842	if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19843	return Res;
19844	if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
19845	return Res;
19846
19847	SelectionDAG &DAG = DCI.DAG;
19848	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
19849
19850	EVT VT = N->getValueType(ResNo: `0`);
19851	const bool FullFP16 = Subtarget->hasFullFP16();
19852	bool IsStrict = N0 ->isStrictFPOpcode();
19853
19854	// extract(dup x) -> x
19855	if (N0.getOpcode() == AArch64ISD::DUP)
19856	return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: `0`), DL: SDLoc (N), VT)
19857	: N0.getOperand(i: `0`);
19858
19859	// Rewrite for pairwise fadd pattern
19860	// (f32 (extract_vector_elt
19861	// (fadd (vXf32 Other)
19862	// (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19863	// ->
19864	// (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19865	// (extract_vector_elt (vXf32 Other) 1))
19866	// For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19867	// we can only do this when it's used only by the extract_vector_elt.
19868	if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0 ->getOpcode(), VT, FullFP16) &&
19869	(!IsStrict \|\| N0.hasOneUse())) {
19870	SDLoc DL(N0);
19871	SDValue N00 = N0 ->getOperand(Num: IsStrict ? `1` : `0`);
19872	SDValue N01 = N0 ->getOperand(Num: IsStrict ? `2` : `1`);
19873
19874	ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
19875	SDValue Other = N00;
19876
19877	// And handle the commutative case.
19878	if (!Shuffle) {
19879	Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
19880	Other = N01;
19881	}
19882
19883	if (Shuffle && Shuffle->getMaskElt(Idx: `0`) == `1` &&
19884	Other == Shuffle->getOperand(Num: `0`)) {
19885	SDValue Extract1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19886	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
19887	SDValue Extract2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19888	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
19889	if (!IsStrict)
19890	return DAG.getNode(Opcode: N0 ->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
19891
19892	// For strict_fadd we need uses of the final extract_vector to be replaced
19893	// with the strict_fadd, but we also need uses of the chain output of the
19894	// original strict_fadd to use the chain output of the new strict_fadd as
19895	// otherwise it may not be deleted.
19896	SDValue Ret = DAG.getNode(Opcode: N0 ->getOpcode(), DL,
19897	ResultTys: {VT, MVT::Other},
19898	Ops: {N0 ->getOperand(Num: `0`), Extract1, Extract2});
19899	DAG.ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Ret);
19900	DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: `1`), To: Ret.getValue(R: `1`));
19901	return SDValue (N, `0`);
19902	}
19903	}
19904
19905	return SDValue ();
19906	}
19907
19908	static SDValue performConcatVectorsCombine(SDNode *N,
19909	TargetLowering::DAGCombinerInfo &DCI,
19910	SelectionDAG &DAG) {
19911	SDLoc DL(N);
19912	EVT VT = N->getValueType(ResNo: `0`);
19913	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
19914	unsigned N0Opc = N0 ->getOpcode(), N1Opc = N1 ->getOpcode();
19915
19916	if (VT.isScalableVector())
19917	return SDValue ();
19918
19919	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
19920	N1Opc == ISD::TRUNCATE) {
19921	SDValue N00 = N0 ->getOperand(Num: `0`);
19922	SDValue N10 = N1 ->getOperand(Num: `0`);
19923	EVT N00VT = N00.getValueType();
19924	unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
19925
19926	// Optimize concat_vectors of truncated vectors, where the intermediate
19927	// type is illegal, to avoid said illegality, e.g.,
19928	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19929	// (v2i16 (truncate (v2i64)))))
19930	// ->
19931	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19932	// (v4i32 (bitcast (v2i64))),
19933	// <0, 2, 4, 6>)))
19934	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19935	// on both input and result type, so we might generate worse code.
19936	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19937	if (N00VT == N10.getValueType() &&
19938	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
19939	N00VT.getScalarSizeInBits() == `4` * VT.getScalarSizeInBits()) {
19940	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19941	SmallVector<int, `8`> Mask(MidVT.getVectorNumElements());
19942	for (size_t i = `0`; i < Mask.size(); ++i)
19943	Mask [i] = i * `2`;
19944	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT,
19945	Operand: DAG.getVectorShuffle(
19946	VT: MidVT, dl: DL,
19947	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MidVT, Operand: N00),
19948	N2: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MidVT, Operand: N10), Mask));
19949	}
19950
19951	// Optimize two large shifts and a combine into a single combine and shift
19952	// For AArch64 architectures, sequences like the following:
19953	//
19954	// ushr v0.4s, v0.4s, #20
19955	// ushr v1.4s, v1.4s, #20
19956	// uzp1 v0.8h, v0.8h, v1.8h
19957	//
19958	// Can be optimized to:
19959	//
19960	// uzp2 v0.8h, v0.8h, v1.8h
19961	// ushr v0.8h, v0.8h, #4
19962	//
19963	// This optimization reduces instruction count.
19964	if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
19965	N00 ->getOperand(Num: `1`) == N10 ->getOperand(Num: `1`)) {
19966	SDValue N000 = N00 ->getOperand(Num: `0`);
19967	SDValue N100 = N10 ->getOperand(Num: `0`);
19968	uint64_t N001ConstVal = N00 ->getConstantOperandVal(Num: `1`),
19969	N101ConstVal = N10 ->getConstantOperandVal(Num: `1`),
19970	NScalarSize = N->getValueType(ResNo: `0`).getScalarSizeInBits();
19971
19972	if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
19973	N000 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: N000);
19974	N100 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: N100);
19975	SDValue Uzp = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT, N1: N000, N2: N100);
19976	SDValue NewShiftConstant =
19977	DAG.getConstant(Val: N001ConstVal - NScalarSize, DL, VT: MVT::i32);
19978
19979	return DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT, N1: Uzp, N2: NewShiftConstant);
19980	}
19981	}
19982	}
19983
19984	if (N->getOperand(Num: `0`).getValueType() == MVT::v4i8 \|\|
19985	N->getOperand(Num: `0`).getValueType() == MVT::v2i16 \|\|
19986	N->getOperand(Num: `0`).getValueType() == MVT::v2i8) {
19987	EVT SrcVT = N->getOperand(Num: `0`).getValueType();
19988	// If we have a concat of v4i8 loads, convert them to a buildvector of f32
19989	// loads to prevent having to go through the v4i8 load legalization that
19990	// needs to extend each element into a larger type.
19991	if (N->getNumOperands() % `2` == `0` &&
19992	all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
19993	if (V.getValueType() != SrcVT)
19994	return false;
19995	if (V.isUndef())
19996	return true;
19997	LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
19998	return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19999	LD->getExtensionType() == ISD::NON_EXTLOAD;
20000	})) {
20001	EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20002	EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
20003	SmallVector<SDValue> Ops;
20004
20005	for (unsigned i = `0`; i < N->getNumOperands(); i++) {
20006	SDValue V = N->getOperand(Num: i);
20007	if (V.isUndef())
20008	Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
20009	else {
20010	LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
20011	SDValue NewLoad = DAG.getLoad(VT: FVT, dl: DL, Chain: LD->getChain(),
20012	Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
20013	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LD, `1`), To: NewLoad.getValue(R: `1`));
20014	Ops.push_back(Elt: NewLoad);
20015	}
20016	}
20017	return DAG.getBitcast(VT: N->getValueType(ResNo: `0`),
20018	V: DAG.getBuildVector(VT: NVT, DL, Ops));
20019	}
20020	}
20021
20022	// Canonicalise concat_vectors to replace concatenations of truncated nots
20023	// with nots of concatenated truncates. This in some cases allows for multiple
20024	// redundant negations to be eliminated.
20025	// (concat_vectors (v4i16 (truncate (not (v4i32)))),
20026	// (v4i16 (truncate (not (v4i32)))))
20027	// ->
20028	// (not (concat_vectors (v4i16 (truncate (v4i32))),
20029	// (v4i16 (truncate (v4i32)))))
20030	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
20031	N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
20032	N->isOnlyUserOf(N: N1.getNode())) {
20033	auto isBitwiseVectorNegate = [](SDValue V) {
20034	return V ->getOpcode() == ISD::XOR &&
20035	ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: `1`).getNode());
20036	};
20037	SDValue N00 = N0 ->getOperand(Num: `0`);
20038	SDValue N10 = N1 ->getOperand(Num: `0`);
20039	if (isBitwiseVectorNegate (N00) && N0 ->isOnlyUserOf(N: N00.getNode()) &&
20040	isBitwiseVectorNegate (N10) && N1 ->isOnlyUserOf(N: N10.getNode())) {
20041	return DAG.getNOT(
20042	DL,
20043	Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT,
20044	N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N0.getValueType(),
20045	Operand: N00 ->getOperand(Num: `0`)),
20046	N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N1.getValueType(),
20047	Operand: N10 ->getOperand(Num: `0`))),
20048	VT);
20049	}
20050	}
20051
20052	// Wait till after everything is legalized to try this. That way we have
20053	// legal vector types and such.
20054	if (DCI.isBeforeLegalizeOps())
20055	return SDValue ();
20056
20057	// Optimise concat_vectors of two identical binops with a 128-bit destination
20058	// size, combine into an binop of two contacts of the source vectors. eg:
20059	// concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20060	if (N->getNumOperands() == `2` && N0Opc == N1Opc && VT.is128BitVector() &&
20061	DAG.getTargetLoweringInfo().isBinOp(Opcode: N0Opc) && N0 ->hasOneUse() &&
20062	N1 ->hasOneUse()) {
20063	SDValue N00 = N0 ->getOperand(Num: `0`);
20064	SDValue N01 = N0 ->getOperand(Num: `1`);
20065	SDValue N10 = N1 ->getOperand(Num: `0`);
20066	SDValue N11 = N1 ->getOperand(Num: `1`);
20067
20068	if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20069	SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N00, N2: N10);
20070	SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N01, N2: N11);
20071	return DAG.getNode(Opcode: N0Opc, DL, VT, N1: Concat0, N2: Concat1);
20072	}
20073	}
20074
20075	auto IsRSHRN = [](SDValue Shr) {
20076	if (Shr.getOpcode() != AArch64ISD::VLSHR)
20077	return false;
20078	SDValue Op = Shr.getOperand(i: `0`);
20079	EVT VT = Op.getValueType();
20080	unsigned ShtAmt = Shr.getConstantOperandVal(i: `1`);
20081	if (ShtAmt > VT.getScalarSizeInBits() / `2` \|\| Op.getOpcode() != ISD::ADD)
20082	return false;
20083
20084	APInt Imm;
20085	if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::MOVIshift)
20086	Imm = APInt (VT.getScalarSizeInBits(),
20087	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`)
20088	<< Op.getOperand(i: `1`).getConstantOperandVal(i: `1`));
20089	else if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::DUP &&
20090	isa<ConstantSDNode>(Val: Op.getOperand(i: `1`).getOperand(i: `0`)))
20091	Imm = APInt (VT.getScalarSizeInBits(),
20092	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`));
20093	else
20094	return false;
20095
20096	if (Imm != `1ULL` << (ShtAmt - `1`))
20097	return false;
20098	return true;
20099	};
20100
20101	// concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20102	if (N->getNumOperands() == `2` && IsRSHRN (N0) &&
20103	((IsRSHRN (N1) &&
20104	N0.getConstantOperandVal(i: `1`) == N1.getConstantOperandVal(i: `1`)) \|\|
20105	N1.isUndef())) {
20106	SDValue X = N0.getOperand(i: `0`).getOperand(i: `0`);
20107	SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
20108	: N1.getOperand(i: `0`).getOperand(i: `0`);
20109	EVT BVT =
20110	X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
20111	SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: BVT, N1: X, N2: Y);
20112	SDValue Add = DAG.getNode(
20113	Opcode: ISD::ADD, DL, VT: BVT, N1: CC,
20114	N2: DAG.getConstant(Val: `1ULL` << (N0.getConstantOperandVal(i: `1`) - `1`), DL, VT: BVT));
20115	SDValue Shr =
20116	DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT: BVT, N1: Add, N2: N0.getOperand(i: `1`));
20117	return Shr;
20118	}
20119
20120	// concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20121	if (N->getNumOperands() == `2` && N0Opc == AArch64ISD::ZIP1 &&
20122	N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: `0`) == N1.getOperand(i: `0`) &&
20123	N0.getOperand(i: `1`) == N1.getOperand(i: `1`)) {
20124	SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N0.getOperand(i: `0`),
20125	N2: DAG.getUNDEF(VT: N0.getValueType()));
20126	SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: N0.getOperand(i: `1`),
20127	N2: DAG.getUNDEF(VT: N0.getValueType()));
20128	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT, N1: E0, N2: E1);
20129	}
20130
20131	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20132	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
20133	// canonicalise to that.
20134	if (N->getNumOperands() == `2` && N0 == N1 && VT.getVectorNumElements() == `2`) {
20135	assert(VT.getScalarSizeInBits() == `64`);
20136	return DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL, VT, N1: WidenVector(V64Reg: N0, DAG),
20137	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
20138	}
20139
20140	// Canonicalise concat_vectors so that the right-hand vector has as few
20141	// bit-casts as possible before its real operation. The primary matching
20142	// destination for these operations will be the narrowing "2" instructions,
20143	// which depend on the operation being performed on this right-hand vector.
20144	// For example,
20145	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
20146	// becomes
20147	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
20148
20149	if (N->getNumOperands() != `2` \|\| N1Opc != ISD::BITCAST)
20150	return SDValue ();
20151	SDValue RHS = N1 ->getOperand(Num: `0`);
20152	MVT RHSTy = RHS.getValueType().getSimpleVT();
20153	// If the RHS is not a vector, this is not the pattern we're looking for.
20154	if (!RHSTy.isVector())
20155	return SDValue ();
20156
20157	LLVM_DEBUG(
20158	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
20159
20160	MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
20161	NumElements: RHSTy.getVectorNumElements() * `2`);
20162	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT,
20163	Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatTy,
20164	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RHSTy, Operand: N0),
20165	N2: RHS));
20166	}
20167
20168	static SDValue
20169	performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20170	SelectionDAG &DAG) {
20171	if (DCI.isBeforeLegalizeOps())
20172	return SDValue ();
20173
20174	EVT VT = N->getValueType(ResNo: `0`);
20175	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1)
20176	return SDValue ();
20177
20178	SDValue V = N->getOperand(Num: `0`);
20179
20180	// NOTE: This combine exists in DAGCombiner, but that version's legality check
20181	// blocks this combine because the non-const case requires custom lowering.
20182	//
20183	// ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
20184	if (V.getOpcode() == ISD::SPLAT_VECTOR)
20185	if (isa<ConstantSDNode>(Val: V.getOperand(i: `0`)))
20186	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT, Operand: V.getOperand(i: `0`));
20187
20188	return SDValue ();
20189	}
20190
20191	static SDValue
20192	performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
20193	SelectionDAG &DAG) {
20194	SDLoc DL(N);
20195	SDValue Vec = N->getOperand(Num: `0`);
20196	SDValue SubVec = N->getOperand(Num: `1`);
20197	uint64_t IdxVal = N->getConstantOperandVal(Num: `2`);
20198	EVT VecVT = Vec.getValueType();
20199	EVT SubVT = SubVec.getValueType();
20200
20201	// Promote fixed length vector zeros.
20202	if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
20203	Vec.isUndef() && isZerosVector(N: SubVec.getNode()))
20204	return VecVT.isInteger() ? DAG.getConstant(Val: `0`, DL, VT: VecVT)
20205	: DAG.getConstantFP(Val: `0`, DL, VT: VecVT);
20206
20207	// Only do this for legal fixed vector types.
20208	if (!VecVT.isFixedLengthVector() \|\|
20209	!DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) \|\|
20210	!DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
20211	return SDValue ();
20212
20213	// Ignore widening patterns.
20214	if (IdxVal == `0` && Vec.isUndef())
20215	return SDValue ();
20216
20217	// Subvector must be half the width and an "aligned" insertion.
20218	unsigned NumSubElts = SubVT.getVectorNumElements();
20219	if ((SubVT.getSizeInBits() * `2`) != VecVT.getSizeInBits() \|\|
20220	(IdxVal != `0` && IdxVal != NumSubElts))
20221	return SDValue ();
20222
20223	// Fold insert_subvector -> concat_vectors
20224	// insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
20225	// insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
20226	SDValue Lo, Hi;
20227	if (IdxVal == `0`) {
20228	Lo = SubVec;
20229	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
20230	N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
20231	} else {
20232	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
20233	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
20234	Hi = SubVec;
20235	}
20236	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
20237	}
20238
20239	static SDValue tryCombineFixedPointConvert(SDNode *N,
20240	TargetLowering::DAGCombinerInfo &DCI,
20241	SelectionDAG &DAG) {
20242	// Wait until after everything is legalized to try this. That way we have
20243	// legal vector types and such.
20244	if (DCI.isBeforeLegalizeOps())
20245	return SDValue ();
20246	// Transform a scalar conversion of a value from a lane extract into a
20247	// lane extract of a vector conversion. E.g., from foo1 to foo2:
20248	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
20249	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
20250	//
20251	// The second form interacts better with instruction selection and the
20252	// register allocator to avoid cross-class register copies that aren't
20253	// coalescable due to a lane reference.
20254
20255	// Check the operand and see if it originates from a lane extract.
20256	SDValue Op1 = N->getOperand(Num: `1`);
20257	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
20258	return SDValue ();
20259
20260	// Yep, no additional predication needed. Perform the transform.
20261	SDValue IID = N->getOperand(Num: `0`);
20262	SDValue Shift = N->getOperand(Num: `2`);
20263	SDValue Vec = Op1.getOperand(i: `0`);
20264	SDValue Lane = Op1.getOperand(i: `1`);
20265	EVT ResTy = N->getValueType(ResNo: `0`);
20266	EVT VecResTy;
20267	SDLoc DL(N);
20268
20269	// The vector width should be 128 bits by the time we get here, even
20270	// if it started as 64 bits (the extract_vector handling will have
20271	// done so). Bail if it is not.
20272	if (Vec.getValueSizeInBits() != `128`)
20273	return SDValue ();
20274
20275	if (Vec.getValueType() == MVT::v4i32)
20276	VecResTy = MVT::v4f32;
20277	else if (Vec.getValueType() == MVT::v2i64)
20278	VecResTy = MVT::v2f64;
20279	else
20280	return SDValue ();
20281
20282	SDValue Convert =
20283	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
20284	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
20285	}
20286
20287	// AArch64 high-vector "long" operations are formed by performing the non-high
20288	// version on an extract_subvector of each operand which gets the high half:
20289	//
20290	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
20291	//
20292	// However, there are cases which don't have an extract_high explicitly, but
20293	// have another operation that can be made compatible with one for free. For
20294	// example:
20295	//
20296	// (dupv64 scalar) --> (extract_high (dup128 scalar))
20297	//
20298	// This routine does the actual conversion of such DUPs, once outer routines
20299	// have determined that everything else is in order.
20300	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
20301	// similarly here.
20302	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
20303	MVT VT = N.getSimpleValueType();
20304	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
20305	N.getConstantOperandVal(i: `1`) == `0`)
20306	N = N.getOperand(i: `0`);
20307
20308	switch (N.getOpcode()) {
20309	case AArch64ISD::DUP:
20310	case AArch64ISD::DUPLANE8:
20311	case AArch64ISD::DUPLANE16:
20312	case AArch64ISD::DUPLANE32:
20313	case AArch64ISD::DUPLANE64:
20314	case AArch64ISD::MOVI:
20315	case AArch64ISD::MOVIshift:
20316	case AArch64ISD::MOVIedit:
20317	case AArch64ISD::MOVImsl:
20318	case AArch64ISD::MVNIshift:
20319	case AArch64ISD::MVNImsl:
20320	break;
20321	default:
20322	// FMOV could be supported, but isn't very useful, as it would only occur
20323	// if you passed a bitcast' floating point immediate to an eligible long
20324	// integer op (addl, smull, ...).
20325	return SDValue ();
20326	}
20327
20328	if (!VT.is64BitVector())
20329	return SDValue ();
20330
20331	SDLoc DL(N);
20332	unsigned NumElems = VT.getVectorNumElements();
20333	if (N.getValueType().is64BitVector()) {
20334	MVT ElementTy = VT.getVectorElementType();
20335	MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * `2`);
20336	N = DAG.getNode(Opcode: N ->getOpcode(), DL, VT: NewVT, Ops: N ->ops());
20337	}
20338
20339	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: N,
20340	N2: DAG.getConstant(Val: NumElems, DL, VT: MVT::i64));
20341	}
20342
20343	static bool isEssentiallyExtractHighSubvector(SDValue N) {
20344	if (N.getOpcode() == ISD::BITCAST)
20345	N = N.getOperand(i: `0`);
20346	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20347	return false;
20348	if (N.getOperand(i: `0`).getValueType().isScalableVector())
20349	return false;
20350	return N.getConstantOperandAPInt(i: `1`) ==
20351	N.getOperand(i: `0`).getValueType().getVectorNumElements() / `2`;
20352	}
20353
20354	/// Helper structure to keep track of ISD::SET_CC operands.
20355	struct GenericSetCCInfo {
20356	const SDValue *Opnd0;
20357	const SDValue *Opnd1;
20358	ISD::CondCode CC;
20359	};
20360
20361	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
20362	struct AArch64SetCCInfo {
20363	const SDValue *Cmp;
20364	AArch64CC::CondCode CC;
20365	};
20366
20367	/// Helper structure to keep track of SetCC information.
20368	union SetCCInfo {
20369	GenericSetCCInfo Generic;
20370	AArch64SetCCInfo AArch64;
20371	};
20372
20373	/// Helper structure to be able to read SetCC information. If set to
20374	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
20375	/// GenericSetCCInfo.
20376	struct SetCCInfoAndKind {
20377	SetCCInfo Info;
20378	bool IsAArch64;
20379	};
20380
20381	/// Check whether or not \p Op is a SET_CC operation, either a generic or
20382	/// an
20383	/// AArch64 lowered one.
20384	/// \p SetCCInfo is filled accordingly.
20385	/// \post SetCCInfo is meanginfull only when this function returns true.
20386	/// \return True when Op is a kind of SET_CC operation.
20387	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
20388	// If this is a setcc, this is straight forward.
20389	if (Op.getOpcode() == ISD::SETCC) {
20390	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: `0`);
20391	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: `1`);
20392	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
20393	SetCCInfo.IsAArch64 = false;
20394	return true;
20395	}
20396	// Otherwise, check if this is a matching csel instruction.
20397	// In other words:
20398	// - csel 1, 0, cc
20399	// - csel 0, 1, !cc
20400	if (Op.getOpcode() != AArch64ISD::CSEL)
20401	return false;
20402	// Set the information about the operands.
20403	// TODO: we want the operands of the Cmp not the csel
20404	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: `3`);
20405	SetCCInfo.IsAArch64 = true;
20406	SetCCInfo.Info.AArch64.CC =
20407	static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
20408
20409	// Check that the operands matches the constraints:
20410	// (1) Both operands must be constants.
20411	// (2) One must be 1 and the other must be 0.
20412	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `0`));
20413	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
20414
20415	// Check (1).
20416	if (!TValue \|\| !FValue)
20417	return false;
20418
20419	// Check (2).
20420	if (!TValue->isOne()) {
20421	// Update the comparison when we are interested in !cc.
20422	std::swap(a&: TValue, b&: FValue);
20423	SetCCInfo.Info.AArch64.CC =
20424	AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
20425	}
20426	return TValue->isOne() && FValue->isZero();
20427	}
20428
20429	// Returns true if Op is setcc or zext of setcc.
20430	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
20431	if (isSetCC(Op, SetCCInfo&: Info))
20432	return true;
20433	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
20434	isSetCC(Op: Op ->getOperand(Num: `0`), SetCCInfo&: Info));
20435	}
20436
20437	// The folding we want to perform is:
20438	// (add x, [zext] (setcc cc ...) )
20439	// -->
20440	// (csel x, (add x, 1), !cc ...)
20441	//
20442	// The latter will get matched to a CSINC instruction.
20443	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
20444	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
20445	SDValue LHS = Op->getOperand(Num: `0`);
20446	SDValue RHS = Op->getOperand(Num: `1`);
20447	SetCCInfoAndKind InfoAndKind;
20448
20449	// If both operands are a SET_CC, then we don't want to perform this
20450	// folding and create another csel as this results in more instructions
20451	// (and higher register usage).
20452	if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
20453	isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
20454	return SDValue ();
20455
20456	// If neither operand is a SET_CC, give up.
20457	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
20458	std::swap(a&: LHS, b&: RHS);
20459	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
20460	return SDValue ();
20461	}
20462
20463	// FIXME: This could be generatized to work for FP comparisons.
20464	EVT CmpVT = InfoAndKind.IsAArch64
20465	? InfoAndKind.Info.AArch64.Cmp->getOperand(i: `0`).getValueType()
20466	: InfoAndKind.Info.Generic.Opnd0->getValueType();
20467	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
20468	return SDValue ();
20469
20470	SDValue CCVal;
20471	SDValue Cmp;
20472	SDLoc DL(Op);
20473	if (InfoAndKind.IsAArch64) {
20474	CCVal = DAG.getConstant(
20475	Val: AArch64CC::getInvertedCondCode(Code: InfoAndKind.Info.AArch64.CC), DL,
20476	VT: MVT::i32);
20477	Cmp = *InfoAndKind.Info.AArch64.Cmp;
20478	} else
20479	Cmp = getAArch64Cmp(
20480	LHS: InfoAndKind.Info.Generic.Opnd0, RHS: InfoAndKind.Info.Generic.Opnd1,
20481	CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
20482	DL);
20483
20484	EVT VT = Op->getValueType(ResNo: `0`);
20485	LHS = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: DAG.getConstant(Val: `1`, DL, VT));
20486	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
20487	}
20488
20489	// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
20490	static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
20491	EVT VT = N->getValueType(ResNo: `0`);
20492	// Only scalar integer and vector types.
20493	if (N->getOpcode() != ISD::ADD \|\| !VT.isScalarInteger())
20494	return SDValue ();
20495
20496	SDValue LHS = N->getOperand(Num: `0`);
20497	SDValue RHS = N->getOperand(Num: `1`);
20498	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
20499	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\| LHS.getValueType() != VT)
20500	return SDValue ();
20501
20502	auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`));
20503	auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS ->getOperand(Num: `1`));
20504	if (!LHSN1 \|\| LHSN1 != RHSN1 \|\| !RHSN1->isZero())
20505	return SDValue ();
20506
20507	SDValue Op1 = LHS ->getOperand(Num: `0`);
20508	SDValue Op2 = RHS ->getOperand(Num: `0`);
20509	EVT OpVT1 = Op1.getValueType();
20510	EVT OpVT2 = Op2.getValueType();
20511	if (Op1.getOpcode() != AArch64ISD::UADDV \|\| OpVT1 != OpVT2 \|\|
20512	Op2.getOpcode() != AArch64ISD::UADDV \|\|
20513	OpVT1.getVectorElementType() != VT)
20514	return SDValue ();
20515
20516	SDValue Val1 = Op1.getOperand(i: `0`);
20517	SDValue Val2 = Op2.getOperand(i: `0`);
20518	EVT ValVT = Val1 ->getValueType(ResNo: `0`);
20519	SDLoc DL(N);
20520	SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
20521	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT,
20522	N1: DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: ValVT, Operand: AddVal),
20523	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
20524	}
20525
20526	/// Perform the scalar expression combine in the form of:
20527	/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
20528	/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
20529	static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
20530	EVT VT = N->getValueType(ResNo: `0`);
20531	if (!VT.isScalarInteger() \|\| N->getOpcode() != ISD::ADD)
20532	return SDValue ();
20533
20534	SDValue LHS = N->getOperand(Num: `0`);
20535	SDValue RHS = N->getOperand(Num: `1`);
20536
20537	// Handle commutivity.
20538	if (LHS.getOpcode() != AArch64ISD::CSEL &&
20539	LHS.getOpcode() != AArch64ISD::CSNEG) {
20540	std::swap(a&: LHS, b&: RHS);
20541	if (LHS.getOpcode() != AArch64ISD::CSEL &&
20542	LHS.getOpcode() != AArch64ISD::CSNEG) {
20543	return SDValue ();
20544	}
20545	}
20546
20547	if (!LHS.hasOneUse())
20548	return SDValue ();
20549
20550	AArch64CC::CondCode AArch64CC =
20551	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
20552
20553	// The CSEL should include a const one operand, and the CSNEG should include
20554	// One or NegOne operand.
20555	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `0`));
20556	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
20557	if (!CTVal \|\| !CFVal)
20558	return SDValue ();
20559
20560	if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
20561	(CTVal->isOne() \|\| CFVal->isOne())) &&
20562	!(LHS.getOpcode() == AArch64ISD::CSNEG &&
20563	(CTVal->isOne() \|\| CFVal->isAllOnes())))
20564	return SDValue ();
20565
20566	// Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
20567	if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
20568	!CFVal->isOne()) {
20569	std::swap(a&: CTVal, b&: CFVal);
20570	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
20571	}
20572
20573	SDLoc DL(N);
20574	// Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
20575	if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
20576	!CFVal->isAllOnes()) {
20577	APInt C = -`1` * CFVal->getAPIntValue();
20578	CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
20579	CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
20580	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
20581	}
20582
20583	// It might be neutral for larger constants, as the immediate need to be
20584	// materialized in a register.
20585	APInt ADDC = CTVal->getAPIntValue();
20586	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20587	if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
20588	return SDValue ();
20589
20590	assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) \|\|
20591	(LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
20592	"Unexpected constant value");
20593
20594	SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue (CTVal, `0`));
20595	SDValue CCVal = DAG.getConstant(Val: AArch64CC, DL, VT: MVT::i32);
20596	SDValue Cmp = LHS.getOperand(i: `3`);
20597
20598	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
20599	}
20600
20601	// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
20602	static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
20603	EVT VT = N->getValueType(ResNo: `0`);
20604	if (N->getOpcode() != ISD::ADD)
20605	return SDValue ();
20606
20607	SDValue Dot = N->getOperand(Num: `0`);
20608	SDValue A = N->getOperand(Num: `1`);
20609	// Handle commutivity
20610	auto isZeroDot = [](SDValue Dot) {
20611	return (Dot.getOpcode() == AArch64ISD::UDOT \|\|
20612	Dot.getOpcode() == AArch64ISD::SDOT) &&
20613	isZerosVector(N: Dot.getOperand(i: `0`).getNode());
20614	};
20615	if (!isZeroDot (Dot))
20616	std::swap(a&: Dot, b&: A);
20617	if (!isZeroDot (Dot))
20618	return SDValue ();
20619
20620	return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc (N), VT, N1: A, N2: Dot.getOperand(i: `1`),
20621	N3: Dot.getOperand(i: `2`));
20622	}
20623
20624	static bool isNegatedInteger(SDValue Op) {
20625	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`));
20626	}
20627
20628	static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
20629	SDLoc DL(Op);
20630	EVT VT = Op.getValueType();
20631	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
20632	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
20633	}
20634
20635	// Try to fold
20636	//
20637	// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
20638	//
20639	// The folding helps csel to be matched with csneg without generating
20640	// redundant neg instruction, which includes negation of the csel expansion
20641	// of abs node lowered by lowerABS.
20642	static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
20643	if (!isNegatedInteger(Op: SDValue (N, `0`)))
20644	return SDValue ();
20645
20646	SDValue CSel = N->getOperand(Num: `1`);
20647	if (CSel.getOpcode() != AArch64ISD::CSEL \|\| !CSel ->hasOneUse())
20648	return SDValue ();
20649
20650	SDValue N0 = CSel.getOperand(i: `0`);
20651	SDValue N1 = CSel.getOperand(i: `1`);
20652
20653	// If both of them is not negations, it's not worth the folding as it
20654	// introduces two additional negations while reducing one negation.
20655	if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
20656	return SDValue ();
20657
20658	SDValue N0N = getNegatedInteger(Op: N0, DAG);
20659	SDValue N1N = getNegatedInteger(Op: N1, DAG);
20660
20661	SDLoc DL(N);
20662	EVT VT = CSel.getValueType();
20663	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: `2`),
20664	N4: CSel.getOperand(i: `3`));
20665	}
20666
20667	// The basic add/sub long vector instructions have variants with "2" on the end
20668	// which act on the high-half of their inputs. They are normally matched by
20669	// patterns like:
20670	//
20671	// (add (zeroext (extract_high LHS)),
20672	// (zeroext (extract_high RHS)))
20673	// -> uaddl2 vD, vN, vM
20674	//
20675	// However, if one of the extracts is something like a duplicate, this
20676	// instruction can still be used profitably. This function puts the DAG into a
20677	// more appropriate form for those patterns to trigger.
20678	static SDValue performAddSubLongCombine(SDNode *N,
20679	TargetLowering::DAGCombinerInfo &DCI) {
20680	SelectionDAG &DAG = DCI.DAG;
20681	if (DCI.isBeforeLegalizeOps())
20682	return SDValue ();
20683
20684	MVT VT = N->getSimpleValueType(ResNo: `0`);
20685	if (!VT.is128BitVector()) {
20686	if (N->getOpcode() == ISD::ADD)
20687	return performSetccAddFolding(Op: N, DAG);
20688	return SDValue ();
20689	}
20690
20691	// Make sure both branches are extended in the same way.
20692	SDValue LHS = N->getOperand(Num: `0`);
20693	SDValue RHS = N->getOperand(Num: `1`);
20694	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20695	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
20696	LHS.getOpcode() != RHS.getOpcode())
20697	return SDValue ();
20698
20699	unsigned ExtType = LHS.getOpcode();
20700
20701	// It's not worth doing if at least one of the inputs isn't already an
20702	// extract, but we don't know which it'll be so we have to try both.
20703	if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: `0`))) {
20704	RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: `0`), DAG);
20705	if (!RHS.getNode())
20706	return SDValue ();
20707
20708	RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: RHS);
20709	} else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: `0`))) {
20710	LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: `0`), DAG);
20711	if (!LHS.getNode())
20712	return SDValue ();
20713
20714	LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: LHS);
20715	}
20716
20717	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT, N1: LHS, N2: RHS);
20718	}
20719
20720	static bool isCMP(SDValue Op) {
20721	return Op.getOpcode() == AArch64ISD::SUBS &&
20722	!Op.getNode()->hasAnyUseOfValue(Value: `0`);
20723	}
20724
20725	// (CSEL 1 0 CC Cond) => CC
20726	// (CSEL 0 1 CC Cond) => !CC
20727	static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20728	if (Op.getOpcode() != AArch64ISD::CSEL)
20729	return std::nullopt;
20730	auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
20731	if (CC == AArch64CC::AL \|\| CC == AArch64CC::NV)
20732	return std::nullopt;
20733	SDValue OpLHS = Op.getOperand(i: `0`);
20734	SDValue OpRHS = Op.getOperand(i: `1`);
20735	if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
20736	return CC;
20737	if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
20738	return getInvertedCondCode(Code: CC);
20739
20740	return std::nullopt;
20741	}
20742
20743	// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20744	// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20745	static SDValue foldOverflowCheck(SDNode Op, SelectionDAG &DAG, bool* IsAdd) {
20746	SDValue CmpOp = Op->getOperand(Num: `2`);
20747	if (!isCMP(Op: CmpOp))
20748	return SDValue ();
20749
20750	if (IsAdd) {
20751	if (!isOneConstant(V: CmpOp.getOperand(i: `1`)))
20752	return SDValue ();
20753	} else {
20754	if (!isNullConstant(V: CmpOp.getOperand(i: `0`)))
20755	return SDValue ();
20756	}
20757
20758	SDValue CsetOp = CmpOp ->getOperand(Num: IsAdd ? `0` : `1`);
20759	auto CC = getCSETCondCode(Op: CsetOp);
20760	if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20761	return SDValue ();
20762
20763	return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc (Op), VTList: Op->getVTList(),
20764	N1: Op->getOperand(Num: `0`), N2: Op->getOperand(Num: `1`),
20765	N3: CsetOp.getOperand(i: `3`));
20766	}
20767
20768	// (ADC x 0 cond) => (CINC x HS cond)
20769	static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
20770	SDValue LHS = N->getOperand(Num: `0`);
20771	SDValue RHS = N->getOperand(Num: `1`);
20772	SDValue Cond = N->getOperand(Num: `2`);
20773
20774	if (!isNullConstant(V: RHS))
20775	return SDValue ();
20776
20777	EVT VT = N->getValueType(ResNo: `0`);
20778	SDLoc DL(N);
20779
20780	// (CINC x cc cond) <=> (CSINC x x !cc cond)
20781	SDValue CC = DAG.getConstant(Val: AArch64CC::LO, DL, VT: MVT::i32);
20782	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
20783	}
20784
20785	static SDValue performBuildVectorCombine(SDNode *N,
20786	TargetLowering::DAGCombinerInfo &DCI,
20787	SelectionDAG &DAG) {
20788	SDLoc DL(N);
20789	EVT VT = N->getValueType(ResNo: `0`);
20790
20791	if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
20792	(VT == MVT::v4f16 \|\| VT == MVT::v4bf16)) {
20793	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`),
20794	Elt2 = N->getOperand(Num: `2`), Elt3 = N->getOperand(Num: `3`);
20795	if (Elt0 ->getOpcode() == ISD::FP_ROUND &&
20796	Elt1 ->getOpcode() == ISD::FP_ROUND &&
20797	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
20798	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
20799	Elt0 ->getConstantOperandVal(Num: `1`) == Elt1 ->getConstantOperandVal(Num: `1`) &&
20800	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20801	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20802	// Constant index.
20803	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20804	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20805	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20806	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20807	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
20808	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
20809	SDValue LowLanesSrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20810	if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20811	SDValue HighLanes;
20812	if (Elt2 ->getOpcode() == ISD::UNDEF &&
20813	Elt3 ->getOpcode() == ISD::UNDEF) {
20814	HighLanes = DAG.getUNDEF(VT: MVT::v2f32);
20815	} else if (Elt2 ->getOpcode() == ISD::FP_ROUND &&
20816	Elt3 ->getOpcode() == ISD::FP_ROUND &&
20817	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `1`)) &&
20818	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `1`)) &&
20819	Elt2 ->getConstantOperandVal(Num: `1`) ==
20820	Elt3 ->getConstantOperandVal(Num: `1`) &&
20821	Elt2 ->getOperand(Num: `0`)->getOpcode() ==
20822	ISD::EXTRACT_VECTOR_ELT &&
20823	Elt3 ->getOperand(Num: `0`)->getOpcode() ==
20824	ISD::EXTRACT_VECTOR_ELT &&
20825	// Constant index.
20826	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20827	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20828	Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20829	Elt3 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20830	Elt2 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
20831	Elt3 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
20832	SDValue HighLanesSrcVec = Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20833	HighLanes =
20834	DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: HighLanesSrcVec);
20835	}
20836	if (HighLanes) {
20837	SDValue DoubleToSingleSticky =
20838	DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: LowLanesSrcVec);
20839	SDValue Concat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v4f32,
20840	N1: DoubleToSingleSticky, N2: HighLanes);
20841	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
20842	N2: Elt0 ->getOperand(Num: `1`));
20843	}
20844	}
20845	}
20846	}
20847
20848	if (VT == MVT::v2f64) {
20849	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
20850	if (Elt0 ->getOpcode() == ISD::FP_EXTEND &&
20851	Elt1 ->getOpcode() == ISD::FP_EXTEND &&
20852	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20853	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20854	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20855	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20856	// Constant index.
20857	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20858	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20859	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) + `1` ==
20860	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) &&
20861	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20862	// ResultType's known minimum vector length.
20863	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) %
20864	VT.getVectorMinNumElements() ==
20865	`0`) {
20866	SDValue SrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20867	if (SrcVec.getValueType() == MVT::v4f16 \|\|
20868	SrcVec.getValueType() == MVT::v4bf16) {
20869	SDValue HalfToSingle =
20870	DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::v4f32, Operand: SrcVec);
20871	SDValue SubvectorIdx = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`);
20872	SDValue Extract = DAG.getNode(
20873	Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: VT.changeVectorElementType(EltVT: MVT::f32),
20874	N1: HalfToSingle, N2: SubvectorIdx);
20875	return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
20876	}
20877	}
20878	}
20879
20880	// A build vector of two extracted elements is equivalent to an
20881	// extract subvector where the inner vector is any-extended to the
20882	// extract_vector_elt VT.
20883	// (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20884	// (extract_elt_iXX_to_i32 vec Idx+1))
20885	// => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20886
20887	// For now, only consider the v2i32 case, which arises as a result of
20888	// legalization.
20889	if (VT != MVT::v2i32)
20890	return SDValue ();
20891
20892	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
20893	// Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20894	if (Elt0 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20895	Elt1 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20896	// Constant index.
20897	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
20898	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
20899	// Both EXTRACT_VECTOR_ELT from same vector...
20900	Elt0 ->getOperand(Num: `0`) == Elt1 ->getOperand(Num: `0`) &&
20901	// ... and contiguous. First element's index +1 == second element's index.
20902	Elt0 ->getConstantOperandVal(Num: `1`) + `1` == Elt1 ->getConstantOperandVal(Num: `1`) &&
20903	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20904	// ResultType's known minimum vector length.
20905	Elt0 ->getConstantOperandVal(Num: `1`) % VT.getVectorMinNumElements() == `0`) {
20906	SDValue VecToExtend = Elt0 ->getOperand(Num: `0`);
20907	EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(EltVT: MVT::i32);
20908	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
20909	return SDValue ();
20910
20911	SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0 ->getConstantOperandVal(Num: `1`), DL);
20912
20913	SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
20914	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: Ext,
20915	N2: SubvectorIdx);
20916	}
20917
20918	return SDValue ();
20919	}
20920
20921	static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
20922	TargetLowering::DAGCombinerInfo &DCI) {
20923	SDLoc DL(N);
20924	EVT VT = N->getValueType(ResNo: `0`);
20925	SDValue N0 = N->getOperand(Num: `0`);
20926	if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20927	N0.getOpcode() == AArch64ISD::DUP) {
20928	SDValue Op = N0.getOperand(i: `0`);
20929	if (VT.getScalarType() == MVT::i32 &&
20930	N0.getOperand(i: `0`).getValueType().getScalarType() == MVT::i64)
20931	Op = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Op);
20932	return DAG.getNode(Opcode: N0.getOpcode(), DL, VT, Operand: Op);
20933	}
20934
20935	// Performing the following combine produces a preferable form for ISEL.
20936	// i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx2))*
20937	if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20938	N0.hasOneUse()) {
20939	SDValue Op = N0.getOperand(i: `0`);
20940	SDValue ExtractIndexNode = N0.getOperand(i: `1`);
20941	if (!isa<ConstantSDNode>(Val: ExtractIndexNode))
20942	return SDValue ();
20943
20944	// For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
20945	// So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
20946	assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
20947	"Unexpected legalisation result!");
20948
20949	EVT SrcVectorType = Op.getValueType();
20950	// We also assume that SrcVectorType cannot be a V64 (see
20951	// LowerEXTRACT_VECTOR_ELT).
20952	assert((SrcVectorType == MVT::v2i64 \|\| SrcVectorType == MVT::nxv2i64) &&
20953	"Unexpected legalisation result!");
20954
20955	unsigned ExtractIndex =
20956	cast<ConstantSDNode>(Val&: ExtractIndexNode)->getZExtValue();
20957	MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
20958
20959	Op = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: CastVT, Operand: Op);
20960	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op,
20961	N2: DAG.getVectorIdxConstant(Val: ExtractIndex * `2`, DL));
20962	}
20963
20964	return SDValue ();
20965	}
20966
20967	// Check an node is an extend or shift operand
20968	static bool isExtendOrShiftOperand(SDValue N) {
20969	unsigned Opcode = N.getOpcode();
20970	if (ISD::isExtOpcode(Opcode) \|\| Opcode == ISD::SIGN_EXTEND_INREG) {
20971	EVT SrcVT;
20972	if (Opcode == ISD::SIGN_EXTEND_INREG)
20973	SrcVT = cast<VTSDNode>(Val: N.getOperand(i: `1`))->getVT();
20974	else
20975	SrcVT = N.getOperand(i: `0`).getValueType();
20976
20977	return SrcVT == MVT::i32 \|\| SrcVT == MVT::i16 \|\| SrcVT == MVT::i8;
20978	} else if (Opcode == ISD::AND) {
20979	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
20980	if (!CSD)
20981	return false;
20982	uint64_t AndMask = CSD->getZExtValue();
20983	return AndMask == `0xff` \|\| AndMask == `0xffff` \|\| AndMask == `0xffffffff`;
20984	} else if (Opcode == ISD::SHL \|\| Opcode == ISD::SRL \|\| Opcode == ISD::SRA) {
20985	return isa<ConstantSDNode>(Val: N.getOperand(i: `1`));
20986	}
20987
20988	return false;
20989	}
20990
20991	// (N - Y) + Z --> (Z - Y) + N
20992	// when N is an extend or shift operand
20993	static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
20994	SelectionDAG &DAG) {
20995	auto IsOneUseExtend = [](SDValue N) {
20996	return N.hasOneUse() && isExtendOrShiftOperand(N);
20997	};
20998
20999	// DAGCombiner will revert the combination when Z is constant cause
21000	// dead loop. So don't enable the combination when Z is constant.
21001	// If Z is one use shift C, we also can't do the optimization.
21002	// It will falling to self infinite loop.
21003	if (isa<ConstantSDNode>(Val: Z) \|\| IsOneUseExtend (Z))
21004	return SDValue ();
21005
21006	if (SUB.getOpcode() != ISD::SUB \|\| !SUB.hasOneUse())
21007	return SDValue ();
21008
21009	SDValue Shift = SUB.getOperand(i: `0`);
21010	if (!IsOneUseExtend (Shift))
21011	return SDValue ();
21012
21013	SDLoc DL(N);
21014	EVT VT = N->getValueType(ResNo: `0`);
21015
21016	SDValue Y = SUB.getOperand(i: `1`);
21017	SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
21018	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
21019	}
21020
21021	static SDValue performAddCombineForShiftedOperands(SDNode *N,
21022	SelectionDAG &DAG) {
21023	// NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
21024	// commutative.
21025	if (N->getOpcode() != ISD::ADD)
21026	return SDValue ();
21027
21028	// Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
21029	// shifted register is only available for i32 and i64.
21030	EVT VT = N->getValueType(ResNo: `0`);
21031	if (VT != MVT::i32 && VT != MVT::i64)
21032	return SDValue ();
21033
21034	SDLoc DL(N);
21035	SDValue LHS = N->getOperand(Num: `0`);
21036	SDValue RHS = N->getOperand(Num: `1`);
21037
21038	if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
21039	return Val;
21040	if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
21041	return Val;
21042
21043	uint64_t LHSImm = `0`, RHSImm = `0`;
21044	// If both operand are shifted by imm and shift amount is not greater than 4
21045	// for one operand, swap LHS and RHS to put operand with smaller shift amount
21046	// on RHS.
21047	//
21048	// On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
21049	// LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
21050	// with LSL (shift > 4). For the rest of processors, this is no-op for
21051	// performance or correctness.
21052	if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
21053	isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= `4` &&
21054	RHSImm > `4` && LHS.hasOneUse())
21055	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
21056
21057	return SDValue ();
21058	}
21059
21060	// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
21061	// This reassociates it back to allow the creation of more mls instructions.
21062	static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
21063	if (N->getOpcode() != ISD::SUB)
21064	return SDValue ();
21065
21066	SDValue Add = N->getOperand(Num: `1`);
21067	SDValue X = N->getOperand(Num: `0`);
21068	if (Add.getOpcode() != ISD::ADD)
21069	return SDValue ();
21070
21071	if (!Add.hasOneUse())
21072	return SDValue ();
21073	if (DAG.isConstantIntBuildVectorOrConstantInt(N: X))
21074	return SDValue ();
21075
21076	SDValue M1 = Add.getOperand(i: `0`);
21077	SDValue M2 = Add.getOperand(i: `1`);
21078	if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
21079	M1.getOpcode() != AArch64ISD::UMULL)
21080	return SDValue ();
21081	if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
21082	M2.getOpcode() != AArch64ISD::UMULL)
21083	return SDValue ();
21084
21085	EVT VT = N->getValueType(ResNo: `0`);
21086	SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: X, N2: M1);
21087	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: Sub, N2: M2);
21088	}
21089
21090	// Combine into mla/mls.
21091	// This works on the patterns of:
21092	// add v1, (mul v2, v3)
21093	// sub v1, (mul v2, v3)
21094	// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
21095	// It will transform the add/sub to a scalable version, so that we can
21096	// make use of SVE's MLA/MLS that will be generated for that pattern
21097	static SDValue
21098	performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
21099	SelectionDAG &DAG = DCI.DAG;
21100	// Make sure that the types are legal
21101	if (!DCI.isAfterLegalizeDAG())
21102	return SDValue ();
21103	// Before using SVE's features, check first if it's available.
21104	if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
21105	return SDValue ();
21106
21107	if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
21108	return SDValue ();
21109
21110	if (!N->getValueType(ResNo: `0`).isFixedLengthVector())
21111	return SDValue ();
21112
21113	auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
21114	if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21115	return SDValue ();
21116
21117	if (!cast<ConstantSDNode>(Val: Op1 ->getOperand(Num: `1`))->isZero())
21118	return SDValue ();
21119
21120	SDValue MulValue = Op1 ->getOperand(Num: `0`);
21121	if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
21122	return SDValue ();
21123
21124	if (!Op1.hasOneUse() \|\| !MulValue.hasOneUse())
21125	return SDValue ();
21126
21127	EVT ScalableVT = MulValue.getValueType();
21128	if (!ScalableVT.isScalableVector())
21129	return SDValue ();
21130
21131	SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
21132	SDValue NewValue =
21133	DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
21134	return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: `0`), V: NewValue);
21135	};
21136
21137	if (SDValue res = performOpt (N->getOperand(Num: `0`), N->getOperand(Num: `1`)))
21138	return res;
21139	else if (N->getOpcode() == ISD::ADD)
21140	return performOpt (N->getOperand(Num: `1`), N->getOperand(Num: `0`));
21141
21142	return SDValue ();
21143	}
21144
21145	// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
21146	// help, for example, to produce ssra from sshr+add.
21147	static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
21148	EVT VT = N->getValueType(ResNo: `0`);
21149	if (VT != MVT::i64 \|\|
21150	DAG.getTargetLoweringInfo().isOperationExpand(Op: N->getOpcode(), VT: MVT::v1i64))
21151	return SDValue ();
21152	SDValue Op0 = N->getOperand(Num: `0`);
21153	SDValue Op1 = N->getOperand(Num: `1`);
21154
21155	// At least one of the operands should be an extract, and the other should be
21156	// something that is easy to convert to v1i64 type (in this case a load).
21157	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21158	Op0.getOpcode() != ISD::LOAD)
21159	return SDValue ();
21160	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
21161	Op1.getOpcode() != ISD::LOAD)
21162	return SDValue ();
21163
21164	SDLoc DL(N);
21165	if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21166	Op0.getOperand(i: `0`).getValueType() == MVT::v1i64) {
21167	Op0 = Op0.getOperand(i: `0`);
21168	Op1 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op1);
21169	} else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21170	Op1.getOperand(i: `0`).getValueType() == MVT::v1i64) {
21171	Op0 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op0);
21172	Op1 = Op1.getOperand(i: `0`);
21173	} else
21174	return SDValue ();
21175
21176	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
21177	N1: DAG.getNode(Opcode: N->getOpcode(), DL, VT: MVT::v1i64, N1: Op0, N2: Op1),
21178	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21179	}
21180
21181	static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
21182	SDValue BV = peekThroughOneUseBitcasts(V: B);
21183	if (!BV ->hasOneUse())
21184	return false;
21185	if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
21186	if (!Ld \|\| !Ld->isSimple())
21187	return false;
21188	Loads.push_back(Elt: Ld);
21189	return true;
21190	} else if (BV.getOpcode() == ISD::BUILD_VECTOR \|\|
21191	BV.getOpcode() == ISD::CONCAT_VECTORS) {
21192	for (unsigned Op = `0`; Op < BV.getNumOperands(); Op++) {
21193	auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
21194	if (!Ld \|\| !Ld->isSimple() \|\| !BV.getOperand(i: Op).hasOneUse())
21195	return false;
21196	Loads.push_back(Elt: Ld);
21197	}
21198	return true;
21199	} else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
21200	// Try to find a tree of shuffles and concats from how IR shuffles of loads
21201	// are lowered. Note that this only comes up because we do not always visit
21202	// operands before uses. After that is fixed this can be removed and in the
21203	// meantime this is fairly specific to the lowering we expect from IR.
21204	// t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
21205	// t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
21206	// t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
21207	// t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
21208	// t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
21209	// t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
21210	// t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
21211	// t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
21212	// t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
21213	if (B.getOperand(i: `0`).getOpcode() != ISD::VECTOR_SHUFFLE \|\|
21214	B.getOperand(i: `0`).getOperand(i: `0`).getOpcode() != ISD::CONCAT_VECTORS \|\|
21215	B.getOperand(i: `0`).getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
21216	B.getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
21217	B.getOperand(i: `1`).getNumOperands() != `4`)
21218	return false;
21219	auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
21220	auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: `0`));
21221	int NumElts = B.getValueType().getVectorNumElements();
21222	int NumSubElts = NumElts / `4`;
21223	for (int I = `0`; I < NumSubElts; I++) {
21224	// <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
21225	if (SV1->getMaskElt(Idx: I) != I \|\|
21226	SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
21227	SV1->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumSubElts * `2` \|\|
21228	SV1->getMaskElt(Idx: I + NumSubElts * `3`) != I + NumElts)
21229	return false;
21230	// <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
21231	if (SV2->getMaskElt(Idx: I) != I \|\|
21232	SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
21233	SV2->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumElts)
21234	return false;
21235	}
21236	auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `0`));
21237	auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `1`));
21238	auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `1`).getOperand(i: `0`));
21239	auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: `1`).getOperand(i: `0`));
21240	if (!Ld0 \|\| !Ld1 \|\| !Ld2 \|\| !Ld3 \|\| !Ld0->isSimple() \|\| !Ld1->isSimple() \|\|
21241	!Ld2->isSimple() \|\| !Ld3->isSimple())
21242	return false;
21243	Loads.push_back(Elt: Ld0);
21244	Loads.push_back(Elt: Ld1);
21245	Loads.push_back(Elt: Ld2);
21246	Loads.push_back(Elt: Ld3);
21247	return true;
21248	}
21249	return false;
21250	}
21251
21252	static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
21253	SelectionDAG &DAG,
21254	unsigned &NumSubLoads) {
21255	if (!Op0.hasOneUse() \|\| !Op1.hasOneUse())
21256	return false;
21257
21258	SmallVector<LoadSDNode *> Loads0, Loads1;
21259	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
21260	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
21261	if (NumSubLoads && Loads0.size() != NumSubLoads)
21262	return false;
21263	NumSubLoads = Loads0.size();
21264	return Loads0.size() == Loads1.size() &&
21265	all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
21266	unsigned Size = get<`0`>(L)->getValueType(`0`).getSizeInBits();
21267	return Size == get<`1`>(L)->getValueType(`0`).getSizeInBits() &&
21268	DAG.areNonVolatileConsecutiveLoads(LD: get<`1`>(L), Base: get<`0`>(L),
21269	Bytes: Size / `8`, Dist: `1`);
21270	});
21271	}
21272
21273	if (Op0.getOpcode() != Op1.getOpcode())
21274	return false;
21275
21276	switch (Op0.getOpcode()) {
21277	case ISD::ADD:
21278	case ISD::SUB:
21279	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
21280	DAG, NumSubLoads) &&
21281	areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `1`), Op1: Op1.getOperand(i: `1`),
21282	DAG, NumSubLoads);
21283	case ISD::SIGN_EXTEND:
21284	case ISD::ANY_EXTEND:
21285	case ISD::ZERO_EXTEND:
21286	EVT XVT = Op0.getOperand(i: `0`).getValueType();
21287	if (XVT.getScalarSizeInBits() != `8` && XVT.getScalarSizeInBits() != `16` &&
21288	XVT.getScalarSizeInBits() != `32`)
21289	return false;
21290	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
21291	DAG, NumSubLoads);
21292	}
21293	return false;
21294	}
21295
21296	// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
21297	// into a single load of twice the size, that we extract the bottom part and top
21298	// part so that the shl can use a shll2 instruction. The two loads in that
21299	// example can also be larger trees of instructions, which are identical except
21300	// for the leaves which are all loads offset from the LHS, including
21301	// buildvectors of multiple loads. For example the RHS tree could be
21302	// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
21303	// Whilst it can be common for the larger loads to replace LDP instructions
21304	// (which doesn't gain anything on it's own), the larger loads can help create
21305	// more efficient code, and in buildvectors prevent the need for ld1 lane
21306	// inserts which can be slower than normal loads.
21307	static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
21308	EVT VT = N->getValueType(ResNo: `0`);
21309	if (!VT.isFixedLengthVector() \|\|
21310	(VT.getScalarSizeInBits() != `16` && VT.getScalarSizeInBits() != `32` &&
21311	VT.getScalarSizeInBits() != `64`))
21312	return SDValue ();
21313
21314	SDValue Other = N->getOperand(Num: `0`);
21315	SDValue Shift = N->getOperand(Num: `1`);
21316	if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
21317	std::swap(a&: Shift, b&: Other);
21318	APInt ShiftAmt;
21319	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse() \|\|
21320	!ISD::isConstantSplatVector(N: Shift.getOperand(i: `1`).getNode(), SplatValue&: ShiftAmt))
21321	return SDValue ();
21322
21323	if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: `0`).getOpcode()) \|\|
21324	!ISD::isExtOpcode(Opcode: Other.getOpcode()) \|\|
21325	Shift.getOperand(i: `0`).getOperand(i: `0`).getValueType() !=
21326	Other.getOperand(i: `0`).getValueType() \|\|
21327	!Other.hasOneUse() \|\| !Shift.getOperand(i: `0`).hasOneUse())
21328	return SDValue ();
21329
21330	SDValue Op0 = Other.getOperand(i: `0`);
21331	SDValue Op1 = Shift.getOperand(i: `0`).getOperand(i: `0`);
21332
21333	unsigned NumSubLoads = `0`;
21334	if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
21335	return SDValue ();
21336
21337	// Attempt to rule out some unprofitable cases using heuristics (some working
21338	// around suboptimal code generation), notably if the extend not be able to
21339	// use ushll2 instructions as the types are not large enough. Otherwise zip's
21340	// will need to be created which can increase the instruction count.
21341	unsigned NumElts = Op0.getValueType().getVectorNumElements();
21342	unsigned NumSubElts = NumElts / NumSubLoads;
21343	if (NumSubElts * VT.getScalarSizeInBits() < `128` \|\|
21344	(Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode() &&
21345	Op0.getValueType().getSizeInBits() < `128` &&
21346	!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
21347	return SDValue ();
21348
21349	// Recreate the tree with the new combined loads.
21350	std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
21351	[&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
21352	EVT DVT =
21353	Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21354
21355	SmallVector<LoadSDNode *> Loads0, Loads1;
21356	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
21357	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
21358	EVT LoadVT = EVT::getVectorVT(
21359	Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
21360	NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
21361	EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21362
21363	SmallVector<SDValue> NewLoads;
21364	for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
21365	SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc (L0), Chain: L0->getChain(),
21366	Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
21367	Alignment: L0->getBaseAlign());
21368	DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: `1`));
21369	DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: `1`));
21370	NewLoads.push_back(Elt: Load);
21371	}
21372	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op0), VT: DVT, Ops: NewLoads);
21373	}
21374
21375	SmallVector<SDValue> Ops;
21376	for (const auto &[O0, O1] : zip(t: Op0 ->op_values(), u: Op1 ->op_values()))
21377	Ops.push_back(Elt: GenCombinedTree (O0, O1, DAG));
21378	return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc (Op0), VT: DVT, Ops);
21379	};
21380	SDValue NewOp = GenCombinedTree (Op0, Op1, DAG);
21381
21382	SmallVector<int> LowMask(NumElts, `0`), HighMask(NumElts, `0`);
21383	int Hi = NumSubElts, Lo = `0`;
21384	for (unsigned i = `0`; i < NumSubLoads; i++) {
21385	for (unsigned j = `0`; j < NumSubElts; j++) {
21386	LowMask [i * NumSubElts + j] = Lo++;
21387	HighMask [i * NumSubElts + j] = Hi++;
21388	}
21389	Lo += NumSubElts;
21390	Hi += NumSubElts;
21391	}
21392	SDLoc DL(N);
21393	SDValue Ext0, Ext1;
21394	// Extract the top and bottom lanes, then extend the result. Possibly extend
21395	// the result then extract the lanes if the two operands match as it produces
21396	// slightly smaller code.
21397	if (Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode()) {
21398	SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(),
21399	N1: NewOp, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21400	SDValue SubH =
21401	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(), N1: NewOp,
21402	N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
21403	SDValue Extr0 =
21404	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
21405	SDValue Extr1 =
21406	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
21407	Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
21408	Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: `0`).getOpcode(), DL, VT, Operand: Extr1);
21409	} else {
21410	EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
21411	SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
21412	SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
21413	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21414	SDValue SubH =
21415	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
21416	N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
21417	Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
21418	Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
21419	}
21420	SDValue NShift =
21421	DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: `1`));
21422	return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
21423	}
21424
21425	static SDValue performAddSubCombine(SDNode *N,
21426	TargetLowering::DAGCombinerInfo &DCI) {
21427	// Try to change sum of two reductions.
21428	if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
21429	return Val;
21430	if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
21431	return Val;
21432	if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
21433	return Val;
21434	if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
21435	return Val;
21436	if (SDValue Val = performVectorExtCombine(N, DAG&: DCI.DAG))
21437	return Val;
21438	if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
21439	return Val;
21440	if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
21441	return Val;
21442	if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
21443	return Val;
21444	if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
21445	return Val;
21446
21447	if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
21448	return Val;
21449
21450	return performAddSubLongCombine(N, DCI);
21451	}
21452
21453	// Massage DAGs which we can use the high-half "long" operations on into
21454	// something isel will recognize better. E.g.
21455	//
21456	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
21457	// (aarch64_neon_umull (extract_high (v2i64 vec)))
21458	// (extract_high (v2i64 (dup128 scalar)))))
21459	//
21460	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
21461	TargetLowering::DAGCombinerInfo &DCI,
21462	SelectionDAG &DAG) {
21463	if (DCI.isBeforeLegalizeOps())
21464	return SDValue ();
21465
21466	SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `0` : `1`);
21467	SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `1` : `2`);
21468	assert(LHS.getValueType().is64BitVector() &&
21469	RHS.getValueType().is64BitVector() &&
21470	"unexpected shape for long operation");
21471
21472	// Either node could be a DUP, but it's not worth doing both of them (you'd
21473	// just as well use the non-high version) so look for a corresponding extract
21474	// operation on the other "wing".
21475	if (isEssentiallyExtractHighSubvector(N: LHS)) {
21476	RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
21477	if (!RHS.getNode())
21478	return SDValue ();
21479	} else if (isEssentiallyExtractHighSubvector(N: RHS)) {
21480	LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
21481	if (!LHS.getNode())
21482	return SDValue ();
21483	} else
21484	return SDValue ();
21485
21486	if (IID == Intrinsic::not_intrinsic)
21487	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: LHS, N2: RHS);
21488
21489	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21490	N1: N->getOperand(Num: `0`), N2: LHS, N3: RHS);
21491	}
21492
21493	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21494	MVT ElemTy = N->getSimpleValueType(ResNo: `0`).getScalarType();
21495	unsigned ElemBits = ElemTy.getSizeInBits();
21496
21497	int64_t ShiftAmount;
21498	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `2`))) {
21499	APInt SplatValue, SplatUndef;
21500	unsigned SplatBitSize;
21501	bool HasAnyUndefs;
21502	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
21503	HasAnyUndefs, MinSplatBits: ElemBits) \|\|
21504	SplatBitSize != ElemBits)
21505	return SDValue ();
21506
21507	ShiftAmount = SplatValue.getSExtValue();
21508	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`))) {
21509	ShiftAmount = CVN->getSExtValue();
21510	} else
21511	return SDValue ();
21512
21513	// If the shift amount is zero, remove the shift intrinsic.
21514	if (ShiftAmount == `0` && IID != Intrinsic::aarch64_neon_sqshlu)
21515	return N->getOperand(Num: `1`);
21516
21517	unsigned Opcode;
21518	bool IsRightShift;
21519	switch (IID) {
21520	default:
21521	llvm_unreachable("Unknown shift intrinsic");
21522	case Intrinsic::aarch64_neon_sqshl:
21523	Opcode = AArch64ISD::SQSHL_I;
21524	IsRightShift = false;
21525	break;
21526	case Intrinsic::aarch64_neon_uqshl:
21527	Opcode = AArch64ISD::UQSHL_I;
21528	IsRightShift = false;
21529	break;
21530	case Intrinsic::aarch64_neon_srshl:
21531	Opcode = AArch64ISD::SRSHR_I;
21532	IsRightShift = true;
21533	break;
21534	case Intrinsic::aarch64_neon_urshl:
21535	Opcode = AArch64ISD::URSHR_I;
21536	IsRightShift = true;
21537	break;
21538	case Intrinsic::aarch64_neon_sqshlu:
21539	Opcode = AArch64ISD::SQSHLU_I;
21540	IsRightShift = false;
21541	break;
21542	case Intrinsic::aarch64_neon_sshl:
21543	case Intrinsic::aarch64_neon_ushl:
21544	// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
21545	// left shift for positive shift amounts. For negative shifts we can use a
21546	// VASHR/VLSHR as appropriate.
21547	if (ShiftAmount < `0`) {
21548	Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
21549	: AArch64ISD::VLSHR;
21550	ShiftAmount = -ShiftAmount;
21551	} else
21552	Opcode = AArch64ISD::VSHL;
21553	IsRightShift = false;
21554	break;
21555	}
21556
21557	EVT VT = N->getValueType(ResNo: `0`);
21558	SDValue Op = N->getOperand(Num: `1`);
21559	SDLoc DL(N);
21560	if (VT == MVT::i64) {
21561	Op = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op);
21562	VT = MVT::v1i64;
21563	}
21564
21565	if (IsRightShift && ShiftAmount <= -`1` && ShiftAmount >= -(int)ElemBits) {
21566	Op = DAG.getNode(Opcode, DL, VT, N1: Op,
21567	N2: DAG.getSignedConstant(Val: -ShiftAmount, DL, VT: MVT::i32));
21568	if (N->getValueType(ResNo: `0`) == MVT::i64)
21569	Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Op,
21570	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21571	return Op;
21572	} else if (!IsRightShift && ShiftAmount >= `0` && ShiftAmount < ElemBits) {
21573	Op = DAG.getNode(Opcode, DL, VT, N1: Op,
21574	N2: DAG.getConstant(Val: ShiftAmount, DL, VT: MVT::i32));
21575	if (N->getValueType(ResNo: `0`) == MVT::i64)
21576	Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64, N1: Op,
21577	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21578	return Op;
21579	}
21580
21581	return SDValue ();
21582	}
21583
21584	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
21585	// the intrinsics must be legal and take an i32, this means there's almost
21586	// certainly going to be a zext in the DAG which we can eliminate.
21587	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
21588	SDValue AndN = N->getOperand(Num: `2`);
21589	if (AndN.getOpcode() != ISD::AND)
21590	return SDValue ();
21591
21592	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: `1`));
21593	if (!CMask \|\| CMask->getZExtValue() != Mask)
21594	return SDValue ();
21595
21596	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc (N), VT: MVT::i32,
21597	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`), N3: AndN.getOperand(i: `0`));
21598	}
21599
21600	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
21601	SelectionDAG &DAG) {
21602	SDLoc DL(N);
21603	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`),
21604	N1: DAG.getNode(Opcode: Opc, DL, VT: N->getOperand(Num: `1`).getSimpleValueType(),
21605	Operand: N->getOperand(Num: `1`)),
21606	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21607	}
21608
21609	static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
21610	SDLoc DL(N);
21611	SDValue Op1 = N->getOperand(Num: `1`);
21612	SDValue Op2 = N->getOperand(Num: `2`);
21613	EVT ScalarTy = Op2.getValueType();
21614	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
21615	ScalarTy = MVT::i32;
21616
21617	// Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
21618	SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: `0`));
21619	SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op2);
21620	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: `0`), N1: StepVector, N2: Step);
21621	SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op1);
21622	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: Mul, N2: Base);
21623	}
21624
21625	static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
21626	SDLoc DL(N);
21627	SDValue Scalar = N->getOperand(Num: `3`);
21628	EVT ScalarTy = Scalar.getValueType();
21629
21630	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
21631	Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Scalar);
21632
21633	SDValue Passthru = N->getOperand(Num: `1`);
21634	SDValue Pred = N->getOperand(Num: `2`);
21635	return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL, VT: N->getValueType(ResNo: `0`),
21636	N1: Pred, N2: Scalar, N3: Passthru);
21637	}
21638
21639	static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
21640	SDLoc DL(N);
21641	LLVMContext &Ctx = *DAG.getContext();
21642	EVT VT = N->getValueType(ResNo: `0`);
21643
21644	assert(VT.isScalableVector() && "Expected a scalable vector.");
21645
21646	// Current lowering only supports the SVE-ACLE types.
21647	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
21648	return SDValue ();
21649
21650	unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / `8`;
21651	unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / `8`;
21652	EVT ByteVT =
21653	EVT::getVectorVT(Context&: Ctx, VT: MVT::i8, EC: ElementCount::getScalable(MinVal: ByteSize));
21654
21655	// Convert everything to the domain of EXT (i.e bytes).
21656	SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ByteVT, Operand: N->getOperand(Num: `1`));
21657	SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ByteVT, Operand: N->getOperand(Num: `2`));
21658	SDValue Op2 = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i32, N1: N->getOperand(Num: `3`),
21659	N2: DAG.getConstant(Val: ElemSize, DL, VT: MVT::i32));
21660
21661	SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
21662	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: EXT);
21663	}
21664
21665	static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
21666	TargetLowering::DAGCombinerInfo &DCI,
21667	SelectionDAG &DAG) {
21668	if (DCI.isBeforeLegalize())
21669	return SDValue ();
21670
21671	SDValue Comparator = N->getOperand(Num: `3`);
21672	if (Comparator.getOpcode() == AArch64ISD::DUP \|\|
21673	Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
21674	unsigned IID = getIntrinsicID(N);
21675	EVT VT = N->getValueType(ResNo: `0`);
21676	EVT CmpVT = N->getOperand(Num: `2`).getValueType();
21677	SDValue Pred = N->getOperand(Num: `1`);
21678	SDValue Imm;
21679	SDLoc DL(N);
21680
21681	switch (IID) {
21682	default:
21683	llvm_unreachable("Called with wrong intrinsic!");
21684	break;
21685
21686	// Signed comparisons
21687	case Intrinsic::aarch64_sve_cmpeq_wide:
21688	case Intrinsic::aarch64_sve_cmpne_wide:
21689	case Intrinsic::aarch64_sve_cmpge_wide:
21690	case Intrinsic::aarch64_sve_cmpgt_wide:
21691	case Intrinsic::aarch64_sve_cmplt_wide:
21692	case Intrinsic::aarch64_sve_cmple_wide: {
21693	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
21694	int64_t ImmVal = CN->getSExtValue();
21695	if (ImmVal >= -`16` && ImmVal <= `15`)
21696	Imm = DAG.getSignedConstant(Val: ImmVal, DL, VT: MVT::i32);
21697	else
21698	return SDValue ();
21699	}
21700	break;
21701	}
21702	// Unsigned comparisons
21703	case Intrinsic::aarch64_sve_cmphs_wide:
21704	case Intrinsic::aarch64_sve_cmphi_wide:
21705	case Intrinsic::aarch64_sve_cmplo_wide:
21706	case Intrinsic::aarch64_sve_cmpls_wide: {
21707	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
21708	uint64_t ImmVal = CN->getZExtValue();
21709	if (ImmVal <= `127`)
21710	Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21711	else
21712	return SDValue ();
21713	}
21714	break;
21715	}
21716	}
21717
21718	if (!Imm)
21719	return SDValue ();
21720
21721	SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
21722	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
21723	N2: N->getOperand(Num: `2`), N3: Splat, N4: DAG.getCondCode(Cond: CC));
21724	}
21725
21726	return SDValue ();
21727	}
21728
21729	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21730	AArch64CC::CondCode Cond) {
21731	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21732
21733	SDLoc DL(Op);
21734	assert(Op.getValueType().isScalableVector() &&
21735	TLI.isTypeLegal(Op.getValueType()) &&
21736	"Expected legal scalable vector type!");
21737	assert(Op.getValueType() == Pg.getValueType() &&
21738	"Expected same type for PTEST operands");
21739
21740	// Ensure target specific opcodes are using legal type.
21741	EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
21742	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OutVT);
21743	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OutVT);
21744
21745	// Ensure operands have type nxv16i1.
21746	if (Op.getValueType() != MVT::nxv16i1) {
21747	if ((Cond == AArch64CC::ANY_ACTIVE \|\| Cond == AArch64CC::NONE_ACTIVE) &&
21748	isZeroingInactiveLanes(Op))
21749	Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Pg);
21750	else
21751	Pg = getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Pg, DAG);
21752	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Op);
21753	}
21754
21755	// Set condition code (CC) flags.
21756	SDValue Test = DAG.getNode(
21757	Opcode: Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
21758	DL, VT: MVT::i32, N1: Pg, N2: Op);
21759
21760	// Convert CC to integer based on requested condition.
21761	// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21762	SDValue CC = DAG.getConstant(Val: getInvertedCondCode(Code: Cond), DL, VT: MVT::i32);
21763	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
21764	return DAG.getZExtOrTrunc(Op: Res, DL, VT);
21765	}
21766
21767	static SDValue combineSVEReductionInt(SDNode N, unsigned* Opc,
21768	SelectionDAG &DAG) {
21769	SDLoc DL(N);
21770
21771	SDValue Pred = N->getOperand(Num: `1`);
21772	SDValue VecToReduce = N->getOperand(Num: `2`);
21773
21774	// NOTE: The integer reduction's result type is not always linked to the
21775	// operand's element type so we construct it from the intrinsic's result type.
21776	EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: `0`));
21777	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21778
21779	// SVE reductions set the whole vector register with the first element
21780	// containing the reduction result, which we'll now extract.
21781	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21782	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21783	N2: Zero);
21784	}
21785
21786	static SDValue combineSVEReductionFP(SDNode N, unsigned* Opc,
21787	SelectionDAG &DAG) {
21788	SDLoc DL(N);
21789
21790	SDValue Pred = N->getOperand(Num: `1`);
21791	SDValue VecToReduce = N->getOperand(Num: `2`);
21792
21793	EVT ReduceVT = VecToReduce.getValueType();
21794	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21795
21796	// SVE reductions set the whole vector register with the first element
21797	// containing the reduction result, which we'll now extract.
21798	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21799	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21800	N2: Zero);
21801	}
21802
21803	static SDValue combineSVEReductionOrderedFP(SDNode N, unsigned* Opc,
21804	SelectionDAG &DAG) {
21805	SDLoc DL(N);
21806
21807	SDValue Pred = N->getOperand(Num: `1`);
21808	SDValue InitVal = N->getOperand(Num: `2`);
21809	SDValue VecToReduce = N->getOperand(Num: `3`);
21810	EVT ReduceVT = VecToReduce.getValueType();
21811
21812	// Ordered reductions use the first lane of the result vector as the
21813	// reduction's initial value.
21814	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21815	InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
21816	N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
21817
21818	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
21819
21820	// SVE reductions set the whole vector register with the first element
21821	// containing the reduction result, which we'll now extract.
21822	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21823	N2: Zero);
21824	}
21825
21826	// If a merged operation has no inactive lanes we can relax it to a predicated
21827	// or unpredicated operation, which potentially allows better isel (perhaps
21828	// using immediate forms) or relaxing register reuse requirements.
21829	static SDValue convertMergedOpToPredOp(SDNode N, unsigned* Opc,
21830	SelectionDAG &DAG, bool UnpredOp = false,
21831	bool SwapOperands = false) {
21832	assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21833	assert(N->getNumOperands() == `4` && "Expected 3 operand intrinsic!");
21834	SDValue Pg = N->getOperand(Num: `1`);
21835	SDValue Op1 = N->getOperand(Num: SwapOperands ? `3` : `2`);
21836	SDValue Op2 = N->getOperand(Num: SwapOperands ? `2` : `3`);
21837
21838	// ISD way to specify an all active predicate.
21839	if (isAllActivePredicate(DAG, N: Pg)) {
21840	if (UnpredOp)
21841	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Op1, N2: Op2);
21842
21843	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Pg, N2: Op1, N3: Op2);
21844	}
21845
21846	// FUTURE: SplatVector(true)
21847	return SDValue ();
21848	}
21849
21850	SDValue tryLowerPartialReductionToDot(SDNode *N,
21851	const AArch64Subtarget *Subtarget,
21852	SelectionDAG &DAG) {
21853
21854	assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21855	getIntrinsicID(N) ==
21856	Intrinsic::experimental_vector_partial_reduce_add &&
21857	"Expected a partial reduction node");
21858
21859	bool Scalable = N->getValueType(ResNo: `0`).isScalableVector();
21860	if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
21861	return SDValue ();
21862	if (!Scalable && (!Subtarget->isNeonAvailable() \|\| !Subtarget->hasDotProd()))
21863	return SDValue ();
21864
21865	SDLoc DL(N);
21866
21867	SDValue Op2 = N->getOperand(Num: `2`);
21868	unsigned Op2Opcode = Op2 ->getOpcode();
21869	SDValue MulOpLHS, MulOpRHS;
21870	bool MulOpLHSIsSigned, MulOpRHSIsSigned;
21871	if (ISD::isExtOpcode(Opcode: Op2Opcode)) {
21872	MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
21873	MulOpLHS = Op2 ->getOperand(Num: `0`);
21874	MulOpRHS = DAG.getConstant(Val: `1`, DL, VT: MulOpLHS.getValueType());
21875	} else if (Op2Opcode == ISD::MUL) {
21876	SDValue ExtMulOpLHS = Op2 ->getOperand(Num: `0`);
21877	SDValue ExtMulOpRHS = Op2 ->getOperand(Num: `1`);
21878
21879	unsigned ExtMulOpLHSOpcode = ExtMulOpLHS ->getOpcode();
21880	unsigned ExtMulOpRHSOpcode = ExtMulOpRHS ->getOpcode();
21881	if (!ISD::isExtOpcode(Opcode: ExtMulOpLHSOpcode) \|\|
21882	!ISD::isExtOpcode(Opcode: ExtMulOpRHSOpcode))
21883	return SDValue ();
21884
21885	MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
21886	MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
21887
21888	MulOpLHS = ExtMulOpLHS ->getOperand(Num: `0`);
21889	MulOpRHS = ExtMulOpRHS ->getOperand(Num: `0`);
21890
21891	if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
21892	return SDValue ();
21893	} else
21894	return SDValue ();
21895
21896	SDValue Acc = N->getOperand(Num: `1`);
21897	EVT ReducedVT = N->getValueType(ResNo: `0`);
21898	EVT MulSrcVT = MulOpLHS.getValueType();
21899
21900	// Dot products operate on chunks of four elements so there must be four times
21901	// as many elements in the wide type
21902	if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
21903	!(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
21904	!(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
21905	!(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
21906	!(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
21907	!(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
21908	return SDValue ();
21909
21910	// If the extensions are mixed, we should lower it to a usdot instead
21911	unsigned Opcode = `0`;
21912	if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
21913	if (!Subtarget->hasMatMulInt8())
21914	return SDValue ();
21915
21916	bool Scalable = N->getValueType(ResNo: `0`).isScalableVT();
21917	// There's no nxv2i64 version of usdot
21918	if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
21919	return SDValue ();
21920
21921	Opcode = AArch64ISD::USDOT;
21922	// USDOT expects the signed operand to be last
21923	if (!MulOpRHSIsSigned)
21924	std::swap(a&: MulOpLHS, b&: MulOpRHS);
21925	} else
21926	Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
21927
21928	// Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
21929	// product followed by a zero / sign extension
21930	if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) \|\|
21931	(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
21932	EVT ReducedVTI32 =
21933	(ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
21934
21935	SDValue DotI32 =
21936	DAG.getNode(Opcode, DL, VT: ReducedVTI32,
21937	N1: DAG.getConstant(Val: `0`, DL, VT: ReducedVTI32), N2: MulOpLHS, N3: MulOpRHS);
21938	SDValue Extended = DAG.getSExtOrTrunc(Op: DotI32, DL, VT: ReducedVT);
21939	return DAG.getNode(Opcode: ISD::ADD, DL, VT: ReducedVT, N1: Acc, N2: Extended);
21940	}
21941
21942	return DAG.getNode(Opcode, DL, VT: ReducedVT, N1: Acc, N2: MulOpLHS, N3: MulOpRHS);
21943	}
21944
21945	SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
21946	const AArch64Subtarget *Subtarget,
21947	SelectionDAG &DAG) {
21948
21949	assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21950	getIntrinsicID(N) ==
21951	Intrinsic::experimental_vector_partial_reduce_add &&
21952	"Expected a partial reduction node");
21953
21954	if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
21955	return SDValue ();
21956
21957	SDLoc DL(N);
21958
21959	if (!ISD::isExtOpcode(Opcode: N->getOperand(Num: `2`).getOpcode()))
21960	return SDValue ();
21961	SDValue Acc = N->getOperand(Num: `1`);
21962	SDValue Ext = N->getOperand(Num: `2`);
21963	EVT AccVT = Acc.getValueType();
21964	EVT ExtVT = Ext.getValueType();
21965	if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
21966	return SDValue ();
21967
21968	SDValue ExtOp = Ext ->getOperand(Num: `0`);
21969	EVT ExtOpVT = ExtOp.getValueType();
21970
21971	if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
21972	!(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
21973	!(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
21974	return SDValue ();
21975
21976	bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
21977	unsigned BottomOpcode =
21978	ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
21979	unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
21980	SDValue BottomNode = DAG.getNode(Opcode: BottomOpcode, DL, VT: AccVT, N1: Acc, N2: ExtOp);
21981	return DAG.getNode(Opcode: TopOpcode, DL, VT: AccVT, N1: BottomNode, N2: ExtOp);
21982	}
21983
21984	static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
21985	SDLoc DL(N);
21986	EVT VT = N->getValueType(ResNo: `0`);
21987	SDValue Op1 = N->getOperand(Num: `1`);
21988	SDValue Op2 = N->getOperand(Num: `2`);
21989	SDValue Op3 = N->getOperand(Num: `3`);
21990
21991	switch (IID) {
21992	default:
21993	llvm_unreachable("Called with wrong intrinsic!");
21994	case Intrinsic::aarch64_sve_bsl:
21995	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1, N3: Op2);
21996	case Intrinsic::aarch64_sve_bsl1n:
21997	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: DAG.getNOT(DL, Val: Op1, VT),
21998	N3: Op2);
21999	case Intrinsic::aarch64_sve_bsl2n:
22000	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1,
22001	N3: DAG.getNOT(DL, Val: Op2, VT));
22002	case Intrinsic::aarch64_sve_nbsl:
22003	return DAG.getNOT(DL, Val: DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Op3, N2: Op1, N3: Op2),
22004	VT);
22005	}
22006	}
22007
22008	static SDValue performIntrinsicCombine(SDNode *N,
22009	TargetLowering::DAGCombinerInfo &DCI,
22010	const AArch64Subtarget *Subtarget) {
22011	SelectionDAG &DAG = DCI.DAG;
22012	unsigned IID = getIntrinsicID(N);
22013	switch (IID) {
22014	default:
22015	break;
22016	case Intrinsic::experimental_vector_partial_reduce_add: {
22017	if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
22018	return Dot;
22019	if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
22020	return WideAdd;
22021	SDLoc DL(N);
22022	SDValue Input = N->getOperand(Num: `2`);
22023	return DAG.getNode(Opcode: ISD::PARTIAL_REDUCE_UMLA, DL, VT: N->getValueType(ResNo: `0`),
22024	N1: N->getOperand(Num: `1`), N2: Input,
22025	N3: DAG.getConstant(Val: `1`, DL, VT: Input.getValueType()));
22026	}
22027	case Intrinsic::aarch64_neon_vcvtfxs2fp:
22028	case Intrinsic::aarch64_neon_vcvtfxu2fp:
22029	return tryCombineFixedPointConvert(N, DCI, DAG);
22030	case Intrinsic::aarch64_neon_saddv:
22031	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
22032	case Intrinsic::aarch64_neon_uaddv:
22033	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
22034	case Intrinsic::aarch64_neon_sminv:
22035	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
22036	case Intrinsic::aarch64_neon_uminv:
22037	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
22038	case Intrinsic::aarch64_neon_smaxv:
22039	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
22040	case Intrinsic::aarch64_neon_umaxv:
22041	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
22042	case Intrinsic::aarch64_neon_fmax:
22043	return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22044	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22045	case Intrinsic::aarch64_neon_fmin:
22046	return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22047	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22048	case Intrinsic::aarch64_neon_fmaxnm:
22049	return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22050	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22051	case Intrinsic::aarch64_neon_fminnm:
22052	return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22053	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22054	case Intrinsic::aarch64_neon_smull:
22055	return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22056	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22057	case Intrinsic::aarch64_neon_umull:
22058	return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22059	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22060	case Intrinsic::aarch64_neon_pmull:
22061	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22062	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22063	case Intrinsic::aarch64_neon_sqdmull:
22064	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
22065	case Intrinsic::aarch64_neon_sqshl:
22066	case Intrinsic::aarch64_neon_uqshl:
22067	case Intrinsic::aarch64_neon_sqshlu:
22068	case Intrinsic::aarch64_neon_srshl:
22069	case Intrinsic::aarch64_neon_urshl:
22070	case Intrinsic::aarch64_neon_sshl:
22071	case Intrinsic::aarch64_neon_ushl:
22072	return tryCombineShiftImm(IID, N, DAG);
22073	case Intrinsic::aarch64_neon_sabd:
22074	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22075	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22076	case Intrinsic::aarch64_neon_uabd:
22077	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22078	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22079	case Intrinsic::aarch64_crc32b:
22080	case Intrinsic::aarch64_crc32cb:
22081	return tryCombineCRC32(Mask: `0xff`, N, DAG);
22082	case Intrinsic::aarch64_crc32h:
22083	case Intrinsic::aarch64_crc32ch:
22084	return tryCombineCRC32(Mask: `0xffff`, N, DAG);
22085	case Intrinsic::aarch64_sve_saddv:
22086	// There is no i64 version of SADDV because the sign is irrelevant.
22087	if (N->getOperand(Num: `2`)->getValueType(ResNo: `0`).getVectorElementType() == MVT::i64)
22088	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
22089	else
22090	return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
22091	case Intrinsic::aarch64_sve_uaddv:
22092	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
22093	case Intrinsic::aarch64_sve_smaxv:
22094	return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
22095	case Intrinsic::aarch64_sve_umaxv:
22096	return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
22097	case Intrinsic::aarch64_sve_sminv:
22098	return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
22099	case Intrinsic::aarch64_sve_uminv:
22100	return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
22101	case Intrinsic::aarch64_sve_orv:
22102	return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
22103	case Intrinsic::aarch64_sve_eorv:
22104	return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
22105	case Intrinsic::aarch64_sve_andv:
22106	return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
22107	case Intrinsic::aarch64_sve_index:
22108	return LowerSVEIntrinsicIndex(N, DAG);
22109	case Intrinsic::aarch64_sve_dup:
22110	return LowerSVEIntrinsicDUP(N, DAG);
22111	case Intrinsic::aarch64_sve_dup_x:
22112	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22113	Operand: N->getOperand(Num: `1`));
22114	case Intrinsic::aarch64_sve_ext:
22115	return LowerSVEIntrinsicEXT(N, DAG);
22116	case Intrinsic::aarch64_sve_mul_u:
22117	return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22118	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22119	case Intrinsic::aarch64_sve_smulh_u:
22120	return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22121	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22122	case Intrinsic::aarch64_sve_umulh_u:
22123	return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22124	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22125	case Intrinsic::aarch64_sve_smin_u:
22126	return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22127	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22128	case Intrinsic::aarch64_sve_umin_u:
22129	return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22130	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22131	case Intrinsic::aarch64_sve_smax_u:
22132	return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22133	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22134	case Intrinsic::aarch64_sve_umax_u:
22135	return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22136	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22137	case Intrinsic::aarch64_sve_lsl_u:
22138	return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22139	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22140	case Intrinsic::aarch64_sve_lsr_u:
22141	return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22142	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22143	case Intrinsic::aarch64_sve_asr_u:
22144	return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22145	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22146	case Intrinsic::aarch64_sve_fadd_u:
22147	return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22148	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22149	case Intrinsic::aarch64_sve_fdiv_u:
22150	return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22151	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22152	case Intrinsic::aarch64_sve_fmax_u:
22153	return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22154	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22155	case Intrinsic::aarch64_sve_fmaxnm_u:
22156	return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22157	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22158	case Intrinsic::aarch64_sve_fmla_u:
22159	return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22160	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `3`), N3: N->getOperand(Num: `4`),
22161	N4: N->getOperand(Num: `2`));
22162	case Intrinsic::aarch64_sve_fmin_u:
22163	return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22164	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22165	case Intrinsic::aarch64_sve_fminnm_u:
22166	return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22167	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22168	case Intrinsic::aarch64_sve_fmul_u:
22169	return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22170	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22171	case Intrinsic::aarch64_sve_fsub_u:
22172	return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22173	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22174	case Intrinsic::aarch64_sve_add_u:
22175	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
22176	N2: N->getOperand(Num: `3`));
22177	case Intrinsic::aarch64_sve_sub_u:
22178	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
22179	N2: N->getOperand(Num: `3`));
22180	case Intrinsic::aarch64_sve_subr:
22181	return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
22182	case Intrinsic::aarch64_sve_and_u:
22183	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
22184	N2: N->getOperand(Num: `3`));
22185	case Intrinsic::aarch64_sve_bic_u:
22186	return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22187	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
22188	case Intrinsic::aarch64_sve_saddwb:
22189	return DAG.getNode(Opcode: AArch64ISD::SADDWB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22190	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22191	case Intrinsic::aarch64_sve_saddwt:
22192	return DAG.getNode(Opcode: AArch64ISD::SADDWT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22193	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22194	case Intrinsic::aarch64_sve_uaddwb:
22195	return DAG.getNode(Opcode: AArch64ISD::UADDWB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22196	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22197	case Intrinsic::aarch64_sve_uaddwt:
22198	return DAG.getNode(Opcode: AArch64ISD::UADDWT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22199	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22200	case Intrinsic::aarch64_sve_eor_u:
22201	return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
22202	N2: N->getOperand(Num: `3`));
22203	case Intrinsic::aarch64_sve_orr_u:
22204	return DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
22205	N2: N->getOperand(Num: `3`));
22206	case Intrinsic::aarch64_sve_sabd_u:
22207	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22208	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
22209	case Intrinsic::aarch64_sve_uabd_u:
22210	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22211	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
22212	case Intrinsic::aarch64_sve_sdiv_u:
22213	return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22214	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22215	case Intrinsic::aarch64_sve_udiv_u:
22216	return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22217	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22218	case Intrinsic::aarch64_sve_sqadd:
22219	return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
22220	case Intrinsic::aarch64_sve_sqsub_u:
22221	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22222	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
22223	case Intrinsic::aarch64_sve_uqadd:
22224	return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
22225	case Intrinsic::aarch64_sve_uqsub_u:
22226	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22227	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
22228	case Intrinsic::aarch64_sve_sqadd_x:
22229	return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22230	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22231	case Intrinsic::aarch64_sve_sqsub_x:
22232	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22233	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22234	case Intrinsic::aarch64_sve_uqadd_x:
22235	return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22236	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22237	case Intrinsic::aarch64_sve_uqsub_x:
22238	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22239	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22240	case Intrinsic::aarch64_sve_asrd:
22241	return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22242	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22243	case Intrinsic::aarch64_sve_cmphs:
22244	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
22245	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22246	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22247	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGE));
22248	break;
22249	case Intrinsic::aarch64_sve_cmphi:
22250	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
22251	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22252	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22253	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGT));
22254	break;
22255	case Intrinsic::aarch64_sve_fcmpge:
22256	case Intrinsic::aarch64_sve_cmpge:
22257	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22258	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22259	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGE));
22260	break;
22261	case Intrinsic::aarch64_sve_fcmpgt:
22262	case Intrinsic::aarch64_sve_cmpgt:
22263	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22264	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22265	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGT));
22266	break;
22267	case Intrinsic::aarch64_sve_fcmpeq:
22268	case Intrinsic::aarch64_sve_cmpeq:
22269	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22270	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22271	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETEQ));
22272	break;
22273	case Intrinsic::aarch64_sve_fcmpne:
22274	case Intrinsic::aarch64_sve_cmpne:
22275	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22276	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22277	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETNE));
22278	break;
22279	case Intrinsic::aarch64_sve_fcmpuo:
22280	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
22281	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
22282	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUO));
22283	break;
22284	case Intrinsic::aarch64_sve_fadda:
22285	return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
22286	case Intrinsic::aarch64_sve_faddv:
22287	return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
22288	case Intrinsic::aarch64_sve_fmaxnmv:
22289	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
22290	case Intrinsic::aarch64_sve_fmaxv:
22291	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
22292	case Intrinsic::aarch64_sve_fminnmv:
22293	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
22294	case Intrinsic::aarch64_sve_fminv:
22295	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
22296	case Intrinsic::aarch64_sve_sel:
22297	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22298	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
22299	case Intrinsic::aarch64_sve_cmpeq_wide:
22300	return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
22301	case Intrinsic::aarch64_sve_cmpne_wide:
22302	return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
22303	case Intrinsic::aarch64_sve_cmpge_wide:
22304	return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
22305	case Intrinsic::aarch64_sve_cmpgt_wide:
22306	return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
22307	case Intrinsic::aarch64_sve_cmplt_wide:
22308	return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
22309	case Intrinsic::aarch64_sve_cmple_wide:
22310	return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
22311	case Intrinsic::aarch64_sve_cmphs_wide:
22312	return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
22313	case Intrinsic::aarch64_sve_cmphi_wide:
22314	return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
22315	case Intrinsic::aarch64_sve_cmplo_wide:
22316	return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
22317	case Intrinsic::aarch64_sve_cmpls_wide:
22318	return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
22319	case Intrinsic::aarch64_sve_ptest_any:
22320	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
22321	Cond: AArch64CC::ANY_ACTIVE);
22322	case Intrinsic::aarch64_sve_ptest_first:
22323	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
22324	Cond: AArch64CC::FIRST_ACTIVE);
22325	case Intrinsic::aarch64_sve_ptest_last:
22326	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
22327	Cond: AArch64CC::LAST_ACTIVE);
22328	case Intrinsic::aarch64_sve_whilelo:
22329	return DAG.getNode(Opcode: ISD::GET_ACTIVE_LANE_MASK, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22330	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
22331	case Intrinsic::aarch64_sve_bsl:
22332	case Intrinsic::aarch64_sve_bsl1n:
22333	case Intrinsic::aarch64_sve_bsl2n:
22334	case Intrinsic::aarch64_sve_nbsl:
22335	return combineSVEBitSel(IID, N, DAG);
22336	}
22337	return SDValue ();
22338	}
22339
22340	static bool isCheapToExtend(const SDValue &N) {
22341	unsigned OC = N ->getOpcode();
22342	return OC == ISD::LOAD \|\| OC == ISD::MLOAD \|\|
22343	ISD::isConstantSplatVectorAllZeros(N: N.getNode());
22344	}
22345
22346	static SDValue
22347	performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22348	SelectionDAG &DAG) {
22349	// If we have (sext (setcc A B)) and A and B are cheap to extend,
22350	// we can move the sext into the arguments and have the same result. For
22351	// example, if A and B are both loads, we can make those extending loads and
22352	// avoid an extra instruction. This pattern appears often in VLS code
22353	// generation where the inputs to the setcc have a different size to the
22354	// instruction that wants to use the result of the setcc.
22355	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
22356	N->getOperand(`0`)->getOpcode() == ISD::SETCC);
22357	const SDValue SetCC = N->getOperand(Num: `0`);
22358
22359	const SDValue CCOp0 = SetCC.getOperand(i: `0`);
22360	const SDValue CCOp1 = SetCC.getOperand(i: `1`);
22361	if (!CCOp0 ->getValueType(ResNo: `0`).isInteger() \|\|
22362	!CCOp1 ->getValueType(ResNo: `0`).isInteger())
22363	return SDValue ();
22364
22365	ISD::CondCode Code =
22366	cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get();
22367
22368	ISD::NodeType ExtType =
22369	isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22370
22371	if (isCheapToExtend(N: SetCC.getOperand(i: `0`)) &&
22372	isCheapToExtend(N: SetCC.getOperand(i: `1`))) {
22373	const SDValue Ext1 =
22374	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp0);
22375	const SDValue Ext2 =
22376	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp1);
22377
22378	return DAG.getSetCC(
22379	DL: SDLoc (SetCC), VT: N->getValueType(ResNo: `0`), LHS: Ext1, RHS: Ext2,
22380	Cond: cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get());
22381	}
22382
22383	return SDValue ();
22384	}
22385
22386	// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
22387	// This comes from interleaved vectorization. It is performed late to capture
22388	// uitofp converts too.
22389	static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N,
22390	SelectionDAG &DAG) {
22391	EVT VT = N->getValueType(ResNo: `0`);
22392	if ((VT != MVT::v4i32 && VT != MVT::v8i16) \|\|
22393	N->getOpcode() != ISD::ZERO_EXTEND \|\|
22394	N->getOperand(Num: `0`).getOpcode() != ISD::EXTRACT_SUBVECTOR)
22395	return SDValue ();
22396
22397	unsigned ExtOffset = N->getOperand(Num: `0`).getConstantOperandVal(i: `1`);
22398	if (ExtOffset != `0` && ExtOffset != VT.getVectorNumElements())
22399	return SDValue ();
22400
22401	EVT InVT = N->getOperand(Num: `0`).getOperand(i: `0`).getValueType();
22402	auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `0`));
22403	if (!Shuffle \|\|
22404	InVT.getVectorNumElements() != VT.getVectorNumElements() * `2` \|\|
22405	InVT.getScalarSizeInBits() * `2` != VT.getScalarSizeInBits())
22406	return SDValue ();
22407
22408	unsigned Idx;
22409	bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22410	Mask: Shuffle->getMask().slice(N: ExtOffset, M: VT.getVectorNumElements()), Factor: `4`, Index&: Idx);
22411	// An undef interleave shuffle can come up after other canonicalizations,
22412	// where the shuffle has been converted to
22413	// zext(extract(shuffle b, undef, [u,u,0,4]))
22414	bool IsUndefDeInterleave = false;
22415	if (!IsDeInterleave)
22416	IsUndefDeInterleave =
22417	Shuffle->getOperand(Num: `1`).isUndef() &&
22418	all_of(
22419	Range: Shuffle->getMask().slice(N: ExtOffset, M: VT.getVectorNumElements() / `2`),
22420	P: [](int M) { return M < `0`; }) &&
22421	ShuffleVectorInst::isDeInterleaveMaskOfFactor(
22422	Mask: Shuffle->getMask().slice(N: ExtOffset + VT.getVectorNumElements() / `2`,
22423	M: VT.getVectorNumElements() / `2`),
22424	Factor: `4`, Index&: Idx);
22425	if ((!IsDeInterleave && !IsUndefDeInterleave) \|\| Idx >= `4`)
22426	return SDValue ();
22427	SDLoc DL(N);
22428	SDValue BC1 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22429	Operand: Shuffle->getOperand(Num: IsUndefDeInterleave ? `1` : `0`));
22430	SDValue BC2 = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22431	Operand: Shuffle->getOperand(Num: IsUndefDeInterleave ? `0` : `1`));
22432	SDValue UZP = DAG.getNode(Opcode: Idx < `2` ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
22433	VT, N1: BC1, N2: BC2);
22434	if ((Idx & `1`) == `1`)
22435	UZP = DAG.getNode(Opcode: ISD::SRL, DL, VT, N1: UZP,
22436	N2: DAG.getConstant(Val: InVT.getScalarSizeInBits(), DL, VT));
22437	return DAG.getNode(
22438	Opcode: ISD::AND, DL, VT, N1: UZP,
22439	N2: DAG.getConstant(Val: (`1` << InVT.getScalarSizeInBits()) - `1`, DL, VT));
22440	}
22441
22442	// This comes up similar to the above when lowering deinterleaving shuffles from
22443	// zexts. We have legalized the operations in the generally case to
22444	// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
22445	// the extract is to the low half and the uzp is uzp1. There would be an extra
22446	// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
22447	// there could also be an existing and / shift that can be combined in, either
22448	// before of after the extract.
22449	static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
22450	EVT VT = N->getValueType(ResNo: `0`);
22451	if (N->getOpcode() != ISD::ZERO_EXTEND \|\|
22452	(VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
22453	return SDValue ();
22454
22455	SDValue Op = N->getOperand(Num: `0`);
22456	unsigned ExtOffset = (unsigned)-`1`;
22457	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22458	ExtOffset = Op.getConstantOperandVal(i: `1`);
22459	Op = Op.getOperand(i: `0`);
22460	}
22461
22462	unsigned Shift = `0`;
22463	APInt Mask = APInt::getLowBitsSet(numBits: VT.getScalarSizeInBits(),
22464	loBitsSet: Op.getValueType().getScalarSizeInBits());
22465
22466	if (Op.getOpcode() == AArch64ISD::VLSHR) {
22467	Shift = Op.getConstantOperandVal(i: `1`);
22468	Op = Op.getOperand(i: `0`);
22469	Mask = Mask.lshr(shiftAmt: Shift);
22470	}
22471	if (Op.getOpcode() == ISD::AND &&
22472	ISD::isConstantSplatVector(N: Op.getOperand(i: `1`).getNode(), SplatValue&: Mask)) {
22473	Op = Op.getOperand(i: `0`);
22474	Mask = Mask.zext(width: VT.getScalarSizeInBits());
22475	} else if (Op.getOpcode() == AArch64ISD::BICi) {
22476	Mask = ~APInt (Op.getValueType().getScalarSizeInBits(),
22477	Op.getConstantOperandVal(i: `1`) << Op.getConstantOperandVal(i: `2`));
22478	Mask = Mask.zext(width: VT.getScalarSizeInBits());
22479	Op = Op.getOperand(i: `0`);
22480	}
22481
22482	if (ExtOffset == (unsigned)-`1`) {
22483	if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22484	ExtOffset = Op.getConstantOperandVal(i: `1`);
22485	Op = Op.getOperand(i: `0`);
22486	} else
22487	return SDValue ();
22488	}
22489	if (ExtOffset != `0` && ExtOffset != VT.getVectorNumElements())
22490	return SDValue ();
22491
22492	if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
22493	return SDValue ();
22494	if (Op.getOpcode() == AArch64ISD::UZP2)
22495	Shift += VT.getScalarSizeInBits() / `2`;
22496
22497	SDLoc DL(N);
22498	SDValue BC = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
22499	Operand: Op.getOperand(i: ExtOffset == `0` ? `0` : `1`));
22500	if (Shift != `0`)
22501	BC = DAG.getNode(Opcode: AArch64ISD::VLSHR, DL, VT, N1: BC,
22502	N2: DAG.getConstant(Val: Shift, DL, VT: MVT::i32));
22503	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: BC, N2: DAG.getConstant(Val: Mask, DL, VT));
22504	}
22505
22506	static SDValue performExtendCombine(SDNode *N,
22507	TargetLowering::DAGCombinerInfo &DCI,
22508	SelectionDAG &DAG) {
22509	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
22510	// we can convert that DUP into another extract_high (of a bigger DUP), which
22511	// helps the backend to decide that an sabdl2 would be useful, saving a real
22512	// extract_high operation.
22513	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
22514	N->getOperand(Num: `0`).getValueType().is64BitVector() &&
22515	(N->getOperand(Num: `0`).getOpcode() == ISD::ABDU \|\|
22516	N->getOperand(Num: `0`).getOpcode() == ISD::ABDS)) {
22517	SDNode *ABDNode = N->getOperand(Num: `0`).getNode();
22518	SDValue NewABD =
22519	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
22520	if (!NewABD.getNode())
22521	return SDValue ();
22522
22523	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: NewABD);
22524	}
22525
22526	if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG))
22527	return R;
22528	if (SDValue R = performZExtUZPCombine(N, DAG))
22529	return R;
22530
22531	if (N->getValueType(ResNo: `0`).isFixedLengthVector() &&
22532	N->getOpcode() == ISD::SIGN_EXTEND &&
22533	N->getOperand(Num: `0`)->getOpcode() == ISD::SETCC)
22534	return performSignExtendSetCCCombine(N, DCI, DAG);
22535
22536	// If we see (any_extend (bswap ...)) with bswap returning an i16, we know
22537	// that the top half of the result register must be unused, due to the
22538	// any_extend. This means that we can replace this pattern with (rev16
22539	// (any_extend ...)). This saves a machine instruction compared to (lsr (rev
22540	// ...)), which is what this pattern would otherwise be lowered to.
22541	// Only apply this optimisation if any_extend in original pattern to i32 or
22542	// i64, because this type will become the input type to REV16 in the new
22543	// pattern, so must be a legitimate REV16 input type.
22544	SDValue Bswap = N->getOperand(Num: `0`);
22545	if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
22546	Bswap.getValueType() == MVT::i16 &&
22547	(N->getValueType(ResNo: `0`) == MVT::i32 \|\| N->getValueType(ResNo: `0`) == MVT::i64)) {
22548	SDLoc DL(N);
22549	SDValue NewAnyExtend = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: N->getValueType(ResNo: `0`),
22550	Operand: Bswap ->getOperand(Num: `0`));
22551	return DAG.getNode(Opcode: AArch64ISD::REV16, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
22552	Operand: NewAnyExtend);
22553	}
22554
22555	return SDValue ();
22556	}
22557
22558	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
22559	SDValue SplatVal, unsigned NumVecElts) {
22560	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
22561	Align OrigAlignment = St.getAlign();
22562	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / `8`;
22563
22564	// Create scalar stores. This is at least as good as the code sequence for a
22565	// split unaligned store which is a dup.s, ext.b, and two stores.
22566	// Most of the time the three stores should be replaced by store pair
22567	// instructions (stp).
22568	SDLoc DL(&St);
22569	SDValue BasePtr = St.getBasePtr();
22570	uint64_t BaseOffset = `0`;
22571
22572	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
22573	SDValue NewST1 =
22574	DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
22575	Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
22576
22577	// As this in ISel, we will not merge this add which may degrade results.
22578	if (BasePtr ->getOpcode() == ISD::ADD &&
22579	isa<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))) {
22580	BaseOffset = cast<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))->getSExtValue();
22581	BasePtr = BasePtr ->getOperand(Num: `0`);
22582	}
22583
22584	unsigned Offset = EltOffset;
22585	while (--NumVecElts) {
22586	Align Alignment = commonAlignment(A: OrigAlignment, Offset);
22587	SDValue OffsetPtr =
22588	DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
22589	N2: DAG.getConstant(Val: BaseOffset + Offset, DL, VT: MVT::i64));
22590	NewST1 = DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
22591	PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
22592	MMOFlags: St.getMemOperand()->getFlags());
22593	Offset += EltOffset;
22594	}
22595	return NewST1;
22596	}
22597
22598	// Returns an SVE type that ContentTy can be trivially sign or zero extended
22599	// into.
22600	static MVT getSVEContainerType(EVT ContentTy) {
22601	assert(ContentTy.isSimple() && "No SVE containers for extended types");
22602
22603	switch (ContentTy.getSimpleVT().SimpleTy) {
22604	default:
22605	llvm_unreachable("No known SVE container for this MVT type");
22606	case MVT::nxv2i8:
22607	case MVT::nxv2i16:
22608	case MVT::nxv2i32:
22609	case MVT::nxv2i64:
22610	case MVT::nxv2f32:
22611	case MVT::nxv2f64:
22612	return MVT::nxv2i64;
22613	case MVT::nxv4i8:
22614	case MVT::nxv4i16:
22615	case MVT::nxv4i32:
22616	case MVT::nxv4f32:
22617	return MVT::nxv4i32;
22618	case MVT::nxv8i8:
22619	case MVT::nxv8i16:
22620	case MVT::nxv8f16:
22621	case MVT::nxv8bf16:
22622	return MVT::nxv8i16;
22623	case MVT::nxv16i8:
22624	return MVT::nxv16i8;
22625	}
22626	}
22627
22628	static SDValue performLD1Combine(SDNode N, SelectionDAG &DAG, unsigned* Opc) {
22629	SDLoc DL(N);
22630	EVT VT = N->getValueType(ResNo: `0`);
22631
22632	if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
22633	return SDValue ();
22634
22635	EVT ContainerVT = VT;
22636	if (ContainerVT.isInteger())
22637	ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
22638
22639	SDVTList VTs = DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other);
22640	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
22641	N->getOperand(Num: `2`), // Pg
22642	N->getOperand(Num: `3`), // Base
22643	DAG.getValueType(VT) };
22644
22645	SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
22646	SDValue LoadChain = SDValue (Load.getNode(), `1`);
22647
22648	if (ContainerVT.isInteger() && (VT != ContainerVT))
22649	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: `0`));
22650
22651	return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
22652	}
22653
22654	static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
22655	SDLoc DL(N);
22656	EVT VT = N->getValueType(ResNo: `0`);
22657	EVT PtrTy = N->getOperand(Num: `3`).getValueType();
22658
22659	EVT LoadVT = VT;
22660	if (VT.isFloatingPoint())
22661	LoadVT = VT.changeTypeToInteger();
22662
22663	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
22664	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT: LoadVT);
22665	SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
22666	Base: MINode->getOperand(Num: `3`), Offset: DAG.getUNDEF(VT: PtrTy),
22667	Mask: MINode->getOperand(Num: `2`), Src0: PassThru,
22668	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
22669	AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
22670
22671	if (VT.isFloatingPoint()) {
22672	SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: `1`) };
22673	return DAG.getMergeValues(Ops, dl: DL);
22674	}
22675
22676	return L;
22677	}
22678
22679	template <unsigned Opcode>
22680	static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
22681	static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO \|\|
22682	Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
22683	"Unsupported opcode.");
22684	SDLoc DL(N);
22685	EVT VT = N->getValueType(ResNo: `0`);
22686
22687	EVT LoadVT = VT;
22688	if (VT.isFloatingPoint())
22689	LoadVT = VT.changeTypeToInteger();
22690
22691	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)};
22692	SDValue Load = DAG.getNode(Opcode, DL, ResultTys: {LoadVT, MVT::Other}, Ops);
22693	SDValue LoadChain = SDValue (Load.getNode(), `1`);
22694
22695	if (VT.isFloatingPoint())
22696	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: `0`));
22697
22698	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
22699	}
22700
22701	static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
22702	SDLoc DL(N);
22703	SDValue Data = N->getOperand(Num: `2`);
22704	EVT DataVT = Data.getValueType();
22705	EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
22706	SDValue InputVT = DAG.getValueType(DataVT);
22707
22708	if (DataVT.isFloatingPoint())
22709	InputVT = DAG.getValueType(HwSrcVt);
22710
22711	SDValue SrcNew;
22712	if (Data.getValueType().isFloatingPoint())
22713	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
22714	else
22715	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
22716
22717	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
22718	SrcNew,
22719	N->getOperand(Num: `4`), // Base
22720	N->getOperand(Num: `3`), // Pg
22721	InputVT
22722	};
22723
22724	return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: `0`), Ops);
22725	}
22726
22727	static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
22728	SDLoc DL(N);
22729
22730	SDValue Data = N->getOperand(Num: `2`);
22731	EVT DataVT = Data.getValueType();
22732	EVT PtrTy = N->getOperand(Num: `4`).getValueType();
22733
22734	if (DataVT.isFloatingPoint())
22735	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
22736
22737	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
22738	return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: `4`),
22739	Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: `3`),
22740	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
22741	AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
22742	}
22743
22744	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
22745	/// load store optimizer pass will merge them to store pair stores. This should
22746	/// be better than a movi to create the vector zero followed by a vector store
22747	/// if the zero constant is not re-used, since one instructions and one register
22748	/// live range will be removed.
22749	///
22750	/// For example, the final generated code should be:
22751	///
22752	/// stp xzr, xzr, [x0]
22753	///
22754	/// instead of:
22755	///
22756	/// movi v0.2d, #0
22757	/// str q0, [x0]
22758	///
22759	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22760	SDValue StVal = St.getValue();
22761	EVT VT = StVal.getValueType();
22762
22763	// Avoid scalarizing zero splat stores for scalable vectors.
22764	if (VT.isScalableVector())
22765	return SDValue ();
22766
22767	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
22768	// 2, 3 or 4 i32 elements.
22769	int NumVecElts = VT.getVectorNumElements();
22770	if (!(((NumVecElts == `2` \|\| NumVecElts == `3`) &&
22771	VT.getVectorElementType().getSizeInBits() == `64`) \|\|
22772	((NumVecElts == `2` \|\| NumVecElts == `3` \|\| NumVecElts == `4`) &&
22773	VT.getVectorElementType().getSizeInBits() == `32`)))
22774	return SDValue ();
22775
22776	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
22777	return SDValue ();
22778
22779	// If the zero constant has more than one use then the vector store could be
22780	// better since the constant mov will be amortized and stp q instructions
22781	// should be able to be formed.
22782	if (!StVal.hasOneUse())
22783	return SDValue ();
22784
22785	// If the store is truncating then it's going down to i16 or smaller, which
22786	// means it can be implemented in a single store anyway.
22787	if (St.isTruncatingStore())
22788	return SDValue ();
22789
22790	// If the immediate offset of the address operand is too large for the stp
22791	// instruction, then bail out.
22792	if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
22793	int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: `1`);
22794	if (Offset < -`512` \|\| Offset > `504`)
22795	return SDValue ();
22796	}
22797
22798	for (int I = `0`; I < NumVecElts; ++I) {
22799	SDValue EltVal = StVal.getOperand(i: I);
22800	if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
22801	return SDValue ();
22802	}
22803
22804	// Use a CopyFromReg WZR/XZR here to prevent
22805	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
22806	SDLoc DL(&St);
22807	unsigned ZeroReg;
22808	EVT ZeroVT;
22809	if (VT.getVectorElementType().getSizeInBits() == `32`) {
22810	ZeroReg = AArch64::WZR;
22811	ZeroVT = MVT::i32;
22812	} else {
22813	ZeroReg = AArch64::XZR;
22814	ZeroVT = MVT::i64;
22815	}
22816	SDValue SplatVal =
22817	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
22818	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22819	}
22820
22821	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
22822	/// value. The load store optimizer pass will merge them to store pair stores.
22823	/// This has better performance than a splat of the scalar followed by a split
22824	/// vector store. Even if the stores are not merged it is four stores vs a dup,
22825	/// followed by an ext.b and two stores.
22826	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
22827	SDValue StVal = St.getValue();
22828	EVT VT = StVal.getValueType();
22829
22830	// Don't replace floating point stores, they possibly won't be transformed to
22831	// stp because of the store pair suppress pass.
22832	if (VT.isFloatingPoint())
22833	return SDValue ();
22834
22835	// We can express a splat as store pair(s) for 2 or 4 elements.
22836	unsigned NumVecElts = VT.getVectorNumElements();
22837	if (NumVecElts != `4` && NumVecElts != `2`)
22838	return SDValue ();
22839
22840	// If the store is truncating then it's going down to i16 or smaller, which
22841	// means it can be implemented in a single store anyway.
22842	if (St.isTruncatingStore())
22843	return SDValue ();
22844
22845	// Check that this is a splat.
22846	// Make sure that each of the relevant vector element locations are inserted
22847	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
22848	std::bitset<`4`> IndexNotInserted((`1` << NumVecElts) - `1`);
22849	SDValue SplatVal;
22850	for (unsigned I = `0`; I < NumVecElts; ++I) {
22851	// Check for insert vector elements.
22852	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
22853	return SDValue ();
22854
22855	// Check that same value is inserted at each vector element.
22856	if (I == `0`)
22857	SplatVal = StVal.getOperand(i: `1`);
22858	else if (StVal.getOperand(i: `1`) != SplatVal)
22859	return SDValue ();
22860
22861	// Check insert element index.
22862	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: `2`));
22863	if (!CIndex)
22864	return SDValue ();
22865	uint64_t IndexVal = CIndex->getZExtValue();
22866	if (IndexVal >= NumVecElts)
22867	return SDValue ();
22868	IndexNotInserted.reset(position: IndexVal);
22869
22870	StVal = StVal.getOperand(i: `0`);
22871	}
22872	// Check that all vector element locations were inserted to.
22873	if (IndexNotInserted.any())
22874	return SDValue ();
22875
22876	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
22877	}
22878
22879	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
22880	SelectionDAG &DAG,
22881	const AArch64Subtarget *Subtarget) {
22882
22883	StoreSDNode *S = cast<StoreSDNode>(Val: N);
22884	if (S->isVolatile() \|\| S->isIndexed())
22885	return SDValue ();
22886
22887	SDValue StVal = S->getValue();
22888	EVT VT = StVal.getValueType();
22889
22890	if (!VT.isFixedLengthVector())
22891	return SDValue ();
22892
22893	// If we get a splat of zeros, convert this vector store to a store of
22894	// scalars. They will be merged into store pairs of xzr thereby removing one
22895	// instruction and one register.
22896	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
22897	return ReplacedZeroSplat;
22898
22899	// FIXME: The logic for deciding if an unaligned store should be split should
22900	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
22901	// a call to that function here.
22902
22903	if (!Subtarget->isMisaligned128StoreSlow())
22904	return SDValue ();
22905
22906	// Don't split at -Oz.
22907	if (DAG.getMachineFunction().getFunction().hasMinSize())
22908	return SDValue ();
22909
22910	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
22911	// those up regresses performance on micro-benchmarks and olden/bh.
22912	if (VT.getVectorNumElements() < `2` \|\| VT == MVT::v2i64)
22913	return SDValue ();
22914
22915	// Split unaligned 16B stores. They are terrible for performance.
22916	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
22917	// extensions can use this to mark that it does not want splitting to happen
22918	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
22919	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
22920	if (VT.getSizeInBits() != `128` \|\| S->getAlign() >= Align (`16`) \|\|
22921	S->getAlign() <= Align (`2`))
22922	return SDValue ();
22923
22924	// If we get a splat of a scalar convert this vector store to a store of
22925	// scalars. They will be merged into store pairs thereby removing two
22926	// instructions.
22927	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
22928	return ReplacedSplat;
22929
22930	SDLoc DL(S);
22931
22932	// Split VT into two.
22933	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22934	unsigned NumElts = HalfVT.getVectorNumElements();
22935	SDValue SubVector0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
22936	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
22937	SDValue SubVector1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
22938	N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
22939	SDValue BasePtr = S->getBasePtr();
22940	SDValue NewST1 =
22941	DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
22942	Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
22943	SDValue OffsetPtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
22944	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
22945	return DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
22946	PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
22947	MMOFlags: S->getMemOperand()->getFlags());
22948	}
22949
22950	static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
22951	assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
22952
22953	// splice(pg, op1, undef) -> op1
22954	if (N->getOperand(Num: `2`).isUndef())
22955	return N->getOperand(Num: `1`);
22956
22957	return SDValue ();
22958	}
22959
22960	static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
22961	const AArch64Subtarget *Subtarget) {
22962	assert((N->getOpcode() == AArch64ISD::UUNPKHI \|\|
22963	N->getOpcode() == AArch64ISD::UUNPKLO) &&
22964	"Unexpected Opcode!");
22965
22966	// uunpklo/hi undef -> undef
22967	if (N->getOperand(Num: `0`).isUndef())
22968	return DAG.getUNDEF(VT: N->getValueType(ResNo: `0`));
22969
22970	// If this is a masked load followed by an UUNPKLO, fold this into a masked
22971	// extending load. We can do this even if this is already a masked
22972	// {z,}extload.
22973	if (N->getOperand(Num: `0`).getOpcode() == ISD::MLOAD &&
22974	N->getOpcode() == AArch64ISD::UUNPKLO) {
22975	MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: `0`));
22976	SDValue Mask = MLD->getMask();
22977	SDLoc DL(N);
22978
22979	if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22980	SDValue (MLD, `0`).hasOneUse() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
22981	(MLD->getPassThru()->isUndef() \|\|
22982	isZerosVector(N: MLD->getPassThru().getNode()))) {
22983	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22984	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
22985	EVT VT = N->getValueType(ResNo: `0`);
22986
22987	// Ensure we can double the size of the predicate pattern
22988	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22989	if (NumElts &&
22990	NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22991	Mask =
22992	getPTrue(DAG, DL, VT: VT.changeVectorElementType(EltVT: MVT::i1), Pattern: PgPattern);
22993	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT);
22994	SDValue NewLoad = DAG.getMaskedLoad(
22995	VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
22996	Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
22997	AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
22998
22999	DAG.ReplaceAllUsesOfValueWith(From: SDValue (MLD, `1`), To: NewLoad.getValue(R: `1`));
23000
23001	return NewLoad;
23002	}
23003	}
23004	}
23005
23006	return SDValue ();
23007	}
23008
23009	static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
23010	if (N->getOpcode() != AArch64ISD::UZP1)
23011	return false;
23012	SDValue Op0 = N->getOperand(Num: `0`);
23013	EVT SrcVT = Op0 ->getValueType(ResNo: `0`);
23014	EVT DstVT = N->getValueType(ResNo: `0`);
23015	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) \|\|
23016	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) \|\|
23017	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
23018	}
23019
23020	// Try to combine rounding shifts where the operands come from an extend, and
23021	// the result is truncated and combined into one vector.
23022	// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
23023	static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
23024	assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
23025	SDValue Op0 = N->getOperand(Num: `0`);
23026	SDValue Op1 = N->getOperand(Num: `1`);
23027	EVT ResVT = N->getValueType(ResNo: `0`);
23028
23029	unsigned RshOpc = Op0.getOpcode();
23030	if (RshOpc != AArch64ISD::RSHRNB_I)
23031	return SDValue ();
23032
23033	// Same op code and imm value?
23034	SDValue ShiftValue = Op0.getOperand(i: `1`);
23035	if (RshOpc != Op1.getOpcode() \|\| ShiftValue != Op1.getOperand(i: `1`))
23036	return SDValue ();
23037
23038	// Same unextended operand value?
23039	SDValue Lo = Op0.getOperand(i: `0`);
23040	SDValue Hi = Op1.getOperand(i: `0`);
23041	if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
23042	Hi.getOpcode() != AArch64ISD::UUNPKHI)
23043	return SDValue ();
23044	SDValue OrigArg = Lo.getOperand(i: `0`);
23045	if (OrigArg != Hi.getOperand(i: `0`))
23046	return SDValue ();
23047
23048	SDLoc DL(N);
23049	return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
23050	N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
23051	N3: ShiftValue);
23052	}
23053
23054	// Try to simplify:
23055	// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
23056	// t2 = nxv8i16 srl(t1, ShiftValue)
23057	// to
23058	// t1 = nxv8i16 rshrnb(X, shiftvalue).
23059	// rshrnb will zero the top half bits of each element. Therefore, this combine
23060	// should only be performed when a following instruction with the rshrnb
23061	// as an operand does not care about the top half of each element. For example,
23062	// a uzp1 or a truncating store.
23063	static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
23064	const AArch64Subtarget *Subtarget) {
23065	EVT VT = Srl ->getValueType(ResNo: `0`);
23066	if (!VT.isScalableVector() \|\| !Subtarget->hasSVE2())
23067	return SDValue ();
23068
23069	EVT ResVT;
23070	if (VT == MVT::nxv8i16)
23071	ResVT = MVT::nxv16i8;
23072	else if (VT == MVT::nxv4i32)
23073	ResVT = MVT::nxv8i16;
23074	else if (VT == MVT::nxv2i64)
23075	ResVT = MVT::nxv4i32;
23076	else
23077	return SDValue ();
23078
23079	SDLoc DL(Srl);
23080	unsigned ShiftValue;
23081	SDValue RShOperand;
23082	if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
23083	return SDValue ();
23084	SDValue Rshrnb = DAG.getNode(
23085	Opcode: AArch64ISD::RSHRNB_I, DL, VT: ResVT,
23086	Ops: {RShOperand, DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32)});
23087	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: Rshrnb);
23088	}
23089
23090	static SDValue isNVCastToHalfWidthElements(SDValue V) {
23091	if (V.getOpcode() != AArch64ISD::NVCAST)
23092	return SDValue ();
23093
23094	SDValue Op = V.getOperand(i: `0`);
23095	if (!Op.getValueType().isVector() \|\|
23096	V.getValueType().getVectorElementCount() !=
23097	Op.getValueType().getVectorElementCount() * `2`)
23098	return SDValue ();
23099
23100	return Op;
23101	}
23102
23103	static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
23104	const AArch64Subtarget *Subtarget) {
23105	SDLoc DL(N);
23106	SDValue Op0 = N->getOperand(Num: `0`);
23107	SDValue Op1 = N->getOperand(Num: `1`);
23108	EVT ResVT = N->getValueType(ResNo: `0`);
23109
23110	// uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
23111	if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23112	Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23113	Op0.getOperand(i: `0`) == Op1.getOperand(i: `0`)) {
23114
23115	SDValue SourceVec = Op0.getOperand(i: `0`);
23116	uint64_t ExtIdx0 = Op0.getConstantOperandVal(i: `1`);
23117	uint64_t ExtIdx1 = Op1.getConstantOperandVal(i: `1`);
23118	uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
23119	if (ExtIdx0 == `0` && ExtIdx1 == NumElements / `2`) {
23120	EVT OpVT = Op0.getOperand(i: `1`).getValueType();
23121	EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
23122	SDValue Uzp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: WidenedResVT, N1: SourceVec,
23123	N2: DAG.getUNDEF(VT: WidenedResVT));
23124	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: ResVT, N1: Uzp,
23125	N2: DAG.getConstant(Val: `0`, DL, VT: OpVT));
23126	}
23127	}
23128
23129	// Following optimizations only work with uzp1.
23130	if (N->getOpcode() == AArch64ISD::UZP2)
23131	return SDValue ();
23132
23133	// uzp1(x, undef) -> concat(truncate(x), undef)
23134	if (Op1.getOpcode() == ISD::UNDEF) {
23135	EVT BCVT = MVT::Other, HalfVT = MVT::Other;
23136	switch (ResVT.getSimpleVT().SimpleTy) {
23137	default:
23138	break;
23139	case MVT::v16i8:
23140	BCVT = MVT::v8i16;
23141	HalfVT = MVT::v8i8;
23142	break;
23143	case MVT::v8i16:
23144	BCVT = MVT::v4i32;
23145	HalfVT = MVT::v4i16;
23146	break;
23147	case MVT::v4i32:
23148	BCVT = MVT::v2i64;
23149	HalfVT = MVT::v2i32;
23150	break;
23151	}
23152	if (BCVT != MVT::Other) {
23153	SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
23154	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
23155	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
23156	N2: DAG.getUNDEF(VT: HalfVT));
23157	}
23158	}
23159
23160	if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
23161	return Urshr;
23162
23163	if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op0)) {
23164	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: PreCast, DAG, Subtarget)) {
23165	Rshrnb = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ResVT, Operand: Rshrnb);
23166	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
23167	}
23168	}
23169
23170	if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op1)) {
23171	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: PreCast, DAG, Subtarget)) {
23172	Rshrnb = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: ResVT, Operand: Rshrnb);
23173	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
23174	}
23175	}
23176
23177	// uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
23178	if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op0)) {
23179	if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
23180	if (PreCast.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
23181	SDValue X = PreCast.getOperand(i: `0`).getOperand(i: `0`);
23182	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
23183	}
23184	}
23185	}
23186
23187	// uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
23188	if (SDValue PreCast = isNVCastToHalfWidthElements(V: Op1)) {
23189	if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
23190	if (PreCast.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
23191	SDValue Z = PreCast.getOperand(i: `0`).getOperand(i: `1`);
23192	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
23193	}
23194	}
23195	}
23196
23197	// These optimizations only work on little endian.
23198	if (!DAG.getDataLayout().isLittleEndian())
23199	return SDValue ();
23200
23201	// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
23202	// Example:
23203	// nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
23204	// to
23205	// nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
23206	if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
23207	Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
23208	if (Op0.getOperand(i: `0`).getValueType() == Op1.getOperand(i: `0`).getValueType()) {
23209	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: `0`),
23210	N2: Op1.getOperand(i: `0`));
23211	}
23212	}
23213
23214	if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
23215	return SDValue ();
23216
23217	SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
23218	SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
23219
23220	// truncating uzp1(x, y) -> xtn(concat (x, y))
23221	if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
23222	EVT Op0Ty = SourceOp0.getValueType();
23223	if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) \|\|
23224	(ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
23225	SDValue Concat =
23226	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
23227	VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
23228	N1: SourceOp0, N2: SourceOp1);
23229	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
23230	}
23231	}
23232
23233	// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
23234	if (SourceOp0.getOpcode() != ISD::TRUNCATE \|\|
23235	SourceOp1.getOpcode() != ISD::TRUNCATE)
23236	return SDValue ();
23237	SourceOp0 = SourceOp0.getOperand(i: `0`);
23238	SourceOp1 = SourceOp1.getOperand(i: `0`);
23239
23240	if (SourceOp0.getValueType() != SourceOp1.getValueType() \|\|
23241	!SourceOp0.getValueType().isSimple())
23242	return SDValue ();
23243
23244	EVT ResultTy;
23245
23246	switch (SourceOp0.getSimpleValueType().SimpleTy) {
23247	case MVT::v2i64:
23248	ResultTy = MVT::v4i32;
23249	break;
23250	case MVT::v4i32:
23251	ResultTy = MVT::v8i16;
23252	break;
23253	case MVT::v8i16:
23254	ResultTy = MVT::v16i8;
23255	break;
23256	default:
23257	return SDValue ();
23258	}
23259
23260	SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
23261	SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
23262	SDValue UzpResult =
23263	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
23264
23265	EVT BitcastResultTy;
23266
23267	switch (ResVT.getSimpleVT().SimpleTy) {
23268	case MVT::v2i32:
23269	BitcastResultTy = MVT::v2i64;
23270	break;
23271	case MVT::v4i16:
23272	BitcastResultTy = MVT::v4i32;
23273	break;
23274	case MVT::v8i8:
23275	BitcastResultTy = MVT::v8i16;
23276	break;
23277	default:
23278	llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
23279	}
23280
23281	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
23282	Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
23283	}
23284
23285	static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
23286	unsigned Opc = N->getOpcode();
23287
23288	const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO \|\|
23289	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23290	const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO \|\|
23291	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
23292	const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO \|\|
23293	Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO \|\|
23294	Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO \|\|
23295	Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
23296
23297	SDLoc DL(N);
23298	SDValue Chain = N->getOperand(Num: `0`);
23299	SDValue Pg = N->getOperand(Num: `1`);
23300	SDValue Base = N->getOperand(Num: `2`);
23301	SDValue Offset = N->getOperand(Num: `3`);
23302	SDValue Ty = N->getOperand(Num: `4`);
23303
23304	EVT ResVT = N->getValueType(ResNo: `0`);
23305
23306	const auto OffsetOpc = Offset.getOpcode();
23307	const bool OffsetIsZExt =
23308	OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
23309	const bool OffsetIsSExt =
23310	OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
23311
23312	// Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
23313	if (!Extended && (OffsetIsSExt \|\| OffsetIsZExt)) {
23314	SDValue ExtPg = Offset.getOperand(i: `0`);
23315	VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: `2`).getNode());
23316	EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
23317
23318	// If the predicate for the sign- or zero-extended offset is the
23319	// same as the predicate used for this load and the sign-/zero-extension
23320	// was from a 32-bits...
23321	if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
23322	SDValue UnextendedOffset = Offset.getOperand(i: `1`);
23323
23324	unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
23325	if (Signed)
23326	NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
23327
23328	return DAG.getNode(Opcode: NewOpc, DL, ResultTys: {ResVT, MVT::Other},
23329	Ops: {Chain, Pg, Base, UnextendedOffset, Ty});
23330	}
23331	}
23332
23333	return SDValue ();
23334	}
23335
23336	/// Optimize a vector shift instruction and its operand if shifted out
23337	/// bits are not used.
23338	static SDValue performVectorShiftCombine(SDNode *N,
23339	const AArch64TargetLowering &TLI,
23340	TargetLowering::DAGCombinerInfo &DCI) {
23341	assert(N->getOpcode() == AArch64ISD::VASHR \|\|
23342	N->getOpcode() == AArch64ISD::VLSHR);
23343
23344	SDValue Op = N->getOperand(Num: `0`);
23345	unsigned OpScalarSize = Op.getScalarValueSizeInBits();
23346
23347	unsigned ShiftImm = N->getConstantOperandVal(Num: `1`);
23348	assert(OpScalarSize > ShiftImm && "Invalid shift imm");
23349
23350	// Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
23351	if (N->getOpcode() == AArch64ISD::VASHR &&
23352	Op.getOpcode() == AArch64ISD::VSHL &&
23353	N->getOperand(Num: `1`) == Op.getOperand(i: `1`))
23354	if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`)) > ShiftImm)
23355	return Op.getOperand(i: `0`);
23356
23357	// If the shift is exact, the shifted out bits matter.
23358	if (N->getFlags().hasExact())
23359	return SDValue ();
23360
23361	APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
23362	APInt DemandedMask = ~ShiftedOutBits;
23363
23364	if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
23365	return SDValue (N, `0`);
23366
23367	return SDValue ();
23368	}
23369
23370	static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
23371	// sunpklo(sext(pred)) -> sext(extract_low_half(pred))
23372	// This transform works in partnership with performSetCCPunpkCombine to
23373	// remove unnecessary transfer of predicates into standard registers and back
23374	if (N->getOperand(Num: `0`).getOpcode() == ISD::SIGN_EXTEND &&
23375	N->getOperand(Num: `0`)->getOperand(Num: `0`)->getValueType(ResNo: `0`).getScalarType() ==
23376	MVT::i1) {
23377	SDValue CC = N->getOperand(Num: `0`)->getOperand(Num: `0`);
23378	auto VT = CC ->getValueType(ResNo: `0`).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
23379	SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc (N), VT, N1: CC,
23380	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SDLoc (N)));
23381	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: Unpk);
23382	}
23383
23384	return SDValue ();
23385	}
23386
23387	/// Target-specific DAG combine function for post-increment LD1 (lane) and
23388	/// post-increment LD1R.
23389	static SDValue performPostLD1Combine(SDNode *N,
23390	TargetLowering::DAGCombinerInfo &DCI,
23391	bool IsLaneOp) {
23392	if (DCI.isBeforeLegalizeOps())
23393	return SDValue ();
23394
23395	SelectionDAG &DAG = DCI.DAG;
23396	EVT VT = N->getValueType(ResNo: `0`);
23397
23398	if (!VT.is128BitVector() && !VT.is64BitVector())
23399	return SDValue ();
23400
23401	// If it is not LOAD, can not do such combine.
23402	unsigned LoadIdx = IsLaneOp ? `1` : `0`;
23403	LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N->getOperand(Num: LoadIdx).getNode());
23404	if (!LD)
23405	return SDValue ();
23406
23407	// If the Generic combiner already helped form a pre- or post-indexed load,
23408	// skip forming one here.
23409	if (LD->isIndexed())
23410	return SDValue ();
23411
23412	// The vector lane must be a constant in the LD1LANE opcode.
23413	SDValue Lane;
23414	if (IsLaneOp) {
23415	Lane = N->getOperand(Num: `2`);
23416	auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
23417	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
23418	return SDValue ();
23419	if (LaneC->getZExtValue() == `0` && isNullOrNullSplat(V: N->getOperand(Num: `0`)))
23420	return SDValue ();
23421	}
23422
23423	LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
23424	EVT MemVT = LoadSDN->getMemoryVT();
23425	// Check if memory operand is the same type as the vector element.
23426	if (MemVT != VT.getVectorElementType())
23427	return SDValue ();
23428
23429	// Check if there are other uses. If so, do not combine as it will introduce
23430	// an extra load.
23431	for (SDUse &U : LD->uses()) {
23432	if (U.getResNo() == `1`) // Ignore uses of the chain result.
23433	continue;
23434	if (U.getUser() != N)
23435	return SDValue ();
23436	}
23437
23438	// If there is one use and it can splat the value, prefer that operation.
23439	// TODO: This could be expanded to more operations if they reliably use the
23440	// index variants.
23441	if (N->hasOneUse()) {
23442	unsigned UseOpc = N->user_begin()->getOpcode();
23443	if (UseOpc == ISD::FMUL \|\| UseOpc == ISD::FMA)
23444	return SDValue ();
23445	}
23446
23447	SDValue Addr = LD->getOperand(Num: `1`);
23448	SDValue Vector = N->getOperand(Num: `0`);
23449	// Search for a use of the address operand that is an increment.
23450	for (SDUse &Use : Addr ->uses()) {
23451	SDNode *User = Use.getUser();
23452	if (User->getOpcode() != ISD::ADD \|\| Use.getResNo() != Addr.getResNo())
23453	continue;
23454
23455	// If the increment is a constant, it must match the memory ref size.
23456	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
23457	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
23458	uint32_t IncVal = CInc->getZExtValue();
23459	unsigned NumBytes = VT.getScalarSizeInBits() / `8`;
23460	if (IncVal != NumBytes)
23461	continue;
23462	Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
23463	}
23464
23465	// To avoid cycle construction make sure that neither the load nor the add
23466	// are predecessors to each other or the Vector.
23467	SmallPtrSet<const SDNode *, `32`> Visited;
23468	SmallVector<const SDNode *, `16`> Worklist;
23469	Visited.insert(Ptr: Addr.getNode());
23470	Worklist.push_back(Elt: User);
23471	Worklist.push_back(Elt: LD);
23472	Worklist.push_back(Elt: Vector.getNode());
23473	if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) \|\|
23474	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
23475	continue;
23476
23477	SmallVector<SDValue, `8`> Ops;
23478	Ops.push_back(Elt: LD->getOperand(Num: `0`)); // Chain
23479	if (IsLaneOp) {
23480	Ops.push_back(Elt: Vector); // The vector to be inserted
23481	Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
23482	}
23483	Ops.push_back(Elt: Addr);
23484	Ops.push_back(Elt: Inc);
23485
23486	EVT Tys[`3`] = { VT, MVT::i64, MVT::Other };
23487	SDVTList SDTys = DAG.getVTList(VTs: Tys);
23488	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
23489	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc (N), VTList: SDTys, Ops,
23490	MemVT,
23491	MMO: LoadSDN->getMemOperand());
23492
23493	// Update the uses.
23494	SDValue NewResults[] = {
23495	SDValue (LD, `0`), // The result of load
23496	SDValue (UpdN.getNode(), `2`) // Chain
23497	};
23498	DCI.CombineTo(N: LD, To: NewResults);
23499	DCI.CombineTo(N, Res: SDValue (UpdN.getNode(), `0`)); // Dup/Inserted Result
23500	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), `1`)); // Write back register
23501
23502	break;
23503	}
23504	return SDValue ();
23505	}
23506
23507	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
23508	/// address translation.
23509	static bool performTBISimplification(SDValue Addr,
23510	TargetLowering::DAGCombinerInfo &DCI,
23511	SelectionDAG &DAG) {
23512	APInt DemandedMask = APInt::getLowBitsSet(numBits: `64`, loBitsSet: `56`);
23513	KnownBits Known;
23514	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
23515	!DCI.isBeforeLegalizeOps());
23516	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23517	if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
23518	DCI.CommitTargetLoweringOpt(TLO);
23519	return true;
23520	}
23521	return false;
23522	}
23523
23524	static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
23525	assert((N->getOpcode() == ISD::STORE \|\| N->getOpcode() == ISD::MSTORE) &&
23526	"Expected STORE dag node in input!");
23527
23528	if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
23529	if (!Store->isTruncatingStore() \|\| Store->isIndexed())
23530	return SDValue ();
23531	SDValue Ext = Store->getValue();
23532	auto ExtOpCode = Ext.getOpcode();
23533	if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
23534	ExtOpCode != ISD::ANY_EXTEND)
23535	return SDValue ();
23536	SDValue Orig = Ext ->getOperand(Num: `0`);
23537	if (Store->getMemoryVT() != Orig.getValueType())
23538	return SDValue ();
23539	return DAG.getStore(Chain: Store->getChain(), dl: SDLoc (Store), Val: Orig,
23540	Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
23541	}
23542
23543	return SDValue ();
23544	}
23545
23546	// A custom combine to lower load <3 x i8> as the more efficient sequence
23547	// below:
23548	// ldrb wX, [x0, #2]
23549	// ldrh wY, [x0]
23550	// orr wX, wY, wX, lsl #16
23551	// fmov s0, wX
23552	//
23553	// Note that an alternative sequence with even fewer (although usually more
23554	// complex/expensive) instructions would be:
23555	// ld1r.4h { v0 }, [x0], #2
23556	// ld1.b { v0 }[2], [x0]
23557	//
23558	// Generating this sequence unfortunately results in noticeably worse codegen
23559	// for code that extends the loaded v3i8, due to legalization breaking vector
23560	// shuffle detection in a way that is very difficult to work around.
23561	// TODO: Revisit once v3i8 legalization has been improved in general.
23562	static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
23563	EVT MemVT = LD->getMemoryVT();
23564	if (MemVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: `3`) \|\|
23565	LD->getBaseAlign() >= `4`)
23566	return SDValue ();
23567
23568	SDLoc DL(LD);
23569	MachineFunction &MF = DAG.getMachineFunction();
23570	SDValue Chain = LD->getChain();
23571	SDValue BasePtr = LD->getBasePtr();
23572	MachineMemOperand *MMO = LD->getMemOperand();
23573	assert(LD->getOffset().isUndef() && "undef offset expected");
23574
23575	// Load 2 x i8, then 1 x i8.
23576	SDValue L16 = DAG.getLoad(VT: MVT::i16, dl: DL, Chain, Ptr: BasePtr, MMO);
23577	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
23578	SDValue L8 = DAG.getLoad(VT: MVT::i8, dl: DL, Chain,
23579	Ptr: DAG.getMemBasePlusOffset(Base: BasePtr, Offset: Offset2, DL),
23580	MMO: MF.getMachineMemOperand(MMO, Offset: `2`, Size: `1`));
23581
23582	// Extend to i32.
23583	SDValue Ext16 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L16);
23584	SDValue Ext8 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L8);
23585
23586	// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
23587	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Ext8,
23588	N2: DAG.getConstant(Val: `16`, DL, VT: MVT::i32));
23589	SDValue Or = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Ext16, N2: Shl);
23590	SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v4i8, Operand: Or);
23591
23592	// Extract v3i8 again.
23593	SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT, N1: Cast,
23594	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
23595	SDValue TokenFactor = DAG.getNode(
23596	Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
23597	Ops: {SDValue (cast<SDNode>(Val&: L16), `1`), SDValue (cast<SDNode>(Val&: L8), `1`)});
23598	return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
23599	}
23600
23601	// Perform TBI simplification if supported by the target and try to break up
23602	// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23603	// load instructions can be selected.
23604	static SDValue performLOADCombine(SDNode *N,
23605	TargetLowering::DAGCombinerInfo &DCI,
23606	SelectionDAG &DAG,
23607	const AArch64Subtarget *Subtarget) {
23608	if (Subtarget->supportsAddressTopByteIgnored())
23609	performTBISimplification(Addr: N->getOperand(Num: `1`), DCI, DAG);
23610
23611	LoadSDNode *LD = cast<LoadSDNode>(Val: N);
23612	EVT RegVT = LD->getValueType(ResNo: `0`);
23613	EVT MemVT = LD->getMemoryVT();
23614	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23615	SDLoc DL(LD);
23616
23617	// Cast ptr32 and ptr64 pointers to the default address space before a load.
23618	unsigned AddrSpace = LD->getAddressSpace();
23619	if (AddrSpace == ARM64AS::PTR64 \|\| AddrSpace == ARM64AS::PTR32_SPTR \|\|
23620	AddrSpace == ARM64AS::PTR32_UPTR) {
23621	MVT PtrVT = TLI.getPointerTy(DL: DAG.getDataLayout());
23622	if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
23623	SDValue Cast =
23624	DAG.getAddrSpaceCast(dl: DL, VT: PtrVT, Ptr: LD->getBasePtr(), SrcAS: AddrSpace, DestAS: `0`);
23625	return DAG.getExtLoad(ExtType: LD->getExtensionType(), dl: DL, VT: RegVT, Chain: LD->getChain(),
23626	Ptr: Cast, PtrInfo: LD->getPointerInfo(), MemVT,
23627	Alignment: LD->getBaseAlign(),
23628	MMOFlags: LD->getMemOperand()->getFlags());
23629	}
23630	}
23631
23632	if (LD->isVolatile() \|\| !Subtarget->isLittleEndian())
23633	return SDValue (N, `0`);
23634
23635	if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23636	return Res;
23637
23638	if (!LD->isNonTemporal())
23639	return SDValue (N, `0`);
23640
23641	if (MemVT.isScalableVector() \|\| MemVT.getSizeInBits() <= `256` \|\|
23642	MemVT.getSizeInBits() % `256` == `0` \|\|
23643	`256` % MemVT.getScalarSizeInBits() != `0`)
23644	return SDValue (N, `0`);
23645
23646	SDValue Chain = LD->getChain();
23647	SDValue BasePtr = LD->getBasePtr();
23648	SDNodeFlags Flags = LD->getFlags();
23649	SmallVector<SDValue, `4`> LoadOps;
23650	SmallVector<SDValue, `4`> LoadOpsChain;
23651	// Replace any non temporal load over 256-bit with a series of 256 bit loads
23652	// and a scalar/vector load less than 256. This way we can utilize 256-bit
23653	// loads and reduce the amount of load instructions generated.
23654	MVT NewVT =
23655	MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
23656	NumElements: `256` / MemVT.getVectorElementType().getSizeInBits());
23657	unsigned Num256Loads = MemVT.getSizeInBits() / `256`;
23658	// Create all 256-bit loads starting from offset 0 and up to Num256Loads-132.*
23659	for (unsigned I = `0`; I < Num256Loads; I++) {
23660	unsigned PtrOffset = I * `32`;
23661	SDValue NewPtr = DAG.getMemBasePlusOffset(
23662	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
23663	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
23664	SDValue NewLoad = DAG.getLoad(
23665	VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
23666	Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
23667	LoadOps.push_back(Elt: NewLoad);
23668	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: NewLoad), `1`));
23669	}
23670
23671	// Process remaining bits of the load operation.
23672	// This is done by creating an UNDEF vector to match the size of the
23673	// 256-bit loads and inserting the remaining load to it. We extract the
23674	// original load type at the end using EXTRACT_SUBVECTOR instruction.
23675	unsigned BitsRemaining = MemVT.getSizeInBits() % `256`;
23676	unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / `8`;
23677	MVT RemainingVT = MVT::getVectorVT(
23678	VT: MemVT.getVectorElementType().getSimpleVT(),
23679	NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
23680	SDValue NewPtr = DAG.getMemBasePlusOffset(
23681	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
23682	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
23683	SDValue RemainingLoad =
23684	DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
23685	PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
23686	MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
23687	SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
23688	SDValue InsertIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
23689	SDValue ExtendedRemainingLoad =
23690	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
23691	Ops: {UndefVector, RemainingLoad, InsertIdx});
23692	LoadOps.push_back(Elt: ExtendedRemainingLoad);
23693	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: RemainingLoad), `1`));
23694	EVT ConcatVT =
23695	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
23696	NumElements: LoadOps.size() * NewVT.getVectorNumElements());
23697	SDValue ConcatVectors =
23698	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
23699	// Extract the original vector type size.
23700	SDValue ExtractSubVector =
23701	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
23702	Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: `0`, DL)});
23703	SDValue TokenFactor =
23704	DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: LoadOpsChain);
23705	return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
23706	}
23707
23708	static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = `0`) {
23709	EVT VecVT = Op.getValueType();
23710	assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
23711	"Need boolean vector type.");
23712
23713	if (Depth > `3`)
23714	return MVT::INVALID_SIMPLE_VALUE_TYPE;
23715
23716	// We can get the base type from a vector compare or truncate.
23717	if (Op.getOpcode() == ISD::SETCC \|\| Op.getOpcode() == ISD::TRUNCATE)
23718	return Op.getOperand(i: `0`).getValueType();
23719
23720	// If an operand is a bool vector, continue looking.
23721	EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
23722	for (SDValue Operand : Op ->op_values()) {
23723	if (Operand.getValueType() != VecVT)
23724	continue;
23725
23726	EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + `1`);
23727	if (!BaseVT.isSimple())
23728	BaseVT = OperandVT;
23729	else if (OperandVT != BaseVT)
23730	return MVT::INVALID_SIMPLE_VALUE_TYPE;
23731	}
23732
23733	return BaseVT;
23734	}
23735
23736	// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
23737	// iN, we can use a trick that extracts the i^th bit from the i^th element and
23738	// then performs a vector add to get a scalar bitmask. This requires that each
23739	// element's bits are either all 1 or all 0.
23740	static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
23741	SDLoc DL(N);
23742	SDValue ComparisonResult(N, `0`);
23743	EVT VecVT = ComparisonResult.getValueType();
23744	assert(VecVT.isVector() && "Must be a vector type");
23745
23746	unsigned NumElts = VecVT.getVectorNumElements();
23747	if (NumElts != `2` && NumElts != `4` && NumElts != `8` && NumElts != `16`)
23748	return SDValue ();
23749
23750	if (VecVT.getVectorElementType() != MVT::i1 &&
23751	!DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT))
23752	return SDValue ();
23753
23754	// If we can find the original types to work on instead of a vector of i1,
23755	// we can avoid extend/extract conversion instructions.
23756	if (VecVT.getVectorElementType() == MVT::i1) {
23757	VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
23758	if (!VecVT.isSimple()) {
23759	unsigned BitsPerElement = std::max(a: `64` / NumElts, b: `8u`); // >= 64-bit vector
23760	VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
23761	}
23762	}
23763	VecVT = VecVT.changeVectorElementTypeToInteger();
23764
23765	// Large vectors don't map directly to this conversion, so to avoid too many
23766	// edge cases, we don't apply it here. The conversion will likely still be
23767	// applied later via multiple smaller vectors, whose results are concatenated.
23768	if (VecVT.getSizeInBits() > `128`)
23769	return SDValue ();
23770
23771	// Ensure that all elements' bits are either 0s or 1s.
23772	ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
23773
23774	SmallVector<SDValue, `16`> MaskConstants;
23775	if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
23776	VecVT == MVT::v16i8) {
23777	// v16i8 is a special case, as we have 16 entries but only 8 positional bits
23778	// per entry. We split it into two halves, apply the mask, zip the halves to
23779	// create 8x 16-bit values, and the perform the vector reduce.
23780	for (unsigned Half = `0`; Half < `2`; ++Half) {
23781	for (unsigned MaskBit = `1`; MaskBit <= `128`; MaskBit *= `2`) {
23782	MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i32));
23783	}
23784	}
23785	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
23786	SDValue RepresentativeBits =
23787	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
23788
23789	SDValue UpperRepresentativeBits =
23790	DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: VecVT, N1: RepresentativeBits,
23791	N2: RepresentativeBits, N3: DAG.getConstant(Val: `8`, DL, VT: MVT::i32));
23792	SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
23793	N1: RepresentativeBits, N2: UpperRepresentativeBits);
23794	Zipped = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i16, Operand: Zipped);
23795	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i16, Operand: Zipped);
23796	}
23797
23798	// All other vector sizes.
23799	unsigned MaxBitMask = `1u` << (VecVT.getVectorNumElements() - `1`);
23800	for (unsigned MaskBit = `1`; MaskBit <= MaxBitMask; MaskBit *= `2`) {
23801	MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i64));
23802	}
23803
23804	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
23805	SDValue RepresentativeBits =
23806	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
23807	EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
23808	a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
23809	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
23810	}
23811
23812	static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
23813	StoreSDNode *Store) {
23814	if (!Store->isTruncatingStore())
23815	return SDValue ();
23816
23817	SDLoc DL(Store);
23818	SDValue VecOp = Store->getValue();
23819	EVT VT = VecOp.getValueType();
23820	EVT MemVT = Store->getMemoryVT();
23821
23822	if (!MemVT.isVector() \|\| !VT.isVector() \|\|
23823	MemVT.getVectorElementType() != MVT::i1)
23824	return SDValue ();
23825
23826	// If we are storing a vector that we are currently building, let
23827	// `scalarizeVectorStore()` handle this more efficiently.
23828	if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
23829	return SDValue ();
23830
23831	VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
23832	SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
23833	if (!VectorBits)
23834	return SDValue ();
23835
23836	EVT StoreVT =
23837	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
23838	SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
23839	return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
23840	MMO: Store->getMemOperand());
23841	}
23842
23843	bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
23844	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) \|\|
23845	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) \|\|
23846	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
23847	}
23848
23849	// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
23850	static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
23851	const AArch64Subtarget *Subtarget) {
23852	SDValue Value = ST->getValue();
23853	EVT ValueVT = Value.getValueType();
23854
23855	if (ST->isVolatile() \|\| !Subtarget->isLittleEndian() \|\|
23856	Value.getOpcode() != ISD::TRUNCATE \|\|
23857	ValueVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: `3`))
23858	return SDValue ();
23859
23860	assert(ST->getOffset().isUndef() && "undef offset expected");
23861	SDLoc DL(ST);
23862	auto WideVT = EVT::getVectorVT(
23863	Context&: *DAG.getContext(),
23864	VT: Value ->getOperand(Num: `0`).getValueType().getVectorElementType(), NumElements: `4`);
23865	SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
23866	SDValue WideTrunc = DAG.getNode(
23867	Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
23868	Ops: {UndefVector, Value ->getOperand(Num: `0`), DAG.getVectorIdxConstant(Val: `0`, DL)});
23869	SDValue Cast = DAG.getNode(
23870	Opcode: ISD::BITCAST, DL, VT: WideVT.getSizeInBits() == `64` ? MVT::v8i8 : MVT::v16i8,
23871	Operand: WideTrunc);
23872
23873	MachineFunction &MF = DAG.getMachineFunction();
23874	SDValue Chain = ST->getChain();
23875	MachineMemOperand *MMO = ST->getMemOperand();
23876	unsigned IdxScale = WideVT.getScalarSizeInBits() / `8`;
23877	SDValue E2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23878	N2: DAG.getConstant(Val: `2` * IdxScale, DL, VT: MVT::i64));
23879	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
23880	SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
23881	Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: `2`, Size: `1`));
23882
23883	SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23884	N2: DAG.getConstant(Val: `1` * IdxScale, DL, VT: MVT::i64));
23885	TypeSize Offset1 = TypeSize::getFixed(ExactSize: `1`);
23886	SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
23887	Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: `1`, Size: `1`));
23888
23889	SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
23890	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
23891	Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
23892	MMO: MF.getMachineMemOperand(MMO, Offset: `0`, Size: `1`));
23893	return Chain;
23894	}
23895
23896	static unsigned getFPSubregForVT(EVT VT) {
23897	assert(VT.isSimple() && "Expected simple VT");
23898	switch (VT.getSimpleVT().SimpleTy) {
23899	case MVT::aarch64mfp8:
23900	return AArch64::bsub;
23901	case MVT::f16:
23902	return AArch64::hsub;
23903	case MVT::f32:
23904	return AArch64::ssub;
23905	case MVT::f64:
23906	return AArch64::dsub;
23907	default:
23908	llvm_unreachable("Unexpected VT!");
23909	}
23910	}
23911
23912	static SDValue performSTORECombine(SDNode *N,
23913	TargetLowering::DAGCombinerInfo &DCI,
23914	SelectionDAG &DAG,
23915	const AArch64Subtarget *Subtarget) {
23916	StoreSDNode *ST = cast<StoreSDNode>(Val: N);
23917	SDValue Chain = ST->getChain();
23918	SDValue Value = ST->getValue();
23919	SDValue Ptr = ST->getBasePtr();
23920	EVT ValueVT = Value.getValueType();
23921	EVT MemVT = ST->getMemoryVT();
23922	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23923	SDLoc DL(ST);
23924
23925	auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
23926	EVT EltVT = VT.getVectorElementType();
23927	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
23928	};
23929
23930	// Cast ptr32 and ptr64 pointers to the default address space before a store.
23931	unsigned AddrSpace = ST->getAddressSpace();
23932	if (AddrSpace == ARM64AS::PTR64 \|\| AddrSpace == ARM64AS::PTR32_SPTR \|\|
23933	AddrSpace == ARM64AS::PTR32_UPTR) {
23934	MVT PtrVT = TLI.getPointerTy(DL: DAG.getDataLayout());
23935	if (PtrVT != Ptr.getSimpleValueType()) {
23936	SDValue Cast = DAG.getAddrSpaceCast(dl: DL, VT: PtrVT, Ptr, SrcAS: AddrSpace, DestAS: `0`);
23937	return DAG.getStore(Chain, dl: DL, Val: Value, Ptr: Cast, PtrInfo: ST->getPointerInfo(),
23938	Alignment: ST->getBaseAlign(), MMOFlags: ST->getMemOperand()->getFlags(),
23939	AAInfo: ST->getAAInfo());
23940	}
23941	}
23942
23943	if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23944	return Res;
23945
23946	// If this is an FP_ROUND followed by a store, fold this into a truncating
23947	// store. We can do this even if this is already a truncstore.
23948	// We purposefully don't care about legality of the nodes here as we know
23949	// they can be split down into something legal.
23950	if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
23951	Value.getNode()->hasOneUse() && ST->isUnindexed() &&
23952	Subtarget->useSVEForFixedLengthVectors() &&
23953	ValueVT.isFixedLengthVector() &&
23954	ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
23955	hasValidElementTypeForFPTruncStore (Value.getOperand(i: `0`).getValueType()))
23956	return DAG.getTruncStore(Chain, dl: DL, Val: Value.getOperand(i: `0`), Ptr, SVT: MemVT,
23957	MMO: ST->getMemOperand());
23958
23959	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
23960	return Split;
23961
23962	if (Subtarget->supportsAddressTopByteIgnored() &&
23963	performTBISimplification(Addr: N->getOperand(Num: `2`), DCI, DAG))
23964	return SDValue (N, `0`);
23965
23966	if (SDValue Store = foldTruncStoreOfExt(DAG, N))
23967	return Store;
23968
23969	if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
23970	return Store;
23971
23972	if (ST->isTruncatingStore() &&
23973	isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT)) {
23974	if (SDValue Rshrnb =
23975	trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: `1`), DAG, Subtarget)) {
23976	return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
23977	SVT: MemVT, MMO: ST->getMemOperand());
23978	}
23979	}
23980
23981	// This is an integer vector_extract_elt followed by a (possibly truncating)
23982	// store. We may be able to replace this with a store of an FP subregister.
23983	if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
23984	Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
23985
23986	SDValue Vector = Value.getOperand(i: `0`);
23987	SDValue ExtIdx = Value.getOperand(i: `1`);
23988	EVT VectorVT = Vector.getValueType();
23989	EVT ElemVT = VectorVT.getVectorElementType();
23990
23991	if (!ValueVT.isInteger())
23992	return SDValue ();
23993
23994	// Propagate zero constants (applying this fold may miss optimizations).
23995	if (ISD::isConstantSplatVectorAllZeros(N: Vector.getNode())) {
23996	SDValue ZeroElt = DAG.getConstant(Val: `0`, DL, VT: ValueVT);
23997	DAG.ReplaceAllUsesWith(From: Value, To: ZeroElt);
23998	return SDValue ();
23999	}
24000
24001	if (ValueVT != MemVT && !ST->isTruncatingStore())
24002	return SDValue ();
24003
24004	// This could generate an additional extract if the index is non-zero and
24005	// the extracted value has multiple uses.
24006	auto *ExtCst = dyn_cast<ConstantSDNode>(Val&: ExtIdx);
24007	if ((!ExtCst \|\| !ExtCst->isZero()) && !Value.hasOneUse())
24008	return SDValue ();
24009
24010	// These can lower to st1, which is preferable if we're unlikely to fold the
24011	// addressing into the store.
24012	if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24013	(VectorVT.is64BitVector() \|\| VectorVT.is128BitVector()) && ExtCst &&
24014	!ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
24015	return SDValue ();
24016
24017	if (MemVT == MVT::i64 \|\| MemVT == MVT::i32) {
24018	// Heuristic: If there are other users of w/x integer scalars extracted
24019	// from this vector that won't fold into the store -- abandon folding.
24020	// Applying this fold may disrupt paired stores.
24021	for (const auto &Use : Vector ->uses()) {
24022	if (Use.getResNo() != Vector.getResNo())
24023	continue;
24024	const SDNode *User = Use.getUser();
24025	if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24026	(!User->hasOneUse() \|\|
24027	(*User->user_begin())->getOpcode() != ISD::STORE))
24028	return SDValue ();
24029	}
24030	}
24031
24032	SDValue ExtVector = Vector;
24033	if (!ExtCst \|\| !ExtCst->isZero()) {
24034	// Handle extracting from lanes != 0.
24035	SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL,
24036	VT: Value.getValueType(), N1: Vector, N2: ExtIdx);
24037	SDValue Zero = DAG.getVectorIdxConstant(Val: `0`, DL);
24038	ExtVector = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT,
24039	N1: DAG.getUNDEF(VT: VectorVT), N2: Ext, N3: Zero);
24040	}
24041
24042	EVT FPMemVT = MemVT == MVT::i8
24043	? MVT::aarch64mfp8
24044	: EVT::getFloatingPointVT(BitWidth: MemVT.getSizeInBits());
24045	SDValue FPSubreg = DAG.getTargetExtractSubreg(SRIdx: getFPSubregForVT(VT: FPMemVT), DL,
24046	VT: FPMemVT, Operand: ExtVector);
24047
24048	return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: FPSubreg, Ptr: ST->getBasePtr(),
24049	MMO: ST->getMemOperand());
24050	}
24051
24052	return SDValue ();
24053	}
24054
24055	static SDValue performMSTORECombine(SDNode *N,
24056	TargetLowering::DAGCombinerInfo &DCI,
24057	SelectionDAG &DAG,
24058	const AArch64Subtarget *Subtarget) {
24059	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
24060	SDValue Value = MST->getValue();
24061	SDValue Mask = MST->getMask();
24062	SDLoc DL(N);
24063
24064	// If this is a UZP1 followed by a masked store, fold this into a masked
24065	// truncating store. We can do this even if this is already a masked
24066	// truncstore.
24067	if (Value.getOpcode() == AArch64ISD::UZP1 && Value ->hasOneUse() &&
24068	MST->isUnindexed() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
24069	Value.getValueType().isInteger()) {
24070	Value = Value.getOperand(i: `0`);
24071	if (Value.getOpcode() == ISD::BITCAST) {
24072	EVT HalfVT =
24073	Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
24074	EVT InVT = Value.getOperand(i: `0`).getValueType();
24075
24076	if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
24077	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
24078	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
24079
24080	// Ensure we can double the size of the predicate pattern
24081	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
24082	if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
24083	MinSVESize) {
24084	Mask = getPTrue(DAG, DL, VT: InVT.changeVectorElementType(EltVT: MVT::i1),
24085	Pattern: PgPattern);
24086	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: `0`),
24087	Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
24088	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
24089	AM: MST->getAddressingMode(),
24090	/IsTruncating=/true);
24091	}
24092	}
24093	}
24094	}
24095
24096	if (MST->isTruncatingStore()) {
24097	EVT ValueVT = Value ->getValueType(ResNo: `0`);
24098	EVT MemVT = MST->getMemoryVT();
24099	if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
24100	return SDValue ();
24101	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
24102	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
24103	Offset: MST->getOffset(), Mask: MST->getMask(),
24104	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
24105	AM: MST->getAddressingMode(), IsTruncating: true);
24106	}
24107	}
24108
24109	return SDValue ();
24110	}
24111
24112	/// \return true if part of the index was folded into the Base.
24113	static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
24114	SDLoc DL, SelectionDAG &DAG) {
24115	// This function assumes a vector of i64 indices.
24116	EVT IndexVT = Index.getValueType();
24117	if (!IndexVT.isVector() \|\| IndexVT.getVectorElementType() != MVT::i64)
24118	return false;
24119
24120	// Simplify:
24121	// BasePtr = Ptr
24122	// Index = X + splat(Offset)
24123	// ->
24124	// BasePtr = Ptr + Offset scale.*
24125	// Index = X
24126	if (Index.getOpcode() == ISD::ADD) {
24127	if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: `1`))) {
24128	Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
24129	BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
24130	Index = Index.getOperand(i: `0`);
24131	return true;
24132	}
24133	}
24134
24135	// Simplify:
24136	// BasePtr = Ptr
24137	// Index = (X + splat(Offset)) << splat(Shift)
24138	// ->
24139	// BasePtr = Ptr + (Offset << Shift) scale)*
24140	// Index = X << splat(shift)
24141	if (Index.getOpcode() == ISD::SHL &&
24142	Index.getOperand(i: `0`).getOpcode() == ISD::ADD) {
24143	SDValue Add = Index.getOperand(i: `0`);
24144	SDValue ShiftOp = Index.getOperand(i: `1`);
24145	SDValue OffsetOp = Add.getOperand(i: `1`);
24146	if (auto Shift = DAG.getSplatValue(V: ShiftOp))
24147	if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
24148	Offset = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Offset, N2: Shift);
24149	Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
24150	BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
24151	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
24152	N1: Add.getOperand(i: `0`), N2: ShiftOp);
24153	return true;
24154	}
24155	}
24156
24157	return false;
24158	}
24159
24160	// Analyse the specified address returning true if a more optimal addressing
24161	// mode is available. When returning true all parameters are updated to reflect
24162	// their recommended values.
24163	static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
24164	SDValue &BasePtr, SDValue &Index,
24165	SelectionDAG &DAG) {
24166	// Try to iteratively fold parts of the index into the base pointer to
24167	// simplify the index as much as possible.
24168	bool Changed = false;
24169	while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc (N), DAG))
24170	Changed = true;
24171
24172	// Only consider element types that are pointer sized as smaller types can
24173	// be easily promoted.
24174	EVT IndexVT = Index.getValueType();
24175	if (IndexVT.getVectorElementType() != MVT::i64 \|\| IndexVT == MVT::nxv2i64)
24176	return Changed;
24177
24178	// Can indices be trivially shrunk?
24179	EVT DataVT = N->getOperand(Num: `1`).getValueType();
24180	// Don't attempt to shrink the index for fixed vectors of 64 bit data since it
24181	// will later be re-extended to 64 bits in legalization
24182	if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == `64`)
24183	return Changed;
24184	if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: `32`, Signed: N->isIndexSigned())) {
24185	EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
24186	Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: NewIndexVT, Operand: Index);
24187	return true;
24188	}
24189
24190	// Match:
24191	// Index = step(const)
24192	int64_t Stride = `0`;
24193	if (Index.getOpcode() == ISD::STEP_VECTOR) {
24194	Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: `0`))->getSExtValue();
24195	}
24196	// Match:
24197	// Index = step(const) << shift(const)
24198	else if (Index.getOpcode() == ISD::SHL &&
24199	Index.getOperand(i: `0`).getOpcode() == ISD::STEP_VECTOR) {
24200	SDValue RHS = Index.getOperand(i: `1`);
24201	if (auto *Shift =
24202	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
24203	int64_t Step = (int64_t)Index.getOperand(i: `0`).getConstantOperandVal(i: `1`);
24204	Stride = Step << Shift->getZExtValue();
24205	}
24206	}
24207
24208	// Return early because no supported pattern is found.
24209	if (Stride == `0`)
24210	return Changed;
24211
24212	if (Stride < std::numeric_limits<int32_t>::min() \|\|
24213	Stride > std::numeric_limits<int32_t>::max())
24214	return Changed;
24215
24216	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
24217	unsigned MaxVScale =
24218	Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
24219	int64_t LastElementOffset =
24220	IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
24221
24222	if (LastElementOffset < std::numeric_limits<int32_t>::min() \|\|
24223	LastElementOffset > std::numeric_limits<int32_t>::max())
24224	return Changed;
24225
24226	EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
24227	// Stride does not scale explicitly by 'Scale', because it happens in
24228	// the gather/scatter addressing mode.
24229	Index = DAG.getStepVector(DL: SDLoc (N), ResVT: NewIndexVT, StepVal: APInt (`32`, Stride, true));
24230	return true;
24231	}
24232
24233	static SDValue performMaskedGatherScatterCombine(
24234	SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
24235	if (!DCI.isBeforeLegalize())
24236	return SDValue ();
24237	MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
24238
24239	SDLoc DL(MGS);
24240	SDValue Chain = MGS->getChain();
24241	SDValue Scale = MGS->getScale();
24242	SDValue Index = MGS->getIndex();
24243	SDValue Mask = MGS->getMask();
24244	SDValue BasePtr = MGS->getBasePtr();
24245	ISD::MemIndexType IndexType = MGS->getIndexType();
24246
24247	if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
24248	return SDValue ();
24249
24250	// Here we catch such cases early and change MGATHER's IndexType to allow
24251	// the use of an Index that's more legalisation friendly.
24252	if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
24253	SDValue PassThru = MGT->getPassThru();
24254	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
24255	return DAG.getMaskedGather(
24256	VTs: DAG.getVTList(VT1: N->getValueType(ResNo: `0`), VT2: MVT::Other), MemVT: MGT->getMemoryVT(), dl: DL,
24257	Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: MGT->getExtensionType());
24258	}
24259	if (auto *MSC = dyn_cast<MaskedScatterSDNode>(Val: MGS)) {
24260	SDValue Data = MSC->getValue();
24261	SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
24262	return DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT: MSC->getMemoryVT(),
24263	dl: DL, Ops, MMO: MSC->getMemOperand(), IndexType,
24264	IsTruncating: MSC->isTruncatingStore());
24265	}
24266	auto *HG = cast<MaskedHistogramSDNode>(Val: MGS);
24267	SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
24268	Index, Scale, HG->getIntID()};
24269	return DAG.getMaskedHistogram(VTs: DAG.getVTList(VT: MVT::Other), MemVT: HG->getMemoryVT(),
24270	dl: DL, Ops, MMO: HG->getMemOperand(), IndexType);
24271	}
24272
24273	/// Target-specific DAG combine function for NEON load/store intrinsics
24274	/// to merge base address updates.
24275	static SDValue performNEONPostLDSTCombine(SDNode *N,
24276	TargetLowering::DAGCombinerInfo &DCI,
24277	SelectionDAG &DAG) {
24278	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
24279	return SDValue ();
24280
24281	unsigned AddrOpIdx = N->getNumOperands() - `1`;
24282	SDValue Addr = N->getOperand(Num: AddrOpIdx);
24283
24284	// Search for a use of the address operand that is an increment.
24285	for (SDUse &Use : Addr ->uses()) {
24286	SDNode *User = Use.getUser();
24287	if (User->getOpcode() != ISD::ADD \|\| Use.getResNo() != Addr.getResNo())
24288	continue;
24289
24290	// Check that the add is independent of the load/store. Otherwise, folding
24291	// it would create a cycle.
24292	SmallPtrSet<const SDNode *, `32`> Visited;
24293	SmallVector<const SDNode *, `16`> Worklist;
24294	Visited.insert(Ptr: Addr.getNode());
24295	Worklist.push_back(Elt: N);
24296	Worklist.push_back(Elt: User);
24297	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
24298	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
24299	continue;
24300
24301	// Find the new opcode for the updating load/store.
24302	bool IsStore = false;
24303	bool IsLaneOp = false;
24304	bool IsDupOp = false;
24305	unsigned NewOpc = `0`;
24306	unsigned NumVecs = `0`;
24307	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
24308	switch (IntNo) {
24309	default: llvm_unreachable("unexpected intrinsic for Neon base update");
24310	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
24311	NumVecs = `2`; break;
24312	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
24313	NumVecs = `3`; break;
24314	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
24315	NumVecs = `4`; break;
24316	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
24317	NumVecs = `2`; IsStore = true; break;
24318	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
24319	NumVecs = `3`; IsStore = true; break;
24320	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
24321	NumVecs = `4`; IsStore = true; break;
24322	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
24323	NumVecs = `2`; break;
24324	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
24325	NumVecs = `3`; break;
24326	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
24327	NumVecs = `4`; break;
24328	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
24329	NumVecs = `2`; IsStore = true; break;
24330	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
24331	NumVecs = `3`; IsStore = true; break;
24332	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
24333	NumVecs = `4`; IsStore = true; break;
24334	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
24335	NumVecs = `2`; IsDupOp = true; break;
24336	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
24337	NumVecs = `3`; IsDupOp = true; break;
24338	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
24339	NumVecs = `4`; IsDupOp = true; break;
24340	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
24341	NumVecs = `2`; IsLaneOp = true; break;
24342	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
24343	NumVecs = `3`; IsLaneOp = true; break;
24344	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
24345	NumVecs = `4`; IsLaneOp = true; break;
24346	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
24347	NumVecs = `2`; IsStore = true; IsLaneOp = true; break;
24348	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
24349	NumVecs = `3`; IsStore = true; IsLaneOp = true; break;
24350	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
24351	NumVecs = `4`; IsStore = true; IsLaneOp = true; break;
24352	}
24353
24354	EVT VecTy;
24355	if (IsStore)
24356	VecTy = N->getOperand(Num: `2`).getValueType();
24357	else
24358	VecTy = N->getValueType(ResNo: `0`);
24359
24360	// If the increment is a constant, it must match the memory ref size.
24361	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
24362	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
24363	uint32_t IncVal = CInc->getZExtValue();
24364	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / `8`;
24365	if (IsLaneOp \|\| IsDupOp)
24366	NumBytes /= VecTy.getVectorNumElements();
24367	if (IncVal != NumBytes)
24368	continue;
24369	Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
24370	}
24371	SmallVector<SDValue, `8`> Ops;
24372	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Incoming chain
24373	// Load lane and store have vector list as input.
24374	if (IsLaneOp \|\| IsStore)
24375	for (unsigned i = `2`; i < AddrOpIdx; ++i)
24376	Ops.push_back(Elt: N->getOperand(Num: i));
24377	Ops.push_back(Elt: Addr); // Base register
24378	Ops.push_back(Elt: Inc);
24379
24380	// Return Types.
24381	EVT Tys[`6`];
24382	unsigned NumResultVecs = (IsStore ? `0` : NumVecs);
24383	unsigned n;
24384	for (n = `0`; n < NumResultVecs; ++n)
24385	Tys[n] = VecTy;
24386	Tys[n++] = MVT::i64; // Type of write back register
24387	Tys[n] = MVT::Other; // Type of the chain
24388	SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + `2`));
24389
24390	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
24391	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc (N), VTList: SDTys, Ops,
24392	MemVT: MemInt->getMemoryVT(),
24393	MMO: MemInt->getMemOperand());
24394
24395	// Update the uses.
24396	std::vector<SDValue> NewResults;
24397	for (unsigned i = `0`; i < NumResultVecs; ++i) {
24398	NewResults.push_back(x: SDValue (UpdN.getNode(), i));
24399	}
24400	NewResults.push_back(x: SDValue (UpdN.getNode(), NumResultVecs + `1`));
24401	DCI.CombineTo(N, To: NewResults);
24402	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), NumResultVecs));
24403
24404	break;
24405	}
24406	return SDValue ();
24407	}
24408
24409	// Checks to see if the value is the prescribed width and returns information
24410	// about its extension mode.
24411	static
24412	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
24413	ExtType = ISD::NON_EXTLOAD;
24414	switch(V.getNode()->getOpcode()) {
24415	default:
24416	return false;
24417	case ISD::LOAD: {
24418	LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
24419	if ((LoadNode->getMemoryVT() == MVT::i8 && width == `8`)
24420	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == `16`)) {
24421	ExtType = LoadNode->getExtensionType();
24422	return true;
24423	}
24424	return false;
24425	}
24426	case ISD::AssertSext: {
24427	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
24428	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
24429	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
24430	ExtType = ISD::SEXTLOAD;
24431	return true;
24432	}
24433	return false;
24434	}
24435	case ISD::AssertZext: {
24436	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
24437	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
24438	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
24439	ExtType = ISD::ZEXTLOAD;
24440	return true;
24441	}
24442	return false;
24443	}
24444	case ISD::Constant:
24445	case ISD::TargetConstant: {
24446	return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
24447	`1LL` << (width - `1`);
24448	}
24449	}
24450
24451	return true;
24452	}
24453
24454	// This function does a whole lot of voodoo to determine if the tests are
24455	// equivalent without and with a mask. Essentially what happens is that given a
24456	// DAG resembling:
24457	//
24458	// +-------------+ +-------------+ +-------------+ +-------------+
24459	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
24460	// +-------------+ +-------------+ +-------------+ +-------------+
24461	// \| \| \| \|
24462	// V V \| +----------+
24463	// +-------------+ +----+ \| \|
24464	// \| ADD \| \|0xff\| \| \|
24465	// +-------------+ +----+ \| \|
24466	// \| \| \| \|
24467	// V V \| \|
24468	// +-------------+ \| \|
24469	// \| AND \| \| \|
24470	// +-------------+ \| \|
24471	// \| \| \|
24472	// +-----+ \| \|
24473	// \| \| \|
24474	// V V V
24475	// +-------------+
24476	// \| CMP \|
24477	// +-------------+
24478	//
24479	// The AND node may be safely removed for some combinations of inputs. In
24480	// particular we need to take into account the extension type of the Input,
24481	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
24482	// width of the input (this can work for any width inputs, the above graph is
24483	// specific to 8 bits.
24484	//
24485	// The specific equations were worked out by generating output tables for each
24486	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
24487	// problem was simplified by working with 4 bit inputs, which means we only
24488	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
24489	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
24490	// patterns present in both extensions (0,7). For every distinct set of
24491	// AddConstant and CompConstants bit patterns we can consider the masked and
24492	// unmasked versions to be equivalent if the result of this function is true for
24493	// all 16 distinct bit patterns of for the current extension type of Input (w0).
24494	//
24495	// sub w8, w0, w1
24496	// and w10, w8, #0x0f
24497	// cmp w8, w2
24498	// cset w9, AArch64CC
24499	// cmp w10, w2
24500	// cset w11, AArch64CC
24501	// cmp w9, w11
24502	// cset w0, eq
24503	// ret
24504	//
24505	// Since the above function shows when the outputs are equivalent it defines
24506	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
24507	// would be expensive to run during compiles. The equations below were written
24508	// in a test harness that confirmed they gave equivalent outputs to the above
24509	// for all inputs function, so they can be used determine if the removal is
24510	// legal instead.
24511	//
24512	// isEquivalentMaskless() is the code for testing if the AND can be removed
24513	// factored out of the DAG recognition as the DAG can take several forms.
24514
24515	static bool isEquivalentMaskless(unsigned CC, unsigned width,
24516	ISD::LoadExtType ExtType, int AddConstant,
24517	int CompConstant) {
24518	// By being careful about our equations and only writing the in term
24519	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
24520	// make them generally applicable to all bit widths.
24521	int MaxUInt = (`1` << width);
24522
24523	// For the purposes of these comparisons sign extending the type is
24524	// equivalent to zero extending the add and displacing it by half the integer
24525	// width. Provided we are careful and make sure our equations are valid over
24526	// the whole range we can just adjust the input and avoid writing equations
24527	// for sign extended inputs.
24528	if (ExtType == ISD::SEXTLOAD)
24529	AddConstant -= (`1` << (width-`1`));
24530
24531	switch(CC) {
24532	case AArch64CC::LE:
24533	case AArch64CC::GT:
24534	if ((AddConstant == `0`) \|\|
24535	(CompConstant == MaxUInt - `1` && AddConstant < `0`) \|\|
24536	(AddConstant >= `0` && CompConstant < `0`) \|\|
24537	(AddConstant <= `0` && CompConstant <= `0` && CompConstant < AddConstant))
24538	return true;
24539	break;
24540	case AArch64CC::LT:
24541	case AArch64CC::GE:
24542	if ((AddConstant == `0`) \|\|
24543	(AddConstant >= `0` && CompConstant <= `0`) \|\|
24544	(AddConstant <= `0` && CompConstant <= `0` && CompConstant <= AddConstant))
24545	return true;
24546	break;
24547	case AArch64CC::HI:
24548	case AArch64CC::LS:
24549	if ((AddConstant >= `0` && CompConstant < `0`) \|\|
24550	(AddConstant <= `0` && CompConstant >= -`1` &&
24551	CompConstant < AddConstant + MaxUInt))
24552	return true;
24553	break;
24554	case AArch64CC::PL:
24555	case AArch64CC::MI:
24556	if ((AddConstant == `0`) \|\|
24557	(AddConstant > `0` && CompConstant <= `0`) \|\|
24558	(AddConstant < `0` && CompConstant <= AddConstant))
24559	return true;
24560	break;
24561	case AArch64CC::LO:
24562	case AArch64CC::HS:
24563	if ((AddConstant >= `0` && CompConstant <= `0`) \|\|
24564	(AddConstant <= `0` && CompConstant >= `0` &&
24565	CompConstant <= AddConstant + MaxUInt))
24566	return true;
24567	break;
24568	case AArch64CC::EQ:
24569	case AArch64CC::NE:
24570	if ((AddConstant > `0` && CompConstant < `0`) \|\|
24571	(AddConstant < `0` && CompConstant >= `0` &&
24572	CompConstant < AddConstant + MaxUInt) \|\|
24573	(AddConstant >= `0` && CompConstant >= `0` &&
24574	CompConstant >= AddConstant) \|\|
24575	(AddConstant <= `0` && CompConstant < `0` && CompConstant < AddConstant))
24576	return true;
24577	break;
24578	case AArch64CC::VS:
24579	case AArch64CC::VC:
24580	case AArch64CC::AL:
24581	case AArch64CC::NV:
24582	return true;
24583	case AArch64CC::Invalid:
24584	break;
24585	}
24586
24587	return false;
24588	}
24589
24590	// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
24591	// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
24592	static SDValue performSubsToAndsCombine(SDNode N, SDNode SubsNode,
24593	SDNode *AndNode, SelectionDAG &DAG,
24594	unsigned CCIndex, unsigned CmpIndex,
24595	unsigned CC) {
24596	ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: `1`));
24597	if (!SubsC)
24598	return SDValue ();
24599
24600	APInt SubsAP = SubsC->getAPIntValue();
24601	if (CC == AArch64CC::HI) {
24602	if (!SubsAP.isMask())
24603	return SDValue ();
24604	} else if (CC == AArch64CC::LO) {
24605	if (!SubsAP.isPowerOf2())
24606	return SDValue ();
24607	} else
24608	return SDValue ();
24609
24610	ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`));
24611	if (!AndC)
24612	return SDValue ();
24613
24614	APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - `1`);
24615
24616	SDLoc DL(N);
24617	APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
24618	SDValue ANDS = DAG.getNode(
24619	Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: `0`),
24620	N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: `0`)));
24621	SDValue AArch64_CC =
24622	DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
24623	VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: `0`));
24624
24625	// For now, only performCSELCombine and performBRCONDCombine call this
24626	// function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
24627	// operands. So just init the ops direct to simplify the code. If we have some
24628	// other case with different CCIndex, CmpIndex, we need to use for loop to
24629	// rewrite the code here.
24630	// TODO: Do we need to assert number of operand is 4 here?
24631	assert((CCIndex == `2` && CmpIndex == `3`) &&
24632	"Expected CCIndex to be 2 and CmpIndex to be 3.");
24633	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), AArch64_CC,
24634	ANDS.getValue(R: `1`)};
24635	return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
24636	}
24637
24638	static
24639	SDValue performCONDCombine(SDNode *N,
24640	TargetLowering::DAGCombinerInfo &DCI,
24641	SelectionDAG &DAG, unsigned CCIndex,
24642	unsigned CmpIndex) {
24643	unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
24644	SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
24645	unsigned CondOpcode = SubsNode->getOpcode();
24646
24647	if (CondOpcode != AArch64ISD::SUBS \|\| SubsNode->hasAnyUseOfValue(Value: `0`) \|\|
24648	!SubsNode->hasOneUse())
24649	return SDValue ();
24650
24651	// There is a SUBS feeding this condition. Is it fed by a mask we can
24652	// use?
24653
24654	SDNode *AndNode = SubsNode->getOperand(Num: `0`).getNode();
24655	unsigned MaskBits = `0`;
24656
24657	if (AndNode->getOpcode() != ISD::AND)
24658	return SDValue ();
24659
24660	if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
24661	CmpIndex, CC))
24662	return Val;
24663
24664	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`))) {
24665	uint32_t CNV = CN->getZExtValue();
24666	if (CNV == `255`)
24667	MaskBits = `8`;
24668	else if (CNV == `65535`)
24669	MaskBits = `16`;
24670	}
24671
24672	if (!MaskBits)
24673	return SDValue ();
24674
24675	SDValue AddValue = AndNode->getOperand(Num: `0`);
24676
24677	if (AddValue.getOpcode() != ISD::ADD)
24678	return SDValue ();
24679
24680	// The basic dag structure is correct, grab the inputs and validate them.
24681
24682	SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: `0`);
24683	SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: `1`);
24684	SDValue SubsInputValue = SubsNode->getOperand(Num: `1`);
24685
24686	// The mask is present and the provenance of all the values is a smaller type,
24687	// lets see if the mask is superfluous.
24688
24689	if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) \|\|
24690	!isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
24691	return SDValue ();
24692
24693	ISD::LoadExtType ExtType;
24694
24695	if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) \|\|
24696	!checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) \|\|
24697	!checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
24698	return SDValue ();
24699
24700	if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
24701	AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
24702	CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
24703	return SDValue ();
24704
24705	// The AND is not necessary, remove it.
24706
24707	SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: `0`),
24708	VT2: SubsNode->getValueType(ResNo: `1`));
24709	SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: `1`) };
24710
24711	SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc (SubsNode), VTList: VTs, Ops);
24712	DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
24713
24714	return SDValue (N, `0`);
24715	}
24716
24717	// Optimize compare with zero and branch.
24718	static SDValue performBRCONDCombine(SDNode *N,
24719	TargetLowering::DAGCombinerInfo &DCI,
24720	SelectionDAG &DAG) {
24721	MachineFunction &MF = DAG.getMachineFunction();
24722	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
24723	// will not be produced, as they are conditional branch instructions that do
24724	// not set flags.
24725	if (MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
24726	return SDValue ();
24727
24728	if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`))
24729	N = NV.getNode();
24730	SDValue Chain = N->getOperand(Num: `0`);
24731	SDValue Dest = N->getOperand(Num: `1`);
24732	SDValue CCVal = N->getOperand(Num: `2`);
24733	SDValue Cmp = N->getOperand(Num: `3`);
24734
24735	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
24736	unsigned CC = CCVal ->getAsZExtVal();
24737	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
24738	return SDValue ();
24739
24740	// Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
24741	if (isCMP(Op: Cmp) && CC == AArch64CC::NE && isOneConstant(V: Cmp.getOperand(i: `1`))) {
24742	SDValue CSel = Cmp.getOperand(i: `0`);
24743	auto CSelCC = getCSETCondCode(Op: CSel);
24744	if (CSelCC) {
24745	SDLoc DL(N);
24746	return DAG.getNode(
24747	Opcode: N->getOpcode(), DL, VTList: N->getVTList(), N1: Chain, N2: Dest,
24748	N3: DAG.getConstant(Val: getInvertedCondCode(Code: *CSelCC), DL, VT: MVT::i32),
24749	N4: CSel.getOperand(i: `3`));
24750	}
24751	}
24752
24753	unsigned CmpOpc = Cmp.getOpcode();
24754	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
24755	return SDValue ();
24756
24757	// Only attempt folding if there is only one use of the flag and no use of the
24758	// value.
24759	if (!Cmp ->hasNUsesOfValue(NUses: `0`, Value: `0`) \|\| !Cmp ->hasNUsesOfValue(NUses: `1`, Value: `1`))
24760	return SDValue ();
24761
24762	SDValue LHS = Cmp.getOperand(i: `0`);
24763	SDValue RHS = Cmp.getOperand(i: `1`);
24764
24765	assert(LHS.getValueType() == RHS.getValueType() &&
24766	"Expected the value type to be the same for both operands!");
24767	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
24768	return SDValue ();
24769
24770	if (isNullConstant(V: LHS))
24771	std::swap(a&: LHS, b&: RHS);
24772
24773	if (!isNullConstant(V: RHS))
24774	return SDValue ();
24775
24776	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
24777	LHS.getOpcode() == ISD::SRL)
24778	return SDValue ();
24779
24780	// Fold the compare into the branch instruction.
24781	SDValue BR;
24782	if (CC == AArch64CC::EQ)
24783	BR = DAG.getNode(Opcode: AArch64ISD::CBZ, DL: SDLoc (N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
24784	else
24785	BR = DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: SDLoc (N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
24786
24787	// Do not add new nodes to DAG combiner worklist.
24788	DCI.CombineTo(N, Res: BR, AddTo: false);
24789
24790	return SDValue ();
24791	}
24792
24793	static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
24794	unsigned CC = N->getConstantOperandVal(Num: `2`);
24795	SDValue SUBS = N->getOperand(Num: `3`);
24796	SDValue Zero, CTTZ;
24797
24798	if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
24799	Zero = N->getOperand(Num: `0`);
24800	CTTZ = N->getOperand(Num: `1`);
24801	} else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
24802	Zero = N->getOperand(Num: `1`);
24803	CTTZ = N->getOperand(Num: `0`);
24804	} else
24805	return SDValue ();
24806
24807	if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) \|\|
24808	(CTTZ.getOpcode() == ISD::TRUNCATE &&
24809	CTTZ.getOperand(i: `0`).getOpcode() != ISD::CTTZ))
24810	return SDValue ();
24811
24812	assert((CTTZ.getValueType() == MVT::i32 \|\| CTTZ.getValueType() == MVT::i64) &&
24813	"Illegal type in CTTZ folding");
24814
24815	if (!isNullConstant(V: Zero) \|\| !isNullConstant(V: SUBS.getOperand(i: `1`)))
24816	return SDValue ();
24817
24818	SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
24819	? CTTZ.getOperand(i: `0`).getOperand(i: `0`)
24820	: CTTZ.getOperand(i: `0`);
24821
24822	if (X != SUBS.getOperand(i: `0`))
24823	return SDValue ();
24824
24825	unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
24826	? CTTZ.getOperand(i: `0`).getValueSizeInBits()
24827	: CTTZ.getValueSizeInBits();
24828	SDValue BitWidthMinusOne =
24829	DAG.getConstant(Val: BitWidth - `1`, DL: SDLoc (N), VT: CTTZ.getValueType());
24830	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: CTTZ.getValueType(), N1: CTTZ,
24831	N2: BitWidthMinusOne);
24832	}
24833
24834	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
24835	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
24836	// Where x and y are constants and x != y
24837
24838	// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
24839	// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
24840	// Where x and y are constants and x != y
24841	static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
24842	SDValue L = Op->getOperand(Num: `0`);
24843	SDValue R = Op->getOperand(Num: `1`);
24844	AArch64CC::CondCode OpCC =
24845	static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: `2`));
24846
24847	SDValue OpCmp = Op->getOperand(Num: `3`);
24848	if (!isCMP(Op: OpCmp))
24849	return SDValue ();
24850
24851	SDValue CmpLHS = OpCmp.getOperand(i: `0`);
24852	SDValue CmpRHS = OpCmp.getOperand(i: `1`);
24853
24854	if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
24855	std::swap(a&: CmpLHS, b&: CmpRHS);
24856	else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
24857	return SDValue ();
24858
24859	SDValue X = CmpLHS ->getOperand(Num: `0`);
24860	SDValue Y = CmpLHS ->getOperand(Num: `1`);
24861	if (!isa<ConstantSDNode>(Val: X) \|\| !isa<ConstantSDNode>(Val: Y) \|\| X == Y) {
24862	return SDValue ();
24863	}
24864
24865	// If one of the constant is opaque constant, x,y sdnode is still different
24866	// but the real value maybe the same. So check APInt here to make sure the
24867	// code is correct.
24868	ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
24869	ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
24870	if (CX->getAPIntValue() == CY->getAPIntValue())
24871	return SDValue ();
24872
24873	AArch64CC::CondCode CC =
24874	static_cast<AArch64CC::CondCode>(CmpLHS ->getConstantOperandVal(Num: `2`));
24875	SDValue Cond = CmpLHS ->getOperand(Num: `3`);
24876
24877	if (CmpRHS == Y)
24878	CC = AArch64CC::getInvertedCondCode(Code: CC);
24879	else if (CmpRHS != X)
24880	return SDValue ();
24881
24882	if (OpCC == AArch64CC::NE)
24883	CC = AArch64CC::getInvertedCondCode(Code: CC);
24884	else if (OpCC != AArch64CC::EQ)
24885	return SDValue ();
24886
24887	SDLoc DL(Op);
24888	EVT VT = Op->getValueType(ResNo: `0`);
24889
24890	SDValue CCValue = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
24891	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
24892	}
24893
24894	// Reassociate the true/false expressions of a CSEL instruction to obtain a
24895	// common subexpression with the comparison instruction. For example, change
24896	// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
24897	// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
24898	// subexpression.
24899	static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG) {
24900	SDValue SubsNode = N->getOperand(Num: `3`);
24901	if (SubsNode.getOpcode() != AArch64ISD::SUBS \|\| !SubsNode.hasOneUse())
24902	return SDValue ();
24903
24904	SDValue CmpOpToMatch = SubsNode.getOperand(i: `1`);
24905	SDValue CmpOpOther = SubsNode.getOperand(i: `0`);
24906	EVT VT = N->getValueType(ResNo: `0`);
24907
24908	unsigned ExpectedOpcode;
24909	SDValue ExpectedOp;
24910	SDValue SubsOp;
24911	auto *CmpOpConst = dyn_cast<ConstantSDNode>(Val&: CmpOpToMatch);
24912	if (CmpOpConst) {
24913	ExpectedOpcode = ISD::ADD;
24914	ExpectedOp =
24915	DAG.getConstant(Val: -CmpOpConst->getAPIntValue(), DL: SDLoc (CmpOpConst),
24916	VT: CmpOpConst->getValueType(ResNo: `0`));
24917	SubsOp = DAG.getConstant(Val: CmpOpConst->getAPIntValue(), DL: SDLoc (CmpOpConst),
24918	VT: CmpOpConst->getValueType(ResNo: `0`));
24919	} else {
24920	ExpectedOpcode = ISD::SUB;
24921	ExpectedOp = CmpOpToMatch;
24922	SubsOp = CmpOpToMatch;
24923	}
24924
24925	// Get the operand that can be reassociated with the SUBS instruction.
24926	auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
24927	if (Op.getOpcode() != ExpectedOpcode)
24928	return SDValue ();
24929	if (Op.getOperand(i: `0`).getOpcode() != ISD::ADD \|\|
24930	!Op.getOperand(i: `0`).hasOneUse())
24931	return SDValue ();
24932	SDValue X = Op.getOperand(i: `0`).getOperand(i: `0`);
24933	SDValue Y = Op.getOperand(i: `0`).getOperand(i: `1`);
24934	if (X != CmpOpOther)
24935	std::swap(a&: X, b&: Y);
24936	if (X != CmpOpOther)
24937	return SDValue ();
24938	if (ExpectedOp != Op.getOperand(i: `1`))
24939	return SDValue ();
24940	return Y;
24941	};
24942
24943	// Try the reassociation using the given constant and condition code.
24944	auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
24945	SDValue SubsOp) {
24946	SDValue TReassocOp = GetReassociationOp (N->getOperand(Num: `0`), ExpectedOp);
24947	SDValue FReassocOp = GetReassociationOp (N->getOperand(Num: `1`), ExpectedOp);
24948	if (!TReassocOp && !FReassocOp)
24949	return SDValue ();
24950
24951	SDValue NewCmp = DAG.getNode(Opcode: AArch64ISD::SUBS, DL: SDLoc (SubsNode),
24952	VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: CmpOpOther, N2: SubsOp);
24953
24954	auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
24955	if (!ReassocOp)
24956	return N->getOperand(Num: OpNum);
24957	SDValue Res = DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (N->getOperand(Num: OpNum)), VT,
24958	N1: NewCmp.getValue(R: `0`), N2: ReassocOp);
24959	DAG.ReplaceAllUsesWith(From: N->getOperand(Num: OpNum), To: Res);
24960	return Res;
24961	};
24962
24963	SDValue TValReassoc = Reassociate(TReassocOp, `0`);
24964	SDValue FValReassoc = Reassociate(FReassocOp, `1`);
24965	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: SDLoc (N), VT, N1: TValReassoc, N2: FValReassoc,
24966	N3: DAG.getConstant(Val: NewCC, DL: SDLoc (N->getOperand(Num: `2`)), VT: MVT_CC),
24967	N4: NewCmp.getValue(R: `1`));
24968	};
24969
24970	auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(Num: `2`));
24971
24972	// First, try to eliminate the compare instruction by searching for a
24973	// subtraction with the same constant.
24974	if (SDValue R = Fold (CC, ExpectedOp, SubsOp))
24975	return R;
24976
24977	if (!CmpOpConst) {
24978	// Try again with the operands of the SUBS instruction and the condition
24979	// swapped. Due to canonicalization, this only helps for non-constant
24980	// operands of the SUBS instruction.
24981	std::swap(a&: CmpOpToMatch, b&: CmpOpOther);
24982	if (SDValue R = Fold (getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
24983	return R;
24984	return SDValue ();
24985	}
24986
24987	if ((CC == AArch64CC::EQ \|\| CC == AArch64CC::NE) && !CmpOpConst->isZero())
24988	return SDValue ();
24989
24990	// Next, search for a subtraction with a slightly different constant. By
24991	// adjusting the condition code, we can still eliminate the compare
24992	// instruction. Adjusting the constant is only valid if it does not result
24993	// in signed/unsigned wrap for signed/unsigned comparisons, respectively.
24994	// Since such comparisons are trivially true/false, we should not encounter
24995	// them here but check for them nevertheless to be on the safe side.
24996	auto CheckedFold = [&](bool Check, APInt NewCmpConst,
24997	AArch64CC::CondCode NewCC) {
24998	auto ExpectedOp = DAG.getConstant(Val: -NewCmpConst, DL: SDLoc (CmpOpConst),
24999	VT: CmpOpConst->getValueType(ResNo: `0`));
25000	auto SubsOp = DAG.getConstant(Val: NewCmpConst, DL: SDLoc (CmpOpConst),
25001	VT: CmpOpConst->getValueType(ResNo: `0`));
25002	return Check ? Fold (NewCC, ExpectedOp, SubsOp) : SDValue ();
25003	};
25004	switch (CC) {
25005	case AArch64CC::EQ:
25006	case AArch64CC::LS:
25007	return CheckedFold (!CmpOpConst->getAPIntValue().isMaxValue(),
25008	CmpOpConst->getAPIntValue() + `1`, AArch64CC::LO);
25009	case AArch64CC::NE:
25010	case AArch64CC::HI:
25011	return CheckedFold (!CmpOpConst->getAPIntValue().isMaxValue(),
25012	CmpOpConst->getAPIntValue() + `1`, AArch64CC::HS);
25013	case AArch64CC::LO:
25014	return CheckedFold (!CmpOpConst->getAPIntValue().isZero(),
25015	CmpOpConst->getAPIntValue() - `1`, AArch64CC::LS);
25016	case AArch64CC::HS:
25017	return CheckedFold (!CmpOpConst->getAPIntValue().isZero(),
25018	CmpOpConst->getAPIntValue() - `1`, AArch64CC::HI);
25019	case AArch64CC::LT:
25020	return CheckedFold (!CmpOpConst->getAPIntValue().isMinSignedValue(),
25021	CmpOpConst->getAPIntValue() - `1`, AArch64CC::LE);
25022	case AArch64CC::LE:
25023	return CheckedFold (!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25024	CmpOpConst->getAPIntValue() + `1`, AArch64CC::LT);
25025	case AArch64CC::GT:
25026	return CheckedFold (!CmpOpConst->getAPIntValue().isMaxSignedValue(),
25027	CmpOpConst->getAPIntValue() + `1`, AArch64CC::GE);
25028	case AArch64CC::GE:
25029	return CheckedFold (!CmpOpConst->getAPIntValue().isMinSignedValue(),
25030	CmpOpConst->getAPIntValue() - `1`, AArch64CC::GT);
25031	default:
25032	return SDValue ();
25033	}
25034	}
25035
25036	static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG) {
25037	AArch64CC::CondCode OpCC =
25038	static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: `2`));
25039
25040	if (OpCC != AArch64CC::NE)
25041	return SDValue ();
25042
25043	SDValue PTest = Op->getOperand(Num: `3`);
25044	if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
25045	return SDValue ();
25046
25047	SDValue TruePred = PTest.getOperand(i: `0`);
25048	SDValue AnyPred = PTest.getOperand(i: `1`);
25049
25050	if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25051	TruePred = TruePred.getOperand(i: `0`);
25052
25053	if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
25054	AnyPred = AnyPred.getOperand(i: `0`);
25055
25056	if (TruePred != AnyPred && !isAllActivePredicate(DAG, N: TruePred))
25057	return SDValue ();
25058
25059	SDValue LastB = Op->getOperand(Num: `0`);
25060	SDValue Default = Op->getOperand(Num: `1`);
25061
25062	if (LastB.getOpcode() != AArch64ISD::LASTB \|\| LastB.getOperand(i: `0`) != AnyPred)
25063	return SDValue ();
25064
25065	return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: SDLoc (Op), VT: Op->getValueType(ResNo: `0`),
25066	N1: AnyPred, N2: Default, N3: LastB.getOperand(i: `1`));
25067	}
25068
25069	// Optimize CSEL instructions
25070	static SDValue performCSELCombine(SDNode *N,
25071	TargetLowering::DAGCombinerInfo &DCI,
25072	SelectionDAG &DAG) {
25073	// CSEL x, x, cc -> x
25074	if (N->getOperand(Num: `0`) == N->getOperand(Num: `1`))
25075	return N->getOperand(Num: `0`);
25076
25077	if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
25078	return R;
25079
25080	// Try to reassociate the true/false expressions so that we can do CSE with
25081	// a SUBS instruction used to perform the comparison.
25082	if (SDValue R = reassociateCSELOperandsForCSE(N, DAG))
25083	return R;
25084
25085	// CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
25086	// CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
25087	if (SDValue Folded = foldCSELofCTTZ(N, DAG))
25088	return Folded;
25089
25090	// CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
25091	// if SUB(y, x) already exists and we can produce a swapped predicate for cc.
25092	SDValue Cond = N->getOperand(Num: `3`);
25093	if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
25094	Cond.hasOneUse() && Cond ->hasNUsesOfValue(NUses: `0`, Value: `0`) &&
25095	DAG.doesNodeExist(Opcode: ISD::SUB, VTList: N->getVTList(),
25096	Ops: {Cond.getOperand(i: `1`), Cond.getOperand(i: `0`)}) &&
25097	!DAG.doesNodeExist(Opcode: ISD::SUB, VTList: N->getVTList(),
25098	Ops: {Cond.getOperand(i: `0`), Cond.getOperand(i: `1`)}) &&
25099	!isNullConstant(V: Cond.getOperand(i: `1`))) {
25100	AArch64CC::CondCode OldCond =
25101	static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(Num: `2`));
25102	AArch64CC::CondCode NewCond = getSwappedCondition(CC: OldCond);
25103	if (NewCond != AArch64CC::AL) {
25104	SDLoc DL(N);
25105	SDValue Sub = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: Cond ->getVTList(),
25106	N1: Cond.getOperand(i: `1`), N2: Cond.getOperand(i: `0`));
25107	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VTList: N->getVTList(), N1: N->getOperand(Num: `0`),
25108	N2: N->getOperand(Num: `1`),
25109	N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
25110	N4: Sub.getValue(R: `1`));
25111	}
25112	}
25113
25114	// CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
25115	if (SDValue CondLast = foldCSELofLASTB(Op: N, DAG))
25116	return CondLast;
25117
25118	return performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`);
25119	}
25120
25121	// Try to re-use an already extended operand of a vector SetCC feeding a
25122	// extended select. Doing so avoids requiring another full extension of the
25123	// SET_CC result when lowering the select.
25124	static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
25125	EVT Op0MVT = Op->getOperand(Num: `0`).getValueType();
25126	if (!Op0MVT.isVector() \|\| Op->use_empty())
25127	return SDValue ();
25128
25129	// Make sure that all uses of Op are VSELECTs with result matching types where
25130	// the result type has a larger element type than the SetCC operand.
25131	SDNode FirstUse = Op->user_begin();
25132	if (FirstUse->getOpcode() != ISD::VSELECT)
25133	return SDValue ();
25134	EVT UseMVT = FirstUse->getValueType(ResNo: `0`);
25135	if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
25136	return SDValue ();
25137	if (any_of(Range: Op->users(), P: [&UseMVT](const SDNode *N) {
25138	return N->getOpcode() != ISD::VSELECT \|\| N->getValueType(ResNo: `0`) != UseMVT;
25139	}))
25140	return SDValue ();
25141
25142	APInt V;
25143	if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: `1`).getNode(), SplatValue&: V))
25144	return SDValue ();
25145
25146	SDLoc DL(Op);
25147	SDValue Op0ExtV;
25148	SDValue Op1ExtV;
25149	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: `2`))->get();
25150	// Check if the first operand of the SET_CC is already extended. If it is,
25151	// split the SET_CC and re-use the extended version of the operand.
25152	SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
25153	Ops: Op->getOperand(Num: `0`));
25154	SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
25155	Ops: Op->getOperand(Num: `0`));
25156	if (Op0SExt && (isSignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
25157	Op0ExtV = SDValue (Op0SExt, `0`);
25158	Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
25159	} else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
25160	Op0ExtV = SDValue (Op0ZExt, `0`);
25161	Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
25162	} else
25163	return SDValue ();
25164
25165	return DAG.getNode(Opcode: ISD::SETCC, DL, VT: UseMVT.changeVectorElementType(EltVT: MVT::i1),
25166	N1: Op0ExtV, N2: Op1ExtV, N3: Op->getOperand(Num: `2`));
25167	}
25168
25169	static SDValue
25170	performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
25171	SelectionDAG &DAG) {
25172	SDValue Vec = N->getOperand(Num: `0`);
25173	if (DCI.isBeforeLegalize() &&
25174	Vec.getValueType().getVectorElementType() == MVT::i1 &&
25175	Vec.getValueType().isFixedLengthVector() &&
25176	Vec.getValueType().isPow2VectorType()) {
25177	SDLoc DL(N);
25178	return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: `0`), DL,
25179	DAG);
25180	}
25181
25182	return SDValue ();
25183	}
25184
25185	static SDValue performSETCCCombine(SDNode *N,
25186	TargetLowering::DAGCombinerInfo &DCI,
25187	SelectionDAG &DAG) {
25188	assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
25189	SDValue LHS = N->getOperand(Num: `0`);
25190	SDValue RHS = N->getOperand(Num: `1`);
25191	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
25192	SDLoc DL(N);
25193	EVT VT = N->getValueType(ResNo: `0`);
25194
25195	if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
25196	return V;
25197
25198	// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
25199	if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
25200	LHS ->getOpcode() == AArch64ISD::CSEL &&
25201	isNullConstant(V: LHS ->getOperand(Num: `0`)) && isOneConstant(V: LHS ->getOperand(Num: `1`)) &&
25202	LHS ->hasOneUse()) {
25203	// Invert CSEL's condition.
25204	auto OldCond =
25205	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
25206	auto NewCond = getInvertedCondCode(Code: OldCond);
25207
25208	// csel 0, 1, !cond, X
25209	SDValue CSEL =
25210	DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: `0`),
25211	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
25212	N4: LHS.getOperand(i: `3`));
25213	return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
25214	}
25215
25216	// setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
25217	if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
25218	LHS ->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`)) &&
25219	LHS ->hasOneUse()) {
25220	EVT TstVT = LHS ->getValueType(ResNo: `0`);
25221	if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= `64` &&
25222	LHS ->getConstantOperandVal(Num: `1`) < TstVT.getFixedSizeInBits()) {
25223	// this pattern will get better opt in emitComparison
25224	uint64_t TstImm = -`1ULL` << LHS ->getConstantOperandVal(Num: `1`);
25225	SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS ->getOperand(Num: `0`),
25226	N2: DAG.getSignedConstant(Val: TstImm, DL, VT: TstVT));
25227	return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: `2`));
25228	}
25229	}
25230
25231	// setcc (iN (bitcast (vNi1 X))), 0, (eq\|ne)
25232	// ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq\|ne)
25233	// setcc (iN (bitcast (vNi1 X))), -1, (eq\|ne)
25234	// ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq\|ne)
25235	if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
25236	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
25237	(isNullConstant(V: RHS) \|\| isAllOnesConstant(V: RHS)) &&
25238	LHS ->getOpcode() == ISD::BITCAST) {
25239	EVT ToVT = LHS ->getValueType(ResNo: `0`);
25240	EVT FromVT = LHS ->getOperand(Num: `0`).getValueType();
25241	if (FromVT.isFixedLengthVector() &&
25242	FromVT.getVectorElementType() == MVT::i1) {
25243	bool IsNull = isNullConstant(V: RHS);
25244	LHS = DAG.getNode(Opcode: IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
25245	DL, VT: MVT::i1, Operand: LHS ->getOperand(Num: `0`));
25246	LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
25247	Operand: LHS);
25248	return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
25249	}
25250	}
25251
25252	// Try to perform the memcmp when the result is tested for [in]equality with 0
25253	if (SDValue V = performOrXorChainCombine(N, DAG))
25254	return V;
25255
25256	EVT CmpVT = LHS.getValueType();
25257
25258	// NOTE: This exists as a combine only because it proved too awkward to match
25259	// splat(1) across all the NEON types during isel.
25260	APInt SplatLHSVal;
25261	if (CmpVT.isInteger() && Cond == ISD::SETGT &&
25262	ISD::isConstantSplatVector(N: LHS.getNode(), SplatValue&: SplatLHSVal) &&
25263	SplatLHSVal.isOne())
25264	return DAG.getSetCC(DL, VT, LHS: DAG.getConstant(Val: `0`, DL, VT: CmpVT), RHS, Cond: ISD::SETGE);
25265
25266	return SDValue ();
25267	}
25268
25269	// Replace a flag-setting operator (eg ANDS) with the generic version
25270	// (eg AND) if the flag is unused.
25271	static SDValue performFlagSettingCombine(SDNode *N,
25272	TargetLowering::DAGCombinerInfo &DCI,
25273	unsigned GenericOpcode) {
25274	SDLoc DL(N);
25275	SDValue LHS = N->getOperand(Num: `0`);
25276	SDValue RHS = N->getOperand(Num: `1`);
25277	EVT VT = N->getValueType(ResNo: `0`);
25278
25279	// If the flag result isn't used, convert back to a generic opcode.
25280	if (!N->hasAnyUseOfValue(Value: `1`)) {
25281	SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
25282	return DCI.DAG.getMergeValues(Ops: {Res, DCI.DAG.getConstant(Val: `0`, DL, VT: MVT::i32)},
25283	dl: DL);
25284	}
25285
25286	// Combine identical generic nodes into this node, re-using the result.
25287	if (SDNode *Generic = DCI.DAG.getNodeIfExists(
25288	Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
25289	DCI.CombineTo(N: Generic, Res: SDValue (N, `0`));
25290
25291	return SDValue ();
25292	}
25293
25294	static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
25295	// setcc_merge_zero pred
25296	// (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
25297	// => extract_subvector (inner setcc_merge_zero)
25298	SDValue Pred = N->getOperand(Num: `0`);
25299	SDValue LHS = N->getOperand(Num: `1`);
25300	SDValue RHS = N->getOperand(Num: `2`);
25301	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
25302
25303	if (Cond != ISD::SETNE \|\| !isZerosVector(N: RHS.getNode()) \|\|
25304	LHS ->getOpcode() != ISD::SIGN_EXTEND)
25305	return SDValue ();
25306
25307	SDValue Extract = LHS ->getOperand(Num: `0`);
25308	if (Extract ->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
25309	Extract ->getValueType(ResNo: `0`) != N->getValueType(ResNo: `0`) \|\|
25310	Extract ->getConstantOperandVal(Num: `1`) != `0`)
25311	return SDValue ();
25312
25313	SDValue InnerSetCC = Extract ->getOperand(Num: `0`);
25314	if (InnerSetCC ->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
25315	return SDValue ();
25316
25317	// By this point we've effectively got
25318	// zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
25319	// lanes are already zero then the trunc(sext()) sequence is redundant and we
25320	// can operate on A directly.
25321	SDValue InnerPred = InnerSetCC.getOperand(i: `0`);
25322	if (Pred.getOpcode() == AArch64ISD::PTRUE &&
25323	InnerPred.getOpcode() == AArch64ISD::PTRUE &&
25324	Pred.getConstantOperandVal(i: `0`) == InnerPred.getConstantOperandVal(i: `0`) &&
25325	Pred ->getConstantOperandVal(Num: `0`) >= AArch64SVEPredPattern::vl1 &&
25326	Pred ->getConstantOperandVal(Num: `0`) <= AArch64SVEPredPattern::vl256)
25327	return Extract;
25328
25329	return SDValue ();
25330	}
25331
25332	static SDValue
25333	performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
25334	assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25335	"Unexpected opcode!");
25336
25337	SelectionDAG &DAG = DCI.DAG;
25338	SDValue Pred = N->getOperand(Num: `0`);
25339	SDValue LHS = N->getOperand(Num: `1`);
25340	SDValue RHS = N->getOperand(Num: `2`);
25341	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
25342
25343	if (SDValue V = performSetCCPunpkCombine(N, DAG))
25344	return V;
25345
25346	if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
25347	LHS ->getOpcode() == ISD::SIGN_EXTEND &&
25348	LHS ->getOperand(Num: `0`)->getValueType(ResNo: `0`) == N->getValueType(ResNo: `0`)) {
25349	// setcc_merge_zero(
25350	// pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
25351	// => setcc_merge_zero(pred, ...)
25352	if (LHS ->getOperand(Num: `0`)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
25353	LHS ->getOperand(Num: `0`)->getOperand(Num: `0`) == Pred)
25354	return LHS ->getOperand(Num: `0`);
25355
25356	// setcc_merge_zero(
25357	// all_active, extend(nxvNi1 ...), != splat(0))
25358	// -> nxvNi1 ...
25359	if (isAllActivePredicate(DAG, N: Pred))
25360	return LHS ->getOperand(Num: `0`);
25361
25362	// setcc_merge_zero(
25363	// pred, extend(nxvNi1 ...), != splat(0))
25364	// -> nxvNi1 and(pred, ...)
25365	if (DCI.isAfterLegalizeDAG())
25366	// Do this after legalization to allow more folds on setcc_merge_zero
25367	// to be recognized.
25368	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
25369	N1: LHS ->getOperand(Num: `0`), N2: Pred);
25370	}
25371
25372	return SDValue ();
25373	}
25374
25375	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
25376	// as well as whether the test should be inverted. This code is required to
25377	// catch these cases (as opposed to standard dag combines) because
25378	// AArch64ISD::TBZ is matched during legalization.
25379	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
25380	SelectionDAG &DAG) {
25381
25382	if (!Op ->hasOneUse())
25383	return Op;
25384
25385	// We don't handle undef/constant-fold cases below, as they should have
25386	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
25387	// etc.)
25388
25389	// (tbz (trunc x), b) -> (tbz x, b)
25390	// This case is just here to enable more of the below cases to be caught.
25391	if (Op ->getOpcode() == ISD::TRUNCATE &&
25392	Bit < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
25393	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25394	}
25395
25396	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
25397	if (Op ->getOpcode() == ISD::ANY_EXTEND &&
25398	Bit < Op ->getOperand(Num: `0`).getValueSizeInBits()) {
25399	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25400	}
25401
25402	if (Op ->getNumOperands() != `2`)
25403	return Op;
25404
25405	auto *C = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
25406	if (!C)
25407	return Op;
25408
25409	switch (Op ->getOpcode()) {
25410	default:
25411	return Op;
25412
25413	// (tbz (and x, m), b) -> (tbz x, b)
25414	case ISD::AND:
25415	if ((C->getZExtValue() >> Bit) & `1`)
25416	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25417	return Op;
25418
25419	// (tbz (shl x, c), b) -> (tbz x, b-c)
25420	case ISD::SHL:
25421	if (C->getZExtValue() <= Bit &&
25422	(Bit - C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
25423	Bit = Bit - C->getZExtValue();
25424	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25425	}
25426	return Op;
25427
25428	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
25429	case ISD::SRA:
25430	Bit = Bit + C->getZExtValue();
25431	if (Bit >= Op ->getValueType(ResNo: `0`).getSizeInBits())
25432	Bit = Op ->getValueType(ResNo: `0`).getSizeInBits() - `1`;
25433	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25434
25435	// (tbz (srl x, c), b) -> (tbz x, b+c)
25436	case ISD::SRL:
25437	if ((Bit + C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
25438	Bit = Bit + C->getZExtValue();
25439	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25440	}
25441	return Op;
25442
25443	// (tbz (xor x, -1), b) -> (tbnz x, b)
25444	case ISD::XOR:
25445	if ((C->getZExtValue() >> Bit) & `1`)
25446	Invert = !Invert;
25447	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
25448	}
25449	}
25450
25451	// Optimize test single bit zero/non-zero and branch.
25452	static SDValue performTBZCombine(SDNode *N,
25453	TargetLowering::DAGCombinerInfo &DCI,
25454	SelectionDAG &DAG) {
25455	unsigned Bit = N->getConstantOperandVal(Num: `2`);
25456	bool Invert = false;
25457	SDValue TestSrc = N->getOperand(Num: `1`);
25458	SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
25459
25460	if (TestSrc == NewTestSrc)
25461	return SDValue ();
25462
25463	unsigned NewOpc = N->getOpcode();
25464	if (Invert) {
25465	if (NewOpc == AArch64ISD::TBZ)
25466	NewOpc = AArch64ISD::TBNZ;
25467	else {
25468	assert(NewOpc == AArch64ISD::TBNZ);
25469	NewOpc = AArch64ISD::TBZ;
25470	}
25471	}
25472
25473	SDLoc DL(N);
25474	return DAG.getNode(Opcode: NewOpc, DL, VT: MVT::Other, N1: N->getOperand(Num: `0`), N2: NewTestSrc,
25475	N3: DAG.getConstant(Val: Bit, DL, VT: MVT::i64), N4: N->getOperand(Num: `3`));
25476	}
25477
25478	// Swap vselect operands where it may allow a predicated operation to achieve
25479	// the `sel`.
25480	//
25481	// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
25482	// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
25483	static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
25484	auto SelectA = N->getOperand(Num: `1`);
25485	auto SelectB = N->getOperand(Num: `2`);
25486	auto NTy = N->getValueType(ResNo: `0`);
25487
25488	if (!NTy.isScalableVector())
25489	return SDValue ();
25490	SDValue SetCC = N->getOperand(Num: `0`);
25491	if (SetCC.getOpcode() != ISD::SETCC \|\| !SetCC.hasOneUse())
25492	return SDValue ();
25493
25494	switch (SelectB.getOpcode()) {
25495	default:
25496	return SDValue ();
25497	case ISD::FMUL:
25498	case ISD::FSUB:
25499	case ISD::FADD:
25500	break;
25501	}
25502	if (SelectA != SelectB.getOperand(i: `0`))
25503	return SDValue ();
25504
25505	ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: `2`))->get();
25506	ISD::CondCode InverseCC =
25507	ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: `0`).getValueType());
25508	auto InverseSetCC =
25509	DAG.getSetCC(DL: SDLoc (SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: `0`),
25510	RHS: SetCC.getOperand(i: `1`), Cond: InverseCC);
25511
25512	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: NTy,
25513	Ops: {InverseSetCC, SelectB, SelectA});
25514	}
25515
25516	// vselect (v1i1 setcc) ->
25517	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
25518	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
25519	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
25520	// such VSELECT.
25521	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
25522	if (auto SwapResult = trySwapVSelectOperands(N, DAG))
25523	return SwapResult;
25524
25525	SDValue N0 = N->getOperand(Num: `0`);
25526	EVT CCVT = N0.getValueType();
25527
25528	if (isAllActivePredicate(DAG, N: N0))
25529	return N->getOperand(Num: `1`);
25530
25531	if (isAllInactivePredicate(N: N0))
25532	return N->getOperand(Num: `2`);
25533
25534	// Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
25535	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
25536	// supported types.
25537	SDValue SetCC = N->getOperand(Num: `0`);
25538	if (SetCC.getOpcode() == ISD::SETCC &&
25539	SetCC.getOperand(i: `2`) == DAG.getCondCode(Cond: ISD::SETGT)) {
25540	SDValue CmpLHS = SetCC.getOperand(i: `0`);
25541	EVT VT = CmpLHS.getValueType();
25542	SDNode *CmpRHS = SetCC.getOperand(i: `1`).getNode();
25543	SDNode *SplatLHS = N->getOperand(Num: `1`).getNode();
25544	SDNode *SplatRHS = N->getOperand(Num: `2`).getNode();
25545	APInt SplatLHSVal;
25546	if (CmpLHS.getValueType() == N->getOperand(Num: `1`).getValueType() &&
25547	VT.isSimple() &&
25548	is_contained(Range: ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
25549	MVT::v2i32, MVT::v4i32, MVT::v2i64}),
25550	Element: VT.getSimpleVT().SimpleTy) &&
25551	ISD::isConstantSplatVector(N: SplatLHS, SplatValue&: SplatLHSVal) &&
25552	SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(N: CmpRHS) &&
25553	ISD::isConstantSplatVectorAllOnes(N: SplatRHS)) {
25554	unsigned NumElts = VT.getVectorNumElements();
25555	SmallVector<SDValue, `8`> Ops(
25556	NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - `1`, DL: SDLoc (N),
25557	VT: VT.getScalarType()));
25558	SDValue Val = DAG.getBuildVector(VT, DL: SDLoc (N), Ops);
25559
25560	auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc (N), VT, N1: CmpLHS, N2: Val);
25561	auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT, N1: Shift, N2: N->getOperand(Num: `1`));
25562	return Or;
25563	}
25564	}
25565
25566	EVT CmpVT = N0.getOperand(i: `0`).getValueType();
25567	if (N0.getOpcode() != ISD::SETCC \|\|
25568	CCVT.getVectorElementCount() != ElementCount::getFixed(MinVal: `1`) \|\|
25569	CCVT.getVectorElementType() != MVT::i1 \|\|
25570	CmpVT.getVectorElementType().isFloatingPoint())
25571	return SDValue ();
25572
25573	EVT ResVT = N->getValueType(ResNo: `0`);
25574	// Only combine when the result type is of the same size as the compared
25575	// operands.
25576	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
25577	return SDValue ();
25578
25579	SDValue IfTrue = N->getOperand(Num: `1`);
25580	SDValue IfFalse = N->getOperand(Num: `2`);
25581	SetCC = DAG.getSetCC(DL: SDLoc (N), VT: CmpVT.changeVectorElementTypeToInteger(),
25582	LHS: N0.getOperand(i: `0`), RHS: N0.getOperand(i: `1`),
25583	Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: `2`))->get());
25584	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: ResVT, N1: SetCC,
25585	N2: IfTrue, N3: IfFalse);
25586	}
25587
25588	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
25589	/// the compare-mask instructions rather than going via NZCV, even if LHS and
25590	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
25591	/// with a vector one followed by a DUP shuffle on the result.
25592	static SDValue performSelectCombine(SDNode *N,
25593	TargetLowering::DAGCombinerInfo &DCI) {
25594	SelectionDAG &DAG = DCI.DAG;
25595	SDValue N0 = N->getOperand(Num: `0`);
25596	EVT ResVT = N->getValueType(ResNo: `0`);
25597
25598	if (N0.getOpcode() != ISD::SETCC)
25599	return SDValue ();
25600
25601	if (ResVT.isScalableVT())
25602	return SDValue ();
25603
25604	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
25605	// scalar SetCCResultType. We also don't expect vectors, because we assume
25606	// that selects fed by vector SETCCs are canonicalized to VSELECT.
25607	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
25608	"Scalar-SETCC feeding SELECT has unexpected result type!");
25609
25610	// If NumMaskElts == 0, the comparison is larger than select result. The
25611	// largest real NEON comparison is 64-bits per lane, which means the result is
25612	// at most 32-bits and an illegal vector. Just bail out for now.
25613	EVT SrcVT = N0.getOperand(i: `0`).getValueType();
25614
25615	// Don't try to do this optimization when the setcc itself has i1 operands.
25616	// There are no legal vectors of i1, so this would be pointless. v1f16 is
25617	// ruled out to prevent the creation of setcc that need to be scalarized.
25618	if (SrcVT == MVT::i1 \|\|
25619	(SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= `16`))
25620	return SDValue ();
25621
25622	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
25623	if (!ResVT.isVector() \|\| NumMaskElts == `0`)
25624	return SDValue ();
25625
25626	SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
25627	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
25628
25629	// Also bail out if the vector CCVT isn't the same size as ResVT.
25630	// This can happen if the SETCC operand size doesn't divide the ResVT size
25631	// (e.g., f64 vs v3f32).
25632	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
25633	return SDValue ();
25634
25635	// Make sure we didn't create illegal types, if we're not supposed to.
25636	assert(DCI.isBeforeLegalize() \|\|
25637	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
25638
25639	// First perform a vector comparison, where lane 0 is the one we're interested
25640	// in.
25641	SDLoc DL(N0);
25642	SDValue LHS =
25643	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `0`));
25644	SDValue RHS =
25645	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `1`));
25646	SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: `2`));
25647
25648	// Now duplicate the comparison mask we want across all other lanes.
25649	SmallVector<int, `8`> DUPMask(CCVT.getVectorNumElements(), `0`);
25650	SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
25651	Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
25652	VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
25653
25654	return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: `1`), RHS: N->getOperand(Num: `2`));
25655	}
25656
25657	static SDValue performDUPCombine(SDNode *N,
25658	TargetLowering::DAGCombinerInfo &DCI) {
25659	EVT VT = N->getValueType(ResNo: `0`);
25660	SDLoc DL(N);
25661	// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
25662	// 128bit vector version.
25663	if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
25664	EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
25665	SmallVector<SDValue> Ops(N->ops());
25666	if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
25667	VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
25668	return DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue (LN, `0`),
25669	N2: DCI.DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
25670	}
25671	}
25672
25673	if (N->getOpcode() == AArch64ISD::DUP) {
25674	// If the instruction is known to produce a scalar in SIMD registers, we can
25675	// duplicate it across the vector lanes using DUPLANE instead of moving it
25676	// to a GPR first. For example, this allows us to handle:
25677	// v4i32 = DUP (i32 (FCMGT (f32, f32)))
25678	SDValue Op = N->getOperand(Num: `0`);
25679	// FIXME: Ideally, we should be able to handle all instructions that
25680	// produce a scalar value in FPRs.
25681	if (Op.getOpcode() == AArch64ISD::FCMEQ \|\|
25682	Op.getOpcode() == AArch64ISD::FCMGE \|\|
25683	Op.getOpcode() == AArch64ISD::FCMGT) {
25684	EVT ElemVT = VT.getVectorElementType();
25685	EVT ExpandedVT = VT;
25686	// Insert into a 128-bit vector to match DUPLANE's pattern.
25687	if (VT.getSizeInBits() != `128`)
25688	ExpandedVT = EVT::getVectorVT(Context&: *DCI.DAG.getContext(), VT: ElemVT,
25689	NumElements: `128` / ElemVT.getSizeInBits());
25690	SDValue Zero = DCI.DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
25691	SDValue Vec = DCI.DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpandedVT,
25692	N1: DCI.DAG.getUNDEF(VT: ExpandedVT), N2: Op, N3: Zero);
25693	return DCI.DAG.getNode(Opcode: getDUPLANEOp(EltType: ElemVT), DL, VT, N1: Vec, N2: Zero);
25694	}
25695
25696	if (DCI.isAfterLegalizeDAG()) {
25697	// If scalar dup's operand is extract_vector_elt, try to combine them into
25698	// duplane. For example,
25699	//
25700	// t21: i32 = extract_vector_elt t19, Constant:i64<0>
25701	// t18: v4i32 = AArch64ISD::DUP t21
25702	// ==>
25703	// t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
25704	SDValue EXTRACT_VEC_ELT = N->getOperand(Num: `0`);
25705	if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
25706	if (VT == EXTRACT_VEC_ELT.getOperand(i: `0`).getValueType()) {
25707	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
25708	return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: `0`),
25709	N2: EXTRACT_VEC_ELT.getOperand(i: `1`));
25710	}
25711	}
25712	}
25713
25714	return performPostLD1Combine(N, DCI, IsLaneOp: false);
25715	}
25716
25717	return SDValue ();
25718	}
25719
25720	/// Get rid of unnecessary NVCASTs (that don't change the type).
25721	static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
25722	if (N->getValueType(ResNo: `0`) == N->getOperand(Num: `0`).getValueType())
25723	return N->getOperand(Num: `0`);
25724	if (N->getOperand(Num: `0`).getOpcode() == AArch64ISD::NVCAST)
25725	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
25726	Operand: N->getOperand(Num: `0`).getOperand(i: `0`));
25727
25728	return SDValue ();
25729	}
25730
25731	// If all users of the globaladdr are of the form (globaladdr + constant), find
25732	// the smallest constant, fold it into the globaladdr's offset and rewrite the
25733	// globaladdr as (globaladdr + constant) - constant.
25734	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
25735	const AArch64Subtarget *Subtarget,
25736	const TargetMachine &TM) {
25737	auto *GN = cast<GlobalAddressSDNode>(Val: N);
25738	if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
25739	AArch64II::MO_NO_FLAG)
25740	return SDValue ();
25741
25742	uint64_t MinOffset = -`1ull`;
25743	for (SDNode *N : GN->users()) {
25744	if (N->getOpcode() != ISD::ADD)
25745	return SDValue ();
25746	auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`));
25747	if (!C)
25748	C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
25749	if (!C)
25750	return SDValue ();
25751	MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
25752	}
25753	uint64_t Offset = MinOffset + GN->getOffset();
25754
25755	// Require that the new offset is larger than the existing one. Otherwise, we
25756	// can end up oscillating between two possible DAGs, for example,
25757	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
25758	if (Offset <= uint64_t(GN->getOffset()))
25759	return SDValue ();
25760
25761	// Check whether folding this offset is legal. It must not go out of bounds of
25762	// the referenced object to avoid violating the code model, and must be
25763	// smaller than 2^20 because this is the largest offset expressible in all
25764	// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
25765	// stores an immediate signed 21 bit offset.)
25766	//
25767	// This check also prevents us from folding negative offsets, which will end
25768	// up being treated in the same way as large positive ones. They could also
25769	// cause code model violations, and aren't really common enough to matter.
25770	if (Offset >= (`1` << `20`))
25771	return SDValue ();
25772
25773	const GlobalValue *GV = GN->getGlobal();
25774	Type *T = GV->getValueType();
25775	if (!T->isSized() \|\|
25776	Offset > GV->getDataLayout().getTypeAllocSize(Ty: T))
25777	return SDValue ();
25778
25779	SDLoc DL(GN);
25780	SDValue Result = DAG.getGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset);
25781	return DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Result,
25782	N2: DAG.getConstant(Val: MinOffset, DL, VT: MVT::i64));
25783	}
25784
25785	static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
25786	const AArch64Subtarget *Subtarget) {
25787	SDValue BR = N->getOperand(Num: `0`);
25788	if (!Subtarget->hasCSSC() \|\| BR.getOpcode() != ISD::BITREVERSE \|\|
25789	!BR.getValueType().isScalarInteger())
25790	return SDValue ();
25791
25792	SDLoc DL(N);
25793	return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: `0`));
25794	}
25795
25796	// Turns the vector of indices into a vector of byte offstes by scaling Offset
25797	// by (BitWidth / 8).
25798	static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
25799	SDLoc DL, unsigned BitWidth) {
25800	assert(Offset.getValueType().isScalableVector() &&
25801	"This method is only for scalable vectors of offsets");
25802
25803	SDValue Shift = DAG.getConstant(Val: Log2_32(Value: BitWidth / `8`), DL, VT: MVT::i64);
25804	SDValue SplatShift = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Shift);
25805
25806	return DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::nxv2i64, N1: Offset, N2: SplatShift);
25807	}
25808
25809	/// Check if the value of \p OffsetInBytes can be used as an immediate for
25810	/// the gather load/prefetch and scatter store instructions with vector base and
25811	/// immediate offset addressing mode:
25812	///
25813	/// [<Zn>.[S\|D]{, #<imm>}]
25814	///
25815	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
25816	inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
25817	unsigned ScalarSizeInBytes) {
25818	// The immediate is not a multiple of the scalar size.
25819	if (OffsetInBytes % ScalarSizeInBytes)
25820	return false;
25821
25822	// The immediate is out of range.
25823	if (OffsetInBytes / ScalarSizeInBytes > `31`)
25824	return false;
25825
25826	return true;
25827	}
25828
25829	/// Check if the value of \p Offset represents a valid immediate for the SVE
25830	/// gather load/prefetch and scatter store instructiona with vector base and
25831	/// immediate offset addressing mode:
25832	///
25833	/// [<Zn>.[S\|D]{, #<imm>}]
25834	///
25835	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
25836	static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
25837	unsigned ScalarSizeInBytes) {
25838	ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
25839	return OffsetConst && isValidImmForSVEVecImmAddrMode(
25840	OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
25841	}
25842
25843	static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
25844	unsigned Opcode,
25845	bool OnlyPackedOffsets = true) {
25846	const SDValue Src = N->getOperand(Num: `2`);
25847	const EVT SrcVT = Src ->getValueType(ResNo: `0`);
25848	assert(SrcVT.isScalableVector() &&
25849	"Scatter stores are only possible for SVE vectors");
25850
25851	SDLoc DL(N);
25852	MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
25853
25854	// Make sure that source data will fit into an SVE register
25855	if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25856	return SDValue ();
25857
25858	// For FPs, ACLE only supports _packed_ single and double precision types.
25859	// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
25860	if (SrcElVT.isFloatingPoint())
25861	if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
25862	((Opcode != AArch64ISD::SST1Q_PRED &&
25863	Opcode != AArch64ISD::SST1Q_INDEX_PRED) \|\|
25864	((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
25865	return SDValue ();
25866
25867	// Depending on the addressing mode, this is either a pointer or a vector of
25868	// pointers (that fits into one register)
25869	SDValue Base = N->getOperand(Num: `4`);
25870	// Depending on the addressing mode, this is either a single offset or a
25871	// vector of offsets (that fits into one register)
25872	SDValue Offset = N->getOperand(Num: `5`);
25873
25874	// For "scalar + vector of indices", just scale the indices. This only
25875	// applies to non-temporal scatters because there's no instruction that takes
25876	// indices.
25877	if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
25878	Offset =
25879	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
25880	Opcode = AArch64ISD::SSTNT1_PRED;
25881	} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
25882	Offset =
25883	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
25884	Opcode = AArch64ISD::SST1Q_PRED;
25885	}
25886
25887	// In the case of non-temporal gather loads there's only one SVE instruction
25888	// per data-size: "scalar + vector", i.e.
25889	// stnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]*
25890	// Since we do have intrinsics that allow the arguments to be in a different
25891	// order, we may need to swap them to match the spec.
25892	if ((Opcode == AArch64ISD::SSTNT1_PRED \|\| Opcode == AArch64ISD::SST1Q_PRED) &&
25893	Offset.getValueType().isVector())
25894	std::swap(a&: Base, b&: Offset);
25895
25896	// SST1_IMM requires that the offset is an immediate that is:
25897	// a multiple of #SizeInBytes,*
25898	// in the range [0, 31 x #SizeInBytes],*
25899	// where #SizeInBytes is the size in bytes of the stored items. For
25900	// immediates outside that range and non-immediate scalar offsets use SST1 or
25901	// SST1_UXTW instead.
25902	if (Opcode == AArch64ISD::SST1_IMM_PRED) {
25903	if (!isValidImmForSVEVecImmAddrMode(Offset,
25904	ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / `8`)) {
25905	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
25906	Opcode = AArch64ISD::SST1_UXTW_PRED;
25907	else
25908	Opcode = AArch64ISD::SST1_PRED;
25909
25910	std::swap(a&: Base, b&: Offset);
25911	}
25912	}
25913
25914	auto &TLI = DAG.getTargetLoweringInfo();
25915	if (!TLI.isTypeLegal(VT: Base.getValueType()))
25916	return SDValue ();
25917
25918	// Some scatter store variants allow unpacked offsets, but only as nxv2i32
25919	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
25920	// nxv2i64. Legalize accordingly.
25921	if (!OnlyPackedOffsets &&
25922	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
25923	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: `0`);
25924
25925	if (!TLI.isTypeLegal(VT: Offset.getValueType()))
25926	return SDValue ();
25927
25928	// Source value type that is representable in hardware
25929	EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
25930
25931	// Keep the original type of the input data to store - this is needed to be
25932	// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
25933	// FP values we want the integer equivalent, so just use HwSrcVt.
25934	SDValue InputVT = DAG.getValueType(SrcVT);
25935	if (SrcVT.isFloatingPoint())
25936	InputVT = DAG.getValueType(HwSrcVt);
25937
25938	SDVTList VTs = DAG.getVTList(VT: MVT::Other);
25939	SDValue SrcNew;
25940
25941	if (Src.getValueType().isFloatingPoint())
25942	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
25943	else
25944	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
25945
25946	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
25947	SrcNew,
25948	N->getOperand(Num: `3`), // Pg
25949	Base,
25950	Offset,
25951	InputVT};
25952
25953	return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
25954	}
25955
25956	static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
25957	unsigned Opcode,
25958	bool OnlyPackedOffsets = true) {
25959	const EVT RetVT = N->getValueType(ResNo: `0`);
25960	assert(RetVT.isScalableVector() &&
25961	"Gather loads are only possible for SVE vectors");
25962
25963	SDLoc DL(N);
25964
25965	// Make sure that the loaded data will fit into an SVE register
25966	if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
25967	return SDValue ();
25968
25969	// Depending on the addressing mode, this is either a pointer or a vector of
25970	// pointers (that fits into one register)
25971	SDValue Base = N->getOperand(Num: `3`);
25972	// Depending on the addressing mode, this is either a single offset or a
25973	// vector of offsets (that fits into one register)
25974	SDValue Offset = N->getOperand(Num: `4`);
25975
25976	// For "scalar + vector of indices", scale the indices to obtain unscaled
25977	// offsets. This applies to non-temporal and quadword gathers, which do not
25978	// have an addressing mode with scaled offset.
25979	if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
25980	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25981	BitWidth: RetVT.getScalarSizeInBits());
25982	Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
25983	} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
25984	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
25985	BitWidth: RetVT.getScalarSizeInBits());
25986	Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
25987	}
25988
25989	// In the case of non-temporal gather loads and quadword gather loads there's
25990	// only one addressing mode : "vector + scalar", e.g.
25991	// ldnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
25992	// Since we do have intrinsics that allow the arguments to be in a different
25993	// order, we may need to swap them to match the spec.
25994	if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO \|\|
25995	Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
25996	Offset.getValueType().isVector())
25997	std::swap(a&: Base, b&: Offset);
25998
25999	// GLD{FF}1_IMM requires that the offset is an immediate that is:
26000	// a multiple of #SizeInBytes,*
26001	// in the range [0, 31 x #SizeInBytes],*
26002	// where #SizeInBytes is the size in bytes of the loaded items. For
26003	// immediates outside that range and non-immediate scalar offsets use
26004	// GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
26005	if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO \|\|
26006	Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
26007	if (!isValidImmForSVEVecImmAddrMode(Offset,
26008	ScalarSizeInBytes: RetVT.getScalarSizeInBits() / `8`)) {
26009	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
26010	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26011	? AArch64ISD::GLD1_UXTW_MERGE_ZERO
26012	: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
26013	else
26014	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
26015	? AArch64ISD::GLD1_MERGE_ZERO
26016	: AArch64ISD::GLDFF1_MERGE_ZERO;
26017
26018	std::swap(a&: Base, b&: Offset);
26019	}
26020	}
26021
26022	auto &TLI = DAG.getTargetLoweringInfo();
26023	if (!TLI.isTypeLegal(VT: Base.getValueType()))
26024	return SDValue ();
26025
26026	// Some gather load variants allow unpacked offsets, but only as nxv2i32
26027	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
26028	// nxv2i64. Legalize accordingly.
26029	if (!OnlyPackedOffsets &&
26030	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
26031	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: `0`);
26032
26033	// Return value type that is representable in hardware
26034	EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
26035
26036	// Keep the original output value type around - this is needed to be able to
26037	// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
26038	// values we want the integer equivalent, so just use HwRetVT.
26039	SDValue OutVT = DAG.getValueType(RetVT);
26040	if (RetVT.isFloatingPoint())
26041	OutVT = DAG.getValueType(HwRetVt);
26042
26043	SDVTList VTs = DAG.getVTList(VT1: HwRetVt, VT2: MVT::Other);
26044	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
26045	N->getOperand(Num: `2`), // Pg
26046	Base, Offset, OutVT};
26047
26048	SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
26049	SDValue LoadChain = SDValue (Load.getNode(), `1`);
26050
26051	if (RetVT.isInteger() && (RetVT != HwRetVt))
26052	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
26053
26054	// If the original return value was FP, bitcast accordingly. Doing it here
26055	// means that we can avoid adding TableGen patterns for FPs.
26056	if (RetVT.isFloatingPoint())
26057	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
26058
26059	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
26060	}
26061
26062	static SDValue
26063	performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
26064	SelectionDAG &DAG) {
26065	SDLoc DL(N);
26066	SDValue Src = N->getOperand(Num: `0`);
26067	unsigned Opc = Src ->getOpcode();
26068
26069	// Sign extend of an unsigned unpack -> signed unpack
26070	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
26071
26072	unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
26073	: AArch64ISD::SUNPKLO;
26074
26075	// Push the sign extend to the operand of the unpack
26076	// This is necessary where, for example, the operand of the unpack
26077	// is another unpack:
26078	// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
26079	// ->
26080	// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
26081	// ->
26082	// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
26083	SDValue ExtOp = Src ->getOperand(Num: `0`);
26084	auto VT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
26085	EVT EltTy = VT.getVectorElementType();
26086	(void)EltTy;
26087
26088	assert((EltTy == MVT::i8 \|\| EltTy == MVT::i16 \|\| EltTy == MVT::i32) &&
26089	"Sign extending from an invalid type");
26090
26091	EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
26092
26093	SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
26094	N1: ExtOp, N2: DAG.getValueType(ExtVT));
26095
26096	return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: `0`), Operand: Ext);
26097	}
26098
26099	if (DCI.isBeforeLegalizeOps())
26100	return SDValue ();
26101
26102	if (!EnableCombineMGatherIntrinsics)
26103	return SDValue ();
26104
26105	// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
26106	// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
26107	unsigned NewOpc;
26108	unsigned MemVTOpNum = `4`;
26109	switch (Opc) {
26110	case AArch64ISD::LD1_MERGE_ZERO:
26111	NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
26112	MemVTOpNum = `3`;
26113	break;
26114	case AArch64ISD::LDNF1_MERGE_ZERO:
26115	NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
26116	MemVTOpNum = `3`;
26117	break;
26118	case AArch64ISD::LDFF1_MERGE_ZERO:
26119	NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
26120	MemVTOpNum = `3`;
26121	break;
26122	case AArch64ISD::GLD1_MERGE_ZERO:
26123	NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
26124	break;
26125	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26126	NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
26127	break;
26128	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26129	NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
26130	break;
26131	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26132	NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
26133	break;
26134	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26135	NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
26136	break;
26137	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26138	NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
26139	break;
26140	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26141	NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
26142	break;
26143	case AArch64ISD::GLDFF1_MERGE_ZERO:
26144	NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
26145	break;
26146	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
26147	NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
26148	break;
26149	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
26150	NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
26151	break;
26152	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
26153	NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
26154	break;
26155	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
26156	NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
26157	break;
26158	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
26159	NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
26160	break;
26161	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
26162	NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
26163	break;
26164	case AArch64ISD::GLDNT1_MERGE_ZERO:
26165	NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
26166	break;
26167	default:
26168	return SDValue ();
26169	}
26170
26171	EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
26172	EVT SrcMemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: MemVTOpNum))->getVT();
26173
26174	if ((SignExtSrcVT != SrcMemVT) \|\| !Src.hasOneUse())
26175	return SDValue ();
26176
26177	EVT DstVT = N->getValueType(ResNo: `0`);
26178	SDVTList VTs = DAG.getVTList(VT1: DstVT, VT2: MVT::Other);
26179
26180	SmallVector<SDValue, `5`> Ops;
26181	for (unsigned I = `0`; I < Src ->getNumOperands(); ++I)
26182	Ops.push_back(Elt: Src ->getOperand(Num: I));
26183
26184	SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc (N), VTList: VTs, Ops);
26185	DCI.CombineTo(N, Res: ExtLoad);
26186	DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: `1`));
26187
26188	// Return N so it doesn't get rechecked
26189	return SDValue (N, `0`);
26190	}
26191
26192	/// Legalize the gather prefetch (scalar + vector addressing mode) when the
26193	/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
26194	/// != nxv2i32) do not need legalization.
26195	static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
26196	const unsigned OffsetPos = `4`;
26197	SDValue Offset = N->getOperand(Num: OffsetPos);
26198
26199	// Not an unpacked vector, bail out.
26200	if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
26201	return SDValue ();
26202
26203	// Extend the unpacked offset vector to 64-bit lanes.
26204	SDLoc DL(N);
26205	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset);
26206	SmallVector<SDValue, `5`> Ops(N->ops());
26207	// Replace the offset operand with the 64-bit one.
26208	Ops [OffsetPos] = Offset;
26209
26210	return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
26211	}
26212
26213	/// Combines a node carrying the intrinsic
26214	/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
26215	/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
26216	/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
26217	/// sve gather prefetch instruction with vector plus immediate addressing mode.
26218	static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
26219	unsigned ScalarSizeInBytes) {
26220	const unsigned ImmPos = `4`, OffsetPos = `3`;
26221	// No need to combine the node if the immediate is valid...
26222	if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
26223	return SDValue ();
26224
26225	// ...otherwise swap the offset base with the offset...
26226	SmallVector<SDValue, `5`> Ops(N->ops());
26227	std::swap(a&: Ops [ImmPos], b&: Ops [OffsetPos]);
26228	// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
26229	// `aarch64_sve_prfb_gather_uxtw_index`.
26230	SDLoc DL(N);
26231	Ops [`1`] = DAG.getConstant(Val: Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
26232	VT: MVT::i64);
26233
26234	return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
26235	}
26236
26237	// Return true if the vector operation can guarantee only the first lane of its
26238	// result contains data, with all bits in other lanes set to zero.
26239	static bool isLanes1toNKnownZero(SDValue Op) {
26240	switch (Op.getOpcode()) {
26241	default:
26242	return false;
26243	case AArch64ISD::ANDV_PRED:
26244	case AArch64ISD::EORV_PRED:
26245	case AArch64ISD::FADDA_PRED:
26246	case AArch64ISD::FADDV_PRED:
26247	case AArch64ISD::FMAXNMV_PRED:
26248	case AArch64ISD::FMAXV_PRED:
26249	case AArch64ISD::FMINNMV_PRED:
26250	case AArch64ISD::FMINV_PRED:
26251	case AArch64ISD::ORV_PRED:
26252	case AArch64ISD::SADDV_PRED:
26253	case AArch64ISD::SMAXV_PRED:
26254	case AArch64ISD::SMINV_PRED:
26255	case AArch64ISD::UADDV_PRED:
26256	case AArch64ISD::UMAXV_PRED:
26257	case AArch64ISD::UMINV_PRED:
26258	return true;
26259	}
26260	}
26261
26262	static SDValue removeRedundantInsertVectorElt(SDNode *N) {
26263	assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
26264	SDValue InsertVec = N->getOperand(Num: `0`);
26265	SDValue InsertElt = N->getOperand(Num: `1`);
26266	SDValue InsertIdx = N->getOperand(Num: `2`);
26267
26268	// We only care about inserts into the first element...
26269	if (!isNullConstant(V: InsertIdx))
26270	return SDValue ();
26271	// ...of a zero'd vector...
26272	if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
26273	return SDValue ();
26274	// ...where the inserted data was previously extracted...
26275	if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
26276	return SDValue ();
26277
26278	SDValue ExtractVec = InsertElt.getOperand(i: `0`);
26279	SDValue ExtractIdx = InsertElt.getOperand(i: `1`);
26280
26281	// ...from the first element of a vector.
26282	if (!isNullConstant(V: ExtractIdx))
26283	return SDValue ();
26284
26285	// If we get here we are effectively trying to zero lanes 1-N of a vector.
26286
26287	// Ensure there's no type conversion going on.
26288	if (N->getValueType(ResNo: `0`) != ExtractVec.getValueType())
26289	return SDValue ();
26290
26291	if (!isLanes1toNKnownZero(Op: ExtractVec))
26292	return SDValue ();
26293
26294	// The explicit zeroing is redundant.
26295	return ExtractVec;
26296	}
26297
26298	static SDValue
26299	performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
26300	if (SDValue Res = removeRedundantInsertVectorElt(N))
26301	return Res;
26302
26303	return performPostLD1Combine(N, DCI, IsLaneOp: true);
26304	}
26305
26306	static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
26307	TargetLowering::DAGCombinerInfo &DCI,
26308	const AArch64Subtarget *Subtarget) {
26309	SDValue N0 = N->getOperand(Num: `0`);
26310	EVT VT = N->getValueType(ResNo: `0`);
26311
26312	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
26313	if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
26314	return SDValue ();
26315
26316	auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
26317	EVT EltVT = VT.getVectorElementType();
26318	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
26319	};
26320
26321	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
26322	// We purposefully don't care about legality of the nodes here as we know
26323	// they can be split down into something legal.
26324	if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
26325	N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
26326	VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad (VT) &&
26327	VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
26328	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
26329	SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc (N), VT,
26330	Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
26331	MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
26332	DCI.CombineTo(N, Res: ExtLoad);
26333	DCI.CombineTo(
26334	N: N0.getNode(),
26335	Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc (N0), VT: N0.getValueType(), N1: ExtLoad,
26336	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (N0), /isTarget=/true)),
26337	Res1: ExtLoad.getValue(R: `1`));
26338	return SDValue (N, `0`); // Return N so it doesn't get rechecked!
26339	}
26340
26341	return SDValue ();
26342	}
26343
26344	static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
26345	const AArch64Subtarget *Subtarget) {
26346	EVT VT = N->getValueType(ResNo: `0`);
26347
26348	// Don't expand for NEON, SVE2 or SME
26349	if (!VT.isScalableVector() \|\| Subtarget->hasSVE2() \|\| Subtarget->hasSME())
26350	return SDValue ();
26351
26352	SDLoc DL(N);
26353
26354	SDValue Mask = N->getOperand(Num: `0`);
26355	SDValue In1 = N->getOperand(Num: `1`);
26356	SDValue In2 = N->getOperand(Num: `2`);
26357
26358	SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
26359	SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
26360	SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
26361	return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
26362	}
26363
26364	static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
26365	EVT VT = N->getValueType(ResNo: `0`);
26366
26367	SDValue Insert = N->getOperand(Num: `0`);
26368	if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
26369	return SDValue ();
26370
26371	if (!Insert.getOperand(i: `0`).isUndef())
26372	return SDValue ();
26373
26374	uint64_t IdxInsert = Insert.getConstantOperandVal(i: `2`);
26375	uint64_t IdxDupLane = N->getConstantOperandVal(Num: `1`);
26376	if (IdxInsert != `0` \|\| IdxDupLane != `0`)
26377	return SDValue ();
26378
26379	SDValue Bitcast = Insert.getOperand(i: `1`);
26380	if (Bitcast.getOpcode() != ISD::BITCAST)
26381	return SDValue ();
26382
26383	SDValue Subvec = Bitcast.getOperand(i: `0`);
26384	EVT SubvecVT = Subvec.getValueType();
26385	if (!SubvecVT.is128BitVector())
26386	return SDValue ();
26387	EVT NewSubvecVT =
26388	getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
26389
26390	SDLoc DL(N);
26391	SDValue NewInsert =
26392	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
26393	N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert ->getOperand(Num: `2`));
26394	SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
26395	N1: NewInsert, N2: N->getOperand(Num: `1`));
26396	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
26397	}
26398
26399	// Try to combine mull with uzp1.
26400	static SDValue tryCombineMULLWithUZP1(SDNode *N,
26401	TargetLowering::DAGCombinerInfo &DCI,
26402	SelectionDAG &DAG) {
26403	if (DCI.isBeforeLegalizeOps())
26404	return SDValue ();
26405
26406	SDValue LHS = N->getOperand(Num: `0`);
26407	SDValue RHS = N->getOperand(Num: `1`);
26408
26409	SDValue ExtractHigh;
26410	SDValue ExtractLow;
26411	SDValue TruncHigh;
26412	SDValue TruncLow;
26413	SDLoc DL(N);
26414
26415	// Check the operands are trunc and extract_high.
26416	if (isEssentiallyExtractHighSubvector(N: LHS) &&
26417	RHS.getOpcode() == ISD::TRUNCATE) {
26418	TruncHigh = RHS;
26419	if (LHS.getOpcode() == ISD::BITCAST)
26420	ExtractHigh = LHS.getOperand(i: `0`);
26421	else
26422	ExtractHigh = LHS;
26423	} else if (isEssentiallyExtractHighSubvector(N: RHS) &&
26424	LHS.getOpcode() == ISD::TRUNCATE) {
26425	TruncHigh = LHS;
26426	if (RHS.getOpcode() == ISD::BITCAST)
26427	ExtractHigh = RHS.getOperand(i: `0`);
26428	else
26429	ExtractHigh = RHS;
26430	} else
26431	return SDValue ();
26432
26433	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26434	// with uzp1.
26435	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26436	SDValue TruncHighOp = TruncHigh.getOperand(i: `0`);
26437	EVT TruncHighOpVT = TruncHighOp.getValueType();
26438	if (TruncHighOp.getOpcode() == AArch64ISD::DUP \|\|
26439	DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
26440	return SDValue ();
26441
26442	// Check there is other extract_high with same source vector.
26443	// For example,
26444	//
26445	// t18: v4i16 = extract_subvector t2, Constant:i64<0>
26446	// t12: v4i16 = truncate t11
26447	// t31: v4i32 = AArch64ISD::SMULL t18, t12
26448	// t23: v4i16 = extract_subvector t2, Constant:i64<4>
26449	// t16: v4i16 = truncate t15
26450	// t30: v4i32 = AArch64ISD::SMULL t23, t1
26451	//
26452	// This dagcombine assumes the two extract_high uses same source vector in
26453	// order to detect the pair of the mull. If they have different source vector,
26454	// this code will not work.
26455	// TODO: Should also try to look through a bitcast.
26456	bool HasFoundMULLow = true;
26457	SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: `0`);
26458	if (ExtractHighSrcVec ->use_size() != `2`)
26459	HasFoundMULLow = false;
26460
26461	// Find ExtractLow.
26462	for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
26463	if (User == ExtractHigh.getNode())
26464	continue;
26465
26466	if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
26467	!isNullConstant(V: User->getOperand(Num: `1`))) {
26468	HasFoundMULLow = false;
26469	break;
26470	}
26471
26472	ExtractLow.setNode(User);
26473	}
26474
26475	if (!ExtractLow \|\| !ExtractLow ->hasOneUse())
26476	HasFoundMULLow = false;
26477
26478	// Check ExtractLow's user.
26479	if (HasFoundMULLow) {
26480	SDNode ExtractLowUser = ExtractLow.getNode()->user_begin();
26481	if (ExtractLowUser->getOpcode() != N->getOpcode()) {
26482	HasFoundMULLow = false;
26483	} else {
26484	if (ExtractLowUser->getOperand(Num: `0`) == ExtractLow) {
26485	if (ExtractLowUser->getOperand(Num: `1`).getOpcode() == ISD::TRUNCATE)
26486	TruncLow = ExtractLowUser->getOperand(Num: `1`);
26487	else
26488	HasFoundMULLow = false;
26489	} else {
26490	if (ExtractLowUser->getOperand(Num: `0`).getOpcode() == ISD::TRUNCATE)
26491	TruncLow = ExtractLowUser->getOperand(Num: `0`);
26492	else
26493	HasFoundMULLow = false;
26494	}
26495	}
26496	}
26497
26498	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
26499	// with uzp1.
26500	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
26501	EVT TruncHighVT = TruncHigh.getValueType();
26502	EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
26503	SDValue TruncLowOp =
26504	HasFoundMULLow ? TruncLow.getOperand(i: `0`) : DAG.getUNDEF(VT: UZP1VT);
26505	EVT TruncLowOpVT = TruncLowOp.getValueType();
26506	if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP \|\|
26507	DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
26508	return SDValue ();
26509
26510	// Create uzp1, extract_high and extract_low.
26511	if (TruncHighOpVT != UZP1VT)
26512	TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
26513	if (TruncLowOpVT != UZP1VT)
26514	TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
26515
26516	SDValue UZP1 =
26517	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
26518	SDValue HighIdxCst =
26519	DAG.getConstant(Val: TruncHighVT.getVectorNumElements(), DL, VT: MVT::i64);
26520	SDValue NewTruncHigh =
26521	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
26522	DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
26523
26524	if (HasFoundMULLow) {
26525	EVT TruncLowVT = TruncLow.getValueType();
26526	SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
26527	N1: UZP1, N2: ExtractLow.getOperand(i: `1`));
26528	DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
26529	}
26530
26531	return SDValue (N, `0`);
26532	}
26533
26534	static SDValue performMULLCombine(SDNode *N,
26535	TargetLowering::DAGCombinerInfo &DCI,
26536	SelectionDAG &DAG) {
26537	if (SDValue Val =
26538	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
26539	return Val;
26540
26541	if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
26542	return Val;
26543
26544	return SDValue ();
26545	}
26546
26547	static SDValue
26548	performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
26549	SelectionDAG &DAG) {
26550	// Let's do below transform.
26551	//
26552	// t34: v4i32 = AArch64ISD::UADDLV t2
26553	// t35: i32 = extract_vector_elt t34, Constant:i64<0>
26554	// t7: i64 = zero_extend t35
26555	// t20: v1i64 = scalar_to_vector t7
26556	// ==>
26557	// t34: v4i32 = AArch64ISD::UADDLV t2
26558	// t39: v2i32 = extract_subvector t34, Constant:i64<0>
26559	// t40: v1i64 = AArch64ISD::NVCAST t39
26560	if (DCI.isBeforeLegalizeOps())
26561	return SDValue ();
26562
26563	EVT VT = N->getValueType(ResNo: `0`);
26564	if (VT != MVT::v1i64)
26565	return SDValue ();
26566
26567	SDValue ZEXT = N->getOperand(Num: `0`);
26568	if (ZEXT.getOpcode() != ISD::ZERO_EXTEND \|\| ZEXT.getValueType() != MVT::i64)
26569	return SDValue ();
26570
26571	SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: `0`);
26572	if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
26573	EXTRACT_VEC_ELT.getValueType() != MVT::i32)
26574	return SDValue ();
26575
26576	if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: `1`)))
26577	return SDValue ();
26578
26579	SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: `0`);
26580	if (UADDLV.getOpcode() != AArch64ISD::UADDLV \|\|
26581	UADDLV.getValueType() != MVT::v4i32 \|\|
26582	UADDLV.getOperand(i: `0`).getValueType() != MVT::v8i8)
26583	return SDValue ();
26584
26585	// Let's generate new sequence with AArch64ISD::NVCAST.
26586	SDLoc DL(N);
26587	SDValue EXTRACT_SUBVEC =
26588	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: UADDLV,
26589	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
26590	SDValue NVCAST =
26591	DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v1i64, Operand: EXTRACT_SUBVEC);
26592
26593	return NVCAST;
26594	}
26595
26596	/// If the operand is a bitwise AND with a constant RHS, and the shift has a
26597	/// constant RHS and is the only use, we can pull it out of the shift, i.e.
26598	///
26599	/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
26600	///
26601	/// We prefer this canonical form to match existing isel patterns.
26602	static SDValue performSHLCombine(SDNode *N,
26603	TargetLowering::DAGCombinerInfo &DCI,
26604	SelectionDAG &DAG) {
26605	if (DCI.isBeforeLegalizeOps())
26606	return SDValue ();
26607
26608	SDValue Op0 = N->getOperand(Num: `0`);
26609	if (Op0.getOpcode() != ISD::AND \|\| !Op0.hasOneUse())
26610	return SDValue ();
26611
26612	SDValue C1 = Op0 ->getOperand(Num: `1`);
26613	SDValue C2 = N->getOperand(Num: `1`);
26614	if (!isa<ConstantSDNode>(Val: C1) \|\| !isa<ConstantSDNode>(Val: C2))
26615	return SDValue ();
26616
26617	// Might be folded into shifted op, do not lower.
26618	if (N->hasOneUse()) {
26619	unsigned UseOpc = N->user_begin()->getOpcode();
26620	if (UseOpc == ISD::ADD \|\| UseOpc == ISD::SUB \|\| UseOpc == ISD::SETCC \|\|
26621	UseOpc == AArch64ISD::ADDS \|\| UseOpc == AArch64ISD::SUBS)
26622	return SDValue ();
26623	}
26624
26625	SDLoc DL(N);
26626	EVT VT = N->getValueType(ResNo: `0`);
26627
26628	// Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
26629	// DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
26630	// causing infinite loop. Result may also be worse.
26631	SDValue NewRHS = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: C1, N2: C2);
26632	if (!isa<ConstantSDNode>(Val: NewRHS))
26633	return SDValue ();
26634
26635	SDValue X = Op0 ->getOperand(Num: `0`);
26636	SDValue NewShift = DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: X, N2: C2);
26637	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: NewShift, N2: NewRHS);
26638	}
26639
26640	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
26641	DAGCombinerInfo &DCI) const {
26642	SelectionDAG &DAG = DCI.DAG;
26643	switch (N->getOpcode()) {
26644	default:
26645	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
26646	break;
26647	case ISD::VECREDUCE_AND:
26648	case ISD::VECREDUCE_OR:
26649	case ISD::VECREDUCE_XOR:
26650	return performVecReduceBitwiseCombine(N, DCI, DAG);
26651	case ISD::ADD:
26652	case ISD::SUB:
26653	return performAddSubCombine(N, DCI);
26654	case ISD::BUILD_VECTOR:
26655	return performBuildVectorCombine(N, DCI, DAG);
26656	case ISD::TRUNCATE:
26657	return performTruncateCombine(N, DAG, DCI);
26658	case AArch64ISD::ANDS:
26659	return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
26660	case AArch64ISD::ADC:
26661	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
26662	return R;
26663	return foldADCToCINC(N, DAG);
26664	case AArch64ISD::SBC:
26665	return foldOverflowCheck(Op: N, DAG, / IsAdd / false);
26666	case AArch64ISD::ADCS:
26667	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
26668	return R;
26669	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
26670	case AArch64ISD::SBCS:
26671	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / false))
26672	return R;
26673	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
26674	case AArch64ISD::BICi: {
26675	APInt DemandedBits =
26676	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getScalarSizeInBits());
26677	APInt DemandedElts =
26678	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getVectorNumElements());
26679
26680	if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
26681	Op: SDValue (N, `0`), DemandedBits, DemandedElts, DCI))
26682	return SDValue ();
26683
26684	break;
26685	}
26686	case ISD::XOR:
26687	return performXorCombine(N, DAG, DCI, Subtarget);
26688	case ISD::MUL:
26689	return performMulCombine(N, DAG, DCI, Subtarget);
26690	case ISD::SINT_TO_FP:
26691	case ISD::UINT_TO_FP:
26692	return performIntToFpCombine(N, DAG, DCI, Subtarget);
26693	case ISD::FP_TO_SINT:
26694	case ISD::FP_TO_UINT:
26695	case ISD::FP_TO_SINT_SAT:
26696	case ISD::FP_TO_UINT_SAT:
26697	return performFpToIntCombine(N, DAG, DCI, Subtarget);
26698	case ISD::OR:
26699	return performORCombine(N, DCI, Subtarget, TLI: *this);
26700	case ISD::AND:
26701	return performANDCombine(N, DCI);
26702	case ISD::FADD:
26703	return performFADDCombine(N, DCI);
26704	case ISD::INTRINSIC_WO_CHAIN:
26705	return performIntrinsicCombine(N, DCI, Subtarget);
26706	case ISD::ANY_EXTEND:
26707	case ISD::ZERO_EXTEND:
26708	case ISD::SIGN_EXTEND:
26709	return performExtendCombine(N, DCI, DAG);
26710	case ISD::SIGN_EXTEND_INREG:
26711	return performSignExtendInRegCombine(N, DCI, DAG);
26712	case ISD::CONCAT_VECTORS:
26713	return performConcatVectorsCombine(N, DCI, DAG);
26714	case ISD::EXTRACT_SUBVECTOR:
26715	return performExtractSubvectorCombine(N, DCI, DAG);
26716	case ISD::INSERT_SUBVECTOR:
26717	return performInsertSubvectorCombine(N, DCI, DAG);
26718	case ISD::SELECT:
26719	return performSelectCombine(N, DCI);
26720	case ISD::VSELECT:
26721	return performVSelectCombine(N, DAG&: DCI.DAG);
26722	case ISD::SETCC:
26723	return performSETCCCombine(N, DCI, DAG);
26724	case ISD::LOAD:
26725	return performLOADCombine(N, DCI, DAG, Subtarget);
26726	case ISD::STORE:
26727	return performSTORECombine(N, DCI, DAG, Subtarget);
26728	case ISD::MSTORE:
26729	return performMSTORECombine(N, DCI, DAG, Subtarget);
26730	case ISD::MGATHER:
26731	case ISD::MSCATTER:
26732	case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
26733	return performMaskedGatherScatterCombine(N, DCI, DAG);
26734	case ISD::FP_EXTEND:
26735	return performFPExtendCombine(N, DAG, DCI, Subtarget);
26736	case AArch64ISD::BRCOND:
26737	return performBRCONDCombine(N, DCI, DAG);
26738	case AArch64ISD::TBNZ:
26739	case AArch64ISD::TBZ:
26740	return performTBZCombine(N, DCI, DAG);
26741	case AArch64ISD::CSEL:
26742	return performCSELCombine(N, DCI, DAG);
26743	case AArch64ISD::DUP:
26744	case AArch64ISD::DUPLANE8:
26745	case AArch64ISD::DUPLANE16:
26746	case AArch64ISD::DUPLANE32:
26747	case AArch64ISD::DUPLANE64:
26748	return performDUPCombine(N, DCI);
26749	case AArch64ISD::DUPLANE128:
26750	return performDupLane128Combine(N, DAG);
26751	case AArch64ISD::NVCAST:
26752	return performNVCASTCombine(N, DAG);
26753	case AArch64ISD::SPLICE:
26754	return performSpliceCombine(N, DAG);
26755	case AArch64ISD::UUNPKLO:
26756	case AArch64ISD::UUNPKHI:
26757	return performUnpackCombine(N, DAG, Subtarget);
26758	case AArch64ISD::UZP1:
26759	case AArch64ISD::UZP2:
26760	return performUzpCombine(N, DAG, Subtarget);
26761	case AArch64ISD::SETCC_MERGE_ZERO:
26762	return performSetccMergeZeroCombine(N, DCI);
26763	case AArch64ISD::REINTERPRET_CAST:
26764	return performReinterpretCastCombine(N);
26765	case AArch64ISD::GLD1_MERGE_ZERO:
26766	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
26767	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
26768	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
26769	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
26770	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
26771	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
26772	case AArch64ISD::GLD1S_MERGE_ZERO:
26773	case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
26774	case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
26775	case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
26776	case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
26777	case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
26778	case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
26779	return performGLD1Combine(N, DAG);
26780	case AArch64ISD::VASHR:
26781	case AArch64ISD::VLSHR:
26782	return performVectorShiftCombine(N, TLI: *this, DCI);
26783	case AArch64ISD::SUNPKLO:
26784	return performSunpkloCombine(N, DAG);
26785	case AArch64ISD::BSP:
26786	return performBSPExpandForSVE(N, DAG, Subtarget);
26787	case ISD::INSERT_VECTOR_ELT:
26788	return performInsertVectorEltCombine(N, DCI);
26789	case ISD::EXTRACT_VECTOR_ELT:
26790	return performExtractVectorEltCombine(N, DCI, Subtarget);
26791	case ISD::VECREDUCE_ADD:
26792	return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
26793	case ISD::GET_ACTIVE_LANE_MASK:
26794	return performActiveLaneMaskCombine(N, DCI, ST: Subtarget);
26795	case AArch64ISD::UADDV:
26796	return performUADDVCombine(N, DAG);
26797	case AArch64ISD::SMULL:
26798	case AArch64ISD::UMULL:
26799	case AArch64ISD::PMULL:
26800	return performMULLCombine(N, DCI, DAG);
26801	case ISD::INTRINSIC_VOID:
26802	case ISD::INTRINSIC_W_CHAIN:
26803	switch (N->getConstantOperandVal(Num: `1`)) {
26804	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
26805	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `1` /=ScalarSizeInBytes/);
26806	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
26807	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `2` /=ScalarSizeInBytes/);
26808	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
26809	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `4` /=ScalarSizeInBytes/);
26810	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
26811	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `8` /=ScalarSizeInBytes/);
26812	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
26813	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
26814	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
26815	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
26816	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
26817	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
26818	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
26819	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
26820	return legalizeSVEGatherPrefetchOffsVec(N, DAG);
26821	case Intrinsic::aarch64_neon_ld2:
26822	case Intrinsic::aarch64_neon_ld3:
26823	case Intrinsic::aarch64_neon_ld4:
26824	case Intrinsic::aarch64_neon_ld1x2:
26825	case Intrinsic::aarch64_neon_ld1x3:
26826	case Intrinsic::aarch64_neon_ld1x4:
26827	case Intrinsic::aarch64_neon_ld2lane:
26828	case Intrinsic::aarch64_neon_ld3lane:
26829	case Intrinsic::aarch64_neon_ld4lane:
26830	case Intrinsic::aarch64_neon_ld2r:
26831	case Intrinsic::aarch64_neon_ld3r:
26832	case Intrinsic::aarch64_neon_ld4r:
26833	case Intrinsic::aarch64_neon_st2:
26834	case Intrinsic::aarch64_neon_st3:
26835	case Intrinsic::aarch64_neon_st4:
26836	case Intrinsic::aarch64_neon_st1x2:
26837	case Intrinsic::aarch64_neon_st1x3:
26838	case Intrinsic::aarch64_neon_st1x4:
26839	case Intrinsic::aarch64_neon_st2lane:
26840	case Intrinsic::aarch64_neon_st3lane:
26841	case Intrinsic::aarch64_neon_st4lane:
26842	return performNEONPostLDSTCombine(N, DCI, DAG);
26843	case Intrinsic::aarch64_sve_ldnt1:
26844	return performLDNT1Combine(N, DAG);
26845	case Intrinsic::aarch64_sve_ld1rq:
26846	return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
26847	case Intrinsic::aarch64_sve_ld1ro:
26848	return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
26849	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
26850	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26851	case Intrinsic::aarch64_sve_ldnt1_gather:
26852	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26853	case Intrinsic::aarch64_sve_ldnt1_gather_index:
26854	return performGatherLoadCombine(N, DAG,
26855	Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
26856	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
26857	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
26858	case Intrinsic::aarch64_sve_ld1:
26859	return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
26860	case Intrinsic::aarch64_sve_ldnf1:
26861	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
26862	case Intrinsic::aarch64_sve_ldff1:
26863	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
26864	case Intrinsic::aarch64_sve_st1:
26865	return performST1Combine(N, DAG);
26866	case Intrinsic::aarch64_sve_stnt1:
26867	return performSTNT1Combine(N, DAG);
26868	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
26869	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26870	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
26871	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26872	case Intrinsic::aarch64_sve_stnt1_scatter:
26873	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
26874	case Intrinsic::aarch64_sve_stnt1_scatter_index:
26875	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
26876	case Intrinsic::aarch64_sve_ld1_gather:
26877	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
26878	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
26879	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
26880	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
26881	case Intrinsic::aarch64_sve_ld1q_gather_index:
26882	return performGatherLoadCombine(N, DAG,
26883	Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
26884	case Intrinsic::aarch64_sve_ld1_gather_index:
26885	return performGatherLoadCombine(N, DAG,
26886	Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
26887	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
26888	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
26889	/OnlyPackedOffsets=/false);
26890	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
26891	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
26892	/OnlyPackedOffsets=/false);
26893	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
26894	return performGatherLoadCombine(N, DAG,
26895	Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
26896	/OnlyPackedOffsets=/false);
26897	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
26898	return performGatherLoadCombine(N, DAG,
26899	Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
26900	/OnlyPackedOffsets=/false);
26901	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
26902	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
26903	case Intrinsic::aarch64_sve_ldff1_gather:
26904	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
26905	case Intrinsic::aarch64_sve_ldff1_gather_index:
26906	return performGatherLoadCombine(N, DAG,
26907	Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
26908	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
26909	return performGatherLoadCombine(N, DAG,
26910	Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
26911	/OnlyPackedOffsets=/false);
26912	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
26913	return performGatherLoadCombine(N, DAG,
26914	Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
26915	/OnlyPackedOffsets=/false);
26916	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
26917	return performGatherLoadCombine(N, DAG,
26918	Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
26919	/OnlyPackedOffsets=/false);
26920	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
26921	return performGatherLoadCombine(N, DAG,
26922	Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
26923	/OnlyPackedOffsets=/false);
26924	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
26925	return performGatherLoadCombine(N, DAG,
26926	Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
26927	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
26928	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
26929	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
26930	case Intrinsic::aarch64_sve_st1q_scatter_index:
26931	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
26932	case Intrinsic::aarch64_sve_st1_scatter:
26933	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
26934	case Intrinsic::aarch64_sve_st1_scatter_index:
26935	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
26936	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
26937	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
26938	/OnlyPackedOffsets=/false);
26939	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
26940	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
26941	/OnlyPackedOffsets=/false);
26942	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
26943	return performScatterStoreCombine(N, DAG,
26944	Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
26945	/OnlyPackedOffsets=/false);
26946	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
26947	return performScatterStoreCombine(N, DAG,
26948	Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
26949	/OnlyPackedOffsets=/false);
26950	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
26951	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
26952	case Intrinsic::aarch64_rndr:
26953	case Intrinsic::aarch64_rndrrs: {
26954	unsigned IntrinsicID = N->getConstantOperandVal(Num: `1`);
26955	auto Register =
26956	(IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
26957	: AArch64SysReg::RNDRRS);
26958	SDLoc DL(N);
26959	SDValue A = DAG.getNode(
26960	Opcode: AArch64ISD::MRS, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32, VT3: MVT::Other),
26961	N1: N->getOperand(Num: `0`), N2: DAG.getConstant(Val: Register, DL, VT: MVT::i32));
26962	SDValue B = DAG.getNode(
26963	Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
26964	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
26965	N3: DAG.getConstant(Val: AArch64CC::NE, DL, VT: MVT::i32), N4: A.getValue(R: `1`));
26966	return DAG.getMergeValues(
26967	Ops: {A, DAG.getZExtOrTrunc(Op: B, DL, VT: MVT::i1), A.getValue(R: `2`)}, dl: DL);
26968	}
26969	case Intrinsic::aarch64_sme_ldr_zt:
26970	return DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL: SDLoc (N),
26971	VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: `0`),
26972	N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
26973	case Intrinsic::aarch64_sme_str_zt:
26974	return DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL: SDLoc (N),
26975	VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: `0`),
26976	N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
26977	default:
26978	break;
26979	}
26980	break;
26981	case ISD::GlobalAddress:
26982	return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
26983	case ISD::CTLZ:
26984	return performCTLZCombine(N, DAG, Subtarget);
26985	case ISD::SCALAR_TO_VECTOR:
26986	return performScalarToVectorCombine(N, DCI, DAG);
26987	case ISD::SHL:
26988	return performSHLCombine(N, DCI, DAG);
26989	}
26990	return SDValue ();
26991	}
26992
26993	// Check if the return value is used as only a return value, as otherwise
26994	// we can't perform a tail-call. In particular, we need to check for
26995	// target ISD nodes that are returns and any other "odd" constructs
26996	// that the generic analysis code won't necessarily catch.
26997	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
26998	SDValue &Chain) const {
26999	if (N->getNumValues() != `1`)
27000	return false;
27001	if (!N->hasNUsesOfValue(NUses: `1`, Value: `0`))
27002	return false;
27003
27004	SDValue TCChain = Chain;
27005	SDNode Copy = N->user_begin();
27006	if (Copy->getOpcode() == ISD::CopyToReg) {
27007	// If the copy has a glue operand, we conservatively assume it isn't safe to
27008	// perform a tail call.
27009	if (Copy->getOperand(Num: Copy->getNumOperands() - `1`).getValueType() ==
27010	MVT::Glue)
27011	return false;
27012	TCChain = Copy->getOperand(Num: `0`);
27013	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
27014	return false;
27015
27016	bool HasRet = false;
27017	for (SDNode *Node : Copy->users()) {
27018	if (Node->getOpcode() != AArch64ISD::RET_GLUE)
27019	return false;
27020	HasRet = true;
27021	}
27022
27023	if (!HasRet)
27024	return false;
27025
27026	Chain = TCChain;
27027	return true;
27028	}
27029
27030	// Return whether the an instruction can potentially be optimized to a tail
27031	// call. This will cause the optimizers to attempt to move, or duplicate,
27032	// return instructions to help enable tail call optimizations for this
27033	// instruction.
27034	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
27035	return CI->isTailCall();
27036	}
27037
27038	bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
27039	Register Offset, bool IsPre,
27040	MachineRegisterInfo &MRI) const {
27041	auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
27042	if (!CstOffset \|\| CstOffset ->isZero())
27043	return false;
27044
27045	// All of the indexed addressing mode instructions take a signed 9 bit
27046	// immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
27047	// encodes the sign/indexing direction.
27048	return isInt<`9`>(x: CstOffset ->getSExtValue());
27049	}
27050
27051	bool AArch64TargetLowering::getIndexedAddressParts(SDNode N, SDNode Op,
27052	SDValue &Base,
27053	SDValue &Offset,
27054	SelectionDAG &DAG) const {
27055	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
27056	return false;
27057
27058	// Non-null if there is exactly one user of the loaded value (ignoring chain).
27059	SDNode ValOnlyUser = nullptr*;
27060	for (SDUse &U : N->uses()) {
27061	if (U.getResNo() == `1`)
27062	continue; // Ignore chain.
27063	if (ValOnlyUser == nullptr)
27064	ValOnlyUser = U.getUser();
27065	else {
27066	ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
27067	break;
27068	}
27069	}
27070
27071	auto IsUndefOrZero = [](SDValue V) {
27072	return V.isUndef() \|\| isNullOrNullSplat(V, /AllowUndefs/ true);
27073	};
27074
27075	// If the only user of the value is a scalable vector splat, it is
27076	// preferable to do a replicating load (ld1r).*
27077	if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: `0`).isScalableVector() &&
27078	(ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR \|\|
27079	(ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
27080	IsUndefOrZero (ValOnlyUser->getOperand(Num: `2`)))))
27081	return false;
27082
27083	Base = Op->getOperand(Num: `0`);
27084	// All of the indexed addressing mode instructions take a signed
27085	// 9 bit immediate offset.
27086	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: `1`))) {
27087	int64_t RHSC = RHS->getSExtValue();
27088	if (Op->getOpcode() == ISD::SUB)
27089	RHSC = -(uint64_t)RHSC;
27090	if (!isInt<`9`>(x: RHSC))
27091	return false;
27092	// When big-endian VLD1/VST1 are used for vector load and store, and these
27093	// only allow an offset that's equal to the store size.
27094	EVT MemType = cast<MemSDNode>(Val: N)->getMemoryVT();
27095	if (!Subtarget->isLittleEndian() && MemType.isVector() &&
27096	(uint64_t)RHSC != MemType.getStoreSize())
27097	return false;
27098	// Always emit pre-inc/post-inc addressing mode. Use negated constant offset
27099	// when dealing with subtraction.
27100	Offset = DAG.getConstant(Val: RHSC, DL: SDLoc (N), VT: RHS->getValueType(ResNo: `0`));
27101	return true;
27102	}
27103	return false;
27104	}
27105
27106	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
27107	SDValue &Offset,
27108	ISD::MemIndexedMode &AM,
27109	SelectionDAG &DAG) const {
27110	EVT VT;
27111	SDValue Ptr;
27112	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
27113	VT = LD->getMemoryVT();
27114	Ptr = LD->getBasePtr();
27115	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
27116	VT = ST->getMemoryVT();
27117	Ptr = ST->getBasePtr();
27118	} else
27119	return false;
27120
27121	if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
27122	return false;
27123	AM = ISD::PRE_INC;
27124	return true;
27125	}
27126
27127	bool AArch64TargetLowering::getPostIndexedAddressParts(
27128	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
27129	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
27130	EVT VT;
27131	SDValue Ptr;
27132	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
27133	VT = LD->getMemoryVT();
27134	Ptr = LD->getBasePtr();
27135	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
27136	VT = ST->getMemoryVT();
27137	Ptr = ST->getBasePtr();
27138	} else
27139	return false;
27140
27141	if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
27142	return false;
27143	// Post-indexing updates the base, so it's not a valid transform
27144	// if that's not the same as the load's pointer.
27145	if (Ptr != Base)
27146	return false;
27147	AM = ISD::POST_INC;
27148	return true;
27149	}
27150
27151	static void replaceBoolVectorBitcast(SDNode *N,
27152	SmallVectorImpl<SDValue> &Results,
27153	SelectionDAG &DAG) {
27154	SDLoc DL(N);
27155	SDValue Op = N->getOperand(Num: `0`);
27156	EVT VT = N->getValueType(ResNo: `0`);
27157	[[maybe_unused]] EVT SrcVT = Op.getValueType();
27158	assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27159	"Must be bool vector.");
27160
27161	// Special handling for Clang's __builtin_convertvector. For vectors with <8
27162	// elements, it adds a vector concatenation with undef(s). If we encounter
27163	// this here, we can skip the concat.
27164	if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: `0`).isUndef()) {
27165	bool AllUndef = true;
27166	for (unsigned I = `1`; I < Op.getNumOperands(); ++I)
27167	AllUndef &= Op.getOperand(i: I).isUndef();
27168
27169	if (AllUndef)
27170	Op = Op.getOperand(i: `0`);
27171	}
27172
27173	SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
27174	if (VectorBits)
27175	Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
27176	}
27177
27178	static void CustomNonLegalBITCASTResults(SDNode *N,
27179	SmallVectorImpl<SDValue> &Results,
27180	SelectionDAG &DAG, EVT ExtendVT,
27181	EVT CastVT) {
27182	SDLoc DL(N);
27183	SDValue Op = N->getOperand(Num: `0`);
27184	EVT VT = N->getValueType(ResNo: `0`);
27185
27186	// Use SCALAR_TO_VECTOR for lane zero
27187	SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
27188	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
27189	SDValue IdxZero = DAG.getVectorIdxConstant(Val: `0`, DL);
27190	Results.push_back(
27191	Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
27192	}
27193
27194	void AArch64TargetLowering::ReplaceBITCASTResults(
27195	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
27196	SDLoc DL(N);
27197	SDValue Op = N->getOperand(Num: `0`);
27198	EVT VT = N->getValueType(ResNo: `0`);
27199	EVT SrcVT = Op.getValueType();
27200
27201	if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
27202	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v4i16);
27203	return;
27204	}
27205
27206	if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
27207	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v8i8);
27208	return;
27209	}
27210
27211	if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
27212	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v4i16, CastVT: MVT::v8i8);
27213	return;
27214	}
27215
27216	if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
27217	assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
27218	"Expected fp->int bitcast!");
27219
27220	// Bitcasting between unpacked vector types of different element counts is
27221	// not a NOP because the live elements are laid out differently.
27222	// 01234567
27223	// e.g. nxv2i32 = XX??XX??
27224	// nxv4f16 = X?X?X?X?
27225	if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
27226	return;
27227
27228	SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
27229	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
27230	return;
27231	}
27232
27233	if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
27234	!VT.isVector())
27235	return replaceBoolVectorBitcast(N, Results, DAG);
27236
27237	if (VT != MVT::i16 \|\| (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
27238	return;
27239
27240	Op = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
27241	Operand: DAG.getUNDEF(VT: MVT::i32), Subreg: Op);
27242	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Op);
27243	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Op));
27244	}
27245
27246	static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
27247	SelectionDAG &DAG,
27248	const AArch64Subtarget *Subtarget) {
27249	EVT VT = N->getValueType(ResNo: `0`);
27250	if (!VT.is256BitVector() \|\|
27251	(VT.getScalarType().isFloatingPoint() &&
27252	!N->getFlags().hasAllowReassociation()) \|\|
27253	(VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
27254	VT.getScalarType() == MVT::bf16)
27255	return;
27256
27257	SDValue X = N->getOperand(Num: `0`);
27258	auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `1`));
27259	if (!Shuf) {
27260	Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `0`));
27261	X = N->getOperand(Num: `1`);
27262	if (!Shuf)
27263	return;
27264	}
27265
27266	if (Shuf->getOperand(Num: `0`) != X \|\| !Shuf->getOperand(Num: `1`)->isUndef())
27267	return;
27268
27269	// Check the mask is 1,0,3,2,5,4,...
27270	ArrayRef<int> Mask = Shuf->getMask();
27271	for (int I = `0`, E = Mask.size(); I < E; I++)
27272	if (Mask [I] != (I % `2` == `0` ? I + `1` : I - `1`))
27273	return;
27274
27275	SDLoc DL(N);
27276	auto LoHi = DAG.SplitVector(N: X, DL);
27277	assert(LoHi.first.getValueType() == LoHi.second.getValueType());
27278	SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
27279	N1: LoHi.first, N2: LoHi.second);
27280
27281	// Shuffle the elements back into order.
27282	SmallVector<int> NMask;
27283	for (unsigned I = `0`, E = VT.getVectorNumElements() / `2`; I < E; I++) {
27284	NMask.push_back(Elt: I);
27285	NMask.push_back(Elt: I);
27286	}
27287	Results.push_back(
27288	Elt: DAG.getVectorShuffle(VT, dl: DL,
27289	N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
27290	N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
27291	N2: DAG.getUNDEF(VT), Mask: NMask));
27292	}
27293
27294	static void ReplaceReductionResults(SDNode *N,
27295	SmallVectorImpl<SDValue> &Results,
27296	SelectionDAG &DAG, unsigned InterOp,
27297	unsigned AcrossOp) {
27298	EVT LoVT, HiVT;
27299	SDValue Lo, Hi;
27300	SDLoc DL(N);
27301	std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: `0`));
27302	std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: `0`);
27303	SDValue InterVal = DAG.getNode(Opcode: InterOp, DL, VT: LoVT, N1: Lo, N2: Hi);
27304	SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL, VT: LoVT, Operand: InterVal);
27305	Results.push_back(Elt: SplitVal);
27306	}
27307
27308	void AArch64TargetLowering::ReplaceExtractSubVectorResults(
27309	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
27310	SDValue In = N->getOperand(Num: `0`);
27311	EVT InVT = In.getValueType();
27312
27313	// Common code will handle these just fine.
27314	if (!InVT.isScalableVector() \|\| !InVT.isInteger())
27315	return;
27316
27317	SDLoc DL(N);
27318	EVT VT = N->getValueType(ResNo: `0`);
27319
27320	// The following checks bail if this is not a halving operation.
27321
27322	ElementCount ResEC = VT.getVectorElementCount();
27323
27324	if (InVT.getVectorElementCount() != (ResEC * `2`))
27325	return;
27326
27327	auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
27328	if (!CIndex)
27329	return;
27330
27331	unsigned Index = CIndex->getZExtValue();
27332	if ((Index != `0`) && (Index != ResEC.getKnownMinValue()))
27333	return;
27334
27335	unsigned Opcode = (Index == `0`) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
27336	EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27337
27338	SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: `0`));
27339	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
27340	}
27341
27342	void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
27343	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
27344	assert((Subtarget->hasSVE2p1() \|\|
27345	(Subtarget->hasSME2() && Subtarget->isStreaming())) &&
27346	"Custom lower of get.active.lane.mask missing required feature.");
27347
27348	assert(N->getValueType(`0`) == MVT::nxv32i1 &&
27349	"Unexpected result type for get.active.lane.mask");
27350
27351	SDLoc DL(N);
27352	SDValue Idx = N->getOperand(Num: `0`);
27353	SDValue TC = N->getOperand(Num: `1`);
27354
27355	assert(Idx.getValueType().getFixedSizeInBits() <= `64` &&
27356	"Unexpected operand type for get.active.lane.mask");
27357
27358	if (Idx.getValueType() != MVT::i64) {
27359	Idx = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Idx);
27360	TC = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: TC);
27361	}
27362
27363	SDValue ID =
27364	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
27365	EVT HalfVT = N->getValueType(ResNo: `0`).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
27366	auto WideMask =
27367	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, ResultTys: {HalfVT, HalfVT}, Ops: {ID, Idx, TC});
27368
27369	Results.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: N->getValueType(ResNo: `0`),
27370	Ops: {WideMask.getValue(R: `0`), WideMask.getValue(R: `1`)}));
27371	}
27372
27373	// Create an even/odd pair of X registers holding integer value V.
27374	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
27375	SDLoc DL(V.getNode());
27376	auto [VLo, VHi] = DAG.SplitScalar(N: V, DL, LoVT: MVT::i64, HiVT: MVT::i64);
27377	if (DAG.getDataLayout().isBigEndian())
27378	std::swap (a&: VLo, b&: VHi);
27379	SDValue RegClass =
27380	DAG.getTargetConstant(Val: AArch64::XSeqPairsClassRegClassID, DL, VT: MVT::i32);
27381	SDValue SubReg0 = DAG.getTargetConstant(Val: AArch64::sube64, DL, VT: MVT::i32);
27382	SDValue SubReg1 = DAG.getTargetConstant(Val: AArch64::subo64, DL, VT: MVT::i32);
27383	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
27384	return SDValue (
27385	DAG.getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl: DL, VT: MVT::Untyped, Ops), `0`);
27386	}
27387
27388	static void ReplaceCMP_SWAP_128Results(SDNode *N,
27389	SmallVectorImpl<SDValue> &Results,
27390	SelectionDAG &DAG,
27391	const AArch64Subtarget *Subtarget) {
27392	assert(N->getValueType(`0`) == MVT::i128 &&
27393	"AtomicCmpSwap on types less than 128 should be legal");
27394
27395	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
27396	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics()) {
27397	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
27398	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
27399	SDValue Ops[] = {
27400	createGPRPairNode(DAG, V: N->getOperand(Num: `2`)), // Compare value
27401	createGPRPairNode(DAG, V: N->getOperand(Num: `3`)), // Store value
27402	N->getOperand(Num: `1`), // Ptr
27403	N->getOperand(Num: `0`), // Chain in
27404	};
27405
27406	unsigned Opcode;
27407	switch (MemOp->getMergedOrdering()) {
27408	case AtomicOrdering::Monotonic:
27409	Opcode = AArch64::CASPX;
27410	break;
27411	case AtomicOrdering::Acquire:
27412	Opcode = AArch64::CASPAX;
27413	break;
27414	case AtomicOrdering::Release:
27415	Opcode = AArch64::CASPLX;
27416	break;
27417	case AtomicOrdering::AcquireRelease:
27418	case AtomicOrdering::SequentiallyConsistent:
27419	Opcode = AArch64::CASPALX;
27420	break;
27421	default:
27422	llvm_unreachable("Unexpected ordering!");
27423	}
27424
27425	MachineSDNode *CmpSwap = DAG.getMachineNode(
27426	Opcode, dl: SDLoc (N), VTs: DAG.getVTList(VT1: MVT::Untyped, VT2: MVT::Other), Ops);
27427	DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
27428
27429	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
27430	if (DAG.getDataLayout().isBigEndian())
27431	std::swap(a&: SubReg1, b&: SubReg2);
27432	SDValue Lo = DAG.getTargetExtractSubreg(SRIdx: SubReg1, DL: SDLoc (N), VT: MVT::i64,
27433	Operand: SDValue (CmpSwap, `0`));
27434	SDValue Hi = DAG.getTargetExtractSubreg(SRIdx: SubReg2, DL: SDLoc (N), VT: MVT::i64,
27435	Operand: SDValue (CmpSwap, `0`));
27436	Results.push_back(
27437	Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128, N1: Lo, N2: Hi));
27438	Results.push_back(Elt: SDValue (CmpSwap, `1`)); // Chain out
27439	return;
27440	}
27441
27442	unsigned Opcode;
27443	switch (MemOp->getMergedOrdering()) {
27444	case AtomicOrdering::Monotonic:
27445	Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
27446	break;
27447	case AtomicOrdering::Acquire:
27448	Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
27449	break;
27450	case AtomicOrdering::Release:
27451	Opcode = AArch64::CMP_SWAP_128_RELEASE;
27452	break;
27453	case AtomicOrdering::AcquireRelease:
27454	case AtomicOrdering::SequentiallyConsistent:
27455	Opcode = AArch64::CMP_SWAP_128;
27456	break;
27457	default:
27458	llvm_unreachable("Unexpected ordering!");
27459	}
27460
27461	SDLoc DL(N);
27462	auto Desired = DAG.SplitScalar(N: N->getOperand(Num: `2`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
27463	auto New = DAG.SplitScalar(N: N->getOperand(Num: `3`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
27464	SDValue Ops[] = {N->getOperand(Num: `1`), Desired.first, Desired.second,
27465	New.first, New.second, N->getOperand(Num: `0`)};
27466	SDNode *CmpSwap = DAG.getMachineNode(
27467	Opcode, dl: SDLoc (N), VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::i32, VT4: MVT::Other),
27468	Ops);
27469	DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
27470
27471	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
27472	N1: SDValue (CmpSwap, `0`), N2: SDValue (CmpSwap, `1`)));
27473	Results.push_back(Elt: SDValue (CmpSwap, `3`));
27474	}
27475
27476	static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
27477	AtomicOrdering Ordering) {
27478	// ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
27479	// LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
27480	// the type is not legal. Therefore we shouldn't expect to see a 128-bit
27481	// ATOMIC_LOAD_CLR at any point.
27482	assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
27483	"ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
27484	assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
27485	assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
27486
27487	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27488	// The operand will need to be XORed in a separate step.
27489	switch (Ordering) {
27490	case AtomicOrdering::Monotonic:
27491	return AArch64::LDCLRP;
27492	break;
27493	case AtomicOrdering::Acquire:
27494	return AArch64::LDCLRPA;
27495	break;
27496	case AtomicOrdering::Release:
27497	return AArch64::LDCLRPL;
27498	break;
27499	case AtomicOrdering::AcquireRelease:
27500	case AtomicOrdering::SequentiallyConsistent:
27501	return AArch64::LDCLRPAL;
27502	break;
27503	default:
27504	llvm_unreachable("Unexpected ordering!");
27505	}
27506	}
27507
27508	if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
27509	switch (Ordering) {
27510	case AtomicOrdering::Monotonic:
27511	return AArch64::LDSETP;
27512	break;
27513	case AtomicOrdering::Acquire:
27514	return AArch64::LDSETPA;
27515	break;
27516	case AtomicOrdering::Release:
27517	return AArch64::LDSETPL;
27518	break;
27519	case AtomicOrdering::AcquireRelease:
27520	case AtomicOrdering::SequentiallyConsistent:
27521	return AArch64::LDSETPAL;
27522	break;
27523	default:
27524	llvm_unreachable("Unexpected ordering!");
27525	}
27526	}
27527
27528	if (ISDOpcode == ISD::ATOMIC_SWAP) {
27529	switch (Ordering) {
27530	case AtomicOrdering::Monotonic:
27531	return AArch64::SWPP;
27532	break;
27533	case AtomicOrdering::Acquire:
27534	return AArch64::SWPPA;
27535	break;
27536	case AtomicOrdering::Release:
27537	return AArch64::SWPPL;
27538	break;
27539	case AtomicOrdering::AcquireRelease:
27540	case AtomicOrdering::SequentiallyConsistent:
27541	return AArch64::SWPPAL;
27542	break;
27543	default:
27544	llvm_unreachable("Unexpected ordering!");
27545	}
27546	}
27547
27548	llvm_unreachable("Unexpected ISDOpcode!");
27549	}
27550
27551	static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
27552	SmallVectorImpl<SDValue> &Results,
27553	SelectionDAG &DAG,
27554	const AArch64Subtarget *Subtarget) {
27555	// LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
27556	// here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
27557	// rather than the CASP instructions, because CASP has register classes for
27558	// the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
27559	// to present them as single operands. LSE128 instructions use the GPR64
27560	// register class (because the pair does not have to be sequential), like
27561	// CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
27562
27563	assert(N->getValueType(`0`) == MVT::i128 &&
27564	"AtomicLoadXXX on types less than 128 should be legal");
27565
27566	if (!Subtarget->hasLSE128())
27567	return;
27568
27569	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
27570	const SDValue &Chain = N->getOperand(Num: `0`);
27571	const SDValue &Ptr = N->getOperand(Num: `1`);
27572	const SDValue &Val128 = N->getOperand(Num: `2`);
27573	std::pair<SDValue, SDValue> Val2x64 =
27574	DAG.SplitScalar(N: Val128, DL: SDLoc (Val128), LoVT: MVT::i64, HiVT: MVT::i64);
27575
27576	const unsigned ISDOpcode = N->getOpcode();
27577	const unsigned MachineOpcode =
27578	getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
27579
27580	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
27581	SDLoc DL(Val128);
27582	Val2x64.first =
27583	DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64,
27584	N1: DAG.getAllOnesConstant(DL, VT: MVT::i64), N2: Val2x64.first);
27585	Val2x64.second =
27586	DAG.getNode(Opcode: ISD::XOR, DL, VT: MVT::i64,
27587	N1: DAG.getAllOnesConstant(DL, VT: MVT::i64), N2: Val2x64.second);
27588	}
27589
27590	SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
27591	if (DAG.getDataLayout().isBigEndian())
27592	std::swap(a&: Ops[`0`], b&: Ops[`1`]);
27593
27594	MachineSDNode *AtomicInst =
27595	DAG.getMachineNode(Opcode: MachineOpcode, dl: SDLoc (N),
27596	VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other), Ops);
27597
27598	DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
27599
27600	SDValue Lo = SDValue (AtomicInst, `0`), Hi = SDValue (AtomicInst, `1`);
27601	if (DAG.getDataLayout().isBigEndian())
27602	std::swap(a&: Lo, b&: Hi);
27603
27604	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128, N1: Lo, N2: Hi));
27605	Results.push_back(Elt: SDValue (AtomicInst, `2`)); // Chain out
27606	}
27607
27608	void AArch64TargetLowering::ReplaceNodeResults(
27609	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
27610	switch (N->getOpcode()) {
27611	default:
27612	llvm_unreachable("Don't know how to custom expand this");
27613	case ISD::BITCAST:
27614	ReplaceBITCASTResults(N, Results, DAG);
27615	return;
27616	case ISD::VECREDUCE_ADD:
27617	case ISD::VECREDUCE_SMAX:
27618	case ISD::VECREDUCE_SMIN:
27619	case ISD::VECREDUCE_UMAX:
27620	case ISD::VECREDUCE_UMIN:
27621	Results.push_back(Elt: LowerVECREDUCE(Op: SDValue (N, `0`), DAG));
27622	return;
27623	case ISD::VECTOR_COMPRESS:
27624	if (SDValue Res = LowerVECTOR_COMPRESS(Op: SDValue (N, `0`), DAG))
27625	Results.push_back(Elt: Res);
27626	return;
27627	case ISD::ADD:
27628	case ISD::FADD:
27629	ReplaceAddWithADDP(N, Results, DAG, Subtarget);
27630	return;
27631
27632	case ISD::CTPOP:
27633	case ISD::PARITY:
27634	if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue (N, `0`), DAG))
27635	Results.push_back(Elt: Result);
27636	return;
27637	case AArch64ISD::SADDV:
27638	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
27639	return;
27640	case AArch64ISD::UADDV:
27641	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
27642	return;
27643	case AArch64ISD::SMINV:
27644	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
27645	return;
27646	case AArch64ISD::UMINV:
27647	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
27648	return;
27649	case AArch64ISD::SMAXV:
27650	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
27651	return;
27652	case AArch64ISD::UMAXV:
27653	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
27654	return;
27655	case ISD::MULHS:
27656	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
27657	Results.push_back(
27658	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHS_PRED));
27659	return;
27660	case ISD::MULHU:
27661	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
27662	Results.push_back(
27663	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHU_PRED));
27664	return;
27665	case ISD::FP_TO_UINT:
27666	case ISD::FP_TO_SINT:
27667	case ISD::STRICT_FP_TO_SINT:
27668	case ISD::STRICT_FP_TO_UINT:
27669	assert(N->getValueType(`0`) == MVT::i128 && "unexpected illegal conversion");
27670	// Let normal code take care of it by not adding anything to Results.
27671	return;
27672	case ISD::ATOMIC_CMP_SWAP:
27673	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
27674	return;
27675	case ISD::ATOMIC_LOAD_CLR:
27676	assert(N->getValueType(`0`) != MVT::i128 &&
27677	"128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
27678	break;
27679	case ISD::ATOMIC_LOAD_AND:
27680	case ISD::ATOMIC_LOAD_OR:
27681	case ISD::ATOMIC_SWAP: {
27682	assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
27683	"Expected 128-bit atomicrmw.");
27684	// These need custom type legalisation so we go directly to instruction.
27685	ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
27686	return;
27687	}
27688	case ISD::ADDRSPACECAST: {
27689	SDValue V = LowerADDRSPACECAST(Op: SDValue (N, `0`), DAG);
27690	Results.push_back(Elt: V);
27691	return;
27692	}
27693	case ISD::ATOMIC_LOAD:
27694	case ISD::LOAD: {
27695	MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
27696	EVT MemVT = LoadNode->getMemoryVT();
27697	// Handle lowering 256 bit non temporal loads into LDNP for little-endian
27698	// targets.
27699	if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
27700	MemVT.getSizeInBits() == `256u` &&
27701	(MemVT.getScalarSizeInBits() == `8u` \|\|
27702	MemVT.getScalarSizeInBits() == `16u` \|\|
27703	MemVT.getScalarSizeInBits() == `32u` \|\|
27704	MemVT.getScalarSizeInBits() == `64u`)) {
27705
27706	SDValue Result = DAG.getMemIntrinsicNode(
27707	Opcode: AArch64ISD::LDNP, dl: SDLoc (N),
27708	VTList: DAG.getVTList(VTs: {MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
27709	MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
27710	MVT::Other}),
27711	Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
27712	MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
27713
27714	SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (N), VT: MemVT,
27715	N1: Result.getValue(R: `0`), N2: Result.getValue(R: `1`));
27716	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
27717	return;
27718	}
27719
27720	if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) \|\|
27721	LoadNode->getMemoryVT() != MVT::i128) {
27722	// Non-volatile or atomic loads are optimized later in AArch64's load/store
27723	// optimizer.
27724	return;
27725	}
27726
27727	if (SDValue (N, `0`).getValueType() == MVT::i128) {
27728	auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
27729	bool isLoadAcquire =
27730	AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
27731	unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
27732
27733	if (isLoadAcquire)
27734	assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
27735
27736	SDValue Result = DAG.getMemIntrinsicNode(
27737	Opcode, dl: SDLoc (N), VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
27738	Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
27739	MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
27740
27741	unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? `1` : `0`;
27742
27743	SDValue Pair =
27744	DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128,
27745	N1: Result.getValue(R: FirstRes), N2: Result.getValue(R: `1` - FirstRes));
27746	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
27747	}
27748	return;
27749	}
27750	case ISD::EXTRACT_SUBVECTOR:
27751	ReplaceExtractSubVectorResults(N, Results, DAG);
27752	return;
27753	case ISD::INSERT_SUBVECTOR:
27754	case ISD::CONCAT_VECTORS:
27755	// Custom lowering has been requested for INSERT_SUBVECTOR and
27756	// CONCAT_VECTORS -- but delegate to common code for result type
27757	// legalisation
27758	return;
27759	case ISD::GET_ACTIVE_LANE_MASK:
27760	ReplaceGetActiveLaneMaskResults(N, Results, DAG);
27761	return;
27762	case ISD::INTRINSIC_WO_CHAIN: {
27763	EVT VT = N->getValueType(ResNo: `0`);
27764
27765	Intrinsic::ID IntID =
27766	static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: `0`));
27767	switch (IntID) {
27768	default:
27769	return;
27770	case Intrinsic::aarch64_sve_clasta_n: {
27771	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
27772	"custom lowering for unexpected type");
27773	SDLoc DL(N);
27774	auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: `2`));
27775	auto V = DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: MVT::i32,
27776	N1: N->getOperand(Num: `1`), N2: Op2, N3: N->getOperand(Num: `3`));
27777	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27778	return;
27779	}
27780	case Intrinsic::aarch64_sve_clastb_n: {
27781	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
27782	"custom lowering for unexpected type");
27783	SDLoc DL(N);
27784	auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: `2`));
27785	auto V = DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: MVT::i32,
27786	N1: N->getOperand(Num: `1`), N2: Op2, N3: N->getOperand(Num: `3`));
27787	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27788	return;
27789	}
27790	case Intrinsic::aarch64_sve_lasta: {
27791	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
27792	"custom lowering for unexpected type");
27793	SDLoc DL(N);
27794	auto V = DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: MVT::i32,
27795	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
27796	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27797	return;
27798	}
27799	case Intrinsic::aarch64_sve_lastb: {
27800	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
27801	"custom lowering for unexpected type");
27802	SDLoc DL(N);
27803	auto V = DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: MVT::i32,
27804	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
27805	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27806	return;
27807	}
27808	case Intrinsic::aarch64_sme_in_streaming_mode: {
27809	SDLoc DL(N);
27810	SDValue Chain = DAG.getEntryNode();
27811	SDValue RuntimePStateSM =
27812	getRuntimePStateSM(DAG, Chain, DL, VT: N->getValueType(ResNo: `0`));
27813	Results.push_back(
27814	Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: RuntimePStateSM));
27815	return;
27816	}
27817	case Intrinsic::experimental_vector_match: {
27818	if (!VT.isFixedLengthVector() \|\| VT.getVectorElementType() != MVT::i1)
27819	return;
27820
27821	// NOTE: Only trivial type promotion is supported.
27822	EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
27823	if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
27824	return;
27825
27826	SDLoc DL(N);
27827	auto V = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: NewVT, Ops: N->ops());
27828	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
27829	return;
27830	}
27831	}
27832	}
27833	case ISD::READ_REGISTER: {
27834	SDLoc DL(N);
27835	assert(N->getValueType(`0`) == MVT::i128 &&
27836	"READ_REGISTER custom lowering is only for 128-bit sysregs");
27837	SDValue Chain = N->getOperand(Num: `0`);
27838	SDValue SysRegName = N->getOperand(Num: `1`);
27839
27840	SDValue Result = DAG.getNode(
27841	Opcode: AArch64ISD::MRRS, DL, VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
27842	N1: Chain, N2: SysRegName);
27843
27844	// Sysregs are not endian. Result.getValue(0) always contains the lower half
27845	// of the 128-bit System Register value.
27846	SDValue Pair = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
27847	N1: Result.getValue(R: `0`), N2: Result.getValue(R: `1`));
27848	Results.push_back(Elt: Pair);
27849	Results.push_back(Elt: Result.getValue(R: `2`)); // Chain
27850	return;
27851	}
27852	}
27853	}
27854
27855	bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {
27856	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
27857	return TargetLowering::useLoadStackGuardNode(M);
27858	return true;
27859	}
27860
27861	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
27862	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
27863	// reciprocal if there are three or more FDIVs.
27864	return `3`;
27865	}
27866
27867	TargetLoweringBase::LegalizeTypeAction
27868	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
27869	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
27870	// v4i16, v2i32 instead of to promote.
27871	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
27872	VT == MVT::v1f32)
27873	return TypeWidenVector;
27874
27875	return TargetLoweringBase::getPreferredVectorAction(VT);
27876	}
27877
27878	// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
27879	// provided the address is 16-byte aligned.
27880	bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction I) const* {
27881	if (!Subtarget->hasLSE2())
27882	return false;
27883
27884	if (auto LI = dyn_cast<LoadInst>(Val: I))
27885	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
27886	LI->getAlign() >= Align (`16`);
27887
27888	if (auto SI = dyn_cast<StoreInst>(Val: I))
27889	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
27890	SI->getAlign() >= Align (`16`);
27891
27892	return false;
27893	}
27894
27895	bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction I) const* {
27896	if (!Subtarget->hasLSE128())
27897	return false;
27898
27899	// Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
27900	// will clobber the two registers.
27901	if (const auto *SI = dyn_cast<StoreInst>(Val: I))
27902	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
27903	SI->getAlign() >= Align (`16`) &&
27904	(SI->getOrdering() == AtomicOrdering::Release \|\|
27905	SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
27906
27907	if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
27908	return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
27909	RMW->getAlign() >= Align (`16`) &&
27910	(RMW->getOperation() == AtomicRMWInst::Xchg \|\|
27911	RMW->getOperation() == AtomicRMWInst::And \|\|
27912	RMW->getOperation() == AtomicRMWInst::Or);
27913
27914	return false;
27915	}
27916
27917	bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction I) const* {
27918	if (!Subtarget->hasLSE2() \|\| !Subtarget->hasRCPC3())
27919	return false;
27920
27921	if (auto LI = dyn_cast<LoadInst>(Val: I))
27922	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
27923	LI->getAlign() >= Align (`16`) &&
27924	LI->getOrdering() == AtomicOrdering::Acquire;
27925
27926	if (auto SI = dyn_cast<StoreInst>(Val: I))
27927	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
27928	SI->getAlign() >= Align (`16`) &&
27929	SI->getOrdering() == AtomicOrdering::Release;
27930
27931	return false;
27932	}
27933
27934	bool AArch64TargetLowering::shouldInsertFencesForAtomic(
27935	const Instruction I) const* {
27936	if (isOpSuitableForRCPC3(I))
27937	return false;
27938	if (isOpSuitableForLSE128(I))
27939	return false;
27940	if (isOpSuitableForLDPSTP(I))
27941	return true;
27942	return false;
27943	}
27944
27945	bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
27946	const Instruction I) const* {
27947	// Store-Release instructions only provide seq_cst guarantees when paired with
27948	// Load-Acquire instructions. MSVC CRT does not use these instructions to
27949	// implement seq_cst loads and stores, so we need additional explicit fences
27950	// after memory writes.
27951	if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
27952	return false;
27953
27954	switch (I->getOpcode()) {
27955	default:
27956	return false;
27957	case Instruction::AtomicCmpXchg:
27958	return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
27959	AtomicOrdering::SequentiallyConsistent;
27960	case Instruction::AtomicRMW:
27961	return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
27962	AtomicOrdering::SequentiallyConsistent;
27963	case Instruction::Store:
27964	return cast<StoreInst>(Val: I)->getOrdering() ==
27965	AtomicOrdering::SequentiallyConsistent;
27966	}
27967	}
27968
27969	// Loads and stores less than 128-bits are already atomic; ones above that
27970	// are doomed anyway, so defer to the default libcall and blame the OS when
27971	// things go wrong.
27972	TargetLoweringBase::AtomicExpansionKind
27973	AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst SI) const* {
27974	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
27975	if (Size != `128`)
27976	return AtomicExpansionKind::None;
27977	if (isOpSuitableForRCPC3(I: SI))
27978	return AtomicExpansionKind::None;
27979	if (isOpSuitableForLSE128(I: SI))
27980	return AtomicExpansionKind::Expand;
27981	if (isOpSuitableForLDPSTP(I: SI))
27982	return AtomicExpansionKind::None;
27983	return AtomicExpansionKind::Expand;
27984	}
27985
27986	// Loads and stores less than 128-bits are already atomic; ones above that
27987	// are doomed anyway, so defer to the default libcall and blame the OS when
27988	// things go wrong.
27989	TargetLowering::AtomicExpansionKind
27990	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst LI) const* {
27991	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
27992
27993	if (Size != `128`)
27994	return AtomicExpansionKind::None;
27995	if (isOpSuitableForRCPC3(I: LI))
27996	return AtomicExpansionKind::None;
27997	// No LSE128 loads
27998	if (isOpSuitableForLDPSTP(I: LI))
27999	return AtomicExpansionKind::None;
28000
28001	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
28002	// implement atomicrmw without spilling. If the target address is also on the
28003	// stack and close enough to the spill slot, this can lead to a situation
28004	// where the monitor always gets cleared and the atomic operation can never
28005	// succeed. So at -O0 lower this operation to a CAS loop.
28006	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28007	return AtomicExpansionKind::CmpXChg;
28008
28009	// Using CAS for an atomic load has a better chance of succeeding under high
28010	// contention situations. So use it if available.
28011	return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
28012	: AtomicExpansionKind::LLSC;
28013	}
28014
28015	// Return true if the atomic operation expansion will lower to use a library
28016	// call, and is thus ineligible to use an LLSC expansion.
28017	static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
28018	const AtomicRMWInst *RMW) {
28019	if (!RMW->isFloatingPointOperation())
28020	return false;
28021	switch (RMW->getType()->getScalarType()->getTypeID()) {
28022	case Type::FloatTyID:
28023	case Type::DoubleTyID:
28024	case Type::HalfTyID:
28025	case Type::BFloatTyID:
28026	// Will use soft float
28027	return !Subtarget.hasFPARMv8();
28028	default:
28029	// fp128 will emit library calls.
28030	return true;
28031	}
28032
28033	llvm_unreachable("covered type switch");
28034	}
28035
28036	// The "default" for integer RMW operations is to expand to an LL/SC loop.
28037	// However, with the LSE instructions (or outline-atomics mode, which provides
28038	// library routines in place of the LSE-instructions), we can directly emit many
28039	// operations instead.
28040	TargetLowering::AtomicExpansionKind
28041	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst AI) const* {
28042	Type *Ty = AI->getType();
28043	unsigned Size = Ty->getPrimitiveSizeInBits();
28044	assert(Size <= `128` && "AtomicExpandPass should've handled larger sizes.");
28045
28046	bool CanUseLSE128 = Subtarget->hasLSE128() && Size == `128` &&
28047	(AI->getOperation() == AtomicRMWInst::Xchg \|\|
28048	AI->getOperation() == AtomicRMWInst::Or \|\|
28049	AI->getOperation() == AtomicRMWInst::And);
28050	if (CanUseLSE128)
28051	return AtomicExpansionKind::None;
28052
28053	// If LSFE available, use atomic FP instructions in preference to expansion
28054	if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd \|\|
28055	AI->getOperation() == AtomicRMWInst::FMax \|\|
28056	AI->getOperation() == AtomicRMWInst::FMin \|\|
28057	AI->getOperation() == AtomicRMWInst::FMaximum \|\|
28058	AI->getOperation() == AtomicRMWInst::FMinimum))
28059	return AtomicExpansionKind::None;
28060
28061	// Nand is not supported in LSE.
28062	// Leave 128 bits to LLSC or CmpXChg.
28063	if (AI->getOperation() != AtomicRMWInst::Nand && Size < `128` &&
28064	!AI->isFloatingPointOperation()) {
28065	if (Subtarget->hasLSE())
28066	return AtomicExpansionKind::None;
28067	if (Subtarget->outlineAtomics()) {
28068	// [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
28069	// Don't outline them unless
28070	// (1) high level <atomic> support approved:
28071	// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
28072	// (2) low level libgcc and compiler-rt support implemented by:
28073	// min/max outline atomics helpers
28074	if (AI->getOperation() != AtomicRMWInst::Min &&
28075	AI->getOperation() != AtomicRMWInst::Max &&
28076	AI->getOperation() != AtomicRMWInst::UMin &&
28077	AI->getOperation() != AtomicRMWInst::UMax) {
28078	return AtomicExpansionKind::None;
28079	}
28080	}
28081	}
28082
28083	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
28084	// implement atomicrmw without spilling. If the target address is also on the
28085	// stack and close enough to the spill slot, this can lead to a situation
28086	// where the monitor always gets cleared and the atomic operation can never
28087	// succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
28088	// we have a single CAS instruction that can replace the loop.
28089	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None \|\|
28090	Subtarget->hasLSE() \|\| rmwOpMayLowerToLibcall(Subtarget: *Subtarget, RMW: AI))
28091	return AtomicExpansionKind::CmpXChg;
28092
28093	return AtomicExpansionKind::LLSC;
28094	}
28095
28096	TargetLowering::AtomicExpansionKind
28097	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
28098	AtomicCmpXchgInst AI) const* {
28099	// If subtarget has LSE, leave cmpxchg intact for codegen.
28100	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics())
28101	return AtomicExpansionKind::None;
28102	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
28103	// implement cmpxchg without spilling. If the address being exchanged is also
28104	// on the stack and close enough to the spill slot, this can lead to a
28105	// situation where the monitor always gets cleared and the atomic operation
28106	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
28107	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
28108	return AtomicExpansionKind::None;
28109
28110	// 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
28111	// it.
28112	unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
28113	if (Size > `64`)
28114	return AtomicExpansionKind::None;
28115
28116	return AtomicExpansionKind::LLSC;
28117	}
28118
28119	Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
28120	Type ValueTy, Value Addr,
28121	AtomicOrdering Ord) const {
28122	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28123	bool IsAcquire = isAcquireOrStronger(AO: Ord);
28124
28125	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
28126	// intrinsic must return {i64, i64} and we have to recombine them into a
28127	// single i128 here.
28128	if (ValueTy->getPrimitiveSizeInBits() == `128`) {
28129	Intrinsic::ID Int =
28130	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
28131
28132	Value *LoHi =
28133	Builder.CreateIntrinsic(ID: Int, Args: Addr, /FMFSource=/nullptr, Name: "lohi");
28134
28135	Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: `0`, Name: "lo");
28136	Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: `1`, Name: "hi");
28137
28138	auto *Int128Ty = Type::getInt128Ty(C&: Builder.getContext());
28139	Lo = Builder.CreateZExt(V: Lo, DestTy: Int128Ty, Name: "lo64");
28140	Hi = Builder.CreateZExt(V: Hi, DestTy: Int128Ty, Name: "hi64");
28141
28142	Value *Or = Builder.CreateOr(
28143	LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: Int128Ty, V: `64`)), Name: "val64");
28144	return Builder.CreateBitCast(V: Or, DestTy: ValueTy);
28145	}
28146
28147	Type *Tys[] = { Addr->getType() };
28148	Intrinsic::ID Int =
28149	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
28150
28151	const DataLayout &DL = M->getDataLayout();
28152	IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
28153	CallInst *CI = Builder.CreateIntrinsic(ID: Int, Types: Tys, Args: Addr);
28154	CI->addParamAttr(ArgNo: `0`, Attr: Attribute::get(Context&: Builder.getContext(),
28155	Kind: Attribute::ElementType, Ty: IntEltTy));
28156	Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
28157
28158	return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
28159	}
28160
28161	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
28162	IRBuilderBase &Builder) const {
28163	Builder.CreateIntrinsic(ID: Intrinsic::aarch64_clrex, Args: {});
28164	}
28165
28166	Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
28167	Value Val, Value Addr,
28168	AtomicOrdering Ord) const {
28169	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
28170	bool IsRelease = isReleaseOrStronger(AO: Ord);
28171
28172	// Since the intrinsics must have legal type, the i128 intrinsics take two
28173	// parameters: "i64, i64". We must marshal Val into the appropriate form
28174	// before the call.
28175	if (Val->getType()->getPrimitiveSizeInBits() == `128`) {
28176	Intrinsic::ID Int =
28177	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
28178	Function *Stxr = Intrinsic::getOrInsertDeclaration(M, id: Int);
28179	Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
28180	Type *Int128Ty = Type::getInt128Ty(C&: M->getContext());
28181
28182	Value *CastVal = Builder.CreateBitCast(V: Val, DestTy: Int128Ty);
28183
28184	Value *Lo = Builder.CreateTrunc(V: CastVal, DestTy: Int64Ty, Name: "lo");
28185	Value *Hi =
28186	Builder.CreateTrunc(V: Builder.CreateLShr(LHS: CastVal, RHS: `64`), DestTy: Int64Ty, Name: "hi");
28187	return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
28188	}
28189
28190	Intrinsic::ID Int =
28191	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
28192	Type *Tys[] = { Addr->getType() };
28193	Function *Stxr = Intrinsic::getOrInsertDeclaration(M, id: Int, Tys);
28194
28195	const DataLayout &DL = M->getDataLayout();
28196	IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
28197	Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
28198
28199	CallInst *CI = Builder.CreateCall(
28200	Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
28201	V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: `0`)),
28202	Addr});
28203	CI->addParamAttr(ArgNo: `1`, Attr: Attribute::get(Context&: Builder.getContext(),
28204	Kind: Attribute::ElementType, Ty: Val->getType()));
28205	return CI;
28206	}
28207
28208	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
28209	Type Ty, CallingConv::ID CallConv, bool* isVarArg,
28210	const DataLayout &DL) const {
28211	if (!Ty->isArrayTy()) {
28212	const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
28213	return TySize.isScalable() && TySize.getKnownMinValue() > `128`;
28214	}
28215
28216	// All non aggregate members of the type must have the same type
28217	SmallVector<EVT> ValueVTs;
28218	ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
28219	return all_equal(Range&: ValueVTs);
28220	}
28221
28222	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
28223	EVT) const {
28224	return false;
28225	}
28226
28227	static Value UseTlsOffset(IRBuilderBase &IRB, unsigned* Offset) {
28228	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
28229	Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
28230	M, id: Intrinsic::thread_pointer, Tys: IRB.getPtrTy());
28231	return IRB.CreatePointerCast(
28232	V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
28233	Idx0: Offset),
28234	DestTy: IRB.getPtrTy(AddrSpace: `0`));
28235	}
28236
28237	Value AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const* {
28238	// Android provides a fixed TLS slot for the stack cookie. See the definition
28239	// of TLS_SLOT_STACK_GUARD in
28240	// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
28241	if (Subtarget->isTargetAndroid())
28242	return UseTlsOffset(IRB, Offset: `0x28`);
28243
28244	// Fuchsia is similar.
28245	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
28246	if (Subtarget->isTargetFuchsia())
28247	return UseTlsOffset(IRB, Offset: -`0x10`);
28248
28249	return TargetLowering::getIRStackGuard(IRB);
28250	}
28251
28252	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
28253	// MSVC CRT provides functionalities for stack protection.
28254	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
28255	// MSVC CRT has a global variable holding security cookie.
28256	M.getOrInsertGlobal(Name: "__security_cookie",
28257	Ty: PointerType::getUnqual(C&: M.getContext()));
28258
28259	// MSVC CRT has a function to validate security cookie.
28260	FunctionCallee SecurityCheckCookie =
28261	M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
28262	RetTy: Type::getVoidTy(C&: M.getContext()),
28263	Args: PointerType::getUnqual(C&: M.getContext()));
28264	if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
28265	F->setCallingConv(CallingConv::Win64);
28266	F->addParamAttr(ArgNo: `0`, Kind: Attribute::AttrKind::InReg);
28267	}
28268	return;
28269	}
28270	TargetLowering::insertSSPDeclarations(M);
28271	}
28272
28273	Value AArch64TargetLowering::getSDagStackGuard(const* Module &M) const {
28274	// MSVC CRT has a global variable holding security cookie.
28275	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28276	return M.getGlobalVariable(Name: "__security_cookie");
28277	return TargetLowering::getSDagStackGuard(M);
28278	}
28279
28280	Function AArch64TargetLowering::getSSPStackGuardCheck(const* Module &M) const {
28281	// MSVC CRT has a function to validate security cookie.
28282	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
28283	return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
28284	return TargetLowering::getSSPStackGuardCheck(M);
28285	}
28286
28287	Value *
28288	AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
28289	// Android provides a fixed TLS slot for the SafeStack pointer. See the
28290	// definition of TLS_SLOT_SAFESTACK in
28291	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
28292	if (Subtarget->isTargetAndroid())
28293	return UseTlsOffset(IRB, Offset: `0x48`);
28294
28295	// Fuchsia is similar.
28296	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
28297	if (Subtarget->isTargetFuchsia())
28298	return UseTlsOffset(IRB, Offset: -`0x8`);
28299
28300	return TargetLowering::getSafeStackPointerLocation(IRB);
28301	}
28302
28303	/// If a physical register, this returns the register that receives the
28304	/// exception address on entry to an EH pad.
28305	Register AArch64TargetLowering::getExceptionPointerRegister(
28306	const Constant PersonalityFn) const* {
28307	// FIXME: This is a guess. Has this been defined yet?
28308	return AArch64::X0;
28309	}
28310
28311	/// If a physical register, this returns the register that receives the
28312	/// exception typeid on entry to a landing pad.
28313	Register AArch64TargetLowering::getExceptionSelectorRegister(
28314	const Constant PersonalityFn) const* {
28315	// FIXME: This is a guess. Has this been defined yet?
28316	return AArch64::X1;
28317	}
28318
28319	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
28320	const Instruction &AndI) const {
28321	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
28322	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
28323	// may be beneficial to sink in other cases, but we would have to check that
28324	// the cmp would not get folded into the br to form a cbz for these to be
28325	// beneficial.
28326	ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: `1`));
28327	if (!Mask)
28328	return false;
28329	return Mask->getValue().isPowerOf2();
28330	}
28331
28332	bool AArch64TargetLowering::
28333	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28334	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
28335	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
28336	SelectionDAG &DAG) const {
28337	// Does baseline recommend not to perform the fold by default?
28338	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
28339	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
28340	return false;
28341	// Else, if this is a vector shift, prefer 'shl'.
28342	return X.getValueType().isScalarInteger() \|\| NewShiftOpcode == ISD::SHL;
28343	}
28344
28345	TargetLowering::ShiftLegalizationStrategy
28346	AArch64TargetLowering::preferredShiftLegalizationStrategy(
28347	SelectionDAG &DAG, SDNode N, unsigned* int ExpansionFactor) const {
28348	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
28349	!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
28350	return ShiftLegalizationStrategy::LowerToLibcall;
28351	return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
28352	ExpansionFactor);
28353	}
28354
28355	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock Entry) const* {
28356	// Update IsSplitCSR in AArch64unctionInfo.
28357	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
28358	AFI->setIsSplitCSR(true);
28359	}
28360
28361	void AArch64TargetLowering::insertCopiesSplitCSR(
28362	MachineBasicBlock *Entry,
28363	const SmallVectorImpl<MachineBasicBlock > &Exits) const* {
28364	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
28365	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
28366	if (!IStart)
28367	return;
28368
28369	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28370	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28371	MachineBasicBlock::iterator MBBI = Entry->begin();
28372	for (const MCPhysReg I = IStart; I; ++I) {
28373	const TargetRegisterClass RC = nullptr*;
28374	if (AArch64::GPR64RegClass.contains(Reg: *I))
28375	RC = &AArch64::GPR64RegClass;
28376	else if (AArch64::FPR64RegClass.contains(Reg: *I))
28377	RC = &AArch64::FPR64RegClass;
28378	else
28379	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28380
28381	Register NewVR = MRI->createVirtualRegister(RegClass: RC);
28382	// Create copy from CSR to a virtual register.
28383	// FIXME: this currently does not emit CFI pseudo-instructions, it works
28384	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28385	// nounwind. If we want to generalize this later, we may need to emit
28386	// CFI pseudo-instructions.
28387	assert(Entry->getParent()->getFunction().hasFnAttribute(
28388	Attribute::NoUnwind) &&
28389	"Function should be nounwind in insertCopiesSplitCSR!");
28390	Entry->addLiveIn(PhysReg: *I);
28391	BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc (), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
28392	.addReg(RegNo: *I);
28393
28394	// Insert the copy-back instructions right before the terminator.
28395	for (auto *Exit : Exits)
28396	BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc (),
28397	MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
28398	.addReg(RegNo: NewVR);
28399	}
28400	}
28401
28402	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
28403	// Integer division on AArch64 is expensive. However, when aggressively
28404	// optimizing for code size, we prefer to use a div instruction, as it is
28405	// usually smaller than the alternative sequence.
28406	// The exception to this is vector division. Since AArch64 doesn't have vector
28407	// integer division, leaving the division as-is is a loss even in terms of
28408	// size, because it will have to be scalarized, while the alternative code
28409	// sequence can be performed in vector form.
28410	bool OptSize = Attr.hasFnAttr(Kind: Attribute::MinSize);
28411	return OptSize && !VT.isVector();
28412	}
28413
28414	bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
28415	const MachineFunction &MF) const {
28416	// Avoid merging stores into fixed-length vectors when Neon is unavailable.
28417	// In future, we could allow this when SVE is available, but currently,
28418	// the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
28419	// the general lowering may introduce stack spills/reloads).
28420	if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
28421	return false;
28422
28423	// Do not merge to float value size (128 bytes) if no implicit float attribute
28424	// is set.
28425	bool NoFloat = MF.getFunction().hasFnAttribute(Kind: Attribute::NoImplicitFloat);
28426	return !NoFloat \|\| MemVT.getSizeInBits() <= `64`;
28427	}
28428
28429	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
28430	// We want inc-of-add for scalars and sub-of-not for vectors.
28431	return VT.isScalarInteger();
28432	}
28433
28434	bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
28435	EVT VT) const {
28436	// v8f16 without fp16 need to be extended to v8f32, which is more difficult to
28437	// legalize.
28438	if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
28439	return false;
28440	if (FPVT == MVT::v8bf16)
28441	return false;
28442	return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
28443	}
28444
28445	bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
28446	// Expand scalar and SVE operations using selects. Neon vectors prefer sub to
28447	// avoid vselect becoming bsl / unrolling.
28448	return !VT.isFixedLengthVector();
28449	}
28450
28451	MachineInstr *
28452	AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
28453	MachineBasicBlock::instr_iterator &MBBI,
28454	const TargetInstrInfo TII) const* {
28455	assert(MBBI->isCall() && MBBI->getCFIType() &&
28456	"Invalid call instruction for a KCFI check");
28457
28458	switch (MBBI ->getOpcode()) {
28459	case AArch64::BLR:
28460	case AArch64::BLRNoIP:
28461	case AArch64::TCRETURNri:
28462	case AArch64::TCRETURNrix16x17:
28463	case AArch64::TCRETURNrix17:
28464	case AArch64::TCRETURNrinotx16:
28465	break;
28466	default:
28467	llvm_unreachable("Unexpected CFI call opcode");
28468	}
28469
28470	MachineOperand &Target = MBBI ->getOperand(i: `0`);
28471	assert(Target.isReg() && "Invalid target operand for an indirect call");
28472	Target.setIsRenamable(false);
28473
28474	return BuildMI(BB&: MBB, I: MBBI, MIMD: MBBI ->getDebugLoc(), MCID: TII->get(Opcode: AArch64::KCFI_CHECK))
28475	.addReg(RegNo: Target.getReg())
28476	.addImm(Val: MBBI ->getCFIType())
28477	.getInstr();
28478	}
28479
28480	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
28481	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
28482	}
28483
28484	unsigned
28485	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
28486	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
28487	return getPointerTy(DL).getSizeInBits();
28488
28489	return `3` * getPointerTy(DL).getSizeInBits() + `2` * `32`;
28490	}
28491
28492	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
28493	MachineFrameInfo &MFI = MF.getFrameInfo();
28494	// If we have any vulnerable SVE stack objects then the stack protector
28495	// needs to be placed at the top of the SVE stack area, as the SVE locals
28496	// are placed above the other locals, so we allocate it as if it were a
28497	// scalable vector.
28498	// FIXME: It may be worthwhile having a specific interface for this rather
28499	// than doing it here in finalizeLowering.
28500	if (MFI.hasStackProtectorIndex()) {
28501	for (unsigned int i = `0`, e = MFI.getObjectIndexEnd(); i != e; ++i) {
28502	if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
28503	MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
28504	MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
28505	ID: TargetStackID::ScalableVector);
28506	MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align (`16`));
28507	break;
28508	}
28509	}
28510	}
28511	MFI.computeMaxCallFrameSize(MF);
28512	TargetLoweringBase::finalizeLowering(MF);
28513	}
28514
28515	// Unlike X86, we let frame lowering assign offsets to all catch objects.
28516	bool AArch64TargetLowering::needsFixedCatchObjects() const {
28517	return false;
28518	}
28519
28520	bool AArch64TargetLowering::shouldLocalize(
28521	const MachineInstr &MI, const TargetTransformInfo TTI) const* {
28522	auto &MF = *MI.getMF();
28523	auto &MRI = MF.getRegInfo();
28524	auto maxUses = [](unsigned RematCost) {
28525	// A cost of 1 means remats are basically free.
28526	if (RematCost == `1`)
28527	return std::numeric_limits<unsigned>::max();
28528	if (RematCost == `2`)
28529	return `2U`;
28530
28531	// Remat is too expensive, only sink if there's one user.
28532	if (RematCost > `2`)
28533	return `1U`;
28534	llvm_unreachable("Unexpected remat cost");
28535	};
28536
28537	unsigned Opc = MI.getOpcode();
28538	switch (Opc) {
28539	case TargetOpcode::G_GLOBAL_VALUE: {
28540	// On Darwin, TLS global vars get selected into function calls, which
28541	// we don't want localized, as they can get moved into the middle of a
28542	// another call sequence.
28543	const GlobalValue &GV = *MI.getOperand(i: `1`).getGlobal();
28544	if (GV.isThreadLocal() && Subtarget->isTargetMachO())
28545	return false;
28546	return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
28547	}
28548	case TargetOpcode::G_FCONSTANT:
28549	case TargetOpcode::G_CONSTANT: {
28550	const ConstantInt *CI;
28551	unsigned AdditionalCost = `0`;
28552
28553	if (Opc == TargetOpcode::G_CONSTANT)
28554	CI = MI.getOperand(i: `1`).getCImm();
28555	else {
28556	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
28557	// We try to estimate cost of 32/64b fpimms, as they'll likely be
28558	// materialized as integers.
28559	if (Ty.getScalarSizeInBits() != `32` && Ty.getScalarSizeInBits() != `64`)
28560	break;
28561	auto APF = MI.getOperand(i: `1`).getFPImm()->getValueAPF();
28562	bool OptForSize = MF.getFunction().hasOptSize();
28563	if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
28564	OptForSize))
28565	return true; // Constant should be cheap.
28566	CI =
28567	ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
28568	// FP materialization also costs an extra move, from gpr to fpr.
28569	AdditionalCost = `1`;
28570	}
28571	APInt Imm = CI->getValue();
28572	InstructionCost Cost = TTI->getIntImmCost(
28573	Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
28574	assert(Cost.isValid() && "Expected a valid imm cost");
28575
28576	unsigned RematCost = Cost.getValue();
28577	RematCost += AdditionalCost;
28578	Register Reg = MI.getOperand(i: `0`).getReg();
28579	unsigned MaxUses = maxUses (RematCost);
28580	// Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
28581	if (MaxUses == std::numeric_limits<unsigned>::max())
28582	--MaxUses;
28583	return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
28584	}
28585	// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
28586	// localizable.
28587	case AArch64::ADRP:
28588	case AArch64::G_ADD_LOW:
28589	// Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
28590	case TargetOpcode::G_PTR_ADD:
28591	return true;
28592	default:
28593	break;
28594	}
28595	return TargetLoweringBase::shouldLocalize(MI, TTI);
28596	}
28597
28598	bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
28599	// Fallback for scalable vectors.
28600	// Note that if EnableSVEGISel is true, we allow scalable vector types for
28601	// all instructions, regardless of whether they are actually supported.
28602	if (!EnableSVEGISel) {
28603	if (Inst.getType()->isScalableTy()) {
28604	return true;
28605	}
28606
28607	for (unsigned i = `0`; i < Inst.getNumOperands(); ++i)
28608	if (Inst.getOperand(i)->getType()->isScalableTy())
28609	return true;
28610
28611	if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
28612	if (AI->getAllocatedType()->isScalableTy())
28613	return true;
28614	}
28615	}
28616
28617	// Checks to allow the use of SME instructions
28618	if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
28619	auto CallAttrs = SMECallAttrs (*Base);
28620	if (CallAttrs.requiresSMChange() \|\| CallAttrs.requiresLazySave() \|\|
28621	CallAttrs.requiresPreservingZT0() \|\|
28622	CallAttrs.requiresPreservingAllZAState())
28623	return true;
28624	}
28625	return false;
28626	}
28627
28628	// Return the largest legal scalable vector type that matches VT's element type.
28629	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
28630	assert(VT.isFixedLengthVector() &&
28631	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28632	"Expected legal fixed length vector!");
28633	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28634	default:
28635	llvm_unreachable("unexpected element type for SVE container");
28636	case MVT::i8:
28637	return EVT (MVT::nxv16i8);
28638	case MVT::i16:
28639	return EVT (MVT::nxv8i16);
28640	case MVT::i32:
28641	return EVT (MVT::nxv4i32);
28642	case MVT::i64:
28643	return EVT (MVT::nxv2i64);
28644	case MVT::bf16:
28645	return EVT (MVT::nxv8bf16);
28646	case MVT::f16:
28647	return EVT (MVT::nxv8f16);
28648	case MVT::f32:
28649	return EVT (MVT::nxv4f32);
28650	case MVT::f64:
28651	return EVT (MVT::nxv2f64);
28652	}
28653	}
28654
28655	// Return a predicate with active lanes corresponding to the extent of VT.
28656	static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
28657	EVT VT) {
28658	assert(VT.isFixedLengthVector() &&
28659	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28660	"Expected legal fixed length vector!");
28661
28662	std::optional<unsigned> PgPattern =
28663	getSVEPredPatternFromNumElements(MinNumElts: VT.getVectorNumElements());
28664	assert(PgPattern && "Unexpected element count for SVE predicate");
28665
28666	// For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
28667	// AArch64SVEPredPattern::all, which can enable the use of unpredicated
28668	// variants of instructions when available.
28669	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28670	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28671	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28672	if (MaxSVESize && MinSVESize == MaxSVESize &&
28673	MaxSVESize == VT.getSizeInBits())
28674	PgPattern = AArch64SVEPredPattern::all;
28675
28676	MVT MaskVT;
28677	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
28678	default:
28679	llvm_unreachable("unexpected element type for SVE predicate");
28680	case MVT::i8:
28681	MaskVT = MVT::nxv16i1;
28682	break;
28683	case MVT::i16:
28684	case MVT::f16:
28685	case MVT::bf16:
28686	MaskVT = MVT::nxv8i1;
28687	break;
28688	case MVT::i32:
28689	case MVT::f32:
28690	MaskVT = MVT::nxv4i1;
28691	break;
28692	case MVT::i64:
28693	case MVT::f64:
28694	MaskVT = MVT::nxv2i1;
28695	break;
28696	}
28697
28698	return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
28699	}
28700
28701	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
28702	EVT VT) {
28703	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
28704	"Expected legal scalable vector!");
28705	auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
28706	return getPTrue(DAG, DL, VT: PredTy, Pattern: AArch64SVEPredPattern::all);
28707	}
28708
28709	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
28710	if (VT.isFixedLengthVector())
28711	return getPredicateForFixedLengthVector(DAG, DL, VT);
28712
28713	return getPredicateForScalableVector(DAG, DL, VT);
28714	}
28715
28716	// Grow V to consume an entire SVE register.
28717	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28718	assert(VT.isScalableVector() &&
28719	"Expected to convert into a scalable vector!");
28720	assert(V.getValueType().isFixedLengthVector() &&
28721	"Expected a fixed length vector operand!");
28722	SDLoc DL(V);
28723	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
28724	return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
28725	}
28726
28727	// Shrink V so it's just big enough to maintain a VT's worth of data.
28728	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
28729	assert(VT.isFixedLengthVector() &&
28730	"Expected to convert into a fixed length vector!");
28731	assert(V.getValueType().isScalableVector() &&
28732	"Expected a scalable vector operand!");
28733	SDLoc DL(V);
28734	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
28735	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
28736	}
28737
28738	// Convert all fixed length vector loads larger than NEON to masked_loads.
28739	SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
28740	SDValue Op, SelectionDAG &DAG) const {
28741	auto Load = cast<LoadSDNode>(Val&: Op);
28742
28743	SDLoc DL(Op);
28744	EVT VT = Op.getValueType();
28745	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28746	EVT LoadVT = ContainerVT;
28747	EVT MemVT = Load->getMemoryVT();
28748
28749	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28750
28751	if (VT.isFloatingPoint()) {
28752	LoadVT = ContainerVT.changeTypeToInteger();
28753	MemVT = MemVT.changeTypeToInteger();
28754	}
28755
28756	SDValue NewLoad = DAG.getMaskedLoad(
28757	VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
28758	Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
28759	AM: Load->getAddressingMode(), Load->getExtensionType());
28760
28761	SDValue Result = NewLoad;
28762	if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
28763	EVT ExtendVT = ContainerVT.changeVectorElementType(
28764	EltVT: Load->getMemoryVT().getVectorElementType());
28765
28766	Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
28767	Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
28768	N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
28769	} else if (VT.isFloatingPoint()) {
28770	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
28771	}
28772
28773	Result = convertFromScalableVector(DAG, VT, V: Result);
28774	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
28775	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
28776	}
28777
28778	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
28779	SelectionDAG &DAG) {
28780	SDLoc DL(Mask);
28781	EVT InVT = Mask.getValueType();
28782	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
28783	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
28784
28785	if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
28786	return Pg;
28787
28788	bool InvertCond = false;
28789	if (isBitwiseNot(V: Mask)) {
28790	InvertCond = true;
28791	Mask = Mask.getOperand(i: `0`);
28792	}
28793
28794	SDValue Op1, Op2;
28795	ISD::CondCode CC;
28796
28797	// When Mask is the result of a SETCC, it's better to regenerate the compare.
28798	if (Mask.getOpcode() == ISD::SETCC) {
28799	Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask.getOperand(i: `0`));
28800	Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask.getOperand(i: `1`));
28801	CC = cast<CondCodeSDNode>(Val: Mask.getOperand(i: `2`))->get();
28802	} else {
28803	Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
28804	Op2 = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
28805	CC = ISD::SETNE;
28806	}
28807
28808	if (InvertCond)
28809	CC = getSetCCInverse(Operation: CC, Type: Op1.getValueType());
28810
28811	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
28812	Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: CC)});
28813	}
28814
28815	// Convert all fixed length vector loads larger than NEON to masked_loads.
28816	SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
28817	SDValue Op, SelectionDAG &DAG) const {
28818	auto Load = cast<MaskedLoadSDNode>(Val&: Op);
28819
28820	SDLoc DL(Op);
28821	EVT VT = Op.getValueType();
28822	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28823
28824	SDValue Mask = Load->getMask();
28825	// If this is an extending load and the mask type is not the same as
28826	// load's type then we have to extend the mask type.
28827	if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
28828	assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
28829	"Incorrect mask type");
28830	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT, Operand: Mask);
28831	}
28832	Mask = convertFixedMaskToScalableVector(Mask, DAG);
28833
28834	SDValue PassThru;
28835	bool IsPassThruZeroOrUndef = false;
28836
28837	if (Load->getPassThru()->isUndef()) {
28838	PassThru = DAG.getUNDEF(VT: ContainerVT);
28839	IsPassThruZeroOrUndef = true;
28840	} else {
28841	if (ContainerVT.isInteger())
28842	PassThru = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
28843	else
28844	PassThru = DAG.getConstantFP(Val: `0`, DL, VT: ContainerVT);
28845	if (isZerosVector(N: Load->getPassThru().getNode()))
28846	IsPassThruZeroOrUndef = true;
28847	}
28848
28849	SDValue NewLoad = DAG.getMaskedLoad(
28850	VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
28851	Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
28852	AM: Load->getAddressingMode(), Load->getExtensionType());
28853
28854	SDValue Result = NewLoad;
28855	if (!IsPassThruZeroOrUndef) {
28856	SDValue OldPassThru =
28857	convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
28858	Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
28859	}
28860
28861	Result = convertFromScalableVector(DAG, VT, V: Result);
28862	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
28863	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
28864	}
28865
28866	// Convert all fixed length vector stores larger than NEON to masked_stores.
28867	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
28868	SDValue Op, SelectionDAG &DAG) const {
28869	auto Store = cast<StoreSDNode>(Val&: Op);
28870
28871	SDLoc DL(Op);
28872	EVT VT = Store->getValue().getValueType();
28873	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28874	EVT MemVT = Store->getMemoryVT();
28875
28876	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28877	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
28878
28879	if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
28880	EVT TruncVT = ContainerVT.changeVectorElementType(
28881	EltVT: Store->getMemoryVT().getVectorElementType());
28882	MemVT = MemVT.changeTypeToInteger();
28883	NewValue = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: TruncVT, N1: Pg,
28884	N2: NewValue, N3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64),
28885	N4: DAG.getUNDEF(VT: TruncVT));
28886	NewValue =
28887	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
28888	} else if (VT.isFloatingPoint()) {
28889	MemVT = MemVT.changeTypeToInteger();
28890	NewValue =
28891	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
28892	}
28893
28894	return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
28895	Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
28896	MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
28897	IsTruncating: Store->isTruncatingStore());
28898	}
28899
28900	SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
28901	SDValue Op, SelectionDAG &DAG) const {
28902	auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
28903
28904	SDLoc DL(Op);
28905	EVT VT = Store->getValue().getValueType();
28906	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28907
28908	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
28909	SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
28910
28911	return DAG.getMaskedStore(
28912	Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
28913	Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
28914	AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
28915	}
28916
28917	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
28918	SDValue Op, SelectionDAG &DAG) const {
28919	SDLoc DL(Op);
28920	EVT VT = Op.getValueType();
28921	EVT EltVT = VT.getVectorElementType();
28922
28923	bool Signed = Op.getOpcode() == ISD::SDIV;
28924	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
28925
28926	bool Negated;
28927	uint64_t SplatVal;
28928	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
28929	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28930	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
28931	SDValue Op2 = DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL, VT: MVT::i32);
28932
28933	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28934	SDValue Res =
28935	DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
28936	if (Negated)
28937	Res = DAG.getNode(Opcode: ISD::SUB, DL, VT: ContainerVT,
28938	N1: DAG.getConstant(Val: `0`, DL, VT: ContainerVT), N2: Res);
28939
28940	return convertFromScalableVector(DAG, VT, V: Res);
28941	}
28942
28943	// Scalable vector i32/i64 DIV is supported.
28944	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
28945	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
28946
28947	// Scalable vector i8/i16 DIV is not supported. Promote it to i32.
28948	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
28949	EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
28950	unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28951
28952	// If the wider type is legal: extend, op, and truncate.
28953	EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
28954	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
28955	SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL, VT: WideVT, Operand: Op.getOperand(i: `0`));
28956	SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL, VT: WideVT, Operand: Op.getOperand(i: `1`));
28957	SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: WideVT, N1: Op0, N2: Op1);
28958	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Div);
28959	}
28960
28961	auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
28962	&ExtendOpcode](SDValue Op) {
28963	SDValue IdxZero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
28964	SDValue IdxHalf =
28965	DAG.getConstant(Val: HalfVT.getVectorNumElements(), DL, VT: MVT::i64);
28966	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Op, N2: IdxZero);
28967	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Op, N2: IdxHalf);
28968	return std::pair<SDValue, SDValue>(
28969	{DAG.getNode(Opcode: ExtendOpcode, DL, VT: PromVT, Operand: Lo),
28970	DAG.getNode(Opcode: ExtendOpcode, DL, VT: PromVT, Operand: Hi)});
28971	};
28972
28973	// If wider type is not legal: split, extend, op, trunc and concat.
28974	auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector (Op.getOperand(i: `0`));
28975	auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector (Op.getOperand(i: `1`));
28976	SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
28977	SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
28978	SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: Lo);
28979	SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: Hi);
28980	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops: {LoTrunc, HiTrunc});
28981	}
28982
28983	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
28984	SDValue Op, SelectionDAG &DAG) const {
28985	EVT VT = Op.getValueType();
28986	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28987
28988	SDLoc DL(Op);
28989	SDValue Val = Op.getOperand(i: `0`);
28990	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
28991	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
28992
28993	bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
28994	unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
28995
28996	// Repeatedly unpack Val until the result is of the desired element type.
28997	switch (ContainerVT.getSimpleVT().SimpleTy) {
28998	default:
28999	llvm_unreachable("unimplemented container type");
29000	case MVT::nxv16i8:
29001	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv8i16, Operand: Val);
29002	if (VT.getVectorElementType() == MVT::i16)
29003	break;
29004	[[fallthrough]];
29005	case MVT::nxv8i16:
29006	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv4i32, Operand: Val);
29007	if (VT.getVectorElementType() == MVT::i32)
29008	break;
29009	[[fallthrough]];
29010	case MVT::nxv4i32:
29011	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv2i64, Operand: Val);
29012	assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
29013	break;
29014	}
29015
29016	return convertFromScalableVector(DAG, VT, V: Val);
29017	}
29018
29019	SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
29020	SDValue Op, SelectionDAG &DAG) const {
29021	EVT VT = Op.getValueType();
29022	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29023
29024	SDLoc DL(Op);
29025	SDValue Val = Op.getOperand(i: `0`);
29026	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
29027	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
29028
29029	// Repeatedly truncate Val until the result is of the desired element type.
29030	switch (ContainerVT.getSimpleVT().SimpleTy) {
29031	default:
29032	llvm_unreachable("unimplemented container type");
29033	case MVT::nxv2i64:
29034	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv4i32, Operand: Val);
29035	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv4i32, N1: Val, N2: Val);
29036	if (VT.getVectorElementType() == MVT::i32)
29037	break;
29038	[[fallthrough]];
29039	case MVT::nxv4i32:
29040	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv8i16, Operand: Val);
29041	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv8i16, N1: Val, N2: Val);
29042	if (VT.getVectorElementType() == MVT::i16)
29043	break;
29044	[[fallthrough]];
29045	case MVT::nxv8i16:
29046	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i8, Operand: Val);
29047	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv16i8, N1: Val, N2: Val);
29048	assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
29049	break;
29050	}
29051
29052	return convertFromScalableVector(DAG, VT, V: Val);
29053	}
29054
29055	SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
29056	SDValue Op, SelectionDAG &DAG) const {
29057	EVT VT = Op.getValueType();
29058	EVT InVT = Op.getOperand(i: `0`).getValueType();
29059	assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
29060
29061	SDLoc DL(Op);
29062	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29063	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
29064
29065	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: `1`));
29066	}
29067
29068	SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
29069	SDValue Op, SelectionDAG &DAG) const {
29070	EVT VT = Op.getValueType();
29071	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29072
29073	SDLoc DL(Op);
29074	EVT InVT = Op.getOperand(i: `0`).getValueType();
29075	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29076	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
29077
29078	auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
29079	N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`));
29080
29081	return convertFromScalableVector(DAG, VT, V: ScalableRes);
29082	}
29083
29084	// Convert vector operation 'Op' to an equivalent predicated operation whereby
29085	// the original operation's type is used to construct a suitable predicate.
29086	// NOTE: The results for inactive lanes are undefined.
29087	SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
29088	SelectionDAG &DAG,
29089	unsigned NewOp) const {
29090	EVT VT = Op.getValueType();
29091	SDLoc DL(Op);
29092	auto Pg = getPredicateForVector(DAG, DL, VT);
29093
29094	if (VT.isFixedLengthVector()) {
29095	assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
29096	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29097
29098	// Create list of operands by converting existing ones to scalable types.
29099	SmallVector<SDValue, `4`> Operands = {Pg};
29100	for (const SDValue &V : Op ->op_values()) {
29101	if (isa<CondCodeSDNode>(Val: V)) {
29102	Operands.push_back(Elt: V);
29103	continue;
29104	}
29105
29106	if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
29107	EVT VTArg = VTNode->getVT().getVectorElementType();
29108	EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
29109	Operands.push_back(Elt: DAG.getValueType(NewVTArg));
29110	continue;
29111	}
29112
29113	assert(isTypeLegal(V.getValueType()) &&
29114	"Expected only legal fixed-width types");
29115	Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
29116	}
29117
29118	if (isMergePassthruOpcode(Opc: NewOp))
29119	Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
29120
29121	auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
29122	return convertFromScalableVector(DAG, VT, V: ScalableRes);
29123	}
29124
29125	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
29126
29127	SmallVector<SDValue, `4`> Operands = {Pg};
29128	for (const SDValue &V : Op ->op_values()) {
29129	assert((!V.getValueType().isVector() \|\|
29130	V.getValueType().isScalableVector()) &&
29131	"Only scalable vectors are supported!");
29132	Operands.push_back(Elt: V);
29133	}
29134
29135	if (isMergePassthruOpcode(Opc: NewOp))
29136	Operands.push_back(Elt: DAG.getUNDEF(VT));
29137
29138	return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op ->getFlags());
29139	}
29140
29141	// If a fixed length vector operation has no side effects when applied to
29142	// undefined elements, we can safely use scalable vectors to perform the same
29143	// operation without needing to worry about predication.
29144	SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
29145	SelectionDAG &DAG) const {
29146	EVT VT = Op.getValueType();
29147	assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
29148	"Only expected to lower fixed length vector operation!");
29149	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29150
29151	// Create list of operands by converting existing ones to scalable types.
29152	SmallVector<SDValue, `4`> Ops;
29153	for (const SDValue &V : Op ->op_values()) {
29154	assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
29155
29156	// Pass through non-vector operands.
29157	if (!V.getValueType().isVector()) {
29158	Ops.push_back(Elt: V);
29159	continue;
29160	}
29161
29162	// "cast" fixed length vector to a scalable vector.
29163	assert(V.getValueType().isFixedLengthVector() &&
29164	isTypeLegal(V.getValueType()) &&
29165	"Only fixed length vectors are supported!");
29166	Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
29167	}
29168
29169	auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc (Op), VT: ContainerVT, Ops);
29170	return convertFromScalableVector(DAG, VT, V: ScalableRes);
29171	}
29172
29173	SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
29174	SelectionDAG &DAG) const {
29175	SDLoc DL(ScalarOp);
29176	SDValue AccOp = ScalarOp.getOperand(i: `0`);
29177	SDValue VecOp = ScalarOp.getOperand(i: `1`);
29178	EVT SrcVT = VecOp.getValueType();
29179	EVT ResVT = SrcVT.getVectorElementType();
29180
29181	EVT ContainerVT = SrcVT;
29182	if (SrcVT.isFixedLengthVector()) {
29183	ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29184	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
29185	}
29186
29187	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
29188	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
29189
29190	// Convert operands to Scalable.
29191	AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
29192	N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
29193
29194	// Perform reduction.
29195	SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
29196	N1: Pg, N2: AccOp, N3: VecOp);
29197
29198	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
29199	}
29200
29201	SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
29202	SelectionDAG &DAG) const {
29203	SDLoc DL(ReduceOp);
29204	SDValue Op = ReduceOp.getOperand(i: `0`);
29205	EVT OpVT = Op.getValueType();
29206	EVT VT = ReduceOp.getValueType();
29207
29208	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
29209	return SDValue ();
29210
29211	SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
29212
29213	switch (ReduceOp.getOpcode()) {
29214	default:
29215	return SDValue ();
29216	case ISD::VECREDUCE_OR:
29217	if (isAllActivePredicate(DAG, N: Pg) && OpVT == MVT::nxv16i1)
29218	// The predicate can be 'Op' because
29219	// vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
29220	return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
29221	else
29222	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
29223	case ISD::VECREDUCE_AND: {
29224	Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
29225	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
29226	}
29227	case ISD::VECREDUCE_XOR: {
29228	SDValue ID =
29229	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64);
29230	if (OpVT == MVT::nxv1i1) {
29231	// Emulate a CNTP on .Q using .D and a different governing predicate.
29232	Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Pg);
29233	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Op);
29234	}
29235	SDValue Cntp =
29236	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64, N1: ID, N2: Pg, N3: Op);
29237	return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
29238	}
29239	}
29240
29241	return SDValue ();
29242	}
29243
29244	SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
29245	SDValue ScalarOp,
29246	SelectionDAG &DAG) const {
29247	SDLoc DL(ScalarOp);
29248	SDValue VecOp = ScalarOp.getOperand(i: `0`);
29249	EVT SrcVT = VecOp.getValueType();
29250
29251	if (useSVEForFixedLengthVectorVT(
29252	VT: SrcVT,
29253	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
29254	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29255	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
29256	}
29257
29258	// Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
29259	if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
29260	VecOp.getOpcode() == ISD::ZERO_EXTEND) {
29261	SDValue BoolVec = VecOp.getOperand(i: `0`);
29262	if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
29263	// CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
29264	SDValue CntpOp = DAG.getNode(
29265	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64,
29266	N1: DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64),
29267	N2: BoolVec, N3: BoolVec);
29268	return DAG.getAnyExtOrTrunc(Op: CntpOp, DL, VT: ScalarOp.getValueType());
29269	}
29270	}
29271
29272	// UADDV always returns an i64 result.
29273	EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
29274	SrcVT.getVectorElementType();
29275	EVT RdxVT = SrcVT;
29276	if (SrcVT.isFixedLengthVector() \|\| Opcode == AArch64ISD::UADDV_PRED)
29277	RdxVT = getPackedSVEVectorVT(VT: ResVT);
29278
29279	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
29280	SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
29281	SDValue Res = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT,
29282	N1: Rdx, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
29283
29284	// The VEC_REDUCE nodes expect an element size result.
29285	if (ResVT != ScalarOp.getValueType())
29286	Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
29287
29288	return Res;
29289	}
29290
29291	SDValue
29292	AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
29293	SelectionDAG &DAG) const {
29294	EVT VT = Op.getValueType();
29295	SDLoc DL(Op);
29296
29297	EVT InVT = Op.getOperand(i: `1`).getValueType();
29298	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29299	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `1`));
29300	SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `2`));
29301
29302	// Convert the mask to a predicated (NOTE: We don't need to worry about
29303	// inactive lanes since VSELECT is safe when given undefined elements).
29304	EVT MaskVT = Op.getOperand(i: `0`).getValueType();
29305	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
29306	auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: `0`));
29307	Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
29308	VT: MaskContainerVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
29309
29310	auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
29311	N1: Mask, N2: Op1, N3: Op2);
29312
29313	return convertFromScalableVector(DAG, VT, V: ScalableRes);
29314	}
29315
29316	SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
29317	SDValue Op, SelectionDAG &DAG) const {
29318	SDLoc DL(Op);
29319	EVT InVT = Op.getOperand(i: `0`).getValueType();
29320	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
29321
29322	assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
29323	"Only expected to lower fixed length vector operation!");
29324	assert(Op.getValueType() == InVT.changeTypeToInteger() &&
29325	"Expected integer result of the same bit length as the inputs!");
29326
29327	auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
29328	auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `1`));
29329	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
29330
29331	EVT CmpVT = Pg.getValueType();
29332	auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
29333	Ops: {Pg, Op1, Op2, Op.getOperand(i: `2`)});
29334
29335	EVT PromoteVT = ContainerVT.changeTypeToInteger();
29336	auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
29337	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
29338	}
29339
29340	SDValue
29341	AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
29342	SelectionDAG &DAG) const {
29343	SDLoc DL(Op);
29344	auto SrcOp = Op.getOperand(i: `0`);
29345	EVT VT = Op.getValueType();
29346	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29347	EVT ContainerSrcVT =
29348	getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
29349
29350	SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
29351	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
29352	return convertFromScalableVector(DAG, VT, V: Op);
29353	}
29354
29355	SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
29356	SDValue Op, SelectionDAG &DAG) const {
29357	SDLoc DL(Op);
29358	unsigned NumOperands = Op ->getNumOperands();
29359
29360	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
29361	"Unexpected number of operands in CONCAT_VECTORS");
29362
29363	auto SrcOp1 = Op.getOperand(i: `0`);
29364	auto SrcOp2 = Op.getOperand(i: `1`);
29365	EVT VT = Op.getValueType();
29366	EVT SrcVT = SrcOp1.getValueType();
29367
29368	// Match a splat of 128b segments that fit in a single register.
29369	if (SrcVT.is128BitVector() && all_equal(Range: Op.getNode()->op_values())) {
29370	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29371	SDValue Splat =
29372	DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: ContainerVT,
29373	N1: convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1),
29374	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64, /isTarget=/true));
29375	return convertFromScalableVector(DAG, VT, V: Splat);
29376	}
29377
29378	if (NumOperands > `2`) {
29379	SmallVector<SDValue, `4`> Ops;
29380	EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
29381	for (unsigned I = `0`; I < NumOperands; I += `2`)
29382	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
29383	N1: Op ->getOperand(Num: I), N2: Op ->getOperand(Num: I + `1`)));
29384
29385	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
29386	}
29387
29388	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29389
29390	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29391	SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
29392	SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
29393
29394	Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
29395
29396	return convertFromScalableVector(DAG, VT, V: Op);
29397	}
29398
29399	SDValue
29400	AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
29401	SelectionDAG &DAG) const {
29402	EVT VT = Op.getValueType();
29403	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29404
29405	SDLoc DL(Op);
29406	SDValue Val = Op.getOperand(i: `0`);
29407	SDValue Pg = getPredicateForVector(DAG, DL, VT);
29408	EVT SrcVT = Val.getValueType();
29409	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29410	EVT ExtendVT = ContainerVT.changeVectorElementType(
29411	EltVT: SrcVT.getVectorElementType());
29412
29413	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
29414	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29415
29416	Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
29417	Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
29418	Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
29419	N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
29420
29421	return convertFromScalableVector(DAG, VT, V: Val);
29422	}
29423
29424	SDValue
29425	AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
29426	SelectionDAG &DAG) const {
29427	EVT VT = Op.getValueType();
29428	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29429
29430	SDLoc DL(Op);
29431	SDValue Val = Op.getOperand(i: `0`);
29432	EVT SrcVT = Val.getValueType();
29433	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29434	EVT RoundVT = ContainerSrcVT.changeVectorElementType(
29435	EltVT: VT.getVectorElementType());
29436	SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
29437
29438	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29439	Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
29440	N3: Op.getOperand(i: `1`), N4: DAG.getUNDEF(VT: RoundVT));
29441	Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
29442	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
29443
29444	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29445	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
29446	}
29447
29448	SDValue
29449	AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
29450	SelectionDAG &DAG) const {
29451	EVT VT = Op.getValueType();
29452	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29453
29454	bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
29455	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
29456	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
29457
29458	SDLoc DL(Op);
29459	SDValue Val = Op.getOperand(i: `0`);
29460	EVT SrcVT = Val.getValueType();
29461	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29462	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29463
29464	if (VT.bitsGE(VT: SrcVT)) {
29465	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29466
29467	Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
29468	VT: VT.changeTypeToInteger(), Operand: Val);
29469
29470	// Safe to use a larger than specified operand because by promoting the
29471	// value nothing has changed from an arithmetic point of view.
29472	Val =
29473	convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
29474	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
29475	N3: DAG.getUNDEF(VT: ContainerDstVT));
29476	return convertFromScalableVector(DAG, VT, V: Val);
29477	} else {
29478	EVT CvtVT = ContainerSrcVT.changeVectorElementType(
29479	EltVT: ContainerDstVT.getVectorElementType());
29480	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29481
29482	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29483	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
29484	Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
29485	Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
29486
29487	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
29488	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
29489	}
29490	}
29491
29492	SDValue
29493	AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
29494	SelectionDAG &DAG) const {
29495	SDLoc DL(Op);
29496	EVT OpVT = Op.getValueType();
29497	assert(OpVT.isScalableVector() &&
29498	"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
29499
29500	// Are multi-register uzp instructions available?
29501	if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
29502	OpVT.getVectorElementType() != MVT::i1) {
29503	Intrinsic::ID IntID;
29504	switch (Op ->getNumOperands()) {
29505	default:
29506	return SDValue ();
29507	case `2`:
29508	IntID = Intrinsic::aarch64_sve_uzp_x2;
29509	break;
29510	case `4`:
29511	if (Subtarget->getMinSVEVectorSizeInBits() < `256` &&
29512	OpVT.getScalarSizeInBits() == `64`)
29513	return SDValue ();
29514	IntID = Intrinsic::aarch64_sve_uzp_x4;
29515	break;
29516	}
29517
29518	SmallVector<SDValue, `5`> Ops;
29519	Ops.push_back(Elt: DAG.getTargetConstant(Val: IntID, DL, VT: MVT::i64));
29520	Ops.append(in_start: Op ->op_values().begin(), in_end: Op ->op_values().end());
29521	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VTList: Op ->getVTList(), Ops);
29522	}
29523
29524	if (Op ->getNumOperands() != `2`)
29525	return SDValue ();
29526
29527	SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
29528	N2: Op.getOperand(i: `1`));
29529	SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
29530	N2: Op.getOperand(i: `1`));
29531	return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
29532	}
29533
29534	SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
29535	SelectionDAG &DAG) const {
29536	SDLoc DL(Op);
29537	EVT OpVT = Op.getValueType();
29538	assert(OpVT.isScalableVector() &&
29539	"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
29540
29541	// Are multi-register zip instructions available?
29542	if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
29543	OpVT.getVectorElementType() != MVT::i1) {
29544	Intrinsic::ID IntID;
29545	switch (Op ->getNumOperands()) {
29546	default:
29547	return SDValue ();
29548	case `2`:
29549	IntID = Intrinsic::aarch64_sve_zip_x2;
29550	break;
29551	case `4`:
29552	if (Subtarget->getMinSVEVectorSizeInBits() < `256` &&
29553	OpVT.getScalarSizeInBits() == `64`)
29554	return SDValue ();
29555	IntID = Intrinsic::aarch64_sve_zip_x4;
29556	break;
29557	}
29558
29559	SmallVector<SDValue, `5`> Ops;
29560	Ops.push_back(Elt: DAG.getTargetConstant(Val: IntID, DL, VT: MVT::i64));
29561	Ops.append(in_start: Op ->op_values().begin(), in_end: Op ->op_values().end());
29562	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VTList: Op ->getVTList(), Ops);
29563	}
29564
29565	if (Op ->getNumOperands() != `2`)
29566	return SDValue ();
29567
29568	SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
29569	N2: Op.getOperand(i: `1`));
29570	SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
29571	N2: Op.getOperand(i: `1`));
29572	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
29573	}
29574
29575	SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
29576	SelectionDAG &DAG) const {
29577	// FIXME: Maybe share some code with LowerMGather/Scatter?
29578	MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Val&: Op);
29579	SDLoc DL(HG);
29580	SDValue Chain = HG->getChain();
29581	SDValue Inc = HG->getInc();
29582	SDValue Mask = HG->getMask();
29583	SDValue Ptr = HG->getBasePtr();
29584	SDValue Index = HG->getIndex();
29585	SDValue Scale = HG->getScale();
29586	SDValue IntID = HG->getIntID();
29587
29588	// The Intrinsic ID determines the type of update operation.
29589	[[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(Val: IntID.getNode());
29590	// Right now, we only support 'add' as an update.
29591	assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
29592	"Unexpected histogram update operation");
29593
29594	EVT IndexVT = Index.getValueType();
29595	LLVMContext &Ctx = *DAG.getContext();
29596	ElementCount EC = IndexVT.getVectorElementCount();
29597	EVT MemVT = EVT::getVectorVT(Context&: Ctx, VT: HG->getMemoryVT(), EC);
29598	EVT IncExtVT =
29599	EVT::getIntegerVT(Context&: Ctx, BitWidth: AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
29600	EVT IncSplatVT = EVT::getVectorVT(Context&: Ctx, VT: IncExtVT, EC);
29601	bool ExtTrunc = IncSplatVT != MemVT;
29602
29603	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
29604	SDValue PassThru = DAG.getSplatVector(VT: IncSplatVT, DL, Op: Zero);
29605	SDValue IncSplat = DAG.getSplatVector(
29606	VT: IncSplatVT, DL, Op: DAG.getAnyExtOrTrunc(Op: Inc, DL, VT: IncExtVT));
29607	SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
29608
29609	MachineMemOperand *MMO = HG->getMemOperand();
29610	// Create an MMO for the gather, without load\|store flags.
29611	MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
29612	PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOLoad, Size: MMO->getSize(),
29613	BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
29614	ISD::MemIndexType IndexType = HG->getIndexType();
29615	SDValue Gather = DAG.getMaskedGather(
29616	VTs: DAG.getVTList(VT1: IncSplatVT, VT2: MVT::Other), MemVT, dl: DL, Ops, MMO: GMMO, IndexType,
29617	ExtTy: ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
29618
29619	SDValue GChain = Gather.getValue(R: `1`);
29620
29621	// Perform the histcnt, multiply by inc, add to bucket data.
29622	SDValue ID =
29623	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_histcnt, DL, VT: IncExtVT);
29624	SDValue HistCnt =
29625	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT, N1: ID, N2: Mask, N3: Index, N4: Index);
29626	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: IncSplatVT, N1: HistCnt, N2: IncSplat);
29627	SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: IncSplatVT, N1: Gather, N2: Mul);
29628
29629	// Create an MMO for the scatter, without load\|store flags.
29630	MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
29631	PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOStore, Size: MMO->getSize(),
29632	BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
29633
29634	SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
29635	SDValue Scatter = DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT, dl: DL,
29636	Ops: ScatterOps, MMO: SMMO, IndexType, IsTruncating: ExtTrunc);
29637	return Scatter;
29638	}
29639
29640	/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
29641	/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u\|s)dot. We can
29642	/// however still make use of the dot product instruction by instead
29643	/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
29644	/// If available, make use of the (U\|S)ADDW(B\|T) instructions, otherwise
29645	/// the following pattern is emitted:
29646	/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
29647	/// NTy/2))))
29648	SDValue
29649	AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
29650	SelectionDAG &DAG) const {
29651	SDLoc DL(Op);
29652
29653	SDValue Acc = Op.getOperand(i: `0`);
29654	SDValue LHS = Op.getOperand(i: `1`);
29655	SDValue RHS = Op.getOperand(i: `2`);
29656	EVT ResultVT = Op.getValueType();
29657	EVT OrigResultVT = ResultVT;
29658	EVT OpVT = LHS.getValueType();
29659
29660	bool ConvertToScalable =
29661	ResultVT.isFixedLengthVector() &&
29662	useSVEForFixedLengthVectorVT(VT: ResultVT, /OverrideNEON=/true);
29663
29664	if (ConvertToScalable) {
29665	ResultVT = getContainerForFixedLengthVector(DAG, VT: ResultVT);
29666	OpVT = getContainerForFixedLengthVector(DAG, VT: LHS.getValueType());
29667	Acc = convertToScalableVector(DAG, VT: ResultVT, V: Acc);
29668	LHS = convertToScalableVector(DAG, VT: OpVT, V: LHS);
29669	RHS = convertToScalableVector(DAG, VT: OpVT, V: RHS);
29670	Op = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: ResultVT, Ops: {Acc, LHS, RHS});
29671	}
29672
29673	// Two-way and four-way partial reductions are supported by patterns.
29674	// We only need to handle the 8-way partial reduction.
29675	if (ResultVT.getScalarType() != MVT::i64 \|\| OpVT.getScalarType() != MVT::i8)
29676	return ConvertToScalable ? convertFromScalableVector(DAG, VT: OrigResultVT, V: Op)
29677	: Op;
29678
29679	EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
29680	SDValue DotNode = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DotVT,
29681	N1: DAG.getConstant(Val: `0`, DL, VT: DotVT), N2: LHS, N3: RHS);
29682
29683	SDValue Res;
29684	bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
29685	if (Subtarget->hasSVE2() \|\| Subtarget->isStreamingSVEAvailable()) {
29686	unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
29687	unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
29688	SDValue Lo = DAG.getNode(Opcode: LoOpcode, DL, VT: ResultVT, N1: Acc, N2: DotNode);
29689	Res = DAG.getNode(Opcode: HiOpcode, DL, VT: ResultVT, N1: Lo, N2: DotNode);
29690	} else {
29691	// Fold (nx)v4i32 into (nx)v2i64
29692	auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(N: DotNode, DL);
29693	if (IsUnsigned) {
29694	DotNodeLo = DAG.getZExtOrTrunc(Op: DotNodeLo, DL, VT: ResultVT);
29695	DotNodeHi = DAG.getZExtOrTrunc(Op: DotNodeHi, DL, VT: ResultVT);
29696	} else {
29697	DotNodeLo = DAG.getSExtOrTrunc(Op: DotNodeLo, DL, VT: ResultVT);
29698	DotNodeHi = DAG.getSExtOrTrunc(Op: DotNodeHi, DL, VT: ResultVT);
29699	}
29700	auto Lo = DAG.getNode(Opcode: ISD::ADD, DL, VT: ResultVT, N1: Acc, N2: DotNodeLo);
29701	Res = DAG.getNode(Opcode: ISD::ADD, DL, VT: ResultVT, N1: Lo, N2: DotNodeHi);
29702	}
29703
29704	return ConvertToScalable ? convertFromScalableVector(DAG, VT: OrigResultVT, V: Res)
29705	: Res;
29706	}
29707
29708	SDValue
29709	AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
29710	SelectionDAG &DAG) const {
29711	EVT VT = Op.getValueType();
29712	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29713
29714	assert(Subtarget->isSVEorStreamingSVEAvailable() &&
29715	"Lowering fixed length get_active_lane_mask requires SVE!");
29716
29717	// There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
29718	// but we can use SVE when available.
29719
29720	SDLoc DL(Op);
29721	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29722	EVT WhileVT = ContainerVT.changeElementType(EltVT: MVT::i1);
29723
29724	SDValue Mask = DAG.getNode(Opcode: ISD::GET_ACTIVE_LANE_MASK, DL, VT: WhileVT,
29725	N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`));
29726	SDValue MaskAsInt = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: ContainerVT, Operand: Mask);
29727	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: MaskAsInt,
29728	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
29729	}
29730
29731	SDValue
29732	AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
29733	SelectionDAG &DAG) const {
29734	EVT VT = Op.getValueType();
29735	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29736
29737	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
29738	unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
29739	: AArch64ISD::FCVTZU_MERGE_PASSTHRU;
29740
29741	SDLoc DL(Op);
29742	SDValue Val = Op.getOperand(i: `0`);
29743	EVT SrcVT = Val.getValueType();
29744	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
29745	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
29746
29747	if (VT.bitsGT(VT: SrcVT)) {
29748	EVT CvtVT = ContainerDstVT.changeVectorElementType(
29749	EltVT: ContainerSrcVT.getVectorElementType());
29750	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
29751
29752	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
29753	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
29754
29755	Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
29756	Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
29757	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
29758	N3: DAG.getUNDEF(VT: ContainerDstVT));
29759	return convertFromScalableVector(DAG, VT, V: Val);
29760	} else {
29761	EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
29762	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
29763
29764	// Safe to use a larger than specified result since an fp_to_int where the
29765	// result doesn't fit into the destination is undefined.
29766	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
29767	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
29768	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
29769
29770	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
29771	}
29772	}
29773
29774	static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
29775	ArrayRef<int> ShuffleMask, EVT VT,
29776	EVT ContainerVT, SelectionDAG &DAG) {
29777	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
29778	SDLoc DL(Op);
29779	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
29780	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
29781	bool IsSingleOp =
29782	ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
29783
29784	if (!Subtarget.isNeonAvailable() && !MinSVESize)
29785	MinSVESize = `128`;
29786
29787	// Ignore two operands if no SVE2 or all index numbers couldn't
29788	// be represented.
29789	if (!IsSingleOp && !Subtarget.hasSVE2())
29790	return SDValue ();
29791
29792	EVT VTOp1 = Op.getOperand(i: `0`).getValueType();
29793	unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
29794	unsigned IndexLen = MinSVESize / BitsPerElt;
29795	unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
29796	uint64_t MaxOffset = maxUIntN(N: BitsPerElt);
29797	EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
29798	EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
29799	bool MinMaxEqual = (MinSVESize == MaxSVESize);
29800	assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
29801	"Incorrectly legalised shuffle operation");
29802
29803	SmallVector<SDValue, `8`> TBLMask;
29804	// If MinSVESize is not equal to MaxSVESize then we need to know which
29805	// TBL mask element needs adjustment.
29806	SmallVector<SDValue, `8`> AddRuntimeVLMask;
29807
29808	// Bail out for 8-bits element types, because with 2048-bit SVE register
29809	// size 8 bits is only sufficient to index into the first source vector.
29810	if (!IsSingleOp && !MinMaxEqual && BitsPerElt == `8`)
29811	return SDValue ();
29812
29813	for (int Index : ShuffleMask) {
29814	// Handling poison index value.
29815	if (Index < `0`)
29816	Index = `0`;
29817	// If the mask refers to elements in the second operand, then we have to
29818	// offset the index by the number of elements in a vector. If this is number
29819	// is not known at compile-time, we need to maintain a mask with 'VL' values
29820	// to add at runtime.
29821	if ((unsigned)Index >= ElementsPerVectorReg) {
29822	if (MinMaxEqual) {
29823	Index += IndexLen - ElementsPerVectorReg;
29824	} else {
29825	Index = Index - ElementsPerVectorReg;
29826	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
29827	}
29828	} else if (!MinMaxEqual)
29829	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
29830	// For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
29831	// to 255, this might point to the last element of in the second operand
29832	// of the shufflevector, thus we are rejecting this transform.
29833	if ((unsigned)Index >= MaxOffset)
29834	return SDValue ();
29835	TBLMask.push_back(Elt: DAG.getConstant(Val: Index, DL, VT: MVT::i64));
29836	}
29837
29838	// Choosing an out-of-range index leads to the lane being zeroed vs zero
29839	// value where it would perform first lane duplication for out of
29840	// index elements. For i8 elements an out-of-range index could be a valid
29841	// for 2048-bit vector register size.
29842	for (unsigned i = `0`; i < IndexLen - ElementsPerVectorReg; ++i) {
29843	TBLMask.push_back(Elt: DAG.getConstant(Val: (int)MaxOffset, DL, VT: MVT::i64));
29844	if (!MinMaxEqual)
29845	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
29846	}
29847
29848	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
29849	SDValue VecMask =
29850	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
29851	SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
29852
29853	SDValue Shuffle;
29854	if (IsSingleOp)
29855	Shuffle =
29856	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
29857	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl, DL, VT: MVT::i32),
29858	N2: Op1, N3: SVEMask);
29859	else if (Subtarget.hasSVE2()) {
29860	if (!MinMaxEqual) {
29861	unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
29862	SDValue VScale = (BitsPerElt == `64`)
29863	? DAG.getVScale(DL, VT: MVT::i64, MulImm: APInt (`64`, MinNumElts))
29864	: DAG.getVScale(DL, VT: MVT::i32, MulImm: APInt (`32`, MinNumElts));
29865	SDValue VecMask =
29866	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
29867	SDValue MulByMask = DAG.getNode(
29868	Opcode: ISD::MUL, DL, VT: MaskType,
29869	N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
29870	N2: DAG.getBuildVector(VT: MaskType, DL,
29871	Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
29872	SDValue UpdatedVecMask =
29873	DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
29874	SVEMask = convertToScalableVector(
29875	DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
29876	}
29877	Shuffle =
29878	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
29879	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl2, DL, VT: MVT::i32),
29880	N2: Op1, N3: Op2, N4: SVEMask);
29881	}
29882	Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
29883	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
29884	}
29885
29886	SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
29887	SDValue Op, SelectionDAG &DAG) const {
29888	EVT VT = Op.getValueType();
29889	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
29890
29891	auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
29892	auto ShuffleMask = SVN->getMask();
29893
29894	SDLoc DL(Op);
29895	SDValue Op1 = Op.getOperand(i: `0`);
29896	SDValue Op2 = Op.getOperand(i: `1`);
29897
29898	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
29899	Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
29900	Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
29901
29902	auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
29903	if (ScalarTy == MVT::i8 \|\| ScalarTy == MVT::i16)
29904	return MVT::i32;
29905	return ScalarTy;
29906	};
29907
29908	if (SVN->isSplat()) {
29909	unsigned Lane = std::max(a: `0`, b: SVN->getSplatIndex());
29910	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
29911	SDValue SplatEl = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
29912	N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
29913	Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
29914	return convertFromScalableVector(DAG, VT, V: Op);
29915	}
29916
29917	bool ReverseEXT = false;
29918	unsigned Imm;
29919	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
29920	Imm == VT.getVectorNumElements() - `1`) {
29921	if (ReverseEXT)
29922	std::swap(a&: Op1, b&: Op2);
29923	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
29924	SDValue Scalar = DAG.getNode(
29925	Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
29926	N2: DAG.getConstant(Val: VT.getVectorNumElements() - `1`, DL, VT: MVT::i64));
29927	Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
29928	return convertFromScalableVector(DAG, VT, V: Op);
29929	}
29930
29931	unsigned EltSize = VT.getScalarSizeInBits();
29932	for (unsigned BlockSize : {`64U`, `32U`, `16U`}) {
29933	if (isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize)) {
29934	unsigned RevOp;
29935	if (EltSize == `8`)
29936	RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
29937	else if (EltSize == `16`)
29938	RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
29939	else
29940	RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
29941	EVT BlockedVT =
29942	getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: BlockSize));
29943	SDValue Pg = getPredicateForVector(DAG, DL, VT: BlockedVT);
29944	SDValue BlockedOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BlockedVT, Operand: Op1);
29945	SDValue BlockedRev = DAG.getNode(Opcode: RevOp, DL, VT: BlockedVT, N1: Pg, N2: BlockedOp1,
29946	N3: DAG.getUNDEF(VT: BlockedVT));
29947	SDValue Container =
29948	DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: BlockedRev);
29949	return convertFromScalableVector(DAG, VT, V: Container);
29950	}
29951	}
29952
29953	if (Subtarget->hasSVE2p1() && EltSize == `64` &&
29954	isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: `128`)) {
29955	SDValue Pg = getPredicateForVector(DAG, DL, VT);
29956	SDValue Revd = DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL, VT: ContainerVT,
29957	N1: Pg, N2: Op1, N3: DAG.getUNDEF(VT: ContainerVT));
29958	return convertFromScalableVector(DAG, VT, V: Revd);
29959	}
29960
29961	unsigned WhichResult;
29962	if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
29963	WhichResult == `0`)
29964	return convertFromScalableVector(
29965	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
29966
29967	if (isTRNMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResult)) {
29968	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29969	return convertFromScalableVector(
29970	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
29971	}
29972
29973	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == `0`)
29974	return convertFromScalableVector(
29975	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
29976
29977	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
29978	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
29979	return convertFromScalableVector(
29980	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
29981	}
29982
29983	// Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
29984	// represents the same logical operation as performed by a ZIP instruction. In
29985	// isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
29986	// equivalent to an AArch64 instruction. There's the extra component of
29987	// ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
29988	// only operated on 64/128bit vector types that have a direct mapping to a
29989	// target register and so an exact mapping is implied.
29990	// However, when using SVE for fixed length vectors, most legal vector types
29991	// are actually sub-vectors of a larger SVE register. When mapping
29992	// ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
29993	// how the mask's indices translate. Specifically, when the mapping requires
29994	// an exact meaning for a specific vector index (e.g. Index X is the last
29995	// vector element in the register) then such mappings are often only safe when
29996	// the exact SVE register size is know. The main exception to this is when
29997	// indices are logically relative to the first element of either
29998	// ISD::VECTOR_SHUFFLE operand because these relative indices don't change
29999	// when converting from fixed-length to scalable vector types (i.e. the start
30000	// of a fixed length vector is always the start of a scalable vector).
30001	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
30002	unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
30003	if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
30004	if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
30005	Op2.isUndef()) {
30006	Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
30007	return convertFromScalableVector(DAG, VT, V: Op);
30008	}
30009
30010	if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
30011	WhichResult != `0`)
30012	return convertFromScalableVector(
30013	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
30014
30015	if (isUZPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult)) {
30016	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30017	return convertFromScalableVector(
30018	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
30019	}
30020
30021	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != `0`)
30022	return convertFromScalableVector(
30023	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
30024
30025	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
30026	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
30027	return convertFromScalableVector(
30028	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
30029	}
30030
30031	if ((Subtarget->hasSVE2p1() \|\| Subtarget->hasSME2p1()) &&
30032	Subtarget->isSVEorStreamingSVEAvailable()) {
30033	assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == `0` &&
30034	"Unsupported SVE vector size");
30035
30036	unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
30037	unsigned SegmentElts = VT.getVectorNumElements() / Segments;
30038	if (std::optional<unsigned> Lane =
30039	isDUPQMask(Mask: ShuffleMask, Segments, SegmentSize: SegmentElts)) {
30040	SDValue IID =
30041	DAG.getConstant(Val: Intrinsic::aarch64_sve_dup_laneq, DL, VT: MVT::i64);
30042	return convertFromScalableVector(
30043	DAG, VT,
30044	V: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
30045	Ops: {IID, Op1,
30046	DAG.getConstant(Val: *Lane, DL, VT: MVT::i64,
30047	/isTarget=/true)}));
30048	}
30049	}
30050	}
30051
30052	// Try to widen the shuffle before generating a possibly expensive SVE TBL.
30053	// This may allow the shuffle to be matched as something cheaper like ZIP1.
30054	if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
30055	return WideOp;
30056
30057	// Avoid producing TBL instruction if we don't know SVE register minimal size,
30058	// unless NEON is not available and we can assume minimal SVE register size is
30059	// 128-bits.
30060	if (MinSVESize \|\| !Subtarget->isNeonAvailable())
30061	return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
30062	DAG);
30063
30064	return SDValue ();
30065	}
30066
30067	SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
30068	SelectionDAG &DAG) const {
30069	SDLoc DL(Op);
30070	EVT InVT = Op.getValueType();
30071
30072	assert(VT.isScalableVector() && isTypeLegal(VT) &&
30073	InVT.isScalableVector() && isTypeLegal(InVT) &&
30074	"Only expect to cast between legal scalable vector types!");
30075	assert(VT.getVectorElementType() != MVT::i1 &&
30076	InVT.getVectorElementType() != MVT::i1 &&
30077	"For predicate bitcasts, use getSVEPredicateBitCast");
30078
30079	if (InVT == VT)
30080	return Op;
30081
30082	EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
30083	EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
30084
30085	// Safe bitcasting between unpacked vector types of different element counts
30086	// is currently unsupported because the following is missing the necessary
30087	// work to ensure the result's elements live where they're supposed to within
30088	// an SVE register.
30089	// 01234567
30090	// e.g. nxv2i32 = XX??XX??
30091	// nxv4f16 = X?X?X?X?
30092	assert((VT.getVectorElementCount() == InVT.getVectorElementCount() \|\|
30093	VT == PackedVT \|\| InVT == PackedInVT) &&
30094	"Unexpected bitcast!");
30095
30096	// Pack input if required.
30097	if (InVT != PackedInVT)
30098	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
30099
30100	if (Subtarget->isLittleEndian() \|\|
30101	PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
30102	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
30103	else {
30104	EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
30105	EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
30106
30107	// Simulate the effect of casting through memory.
30108	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedInVTAsInt, Operand: Op);
30109	if (PackedInVTAsInt.getScalarSizeInBits() != `8`)
30110	Op = DAG.getNode(Opcode: ISD::BSWAP, DL, VT: PackedInVTAsInt, Operand: Op);
30111	Op = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: PackedVTAsInt, Operand: Op);
30112	if (PackedVTAsInt.getScalarSizeInBits() != `8`)
30113	Op = DAG.getNode(Opcode: ISD::BSWAP, DL, VT: PackedVTAsInt, Operand: Op);
30114	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
30115	}
30116
30117	// Unpack result if required.
30118	if (VT != PackedVT)
30119	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
30120
30121	return Op;
30122	}
30123
30124	bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
30125	SDValue N) const {
30126	return ::isAllActivePredicate(DAG, N);
30127	}
30128
30129	EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
30130	return ::getPromotedVTForPredicate(VT);
30131	}
30132
30133	bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
30134	SDValue Op, const APInt &OriginalDemandedBits,
30135	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
30136	unsigned Depth) const {
30137
30138	unsigned Opc = Op.getOpcode();
30139	switch (Opc) {
30140	case AArch64ISD::VSHL: {
30141	// Match (VSHL (VLSHR Val X) X)
30142	SDValue ShiftL = Op;
30143	SDValue ShiftR = Op ->getOperand(Num: `0`);
30144	if (ShiftR ->getOpcode() != AArch64ISD::VLSHR)
30145	return false;
30146
30147	if (!ShiftL.hasOneUse() \|\| !ShiftR.hasOneUse())
30148	return false;
30149
30150	unsigned ShiftLBits = ShiftL ->getConstantOperandVal(Num: `1`);
30151	unsigned ShiftRBits = ShiftR ->getConstantOperandVal(Num: `1`);
30152
30153	// Other cases can be handled as well, but this is not
30154	// implemented.
30155	if (ShiftRBits != ShiftLBits)
30156	return false;
30157
30158	unsigned ScalarSize = Op.getScalarValueSizeInBits();
30159	assert(ScalarSize > ShiftLBits && "Invalid shift imm");
30160
30161	APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
30162	APInt UnusedBits = ~OriginalDemandedBits;
30163
30164	if ((ZeroBits & UnusedBits) != ZeroBits)
30165	return false;
30166
30167	// All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
30168	// used - simplify to just Val.
30169	return TLO.CombineTo(O: Op, N: ShiftR ->getOperand(Num: `0`));
30170	}
30171	case AArch64ISD::BICi: {
30172	// Fold BICi if all destination bits already known to be zeroed
30173	SDValue Op0 = Op.getOperand(i: `0`);
30174	KnownBits KnownOp0 =
30175	TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + `1`);
30176	// Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
30177	APInt BitsToClear =
30178	(Op ->getConstantOperandAPInt(Num: `1`) << Op ->getConstantOperandAPInt(Num: `2`))
30179	.trunc(width: KnownOp0.getBitWidth());
30180	APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
30181	if (BitsToClear.isSubsetOf(RHS: AlreadyZeroedBitsToClear))
30182	return TLO.CombineTo(O: Op, N: Op0);
30183
30184	Known = KnownOp0 & KnownBits::makeConstant(C: ~BitsToClear);
30185	return false;
30186	}
30187	case ISD::INTRINSIC_WO_CHAIN: {
30188	if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
30189	unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
30190	if (!MaxSVEVectorSizeInBits)
30191	MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
30192	unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
30193	// The SVE count intrinsics don't support the multiplier immediate so we
30194	// don't have to account for that here. The value returned may be slightly
30195	// over the true required bits, as this is based on the "ALL" pattern. The
30196	// other patterns are also exposed by these intrinsics, but they all
30197	// return a value that's strictly less than "ALL".
30198	unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
30199	unsigned BitWidth = Known.Zero.getBitWidth();
30200	if (RequiredBits < BitWidth)
30201	Known.Zero.setHighBits(BitWidth - RequiredBits);
30202	return false;
30203	}
30204	}
30205	}
30206
30207	return TargetLowering::SimplifyDemandedBitsForTargetNode(
30208	Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
30209	}
30210
30211	bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
30212	return Op.getOpcode() == AArch64ISD::DUP \|\|
30213	Op.getOpcode() == AArch64ISD::MOVI \|\|
30214	(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
30215	Op.getOperand(i: `0`).getOpcode() == AArch64ISD::DUP) \|\|
30216	TargetLowering::isTargetCanonicalConstantNode(Op);
30217	}
30218
30219	bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
30220	return Subtarget->hasSVE() \|\| Subtarget->hasSVE2() \|\|
30221	Subtarget->hasComplxNum();
30222	}
30223
30224	bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
30225	ComplexDeinterleavingOperation Operation, Type Ty) const* {
30226	auto *VTy = dyn_cast<VectorType>(Val: Ty);
30227	if (!VTy)
30228	return false;
30229
30230	// If the vector is scalable, SVE is enabled, implying support for complex
30231	// numbers. Otherwise, we need to ensure complex number support is available
30232	if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
30233	return false;
30234
30235	auto *ScalarTy = VTy->getScalarType();
30236	unsigned NumElements = VTy->getElementCount().getKnownMinValue();
30237
30238	// We can only process vectors that have a bit size of 128 or higher (with an
30239	// additional 64 bits for Neon). Additionally, these vectors must have a
30240	// power-of-2 size, as we later split them into the smallest supported size
30241	// and merging them back together after applying complex operation.
30242	unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
30243	if ((VTyWidth < `128` && (VTy->isScalableTy() \|\| VTyWidth != `64`)) \|\|
30244	!llvm::isPowerOf2_32(Value: VTyWidth))
30245	return false;
30246
30247	if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
30248	unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
30249
30250	if (Operation == ComplexDeinterleavingOperation::CDot)
30251	return ScalarWidth == `32` \|\| ScalarWidth == `64`;
30252	return `8` <= ScalarWidth && ScalarWidth <= `64`;
30253	}
30254
30255	// CDot is not supported outside of scalable/sve scopes
30256	if (Operation == ComplexDeinterleavingOperation::CDot)
30257	return false;
30258
30259	return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) \|\|
30260	ScalarTy->isFloatTy() \|\| ScalarTy->isDoubleTy();
30261	}
30262
30263	Value *AArch64TargetLowering::createComplexDeinterleavingIR(
30264	IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
30265	ComplexDeinterleavingRotation Rotation, Value InputA, Value InputB,
30266	Value Accumulator) const* {
30267	VectorType *Ty = cast<VectorType>(Val: InputA->getType());
30268	if (Accumulator == nullptr)
30269	Accumulator = Constant::getNullValue(Ty);
30270	bool IsScalable = Ty->isScalableTy();
30271	bool IsInt = Ty->getElementType()->isIntegerTy();
30272
30273	unsigned TyWidth =
30274	Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
30275
30276	assert(((TyWidth >= `128` && llvm::isPowerOf2_32(TyWidth)) \|\| TyWidth == `64`) &&
30277	"Vector type must be either 64 or a power of 2 that is at least 128");
30278
30279	if (TyWidth > `128`) {
30280	int Stride = Ty->getElementCount().getKnownMinValue() / `2`;
30281	int AccStride = cast<VectorType>(Val: Accumulator->getType())
30282	->getElementCount()
30283	.getKnownMinValue() /
30284	`2`;
30285	auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
30286	auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: uint64_t(`0`));
30287	auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: uint64_t(`0`));
30288	auto *UpperSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: Stride);
30289	auto *UpperSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: Stride);
30290	Value LowerSplitAcc = nullptr*;
30291	Value UpperSplitAcc = nullptr*;
30292	Type *FullTy = Ty;
30293	FullTy = Accumulator->getType();
30294	auto *HalfAccTy = VectorType::getHalfElementsVectorType(
30295	VTy: cast<VectorType>(Val: Accumulator->getType()));
30296	LowerSplitAcc = B.CreateExtractVector(DstType: HalfAccTy, SrcVec: Accumulator, Idx: uint64_t(`0`));
30297	UpperSplitAcc = B.CreateExtractVector(DstType: HalfAccTy, SrcVec: Accumulator, Idx: AccStride);
30298	auto *LowerSplitInt = createComplexDeinterleavingIR(
30299	B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
30300	auto *UpperSplitInt = createComplexDeinterleavingIR(
30301	B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
30302
30303	auto *Result = B.CreateInsertVector(DstType: FullTy, SrcVec: PoisonValue::get(T: FullTy),
30304	SubVec: LowerSplitInt, Idx: uint64_t(`0`));
30305	return B.CreateInsertVector(DstType: FullTy, SrcVec: Result, SubVec: UpperSplitInt, Idx: AccStride);
30306	}
30307
30308	if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
30309	if (IsScalable) {
30310	if (IsInt)
30311	return B.CreateIntrinsic(
30312	ID: Intrinsic::aarch64_sve_cmla_x, Types: Ty,
30313	Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
30314
30315	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
30316	return B.CreateIntrinsic(
30317	ID: Intrinsic::aarch64_sve_fcmla, Types: Ty,
30318	Args: {Mask, Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
30319	}
30320
30321	Intrinsic::ID IdMap[`4`] = {Intrinsic::aarch64_neon_vcmla_rot0,
30322	Intrinsic::aarch64_neon_vcmla_rot90,
30323	Intrinsic::aarch64_neon_vcmla_rot180,
30324	Intrinsic::aarch64_neon_vcmla_rot270};
30325
30326
30327	return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
30328	Args: {Accumulator, InputA, InputB});
30329	}
30330
30331	if (OperationType == ComplexDeinterleavingOperation::CAdd) {
30332	if (IsScalable) {
30333	if (Rotation == ComplexDeinterleavingRotation::Rotation_90 \|\|
30334	Rotation == ComplexDeinterleavingRotation::Rotation_270) {
30335	if (IsInt)
30336	return B.CreateIntrinsic(
30337	ID: Intrinsic::aarch64_sve_cadd_x, Types: Ty,
30338	Args: {InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
30339
30340	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
30341	return B.CreateIntrinsic(
30342	ID: Intrinsic::aarch64_sve_fcadd, Types: Ty,
30343	Args: {Mask, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
30344	}
30345	return nullptr;
30346	}
30347
30348	Intrinsic::ID IntId = Intrinsic::not_intrinsic;
30349	if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
30350	IntId = Intrinsic::aarch64_neon_vcadd_rot90;
30351	else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
30352	IntId = Intrinsic::aarch64_neon_vcadd_rot270;
30353
30354	if (IntId == Intrinsic::not_intrinsic)
30355	return nullptr;
30356
30357	return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
30358	}
30359
30360	if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
30361	IsScalable) {
30362	return B.CreateIntrinsic(
30363	ID: Intrinsic::aarch64_sve_cdot, Types: Accumulator->getType(),
30364	Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
30365	}
30366
30367	return nullptr;
30368	}
30369
30370	bool AArch64TargetLowering::preferScalarizeSplat(SDNode N) const* {
30371	unsigned Opc = N->getOpcode();
30372	if (ISD::isExtOpcode(Opcode: Opc)) {
30373	if (any_of(Range: N->users(),
30374	P: [&](SDNode Use) { return* Use->getOpcode() == ISD::MUL; }))
30375	return false;
30376	}
30377	return true;
30378	}
30379
30380	unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
30381	return Subtarget->getMinimumJumpTableEntries();
30382	}
30383
30384	MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
30385	CallingConv::ID CC,
30386	EVT VT) const {
30387	bool NonUnitFixedLengthVector =
30388	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
30389	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
30390	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
30391
30392	EVT VT1;
30393	MVT RegisterVT;
30394	unsigned NumIntermediates;
30395	getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
30396	RegisterVT);
30397	return RegisterVT;
30398	}
30399
30400	unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
30401	LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
30402	bool NonUnitFixedLengthVector =
30403	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
30404	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
30405	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
30406
30407	EVT VT1;
30408	MVT VT2;
30409	unsigned NumIntermediates;
30410	return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
30411	NumIntermediates, RegisterVT&: VT2);
30412	}
30413
30414	unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
30415	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
30416	unsigned &NumIntermediates, MVT &RegisterVT) const {
30417	int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
30418	Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
30419	if (!RegisterVT.isFixedLengthVector() \|\|
30420	RegisterVT.getFixedSizeInBits() <= `128`)
30421	return NumRegs;
30422
30423	assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
30424	assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
30425	assert(RegisterVT.getFixedSizeInBits() % `128` == `0` && "Unexpected size!");
30426
30427	// A size mismatch here implies either type promotion or widening and would
30428	// have resulted in scalarisation if larger vectors had not be available.
30429	if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
30430	EVT EltTy = VT.getVectorElementType();
30431	EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: `1`));
30432	if (!isTypeLegal(VT: NewVT))
30433	NewVT = EltTy;
30434
30435	IntermediateVT = NewVT;
30436	NumIntermediates = VT.getVectorNumElements();
30437	RegisterVT = getRegisterType(Context, VT: NewVT);
30438	return NumIntermediates;
30439	}
30440
30441	// SVE VLS support does not introduce a new ABI so we should use NEON sized
30442	// types for vector arguments and returns.
30443
30444	unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / `128`;
30445	NumIntermediates *= NumSubRegs;
30446	NumRegs *= NumSubRegs;
30447
30448	switch (RegisterVT.getVectorElementType().SimpleTy) {
30449	default:
30450	llvm_unreachable("unexpected element type for vector");
30451	case MVT::i8:
30452	IntermediateVT = RegisterVT = MVT::v16i8;
30453	break;
30454	case MVT::i16:
30455	IntermediateVT = RegisterVT = MVT::v8i16;
30456	break;
30457	case MVT::i32:
30458	IntermediateVT = RegisterVT = MVT::v4i32;
30459	break;
30460	case MVT::i64:
30461	IntermediateVT = RegisterVT = MVT::v2i64;
30462	break;
30463	case MVT::f16:
30464	IntermediateVT = RegisterVT = MVT::v8f16;
30465	break;
30466	case MVT::f32:
30467	IntermediateVT = RegisterVT = MVT::v4f32;
30468	break;
30469	case MVT::f64:
30470	IntermediateVT = RegisterVT = MVT::v2f64;
30471	break;
30472	case MVT::bf16:
30473	IntermediateVT = RegisterVT = MVT::v8bf16;
30474	break;
30475	}
30476
30477	return NumRegs;
30478	}
30479
30480	bool AArch64TargetLowering::hasInlineStackProbe(
30481	const MachineFunction &MF) const {
30482	return !Subtarget->isTargetWindows() &&
30483	MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
30484	}
30485
30486	bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
30487	switch (Opc) {
30488	case ISD::TRUNCATE_SSAT_S:
30489	case ISD::TRUNCATE_SSAT_U:
30490	case ISD::TRUNCATE_USAT_U:
30491	if (VT == MVT::v8i8 \|\| VT == MVT::v4i16 \|\| VT == MVT::v2i32)
30492	return true;
30493	}
30494
30495	return TargetLowering::isTypeDesirableForOp(Opc, VT);
30496	}
30497
30498	bool AArch64TargetLowering::shouldPreservePtrArith(const Function &F,
30499	EVT VT) const {
30500	return Subtarget->hasCPA() && UseFEATCPACodegen;
30501	}
30502

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp