AArch64ISelLowering.cpp source code [llvm_projects/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp]

1	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the AArch64TargetLowering class.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "AArch64ISelLowering.h"
14	#include "AArch64CallingConvention.h"
15	#include "AArch64ExpandImm.h"
16	#include "AArch64MachineFunctionInfo.h"
17	#include "AArch64PerfectShuffle.h"
18	#include "AArch64RegisterInfo.h"
19	#include "AArch64Subtarget.h"
20	#include "MCTargetDesc/AArch64AddressingModes.h"
21	#include "Utils/AArch64BaseInfo.h"
22	#include "llvm/ADT/APFloat.h"
23	#include "llvm/ADT/APInt.h"
24	#include "llvm/ADT/ArrayRef.h"
25	#include "llvm/ADT/STLExtras.h"
26	#include "llvm/ADT/SmallSet.h"
27	#include "llvm/ADT/SmallVector.h"
28	#include "llvm/ADT/Statistic.h"
29	#include "llvm/ADT/StringRef.h"
30	#include "llvm/ADT/Twine.h"
31	#include "llvm/Analysis/LoopInfo.h"
32	#include "llvm/Analysis/MemoryLocation.h"
33	#include "llvm/Analysis/ObjCARCUtil.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Analysis/TargetTransformInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/Analysis/VectorUtils.h"
38	#include "llvm/CodeGen/Analysis.h"
39	#include "llvm/CodeGen/CallingConvLower.h"
40	#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
41	#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
42	#include "llvm/CodeGen/GlobalISel/Utils.h"
43	#include "llvm/CodeGen/ISDOpcodes.h"
44	#include "llvm/CodeGen/MachineBasicBlock.h"
45	#include "llvm/CodeGen/MachineFrameInfo.h"
46	#include "llvm/CodeGen/MachineFunction.h"
47	#include "llvm/CodeGen/MachineInstr.h"
48	#include "llvm/CodeGen/MachineInstrBuilder.h"
49	#include "llvm/CodeGen/MachineMemOperand.h"
50	#include "llvm/CodeGen/MachineRegisterInfo.h"
51	#include "llvm/CodeGen/RuntimeLibcallUtil.h"
52	#include "llvm/CodeGen/SelectionDAG.h"
53	#include "llvm/CodeGen/SelectionDAGNodes.h"
54	#include "llvm/CodeGen/TargetCallingConv.h"
55	#include "llvm/CodeGen/TargetInstrInfo.h"
56	#include "llvm/CodeGen/TargetOpcodes.h"
57	#include "llvm/CodeGen/ValueTypes.h"
58	#include "llvm/CodeGenTypes/MachineValueType.h"
59	#include "llvm/IR/Attributes.h"
60	#include "llvm/IR/Constants.h"
61	#include "llvm/IR/DataLayout.h"
62	#include "llvm/IR/DebugLoc.h"
63	#include "llvm/IR/DerivedTypes.h"
64	#include "llvm/IR/Function.h"
65	#include "llvm/IR/GetElementPtrTypeIterator.h"
66	#include "llvm/IR/GlobalValue.h"
67	#include "llvm/IR/IRBuilder.h"
68	#include "llvm/IR/Instruction.h"
69	#include "llvm/IR/Instructions.h"
70	#include "llvm/IR/IntrinsicInst.h"
71	#include "llvm/IR/Intrinsics.h"
72	#include "llvm/IR/IntrinsicsAArch64.h"
73	#include "llvm/IR/Module.h"
74	#include "llvm/IR/PatternMatch.h"
75	#include "llvm/IR/Type.h"
76	#include "llvm/IR/Use.h"
77	#include "llvm/IR/Value.h"
78	#include "llvm/MC/MCRegisterInfo.h"
79	#include "llvm/Support/AtomicOrdering.h"
80	#include "llvm/Support/Casting.h"
81	#include "llvm/Support/CodeGen.h"
82	#include "llvm/Support/CommandLine.h"
83	#include "llvm/Support/Debug.h"
84	#include "llvm/Support/ErrorHandling.h"
85	#include "llvm/Support/InstructionCost.h"
86	#include "llvm/Support/KnownBits.h"
87	#include "llvm/Support/MathExtras.h"
88	#include "llvm/Support/SipHash.h"
89	#include "llvm/Support/raw_ostream.h"
90	#include "llvm/Target/TargetMachine.h"
91	#include "llvm/Target/TargetOptions.h"
92	#include "llvm/TargetParser/Triple.h"
93	#include <algorithm>
94	#include <bitset>
95	#include <cassert>
96	#include <cctype>
97	#include <cstdint>
98	#include <cstdlib>
99	#include <iterator>
100	#include <limits>
101	#include <optional>
102	#include <tuple>
103	#include <utility>
104	#include <vector>
105
106	using namespace llvm;
107	using namespace llvm::PatternMatch;
108
109	#define DEBUG_TYPE "aarch64-lower"
110
111	STATISTIC(NumTailCalls, "Number of tail calls");
112	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
113	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
114
115	// FIXME: The necessary dtprel relocations don't seem to be supported
116	// well in the GNU bfd and gold linkers at the moment. Therefore, by
117	// default, for now, fall back to GeneralDynamic code generation.
118	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
119	"aarch64-elf-ldtls-generation", cl::Hidden,
120	cl::desc ("Allow AArch64 Local Dynamic TLS code generation"),
121	cl::init(Val: false));
122
123	static cl::opt<bool>
124	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
125	cl::desc ("Enable AArch64 logical imm instruction "
126	"optimization"),
127	cl::init(Val: true));
128
129	// Temporary option added for the purpose of testing functionality added
130	// to DAGCombiner.cpp in D92230. It is expected that this can be removed
131	// in future when both implementations will be based off MGATHER rather
132	// than the GLD1 nodes added for the SVE gather load intrinsics.
133	static cl::opt<bool>
134	EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
135	cl::desc ("Combine extends of AArch64 masked "
136	"gather intrinsics"),
137	cl::init(Val: true));
138
139	static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
140	cl::desc ("Combine ext and trunc to TBL"),
141	cl::init(Val: true));
142
143	// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
144	// bottleneck after this transform on high end CPU. So this max leaf node
145	// limitation is guard cmp+ccmp will be profitable.
146	static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(Val: `16`), cl::Hidden,
147	cl::desc ("Maximum of xors"));
148
149	// By turning this on, we will not fallback to DAG ISel when encountering
150	// scalable vector types for all instruction, even if SVE is not yet supported
151	// with some instructions.
152	// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153	cl::opt<bool> EnableSVEGISel(
154	"aarch64-enable-gisel-sve", cl::Hidden,
155	cl::desc ("Enable / disable SVE scalable vectors in Global ISel"),
156	cl::init(Val: false));
157
158	/// Value type used for condition codes.
159	static const MVT MVT_CC = MVT::i32;
160
161	static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
162	AArch64::X3, AArch64::X4, AArch64::X5,
163	AArch64::X6, AArch64::X7};
164	static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
165	AArch64::Q3, AArch64::Q4, AArch64::Q5,
166	AArch64::Q6, AArch64::Q7};
167
168	ArrayRef<MCPhysReg> llvm::AArch64::getGPRArgRegs() { return GPRArgRegs; }
169
170	ArrayRef<MCPhysReg> llvm::AArch64::getFPRArgRegs() { return FPRArgRegs; }
171
172	static inline EVT getPackedSVEVectorVT(EVT VT) {
173	switch (VT.getSimpleVT().SimpleTy) {
174	default:
175	llvm_unreachable("unexpected element type for vector");
176	case MVT::i8:
177	return MVT::nxv16i8;
178	case MVT::i16:
179	return MVT::nxv8i16;
180	case MVT::i32:
181	return MVT::nxv4i32;
182	case MVT::i64:
183	return MVT::nxv2i64;
184	case MVT::f16:
185	return MVT::nxv8f16;
186	case MVT::f32:
187	return MVT::nxv4f32;
188	case MVT::f64:
189	return MVT::nxv2f64;
190	case MVT::bf16:
191	return MVT::nxv8bf16;
192	}
193	}
194
195	// NOTE: Currently there's only a need to return integer vector types. If this
196	// changes then just add an extra "type" parameter.
197	static inline EVT getPackedSVEVectorVT(ElementCount EC) {
198	switch (EC.getKnownMinValue()) {
199	default:
200	llvm_unreachable("unexpected element count for vector");
201	case `16`:
202	return MVT::nxv16i8;
203	case `8`:
204	return MVT::nxv8i16;
205	case `4`:
206	return MVT::nxv4i32;
207	case `2`:
208	return MVT::nxv2i64;
209	}
210	}
211
212	static inline EVT getPromotedVTForPredicate(EVT VT) {
213	assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
214	"Expected scalable predicate vector type!");
215	switch (VT.getVectorMinNumElements()) {
216	default:
217	llvm_unreachable("unexpected element count for vector");
218	case `2`:
219	return MVT::nxv2i64;
220	case `4`:
221	return MVT::nxv4i32;
222	case `8`:
223	return MVT::nxv8i16;
224	case `16`:
225	return MVT::nxv16i8;
226	}
227	}
228
229	/// Returns true if VT's elements occupy the lowest bit positions of its
230	/// associated register class without any intervening space.
231	///
232	/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
233	/// same register class, but only nxv8f16 can be treated as a packed vector.
234	static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
235	assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
236	"Expected legal vector type!");
237	return VT.isFixedLengthVector() \|\|
238	VT.getSizeInBits().getKnownMinValue() == AArch64::SVEBitsPerBlock;
239	}
240
241	// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
242	// predicate and end with a passthru value matching the result type.
243	static bool isMergePassthruOpcode(unsigned Opc) {
244	switch (Opc) {
245	default:
246	return false;
247	case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
248	case AArch64ISD::BSWAP_MERGE_PASSTHRU:
249	case AArch64ISD::REVH_MERGE_PASSTHRU:
250	case AArch64ISD::REVW_MERGE_PASSTHRU:
251	case AArch64ISD::REVD_MERGE_PASSTHRU:
252	case AArch64ISD::CTLZ_MERGE_PASSTHRU:
253	case AArch64ISD::CTPOP_MERGE_PASSTHRU:
254	case AArch64ISD::DUP_MERGE_PASSTHRU:
255	case AArch64ISD::ABS_MERGE_PASSTHRU:
256	case AArch64ISD::NEG_MERGE_PASSTHRU:
257	case AArch64ISD::FNEG_MERGE_PASSTHRU:
258	case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
259	case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
260	case AArch64ISD::FCEIL_MERGE_PASSTHRU:
261	case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
262	case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
263	case AArch64ISD::FRINT_MERGE_PASSTHRU:
264	case AArch64ISD::FROUND_MERGE_PASSTHRU:
265	case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
266	case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
267	case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
268	case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
269	case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
270	case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
271	case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
272	case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
273	case AArch64ISD::FSQRT_MERGE_PASSTHRU:
274	case AArch64ISD::FRECPX_MERGE_PASSTHRU:
275	case AArch64ISD::FABS_MERGE_PASSTHRU:
276	return true;
277	}
278	}
279
280	// Returns true if inactive lanes are known to be zeroed by construction.
281	static bool isZeroingInactiveLanes(SDValue Op) {
282	switch (Op.getOpcode()) {
283	default:
284	return false;
285	// We guarantee i1 splat_vectors to zero the other lanes
286	case ISD::SPLAT_VECTOR:
287	case AArch64ISD::PTRUE:
288	case AArch64ISD::SETCC_MERGE_ZERO:
289	return true;
290	case ISD::INTRINSIC_WO_CHAIN:
291	switch (Op.getConstantOperandVal(i: `0`)) {
292	default:
293	return false;
294	case Intrinsic::aarch64_sve_ptrue:
295	case Intrinsic::aarch64_sve_pnext:
296	case Intrinsic::aarch64_sve_cmpeq:
297	case Intrinsic::aarch64_sve_cmpne:
298	case Intrinsic::aarch64_sve_cmpge:
299	case Intrinsic::aarch64_sve_cmpgt:
300	case Intrinsic::aarch64_sve_cmphs:
301	case Intrinsic::aarch64_sve_cmphi:
302	case Intrinsic::aarch64_sve_cmpeq_wide:
303	case Intrinsic::aarch64_sve_cmpne_wide:
304	case Intrinsic::aarch64_sve_cmpge_wide:
305	case Intrinsic::aarch64_sve_cmpgt_wide:
306	case Intrinsic::aarch64_sve_cmplt_wide:
307	case Intrinsic::aarch64_sve_cmple_wide:
308	case Intrinsic::aarch64_sve_cmphs_wide:
309	case Intrinsic::aarch64_sve_cmphi_wide:
310	case Intrinsic::aarch64_sve_cmplo_wide:
311	case Intrinsic::aarch64_sve_cmpls_wide:
312	case Intrinsic::aarch64_sve_fcmpeq:
313	case Intrinsic::aarch64_sve_fcmpne:
314	case Intrinsic::aarch64_sve_fcmpge:
315	case Intrinsic::aarch64_sve_fcmpgt:
316	case Intrinsic::aarch64_sve_fcmpuo:
317	case Intrinsic::aarch64_sve_facgt:
318	case Intrinsic::aarch64_sve_facge:
319	case Intrinsic::aarch64_sve_whilege:
320	case Intrinsic::aarch64_sve_whilegt:
321	case Intrinsic::aarch64_sve_whilehi:
322	case Intrinsic::aarch64_sve_whilehs:
323	case Intrinsic::aarch64_sve_whilele:
324	case Intrinsic::aarch64_sve_whilelo:
325	case Intrinsic::aarch64_sve_whilels:
326	case Intrinsic::aarch64_sve_whilelt:
327	case Intrinsic::aarch64_sve_match:
328	case Intrinsic::aarch64_sve_nmatch:
329	case Intrinsic::aarch64_sve_whilege_x2:
330	case Intrinsic::aarch64_sve_whilegt_x2:
331	case Intrinsic::aarch64_sve_whilehi_x2:
332	case Intrinsic::aarch64_sve_whilehs_x2:
333	case Intrinsic::aarch64_sve_whilele_x2:
334	case Intrinsic::aarch64_sve_whilelo_x2:
335	case Intrinsic::aarch64_sve_whilels_x2:
336	case Intrinsic::aarch64_sve_whilelt_x2:
337	return true;
338	}
339	}
340	}
341
342	static std::tuple<SDValue, SDValue>
343	extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG) {
344	SDLoc DL(Disc);
345	SDValue AddrDisc;
346	SDValue ConstDisc;
347
348	// If this is a blend, remember the constant and address discriminators.
349	// Otherwise, it's either a constant discriminator, or a non-blended
350	// address discriminator.
351	if (Disc ->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352	Disc ->getConstantOperandVal(Num: `0`) == Intrinsic::ptrauth_blend) {
353	AddrDisc = Disc ->getOperand(Num: `1`);
354	ConstDisc = Disc ->getOperand(Num: `2`);
355	} else {
356	ConstDisc = Disc;
357	}
358
359	// If the constant discriminator (either the blend RHS, or the entire
360	// discriminator value) isn't a 16-bit constant, bail out, and let the
361	// discriminator be computed separately.
362	const auto *ConstDiscN = dyn_cast<ConstantSDNode>(Val&: ConstDisc);
363	if (!ConstDiscN \|\| !isUInt<`16`>(x: ConstDiscN->getZExtValue()))
364	return std::make_tuple(args: DAG->getTargetConstant(Val: `0`, DL, VT: MVT::i64), args&: Disc);
365
366	// If there's no address discriminator, use NoRegister, which we'll later
367	// replace with XZR, or directly use a Z variant of the inst. when available.
368	if (!AddrDisc)
369	AddrDisc = DAG->getRegister(Reg: AArch64::NoRegister, VT: MVT::i64);
370
371	return std::make_tuple(
372	args: DAG->getTargetConstant(Val: ConstDiscN->getZExtValue(), DL, VT: MVT::i64),
373	args&: AddrDisc);
374	}
375
376	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
377	const AArch64Subtarget &STI)
378	: TargetLowering (TM), Subtarget(&STI) {
379	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380	// we have to make something up. Arbitrarily, choose ZeroOrOne.
381	setBooleanContents(ZeroOrOneBooleanContent);
382	// When comparing vectors the result sets the different elements in the
383	// vector to all-one or all-zero.
384	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
385
386	// Set up the register classes.
387	addRegisterClass(VT: MVT::i32, RC: &AArch64::GPR32allRegClass);
388	addRegisterClass(VT: MVT::i64, RC: &AArch64::GPR64allRegClass);
389
390	if (Subtarget->hasLS64()) {
391	addRegisterClass(VT: MVT::i64x8, RC: &AArch64::GPR64x8ClassRegClass);
392	setOperationAction(Op: ISD::LOAD, VT: MVT::i64x8, Action: Custom);
393	setOperationAction(Op: ISD::STORE, VT: MVT::i64x8, Action: Custom);
394	}
395
396	if (Subtarget->hasFPARMv8()) {
397	addRegisterClass(VT: MVT::f16, RC: &AArch64::FPR16RegClass);
398	addRegisterClass(VT: MVT::bf16, RC: &AArch64::FPR16RegClass);
399	addRegisterClass(VT: MVT::f32, RC: &AArch64::FPR32RegClass);
400	addRegisterClass(VT: MVT::f64, RC: &AArch64::FPR64RegClass);
401	addRegisterClass(VT: MVT::f128, RC: &AArch64::FPR128RegClass);
402	}
403
404	if (Subtarget->hasNEON()) {
405	addRegisterClass(VT: MVT::v16i8, RC: &AArch64::FPR8RegClass);
406	addRegisterClass(VT: MVT::v8i16, RC: &AArch64::FPR16RegClass);
407
408	addDRType(VT: MVT::v2f32);
409	addDRType(VT: MVT::v8i8);
410	addDRType(VT: MVT::v4i16);
411	addDRType(VT: MVT::v2i32);
412	addDRType(VT: MVT::v1i64);
413	addDRType(VT: MVT::v1f64);
414	addDRType(VT: MVT::v4f16);
415	addDRType(VT: MVT::v4bf16);
416
417	addQRType(VT: MVT::v4f32);
418	addQRType(VT: MVT::v2f64);
419	addQRType(VT: MVT::v16i8);
420	addQRType(VT: MVT::v8i16);
421	addQRType(VT: MVT::v4i32);
422	addQRType(VT: MVT::v2i64);
423	addQRType(VT: MVT::v8f16);
424	addQRType(VT: MVT::v8bf16);
425	}
426
427	if (Subtarget->isSVEorStreamingSVEAvailable()) {
428	// Add legal sve predicate types
429	addRegisterClass(VT: MVT::nxv1i1, RC: &AArch64::PPRRegClass);
430	addRegisterClass(VT: MVT::nxv2i1, RC: &AArch64::PPRRegClass);
431	addRegisterClass(VT: MVT::nxv4i1, RC: &AArch64::PPRRegClass);
432	addRegisterClass(VT: MVT::nxv8i1, RC: &AArch64::PPRRegClass);
433	addRegisterClass(VT: MVT::nxv16i1, RC: &AArch64::PPRRegClass);
434
435	// Add legal sve data types
436	addRegisterClass(VT: MVT::nxv16i8, RC: &AArch64::ZPRRegClass);
437	addRegisterClass(VT: MVT::nxv8i16, RC: &AArch64::ZPRRegClass);
438	addRegisterClass(VT: MVT::nxv4i32, RC: &AArch64::ZPRRegClass);
439	addRegisterClass(VT: MVT::nxv2i64, RC: &AArch64::ZPRRegClass);
440
441	addRegisterClass(VT: MVT::nxv2f16, RC: &AArch64::ZPRRegClass);
442	addRegisterClass(VT: MVT::nxv4f16, RC: &AArch64::ZPRRegClass);
443	addRegisterClass(VT: MVT::nxv8f16, RC: &AArch64::ZPRRegClass);
444	addRegisterClass(VT: MVT::nxv2f32, RC: &AArch64::ZPRRegClass);
445	addRegisterClass(VT: MVT::nxv4f32, RC: &AArch64::ZPRRegClass);
446	addRegisterClass(VT: MVT::nxv2f64, RC: &AArch64::ZPRRegClass);
447
448	addRegisterClass(VT: MVT::nxv2bf16, RC: &AArch64::ZPRRegClass);
449	addRegisterClass(VT: MVT::nxv4bf16, RC: &AArch64::ZPRRegClass);
450	addRegisterClass(VT: MVT::nxv8bf16, RC: &AArch64::ZPRRegClass);
451
452	if (Subtarget->useSVEForFixedLengthVectors()) {
453	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
454	if (useSVEForFixedLengthVectorVT(VT))
455	addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
456
457	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
458	if (useSVEForFixedLengthVectorVT(VT))
459	addRegisterClass(VT, RC: &AArch64::ZPRRegClass);
460	}
461	}
462
463	if (Subtarget->hasSVE2p1() \|\| Subtarget->hasSME2()) {
464	addRegisterClass(VT: MVT::aarch64svcount, RC: &AArch64::PPRRegClass);
465	setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
466	setOperationPromotedToType(Opc: ISD::STORE, OrigVT: MVT::aarch64svcount, DestVT: MVT::nxv16i1);
467
468	setOperationAction(Op: ISD::SELECT, VT: MVT::aarch64svcount, Action: Custom);
469	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::aarch64svcount, Action: Expand);
470	}
471
472	// Compute derived properties from the register classes
473	computeRegisterProperties(TRI: Subtarget->getRegisterInfo());
474
475	// Provide all sorts of operation actions
476	setOperationAction(Op: ISD::GlobalAddress, VT: MVT::i64, Action: Custom);
477	setOperationAction(Op: ISD::GlobalTLSAddress, VT: MVT::i64, Action: Custom);
478	setOperationAction(Op: ISD::SETCC, VT: MVT::i32, Action: Custom);
479	setOperationAction(Op: ISD::SETCC, VT: MVT::i64, Action: Custom);
480	setOperationAction(Op: ISD::SETCC, VT: MVT::bf16, Action: Custom);
481	setOperationAction(Op: ISD::SETCC, VT: MVT::f16, Action: Custom);
482	setOperationAction(Op: ISD::SETCC, VT: MVT::f32, Action: Custom);
483	setOperationAction(Op: ISD::SETCC, VT: MVT::f64, Action: Custom);
484	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::bf16, Action: Custom);
485	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f16, Action: Custom);
486	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f32, Action: Custom);
487	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f64, Action: Custom);
488	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f16, Action: Custom);
489	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f32, Action: Custom);
490	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f64, Action: Custom);
491	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i32, Action: Legal);
492	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::i64, Action: Legal);
493	setOperationAction(Op: ISD::BRCOND, VT: MVT::Other, Action: Custom);
494	setOperationAction(Op: ISD::BR_CC, VT: MVT::i32, Action: Custom);
495	setOperationAction(Op: ISD::BR_CC, VT: MVT::i64, Action: Custom);
496	setOperationAction(Op: ISD::BR_CC, VT: MVT::f16, Action: Custom);
497	setOperationAction(Op: ISD::BR_CC, VT: MVT::f32, Action: Custom);
498	setOperationAction(Op: ISD::BR_CC, VT: MVT::f64, Action: Custom);
499	setOperationAction(Op: ISD::SELECT, VT: MVT::i32, Action: Custom);
500	setOperationAction(Op: ISD::SELECT, VT: MVT::i64, Action: Custom);
501	setOperationAction(Op: ISD::SELECT, VT: MVT::f16, Action: Custom);
502	setOperationAction(Op: ISD::SELECT, VT: MVT::bf16, Action: Custom);
503	setOperationAction(Op: ISD::SELECT, VT: MVT::f32, Action: Custom);
504	setOperationAction(Op: ISD::SELECT, VT: MVT::f64, Action: Custom);
505	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i32, Action: Custom);
506	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::i64, Action: Custom);
507	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f16, Action: Custom);
508	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::bf16, Action: Custom);
509	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f32, Action: Custom);
510	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f64, Action: Custom);
511	setOperationAction(Op: ISD::BR_JT, VT: MVT::Other, Action: Custom);
512	setOperationAction(Op: ISD::JumpTable, VT: MVT::i64, Action: Custom);
513	setOperationAction(Op: ISD::BRIND, VT: MVT::Other, Action: Custom);
514	setOperationAction(Op: ISD::SETCCCARRY, VT: MVT::i64, Action: Custom);
515
516	setOperationAction(Op: ISD::PtrAuthGlobalAddress, VT: MVT::i64, Action: Custom);
517
518	setOperationAction(Op: ISD::SHL_PARTS, VT: MVT::i64, Action: Custom);
519	setOperationAction(Op: ISD::SRA_PARTS, VT: MVT::i64, Action: Custom);
520	setOperationAction(Op: ISD::SRL_PARTS, VT: MVT::i64, Action: Custom);
521
522	setOperationAction(Op: ISD::FREM, VT: MVT::f32, Action: Expand);
523	setOperationAction(Op: ISD::FREM, VT: MVT::f64, Action: Expand);
524	setOperationAction(Op: ISD::FREM, VT: MVT::f80, Action: Expand);
525
526	setOperationAction(Op: ISD::BUILD_PAIR, VT: MVT::i64, Action: Expand);
527
528	// Custom lowering hooks are needed for XOR
529	// to fold it into CSINC/CSINV.
530	setOperationAction(Op: ISD::XOR, VT: MVT::i32, Action: Custom);
531	setOperationAction(Op: ISD::XOR, VT: MVT::i64, Action: Custom);
532
533	// Virtually no operation on f128 is legal, but LLVM can't expand them when
534	// there's a valid register class, so we need custom operations in most cases.
535	setOperationAction(Op: ISD::FABS, VT: MVT::f128, Action: Expand);
536	setOperationAction(Op: ISD::FADD, VT: MVT::f128, Action: LibCall);
537	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f128, Action: Expand);
538	setOperationAction(Op: ISD::FCOS, VT: MVT::f128, Action: Expand);
539	setOperationAction(Op: ISD::FDIV, VT: MVT::f128, Action: LibCall);
540	setOperationAction(Op: ISD::FMA, VT: MVT::f128, Action: Expand);
541	setOperationAction(Op: ISD::FMUL, VT: MVT::f128, Action: LibCall);
542	setOperationAction(Op: ISD::FNEG, VT: MVT::f128, Action: Expand);
543	setOperationAction(Op: ISD::FPOW, VT: MVT::f128, Action: Expand);
544	setOperationAction(Op: ISD::FREM, VT: MVT::f128, Action: Expand);
545	setOperationAction(Op: ISD::FRINT, VT: MVT::f128, Action: Expand);
546	setOperationAction(Op: ISD::FSIN, VT: MVT::f128, Action: Expand);
547	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f128, Action: Expand);
548	setOperationAction(Op: ISD::FSQRT, VT: MVT::f128, Action: Expand);
549	setOperationAction(Op: ISD::FSUB, VT: MVT::f128, Action: LibCall);
550	setOperationAction(Op: ISD::FTAN, VT: MVT::f128, Action: Expand);
551	setOperationAction(Op: ISD::FTRUNC, VT: MVT::f128, Action: Expand);
552	setOperationAction(Op: ISD::SETCC, VT: MVT::f128, Action: Custom);
553	setOperationAction(Op: ISD::STRICT_FSETCC, VT: MVT::f128, Action: Custom);
554	setOperationAction(Op: ISD::STRICT_FSETCCS, VT: MVT::f128, Action: Custom);
555	setOperationAction(Op: ISD::BR_CC, VT: MVT::f128, Action: Custom);
556	setOperationAction(Op: ISD::SELECT, VT: MVT::f128, Action: Custom);
557	setOperationAction(Op: ISD::SELECT_CC, VT: MVT::f128, Action: Custom);
558	setOperationAction(Op: ISD::FP_EXTEND, VT: MVT::f128, Action: Custom);
559	// FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560	// aren't handled.
561
562	// Lowering for many of the conversions is actually specified by the non-f128
563	// type. The LowerXXX function will be trivial when f128 isn't involved.
564	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i32, Action: Custom);
565	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i64, Action: Custom);
566	setOperationAction(Op: ISD::FP_TO_SINT, VT: MVT::i128, Action: Custom);
567	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i32, Action: Custom);
568	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i64, Action: Custom);
569	setOperationAction(Op: ISD::STRICT_FP_TO_SINT, VT: MVT::i128, Action: Custom);
570	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i32, Action: Custom);
571	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i64, Action: Custom);
572	setOperationAction(Op: ISD::FP_TO_UINT, VT: MVT::i128, Action: Custom);
573	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i32, Action: Custom);
574	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i64, Action: Custom);
575	setOperationAction(Op: ISD::STRICT_FP_TO_UINT, VT: MVT::i128, Action: Custom);
576	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i32, Action: Custom);
577	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i64, Action: Custom);
578	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::i128, Action: Custom);
579	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i32, Action: Custom);
580	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i64, Action: Custom);
581	setOperationAction(Op: ISD::STRICT_SINT_TO_FP, VT: MVT::i128, Action: Custom);
582	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i32, Action: Custom);
583	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i64, Action: Custom);
584	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::i128, Action: Custom);
585	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i32, Action: Custom);
586	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i64, Action: Custom);
587	setOperationAction(Op: ISD::STRICT_UINT_TO_FP, VT: MVT::i128, Action: Custom);
588	if (Subtarget->hasFPARMv8()) {
589	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f16, Action: Custom);
590	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::bf16, Action: Custom);
591	}
592	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f32, Action: Custom);
593	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::f64, Action: Custom);
594	if (Subtarget->hasFPARMv8()) {
595	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f16, Action: Custom);
596	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::bf16, Action: Custom);
597	}
598	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f32, Action: Custom);
599	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT: MVT::f64, Action: Custom);
600
601	setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i32, Action: Custom);
602	setOperationAction(Op: ISD::FP_TO_UINT_SAT, VT: MVT::i64, Action: Custom);
603	setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i32, Action: Custom);
604	setOperationAction(Op: ISD::FP_TO_SINT_SAT, VT: MVT::i64, Action: Custom);
605
606	// Variable arguments.
607	setOperationAction(Op: ISD::VASTART, VT: MVT::Other, Action: Custom);
608	setOperationAction(Op: ISD::VAARG, VT: MVT::Other, Action: Custom);
609	setOperationAction(Op: ISD::VACOPY, VT: MVT::Other, Action: Custom);
610	setOperationAction(Op: ISD::VAEND, VT: MVT::Other, Action: Expand);
611
612	// Variable-sized objects.
613	setOperationAction(Op: ISD::STACKSAVE, VT: MVT::Other, Action: Expand);
614	setOperationAction(Op: ISD::STACKRESTORE, VT: MVT::Other, Action: Expand);
615
616	// Lowering Funnel Shifts to EXTR
617	setOperationAction(Op: ISD::FSHR, VT: MVT::i32, Action: Custom);
618	setOperationAction(Op: ISD::FSHR, VT: MVT::i64, Action: Custom);
619	setOperationAction(Op: ISD::FSHL, VT: MVT::i32, Action: Custom);
620	setOperationAction(Op: ISD::FSHL, VT: MVT::i64, Action: Custom);
621
622	setOperationAction(Op: ISD::DYNAMIC_STACKALLOC, VT: MVT::i64, Action: Custom);
623
624	// Constant pool entries
625	setOperationAction(Op: ISD::ConstantPool, VT: MVT::i64, Action: Custom);
626
627	// BlockAddress
628	setOperationAction(Op: ISD::BlockAddress, VT: MVT::i64, Action: Custom);
629
630	// AArch64 lacks both left-rotate and popcount instructions.
631	setOperationAction(Op: ISD::ROTL, VT: MVT::i32, Action: Expand);
632	setOperationAction(Op: ISD::ROTL, VT: MVT::i64, Action: Expand);
633	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
634	setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
635	setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
636	}
637
638	// AArch64 doesn't have i32 MULH{S\|U}.
639	setOperationAction(Op: ISD::MULHU, VT: MVT::i32, Action: Expand);
640	setOperationAction(Op: ISD::MULHS, VT: MVT::i32, Action: Expand);
641
642	// AArch64 doesn't have {U\|S}MUL_LOHI.
643	setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i32, Action: Expand);
644	setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i32, Action: Expand);
645	setOperationAction(Op: ISD::UMUL_LOHI, VT: MVT::i64, Action: Expand);
646	setOperationAction(Op: ISD::SMUL_LOHI, VT: MVT::i64, Action: Expand);
647
648	if (Subtarget->hasCSSC()) {
649	setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Legal);
650	setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Legal);
651	setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Expand);
652
653	setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Expand);
654
655	setOperationAction(Op: ISD::CTTZ, VT: MVT::i32, Action: Legal);
656	setOperationAction(Op: ISD::CTTZ, VT: MVT::i64, Action: Legal);
657	setOperationAction(Op: ISD::CTTZ, VT: MVT::i128, Action: Expand);
658
659	setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Legal);
660	setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Legal);
661
662	setOperationAction(Op: ISD::SMAX, VT: MVT::i32, Action: Legal);
663	setOperationAction(Op: ISD::SMAX, VT: MVT::i64, Action: Legal);
664	setOperationAction(Op: ISD::UMAX, VT: MVT::i32, Action: Legal);
665	setOperationAction(Op: ISD::UMAX, VT: MVT::i64, Action: Legal);
666
667	setOperationAction(Op: ISD::SMIN, VT: MVT::i32, Action: Legal);
668	setOperationAction(Op: ISD::SMIN, VT: MVT::i64, Action: Legal);
669	setOperationAction(Op: ISD::UMIN, VT: MVT::i32, Action: Legal);
670	setOperationAction(Op: ISD::UMIN, VT: MVT::i64, Action: Legal);
671	} else {
672	setOperationAction(Op: ISD::CTPOP, VT: MVT::i32, Action: Custom);
673	setOperationAction(Op: ISD::CTPOP, VT: MVT::i64, Action: Custom);
674	setOperationAction(Op: ISD::CTPOP, VT: MVT::i128, Action: Custom);
675
676	setOperationAction(Op: ISD::PARITY, VT: MVT::i64, Action: Custom);
677	setOperationAction(Op: ISD::PARITY, VT: MVT::i128, Action: Custom);
678
679	setOperationAction(Op: ISD::ABS, VT: MVT::i32, Action: Custom);
680	setOperationAction(Op: ISD::ABS, VT: MVT::i64, Action: Custom);
681	}
682
683	setOperationAction(Op: ISD::SDIVREM, VT: MVT::i32, Action: Expand);
684	setOperationAction(Op: ISD::SDIVREM, VT: MVT::i64, Action: Expand);
685	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
686	setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
687	setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
688	}
689	setOperationAction(Op: ISD::SREM, VT: MVT::i32, Action: Expand);
690	setOperationAction(Op: ISD::SREM, VT: MVT::i64, Action: Expand);
691	setOperationAction(Op: ISD::UDIVREM, VT: MVT::i32, Action: Expand);
692	setOperationAction(Op: ISD::UDIVREM, VT: MVT::i64, Action: Expand);
693	setOperationAction(Op: ISD::UREM, VT: MVT::i32, Action: Expand);
694	setOperationAction(Op: ISD::UREM, VT: MVT::i64, Action: Expand);
695
696	// Custom lower Add/Sub/Mul with overflow.
697	setOperationAction(Op: ISD::SADDO, VT: MVT::i32, Action: Custom);
698	setOperationAction(Op: ISD::SADDO, VT: MVT::i64, Action: Custom);
699	setOperationAction(Op: ISD::UADDO, VT: MVT::i32, Action: Custom);
700	setOperationAction(Op: ISD::UADDO, VT: MVT::i64, Action: Custom);
701	setOperationAction(Op: ISD::SSUBO, VT: MVT::i32, Action: Custom);
702	setOperationAction(Op: ISD::SSUBO, VT: MVT::i64, Action: Custom);
703	setOperationAction(Op: ISD::USUBO, VT: MVT::i32, Action: Custom);
704	setOperationAction(Op: ISD::USUBO, VT: MVT::i64, Action: Custom);
705	setOperationAction(Op: ISD::SMULO, VT: MVT::i32, Action: Custom);
706	setOperationAction(Op: ISD::SMULO, VT: MVT::i64, Action: Custom);
707	setOperationAction(Op: ISD::UMULO, VT: MVT::i32, Action: Custom);
708	setOperationAction(Op: ISD::UMULO, VT: MVT::i64, Action: Custom);
709
710	setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i32, Action: Custom);
711	setOperationAction(Op: ISD::UADDO_CARRY, VT: MVT::i64, Action: Custom);
712	setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i32, Action: Custom);
713	setOperationAction(Op: ISD::USUBO_CARRY, VT: MVT::i64, Action: Custom);
714	setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i32, Action: Custom);
715	setOperationAction(Op: ISD::SADDO_CARRY, VT: MVT::i64, Action: Custom);
716	setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i32, Action: Custom);
717	setOperationAction(Op: ISD::SSUBO_CARRY, VT: MVT::i64, Action: Custom);
718
719	setOperationAction(Op: ISD::FSIN, VT: MVT::f32, Action: Expand);
720	setOperationAction(Op: ISD::FSIN, VT: MVT::f64, Action: Expand);
721	setOperationAction(Op: ISD::FCOS, VT: MVT::f32, Action: Expand);
722	setOperationAction(Op: ISD::FCOS, VT: MVT::f64, Action: Expand);
723	setOperationAction(Op: ISD::FPOW, VT: MVT::f32, Action: Expand);
724	setOperationAction(Op: ISD::FPOW, VT: MVT::f64, Action: Expand);
725	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f64, Action: Custom);
726	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f32, Action: Custom);
727	if (Subtarget->hasFullFP16()) {
728	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Custom);
729	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Custom);
730	} else {
731	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::f16, Action: Promote);
732	setOperationAction(Op: ISD::FCOPYSIGN, VT: MVT::bf16, Action: Promote);
733	}
734
735	for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
736	ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
737	ISD::FACOS, ISD::FASIN, ISD::FATAN,
738	ISD::FCOSH, ISD::FSINH, ISD::FTANH,
739	ISD::FTAN, ISD::FEXP, ISD::FEXP2,
740	ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
741	ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
742	ISD::STRICT_FPOWI, ISD::STRICT_FCOS, ISD::STRICT_FSIN,
743	ISD::STRICT_FACOS, ISD::STRICT_FASIN, ISD::STRICT_FATAN,
744	ISD::STRICT_FCOSH, ISD::STRICT_FSINH, ISD::STRICT_FTANH,
745	ISD::STRICT_FEXP, ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
746	ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
747	setOperationAction(Op, VT: MVT::f16, Action: Promote);
748	setOperationAction(Op, VT: MVT::v4f16, Action: Expand);
749	setOperationAction(Op, VT: MVT::v8f16, Action: Expand);
750	setOperationAction(Op, VT: MVT::bf16, Action: Promote);
751	setOperationAction(Op, VT: MVT::v4bf16, Action: Expand);
752	setOperationAction(Op, VT: MVT::v8bf16, Action: Expand);
753	}
754
755	auto LegalizeNarrowFP = [this](MVT ScalarVT) {
756	for (auto Op : {
757	ISD::SETCC,
758	ISD::SELECT_CC,
759	ISD::BR_CC,
760	ISD::FADD,
761	ISD::FSUB,
762	ISD::FMUL,
763	ISD::FDIV,
764	ISD::FMA,
765	ISD::FCEIL,
766	ISD::FSQRT,
767	ISD::FFLOOR,
768	ISD::FNEARBYINT,
769	ISD::FRINT,
770	ISD::FROUND,
771	ISD::FROUNDEVEN,
772	ISD::FTRUNC,
773	ISD::FMINNUM,
774	ISD::FMAXNUM,
775	ISD::FMINIMUM,
776	ISD::FMAXIMUM,
777	ISD::STRICT_FADD,
778	ISD::STRICT_FSUB,
779	ISD::STRICT_FMUL,
780	ISD::STRICT_FDIV,
781	ISD::STRICT_FMA,
782	ISD::STRICT_FCEIL,
783	ISD::STRICT_FFLOOR,
784	ISD::STRICT_FSQRT,
785	ISD::STRICT_FRINT,
786	ISD::STRICT_FNEARBYINT,
787	ISD::STRICT_FROUND,
788	ISD::STRICT_FTRUNC,
789	ISD::STRICT_FROUNDEVEN,
790	ISD::STRICT_FMINNUM,
791	ISD::STRICT_FMAXNUM,
792	ISD::STRICT_FMINIMUM,
793	ISD::STRICT_FMAXIMUM,
794	})
795	setOperationAction(Op, VT: ScalarVT, Action: Promote);
796
797	for (auto Op : {ISD::FNEG, ISD::FABS})
798	setOperationAction(Op, VT: ScalarVT, Action: Legal);
799
800	// Round-to-integer need custom lowering for fp16, as Promote doesn't work
801	// because the result type is integer.
802	for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
803	ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
804	ISD::STRICT_LLRINT})
805	setOperationAction(Op, VT: ScalarVT, Action: Custom);
806
807	// promote v4f16 to v4f32 when that is known to be safe.
808	auto V4Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `4`);
809	setOperationPromotedToType(Opc: ISD::FADD, OrigVT: V4Narrow, DestVT: MVT::v4f32);
810	setOperationPromotedToType(Opc: ISD::FSUB, OrigVT: V4Narrow, DestVT: MVT::v4f32);
811	setOperationPromotedToType(Opc: ISD::FMUL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
812	setOperationPromotedToType(Opc: ISD::FDIV, OrigVT: V4Narrow, DestVT: MVT::v4f32);
813	setOperationPromotedToType(Opc: ISD::FCEIL, OrigVT: V4Narrow, DestVT: MVT::v4f32);
814	setOperationPromotedToType(Opc: ISD::FFLOOR, OrigVT: V4Narrow, DestVT: MVT::v4f32);
815	setOperationPromotedToType(Opc: ISD::FROUND, OrigVT: V4Narrow, DestVT: MVT::v4f32);
816	setOperationPromotedToType(Opc: ISD::FTRUNC, OrigVT: V4Narrow, DestVT: MVT::v4f32);
817	setOperationPromotedToType(Opc: ISD::FROUNDEVEN, OrigVT: V4Narrow, DestVT: MVT::v4f32);
818	setOperationPromotedToType(Opc: ISD::FRINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
819	setOperationPromotedToType(Opc: ISD::FNEARBYINT, OrigVT: V4Narrow, DestVT: MVT::v4f32);
820
821	setOperationAction(Op: ISD::FABS, VT: V4Narrow, Action: Legal);
822	setOperationAction(Op: ISD::FNEG, VT: V4Narrow, Action: Legal);
823	setOperationAction(Op: ISD::FMA, VT: V4Narrow, Action: Expand);
824	setOperationAction(Op: ISD::SETCC, VT: V4Narrow, Action: Custom);
825	setOperationAction(Op: ISD::BR_CC, VT: V4Narrow, Action: Expand);
826	setOperationAction(Op: ISD::SELECT, VT: V4Narrow, Action: Expand);
827	setOperationAction(Op: ISD::SELECT_CC, VT: V4Narrow, Action: Expand);
828	setOperationAction(Op: ISD::FCOPYSIGN, VT: V4Narrow, Action: Custom);
829	setOperationAction(Op: ISD::FSQRT, VT: V4Narrow, Action: Expand);
830
831	auto V8Narrow = MVT::getVectorVT(VT: ScalarVT, NumElements: `8`);
832	setOperationAction(Op: ISD::FABS, VT: V8Narrow, Action: Legal);
833	setOperationAction(Op: ISD::FADD, VT: V8Narrow, Action: Legal);
834	setOperationAction(Op: ISD::FCEIL, VT: V8Narrow, Action: Legal);
835	setOperationAction(Op: ISD::FCOPYSIGN, VT: V8Narrow, Action: Custom);
836	setOperationAction(Op: ISD::FDIV, VT: V8Narrow, Action: Legal);
837	setOperationAction(Op: ISD::FFLOOR, VT: V8Narrow, Action: Legal);
838	setOperationAction(Op: ISD::FMA, VT: V8Narrow, Action: Expand);
839	setOperationAction(Op: ISD::FMUL, VT: V8Narrow, Action: Legal);
840	setOperationAction(Op: ISD::FNEARBYINT, VT: V8Narrow, Action: Legal);
841	setOperationAction(Op: ISD::FNEG, VT: V8Narrow, Action: Legal);
842	setOperationAction(Op: ISD::FROUND, VT: V8Narrow, Action: Legal);
843	setOperationAction(Op: ISD::FROUNDEVEN, VT: V8Narrow, Action: Legal);
844	setOperationAction(Op: ISD::FRINT, VT: V8Narrow, Action: Legal);
845	setOperationAction(Op: ISD::FSQRT, VT: V8Narrow, Action: Expand);
846	setOperationAction(Op: ISD::FSUB, VT: V8Narrow, Action: Legal);
847	setOperationAction(Op: ISD::FTRUNC, VT: V8Narrow, Action: Legal);
848	setOperationAction(Op: ISD::SETCC, VT: V8Narrow, Action: Expand);
849	setOperationAction(Op: ISD::BR_CC, VT: V8Narrow, Action: Expand);
850	setOperationAction(Op: ISD::SELECT, VT: V8Narrow, Action: Expand);
851	setOperationAction(Op: ISD::SELECT_CC, VT: V8Narrow, Action: Expand);
852	setOperationAction(Op: ISD::FP_EXTEND, VT: V8Narrow, Action: Expand);
853	};
854
855	if (!Subtarget->hasFullFP16()) {
856	LegalizeNarrowFP (MVT::f16);
857	}
858	LegalizeNarrowFP (MVT::bf16);
859	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f32, Action: Custom);
860	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4bf16, Action: Custom);
861
862	// AArch64 has implementations of a lot of rounding-like FP operations.
863	for (auto Op :
864	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
865	ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
866	ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
867	ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
868	ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
869	ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
870	ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
871	ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
872	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
873	ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
874	for (MVT Ty : {MVT::f32, MVT::f64})
875	setOperationAction(Op, VT: Ty, Action: Legal);
876	if (Subtarget->hasFullFP16())
877	setOperationAction(Op, VT: MVT::f16, Action: Legal);
878	}
879
880	// Basic strict FP operations are legal
881	for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
882	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
883	for (MVT Ty : {MVT::f32, MVT::f64})
884	setOperationAction(Op, VT: Ty, Action: Legal);
885	if (Subtarget->hasFullFP16())
886	setOperationAction(Op, VT: MVT::f16, Action: Legal);
887	}
888
889	// Strict conversion to a larger type is legal
890	for (auto VT : {MVT::f32, MVT::f64})
891	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
892
893	setOperationAction(Op: ISD::PREFETCH, VT: MVT::Other, Action: Custom);
894
895	setOperationAction(Op: ISD::GET_ROUNDING, VT: MVT::i32, Action: Custom);
896	setOperationAction(Op: ISD::SET_ROUNDING, VT: MVT::Other, Action: Custom);
897	setOperationAction(Op: ISD::GET_FPMODE, VT: MVT::i32, Action: Custom);
898	setOperationAction(Op: ISD::SET_FPMODE, VT: MVT::i32, Action: Custom);
899	setOperationAction(Op: ISD::RESET_FPMODE, VT: MVT::Other, Action: Custom);
900
901	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: Custom);
902	if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
903	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: LibCall);
904	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: LibCall);
905	} else {
906	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i32, Action: Expand);
907	setOperationAction(Op: ISD::ATOMIC_LOAD_SUB, VT: MVT::i64, Action: Expand);
908	}
909	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i32, Action: Custom);
910	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i64, Action: Custom);
911
912	// Generate outline atomics library calls only if LSE was not specified for
913	// subtarget
914	if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
915	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i8, Action: LibCall);
916	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i16, Action: LibCall);
917	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i32, Action: LibCall);
918	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i64, Action: LibCall);
919	setOperationAction(Op: ISD::ATOMIC_CMP_SWAP, VT: MVT::i128, Action: LibCall);
920	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i8, Action: LibCall);
921	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i16, Action: LibCall);
922	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i32, Action: LibCall);
923	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i64, Action: LibCall);
924	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i8, Action: LibCall);
925	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i16, Action: LibCall);
926	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i32, Action: LibCall);
927	setOperationAction(Op: ISD::ATOMIC_LOAD_ADD, VT: MVT::i64, Action: LibCall);
928	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i8, Action: LibCall);
929	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i16, Action: LibCall);
930	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i32, Action: LibCall);
931	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i64, Action: LibCall);
932	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i8, Action: LibCall);
933	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i16, Action: LibCall);
934	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i32, Action: LibCall);
935	setOperationAction(Op: ISD::ATOMIC_LOAD_CLR, VT: MVT::i64, Action: LibCall);
936	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i8, Action: LibCall);
937	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i16, Action: LibCall);
938	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i32, Action: LibCall);
939	setOperationAction(Op: ISD::ATOMIC_LOAD_XOR, VT: MVT::i64, Action: LibCall);
940	#define LCALLNAMES(A, B, N) \
941	setLibcallName(A##N##_RELAX, #B #N "_relax"); \
942	setLibcallName(A##N##_ACQ, #B #N "_acq"); \
943	setLibcallName(A##N##_REL, #B #N "_rel"); \
944	setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
945	#define LCALLNAME4(A, B) \
946	LCALLNAMES(A, B, 1) \
947	LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
948	#define LCALLNAME5(A, B) \
949	LCALLNAMES(A, B, 1) \
950	LCALLNAMES(A, B, 2) \
951	LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
952	LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
953	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
954	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
955	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
956	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
957	LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
958	#undef LCALLNAMES
959	#undef LCALLNAME4
960	#undef LCALLNAME5
961	}
962
963	if (Subtarget->hasLSE128()) {
964	// Custom lowering because i128 is not legal. Must be replaced by 2x64
965	// values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
966	setOperationAction(Op: ISD::ATOMIC_LOAD_AND, VT: MVT::i128, Action: Custom);
967	setOperationAction(Op: ISD::ATOMIC_LOAD_OR, VT: MVT::i128, Action: Custom);
968	setOperationAction(Op: ISD::ATOMIC_SWAP, VT: MVT::i128, Action: Custom);
969	}
970
971	// 128-bit loads and stores can be done without expanding
972	setOperationAction(Op: ISD::LOAD, VT: MVT::i128, Action: Custom);
973	setOperationAction(Op: ISD::STORE, VT: MVT::i128, Action: Custom);
974
975	// Aligned 128-bit loads and stores are single-copy atomic according to the
976	// v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
977	if (Subtarget->hasLSE2()) {
978	setOperationAction(Op: ISD::ATOMIC_LOAD, VT: MVT::i128, Action: Custom);
979	setOperationAction(Op: ISD::ATOMIC_STORE, VT: MVT::i128, Action: Custom);
980	}
981
982	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
983	// custom lowering, as there are no un-paired non-temporal stores and
984	// legalization will break up 256 bit inputs.
985	setOperationAction(Op: ISD::STORE, VT: MVT::v32i8, Action: Custom);
986	setOperationAction(Op: ISD::STORE, VT: MVT::v16i16, Action: Custom);
987	setOperationAction(Op: ISD::STORE, VT: MVT::v16f16, Action: Custom);
988	setOperationAction(Op: ISD::STORE, VT: MVT::v16bf16, Action: Custom);
989	setOperationAction(Op: ISD::STORE, VT: MVT::v8i32, Action: Custom);
990	setOperationAction(Op: ISD::STORE, VT: MVT::v8f32, Action: Custom);
991	setOperationAction(Op: ISD::STORE, VT: MVT::v4f64, Action: Custom);
992	setOperationAction(Op: ISD::STORE, VT: MVT::v4i64, Action: Custom);
993
994	// 256 bit non-temporal loads can be lowered to LDNP. This is done using
995	// custom lowering, as there are no un-paired non-temporal loads legalization
996	// will break up 256 bit inputs.
997	setOperationAction(Op: ISD::LOAD, VT: MVT::v32i8, Action: Custom);
998	setOperationAction(Op: ISD::LOAD, VT: MVT::v16i16, Action: Custom);
999	setOperationAction(Op: ISD::LOAD, VT: MVT::v16f16, Action: Custom);
1000	setOperationAction(Op: ISD::LOAD, VT: MVT::v16bf16, Action: Custom);
1001	setOperationAction(Op: ISD::LOAD, VT: MVT::v8i32, Action: Custom);
1002	setOperationAction(Op: ISD::LOAD, VT: MVT::v8f32, Action: Custom);
1003	setOperationAction(Op: ISD::LOAD, VT: MVT::v4f64, Action: Custom);
1004	setOperationAction(Op: ISD::LOAD, VT: MVT::v4i64, Action: Custom);
1005
1006	// Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1007	setOperationAction(Op: ISD::READCYCLECOUNTER, VT: MVT::i64, Action: Legal);
1008
1009	if (getLibcallName(Call: RTLIB::SINCOS_STRET_F32) != nullptr &&
1010	getLibcallName(Call: RTLIB::SINCOS_STRET_F64) != nullptr) {
1011	// Issue __sincos_stret if available.
1012	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Custom);
1013	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Custom);
1014	} else {
1015	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f64, Action: Expand);
1016	setOperationAction(Op: ISD::FSINCOS, VT: MVT::f32, Action: Expand);
1017	}
1018
1019	// Make floating-point constants legal for the large code model, so they don't
1020	// become loads from the constant pool.
1021	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1022	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f32, Action: Legal);
1023	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f64, Action: Legal);
1024	}
1025
1026	// AArch64 does not have floating-point extending loads, i1 sign-extending
1027	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028	for (MVT VT : MVT::fp_valuetypes()) {
1029	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::bf16, Action: Expand);
1030	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f16, Action: Expand);
1031	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f32, Action: Expand);
1032	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f64, Action: Expand);
1033	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: MVT::f80, Action: Expand);
1034	}
1035	for (MVT VT : MVT::integer_valuetypes())
1036	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: MVT::i1, Action: Expand);
1037
1038	for (MVT WideVT : MVT::fp_valuetypes()) {
1039	for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040	if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041	setTruncStoreAction(ValVT: WideVT, MemVT: NarrowVT, Action: Expand);
1042	}
1043	}
1044	}
1045
1046	if (Subtarget->hasFPARMv8()) {
1047	setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1048	setOperationAction(Op: ISD::BITCAST, VT: MVT::f16, Action: Custom);
1049	setOperationAction(Op: ISD::BITCAST, VT: MVT::bf16, Action: Custom);
1050	}
1051
1052	// Indexed loads and stores are supported.
1053	for (unsigned im = (unsigned)ISD::PRE_INC;
1054	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1055	setIndexedLoadAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1056	setIndexedLoadAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1057	setIndexedLoadAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1058	setIndexedLoadAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1059	setIndexedLoadAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1060	setIndexedLoadAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1061	setIndexedLoadAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1062	setIndexedLoadAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1063	setIndexedStoreAction(IdxModes: im, VT: MVT::i8, Action: Legal);
1064	setIndexedStoreAction(IdxModes: im, VT: MVT::i16, Action: Legal);
1065	setIndexedStoreAction(IdxModes: im, VT: MVT::i32, Action: Legal);
1066	setIndexedStoreAction(IdxModes: im, VT: MVT::i64, Action: Legal);
1067	setIndexedStoreAction(IdxModes: im, VT: MVT::f64, Action: Legal);
1068	setIndexedStoreAction(IdxModes: im, VT: MVT::f32, Action: Legal);
1069	setIndexedStoreAction(IdxModes: im, VT: MVT::f16, Action: Legal);
1070	setIndexedStoreAction(IdxModes: im, VT: MVT::bf16, Action: Legal);
1071	}
1072
1073	// Trap.
1074	setOperationAction(Op: ISD::TRAP, VT: MVT::Other, Action: Legal);
1075	setOperationAction(Op: ISD::DEBUGTRAP, VT: MVT::Other, Action: Legal);
1076	setOperationAction(Op: ISD::UBSANTRAP, VT: MVT::Other, Action: Legal);
1077
1078	// We combine OR nodes for bitfield operations.
1079	setTargetDAGCombine(ISD::OR);
1080	// Try to create BICs for vector ANDs.
1081	setTargetDAGCombine(ISD::AND);
1082
1083	// llvm.init.trampoline and llvm.adjust.trampoline
1084	setOperationAction(Op: ISD::INIT_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1085	setOperationAction(Op: ISD::ADJUST_TRAMPOLINE, VT: MVT::Other, Action: Custom);
1086
1087	// Vector add and sub nodes may conceal a high-half opportunity.
1088	// Also, try to fold ADD into CSINC/CSINV..
1089	setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
1090	ISD::UINT_TO_FP});
1091
1092	setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1093	ISD::FP_TO_UINT_SAT, ISD::FADD});
1094
1095	// Try and combine setcc with csel
1096	setTargetDAGCombine(ISD::SETCC);
1097
1098	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
1099
1100	setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
1101	ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS,
1102	ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR,
1103	ISD::STORE, ISD::BUILD_VECTOR});
1104	setTargetDAGCombine(ISD::TRUNCATE);
1105	setTargetDAGCombine(ISD::LOAD);
1106
1107	setTargetDAGCombine(ISD::MSTORE);
1108
1109	setTargetDAGCombine(ISD::MUL);
1110
1111	setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
1112
1113	setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
1114	ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
1115	ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1116
1117	setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
1118
1119	setTargetDAGCombine(ISD::FP_EXTEND);
1120
1121	setTargetDAGCombine(ISD::GlobalAddress);
1122
1123	setTargetDAGCombine(ISD::CTLZ);
1124
1125	setTargetDAGCombine(ISD::VECREDUCE_AND);
1126	setTargetDAGCombine(ISD::VECREDUCE_OR);
1127	setTargetDAGCombine(ISD::VECREDUCE_XOR);
1128
1129	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1130
1131	// In case of strict alignment, avoid an excessive number of byte wide stores.
1132	MaxStoresPerMemsetOptSize = `8`;
1133	MaxStoresPerMemset =
1134	Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : `32`;
1135
1136	MaxGluedStoresPerMemcpy = `4`;
1137	MaxStoresPerMemcpyOptSize = `4`;
1138	MaxStoresPerMemcpy =
1139	Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : `16`;
1140
1141	MaxStoresPerMemmoveOptSize = `4`;
1142	MaxStoresPerMemmove = `4`;
1143
1144	MaxLoadsPerMemcmpOptSize = `4`;
1145	MaxLoadsPerMemcmp =
1146	Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : `8`;
1147
1148	setStackPointerRegisterToSaveRestore(AArch64::SP);
1149
1150	setSchedulingPreference(Sched::Hybrid);
1151
1152	EnableExtLdPromotion = true;
1153
1154	// Set required alignment.
1155	setMinFunctionAlignment(Align (`4`));
1156	// Set preferred alignments.
1157
1158	// Don't align loops on Windows. The SEH unwind info generation needs to
1159	// know the exact length of functions before the alignments have been
1160	// expanded.
1161	if (!Subtarget->isTargetWindows())
1162	setPrefLoopAlignment(STI.getPrefLoopAlignment());
1163	setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
1164	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
1165
1166	// Only change the limit for entries in a jump table if specified by
1167	// the sub target, but not at the command line.
1168	unsigned MaxJT = STI.getMaximumJumpTableSize();
1169	if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1170	setMaximumJumpTableSize(MaxJT);
1171
1172	setHasExtractBitsInsn(true);
1173
1174	setMaxDivRemBitWidthSupported(`128`);
1175
1176	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::Other, Action: Custom);
1177
1178	if (Subtarget->isNeonAvailable()) {
1179	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1180	// silliness like this:
1181	// clang-format off
1182	for (auto Op :
1183	{ISD::SELECT, ISD::SELECT_CC,
1184	ISD::BR_CC, ISD::FADD, ISD::FSUB,
1185	ISD::FMUL, ISD::FDIV, ISD::FMA,
1186	ISD::FNEG, ISD::FABS, ISD::FCEIL,
1187	ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1188	ISD::FSIN, ISD::FCOS, ISD::FTAN,
1189	ISD::FASIN, ISD::FACOS, ISD::FATAN,
1190	ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1191	ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1192	ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1193	ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1194	ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1195	ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1196	ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
1197	ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FCEIL,
1198	ISD::STRICT_FFLOOR, ISD::STRICT_FSQRT, ISD::STRICT_FRINT,
1199	ISD::STRICT_FNEARBYINT, ISD::STRICT_FROUND, ISD::STRICT_FTRUNC,
1200	ISD::STRICT_FROUNDEVEN, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
1201	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM})
1202	setOperationAction(Op, VT: MVT::v1f64, Action: Expand);
1203	// clang-format on
1204	for (auto Op :
1205	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
1206	ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
1207	ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
1208	ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
1209	setOperationAction(Op, VT: MVT::v1i64, Action: Expand);
1210
1211	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
1212	// elements smaller than i32, so promote the input to i32 first.
1213	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1214	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i8, DestVT: MVT::v4i32);
1215
1216	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
1217	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1218	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1219	for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
1220	ISD::STRICT_UINT_TO_FP})
1221	for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1222	setOperationAction(Op, VT, Action: Custom);
1223
1224	if (Subtarget->hasFullFP16()) {
1225	setOperationAction(Op: ISD::ConstantFP, VT: MVT::f16, Action: Legal);
1226	setOperationAction(Op: ISD::ConstantFP, VT: MVT::bf16, Action: Legal);
1227
1228	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1229	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i8, Action: Custom);
1230	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1231	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v16i8, Action: Custom);
1232	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1233	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v4i16, Action: Custom);
1234	setOperationAction(Op: ISD::SINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1235	setOperationAction(Op: ISD::UINT_TO_FP, VT: MVT::v8i16, Action: Custom);
1236	} else {
1237	// when AArch64 doesn't have fullfp16 support, promote the input
1238	// to i32 first.
1239	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1240	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i8, DestVT: MVT::v8i32);
1241	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1242	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v16i8, DestVT: MVT::v16i32);
1243	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1244	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v4i16, DestVT: MVT::v4i32);
1245	setOperationPromotedToType(Opc: ISD::SINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1246	setOperationPromotedToType(Opc: ISD::UINT_TO_FP, OrigVT: MVT::v8i16, DestVT: MVT::v8i32);
1247	}
1248
1249	setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Expand);
1250	setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Expand);
1251	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v8i8, Action: Legal);
1252	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v16i8, Action: Legal);
1253	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i32, Action: Custom);
1254	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v4i32, Action: Custom);
1255	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1256	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v2i64, Action: Custom);
1257	for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1258	setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1259	setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1260	setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1261	setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1262	}
1263
1264	// Custom handling for some quad-vector types to detect MULL.
1265	setOperationAction(Op: ISD::MUL, VT: MVT::v8i16, Action: Custom);
1266	setOperationAction(Op: ISD::MUL, VT: MVT::v4i32, Action: Custom);
1267	setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1268	setOperationAction(Op: ISD::MUL, VT: MVT::v4i16, Action: Custom);
1269	setOperationAction(Op: ISD::MUL, VT: MVT::v2i32, Action: Custom);
1270	setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1271
1272	// Saturates
1273	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1274	MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1275	setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1276	setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1277	setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1278	setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1279	}
1280
1281	for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1282	MVT::v4i32}) {
1283	setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Legal);
1284	setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Legal);
1285	setOperationAction(Op: ISD::AVGCEILS, VT, Action: Legal);
1286	setOperationAction(Op: ISD::AVGCEILU, VT, Action: Legal);
1287	setOperationAction(Op: ISD::ABDS, VT, Action: Legal);
1288	setOperationAction(Op: ISD::ABDU, VT, Action: Legal);
1289	}
1290
1291	// Vector reductions
1292	for (MVT VT : { MVT::v4f16, MVT::v2f32,
1293	MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1294	if (VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()) {
1295	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Legal);
1296	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Legal);
1297	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Legal);
1298	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Legal);
1299
1300	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Legal);
1301	}
1302	}
1303	for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1304	MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1305	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1306	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1307	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1308	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1309	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1310	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1311	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1312	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1313	}
1314	setOperationAction(Op: ISD::VECREDUCE_ADD, VT: MVT::v2i64, Action: Custom);
1315	setOperationAction(Op: ISD::VECREDUCE_AND, VT: MVT::v2i64, Action: Custom);
1316	setOperationAction(Op: ISD::VECREDUCE_OR, VT: MVT::v2i64, Action: Custom);
1317	setOperationAction(Op: ISD::VECREDUCE_XOR, VT: MVT::v2i64, Action: Custom);
1318
1319	setOperationAction(Op: ISD::ANY_EXTEND, VT: MVT::v4i32, Action: Legal);
1320	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Expand);
1321	// Likewise, narrowing and extending vector loads/stores aren't handled
1322	// directly.
1323	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1324	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Expand);
1325
1326	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32) {
1327	setOperationAction(Op: ISD::MULHS, VT, Action: Legal);
1328	setOperationAction(Op: ISD::MULHU, VT, Action: Legal);
1329	} else {
1330	setOperationAction(Op: ISD::MULHS, VT, Action: Expand);
1331	setOperationAction(Op: ISD::MULHU, VT, Action: Expand);
1332	}
1333	setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1334	setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1335
1336	setOperationAction(Op: ISD::BSWAP, VT, Action: Expand);
1337	setOperationAction(Op: ISD::CTTZ, VT, Action: Expand);
1338
1339	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1340	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1341	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1342	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1343	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1344	}
1345	}
1346
1347	// AArch64 has implementations of a lot of rounding-like FP operations.
1348	for (auto Op :
1349	{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1350	ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1351	ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1352	ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1353	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1354	setOperationAction(Op, VT: Ty, Action: Legal);
1355	if (Subtarget->hasFullFP16())
1356	for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1357	setOperationAction(Op, VT: Ty, Action: Legal);
1358	}
1359
1360	// LRINT and LLRINT.
1361	for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1362	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1363	setOperationAction(Op, VT: Ty, Action: Custom);
1364	if (Subtarget->hasFullFP16())
1365	for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1366	setOperationAction(Op, VT: Ty, Action: Custom);
1367	}
1368
1369	setTruncStoreAction(ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1370
1371	setOperationAction(Op: ISD::BITCAST, VT: MVT::i2, Action: Custom);
1372	setOperationAction(Op: ISD::BITCAST, VT: MVT::i4, Action: Custom);
1373	setOperationAction(Op: ISD::BITCAST, VT: MVT::i8, Action: Custom);
1374	setOperationAction(Op: ISD::BITCAST, VT: MVT::i16, Action: Custom);
1375
1376	setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i8, Action: Custom);
1377	setOperationAction(Op: ISD::BITCAST, VT: MVT::v2i16, Action: Custom);
1378	setOperationAction(Op: ISD::BITCAST, VT: MVT::v4i8, Action: Custom);
1379
1380	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1381	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1382	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i16, MemVT: MVT::v4i8, Action: Custom);
1383	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1384	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1385	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: MVT::v4i32, MemVT: MVT::v4i8, Action: Custom);
1386
1387	// ADDP custom lowering
1388	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1389	setOperationAction(Op: ISD::ADD, VT, Action: Custom);
1390	// FADDP custom lowering
1391	for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1392	setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1393	} else / !isNeonAvailable / {
1394	for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1395	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op)
1396	setOperationAction(Op, VT, Action: Expand);
1397
1398	if (VT.is128BitVector() \|\| VT.is64BitVector()) {
1399	setOperationAction(Op: ISD::LOAD, VT, Action: Legal);
1400	setOperationAction(Op: ISD::STORE, VT, Action: Legal);
1401	setOperationAction(Op: ISD::BITCAST, VT,
1402	Action: Subtarget->isLittleEndian() ? Legal : Expand);
1403	}
1404	for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1405	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1406	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1407	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1408	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1409	}
1410	}
1411	}
1412
1413	if (Subtarget->hasSME()) {
1414	setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::Other, Action: Custom);
1415	}
1416
1417	// FIXME: Move lowering for more nodes here if those are common between
1418	// SVE and SME.
1419	if (Subtarget->isSVEorStreamingSVEAvailable()) {
1420	for (auto VT :
1421	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1422	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Custom);
1423	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1424	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1425	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1426	}
1427	}
1428
1429	if (Subtarget->isSVEorStreamingSVEAvailable()) {
1430	for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1431	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1432	setOperationAction(Op: ISD::BSWAP, VT, Action: Custom);
1433	setOperationAction(Op: ISD::CTLZ, VT, Action: Custom);
1434	setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1435	setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1436	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1437	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1438	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1439	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Custom);
1440	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Custom);
1441	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1442	setOperationAction(Op: ISD::MUL, VT, Action: Custom);
1443	setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1444	setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1445	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1446	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1447	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1448	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1449	setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1450	setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1451	setOperationAction(Op: ISD::SMIN, VT, Action: Custom);
1452	setOperationAction(Op: ISD::UMIN, VT, Action: Custom);
1453	setOperationAction(Op: ISD::SMAX, VT, Action: Custom);
1454	setOperationAction(Op: ISD::UMAX, VT, Action: Custom);
1455	setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1456	setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1457	setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1458	setOperationAction(Op: ISD::ABS, VT, Action: Custom);
1459	setOperationAction(Op: ISD::ABDS, VT, Action: Custom);
1460	setOperationAction(Op: ISD::ABDU, VT, Action: Custom);
1461	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Custom);
1462	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1463	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1464	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1465	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Custom);
1466	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Custom);
1467	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Custom);
1468	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Custom);
1469	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1470	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1471
1472	setOperationAction(Op: ISD::UMUL_LOHI, VT, Action: Expand);
1473	setOperationAction(Op: ISD::SMUL_LOHI, VT, Action: Expand);
1474	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1475	setOperationAction(Op: ISD::ROTL, VT, Action: Expand);
1476	setOperationAction(Op: ISD::ROTR, VT, Action: Expand);
1477
1478	setOperationAction(Op: ISD::SADDSAT, VT, Action: Legal);
1479	setOperationAction(Op: ISD::UADDSAT, VT, Action: Legal);
1480	setOperationAction(Op: ISD::SSUBSAT, VT, Action: Legal);
1481	setOperationAction(Op: ISD::USUBSAT, VT, Action: Legal);
1482	setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1483	setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1484	setOperationAction(Op: ISD::SDIVREM, VT, Action: Expand);
1485	setOperationAction(Op: ISD::UDIVREM, VT, Action: Expand);
1486
1487	setOperationAction(Op: ISD::AVGFLOORS, VT, Action: Custom);
1488	setOperationAction(Op: ISD::AVGFLOORU, VT, Action: Custom);
1489	setOperationAction(Op: ISD::AVGCEILS, VT, Action: Custom);
1490	setOperationAction(Op: ISD::AVGCEILU, VT, Action: Custom);
1491
1492	if (!Subtarget->isLittleEndian())
1493	setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1494
1495	if (Subtarget->hasSVE2() \|\|
1496	(Subtarget->hasSME() && Subtarget->isStreaming()))
1497	// For SLI/SRI.
1498	setOperationAction(Op: ISD::OR, VT, Action: Custom);
1499	}
1500
1501	// Illegal unpacked integer vector types.
1502	for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1503	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1504	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1505	}
1506
1507	// Legalize unpacked bitcasts to REINTERPRET_CAST.
1508	for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1509	MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1510	setOperationAction(Op: ISD::BITCAST, VT, Action: Custom);
1511
1512	for (auto VT :
1513	{ MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1514	MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1515	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Legal);
1516
1517	for (auto VT :
1518	{MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1519	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1520	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1521	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1522	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1523	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1524	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1525	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1526
1527	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1528	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1529	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1530
1531	// There are no legal MVT::nxv16f## based types.
1532	if (VT != MVT::nxv16i1) {
1533	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Custom);
1534	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Custom);
1535	}
1536	}
1537
1538	// NEON doesn't support masked loads/stores, but SME and SVE do.
1539	for (auto VT :
1540	{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1541	MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1542	MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1543	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1544	setOperationAction(Op: ISD::MSTORE, VT, Action: Custom);
1545	}
1546
1547	// Firstly, exclude all scalable vector extending loads/truncating stores,
1548	// include both integer and floating scalable vector.
1549	for (MVT VT : MVT::scalable_vector_valuetypes()) {
1550	for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1551	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Expand);
1552	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1553	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1554	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Expand);
1555	}
1556	}
1557
1558	// Then, selectively enable those which we directly support.
1559	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1560	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1561	setTruncStoreAction(ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1562	setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1563	setTruncStoreAction(ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1564	setTruncStoreAction(ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1565	for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1566	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i8, Action: Legal);
1567	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i16, Action: Legal);
1568	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv2i64, MemVT: MVT::nxv2i32, Action: Legal);
1569	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i8, Action: Legal);
1570	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv4i32, MemVT: MVT::nxv4i16, Action: Legal);
1571	setLoadExtAction(ExtType: Op, ValVT: MVT::nxv8i16, MemVT: MVT::nxv8i8, Action: Legal);
1572	}
1573
1574	// SVE supports truncating stores of 64 and 128-bit vectors
1575	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i8, Action: Custom);
1576	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i16, Action: Custom);
1577	setTruncStoreAction(ValVT: MVT::v2i64, MemVT: MVT::v2i32, Action: Custom);
1578	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i8, Action: Custom);
1579	setTruncStoreAction(ValVT: MVT::v2i32, MemVT: MVT::v2i16, Action: Custom);
1580
1581	for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1582	MVT::nxv4f32, MVT::nxv2f64}) {
1583	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1584	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1585	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1586	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1587	setOperationAction(Op: ISD::SELECT, VT, Action: Custom);
1588	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1589	setOperationAction(Op: ISD::FADD, VT, Action: Custom);
1590	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1591	setOperationAction(Op: ISD::FDIV, VT, Action: Custom);
1592	setOperationAction(Op: ISD::FMA, VT, Action: Custom);
1593	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Custom);
1594	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Custom);
1595	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Custom);
1596	setOperationAction(Op: ISD::FMINNUM, VT, Action: Custom);
1597	setOperationAction(Op: ISD::FMUL, VT, Action: Custom);
1598	setOperationAction(Op: ISD::FNEG, VT, Action: Custom);
1599	setOperationAction(Op: ISD::FSUB, VT, Action: Custom);
1600	setOperationAction(Op: ISD::FCEIL, VT, Action: Custom);
1601	setOperationAction(Op: ISD::FFLOOR, VT, Action: Custom);
1602	setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Custom);
1603	setOperationAction(Op: ISD::FRINT, VT, Action: Custom);
1604	setOperationAction(Op: ISD::LRINT, VT, Action: Custom);
1605	setOperationAction(Op: ISD::LLRINT, VT, Action: Custom);
1606	setOperationAction(Op: ISD::FROUND, VT, Action: Custom);
1607	setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Custom);
1608	setOperationAction(Op: ISD::FTRUNC, VT, Action: Custom);
1609	setOperationAction(Op: ISD::FSQRT, VT, Action: Custom);
1610	setOperationAction(Op: ISD::FABS, VT, Action: Custom);
1611	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Custom);
1612	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1613	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1614	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Custom);
1615	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Custom);
1616	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Custom);
1617	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Custom);
1618	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1619	setOperationAction(Op: ISD::VECTOR_DEINTERLEAVE, VT, Action: Custom);
1620	setOperationAction(Op: ISD::VECTOR_INTERLEAVE, VT, Action: Custom);
1621
1622	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1623	setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1624	setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1625	setOperationAction(Op: ISD::FPOWI, VT, Action: Expand);
1626	setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1627	setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1628	setOperationAction(Op: ISD::FSINCOS, VT, Action: Expand);
1629	setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1630	setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1631	setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1632	setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1633	setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1634	setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1635	setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1636	setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1637	setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1638	setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1639	setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1640	setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1641	setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1642
1643	setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1644	setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1645	setCondCodeAction(CCs: ISD::SETLT, VT, Action: Expand);
1646	setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1647	setCondCodeAction(CCs: ISD::SETLE, VT, Action: Expand);
1648	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1649	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1650	setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1651	setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
1652	setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
1653	setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
1654
1655	if (!Subtarget->isLittleEndian())
1656	setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1657	}
1658
1659	for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1660	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Custom);
1661	setOperationAction(Op: ISD::MLOAD, VT, Action: Custom);
1662	setOperationAction(Op: ISD::INSERT_SUBVECTOR, VT, Action: Custom);
1663	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Legal);
1664	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Custom);
1665
1666	if (!Subtarget->isLittleEndian())
1667	setOperationAction(Op: ISD::BITCAST, VT, Action: Expand);
1668	}
1669
1670	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i8, Action: Custom);
1671	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT: MVT::i16, Action: Custom);
1672
1673	// NEON doesn't support integer divides, but SVE does
1674	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1675	MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1676	setOperationAction(Op: ISD::SDIV, VT, Action: Custom);
1677	setOperationAction(Op: ISD::UDIV, VT, Action: Custom);
1678	}
1679
1680	// NEON doesn't support 64-bit vector integer muls, but SVE does.
1681	setOperationAction(Op: ISD::MUL, VT: MVT::v1i64, Action: Custom);
1682	setOperationAction(Op: ISD::MUL, VT: MVT::v2i64, Action: Custom);
1683
1684	// NOTE: Currently this has to happen after computeRegisterProperties rather
1685	// than the preferred option of combining it with the addRegisterClass call.
1686	if (Subtarget->useSVEForFixedLengthVectors()) {
1687	for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
1688	if (useSVEForFixedLengthVectorVT(
1689	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1690	addTypeForFixedLengthSVE(VT);
1691	}
1692	for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) {
1693	if (useSVEForFixedLengthVectorVT(
1694	VT, /OverrideNEON=/!Subtarget->isNeonAvailable()))
1695	addTypeForFixedLengthSVE(VT);
1696	}
1697
1698	// 64bit results can mean a bigger than NEON input.
1699	for (auto VT : {MVT::v8i8, MVT::v4i16})
1700	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1701	setOperationAction(Op: ISD::FP_ROUND, VT: MVT::v4f16, Action: Custom);
1702
1703	// 128bit results imply a bigger than NEON input.
1704	for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1705	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Custom);
1706	for (auto VT : {MVT::v8f16, MVT::v4f32})
1707	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Custom);
1708
1709	// These operations are not supported on NEON but SVE can do them.
1710	setOperationAction(Op: ISD::BITREVERSE, VT: MVT::v1i64, Action: Custom);
1711	setOperationAction(Op: ISD::CTLZ, VT: MVT::v1i64, Action: Custom);
1712	setOperationAction(Op: ISD::CTLZ, VT: MVT::v2i64, Action: Custom);
1713	setOperationAction(Op: ISD::CTTZ, VT: MVT::v1i64, Action: Custom);
1714	setOperationAction(Op: ISD::MULHS, VT: MVT::v1i64, Action: Custom);
1715	setOperationAction(Op: ISD::MULHS, VT: MVT::v2i64, Action: Custom);
1716	setOperationAction(Op: ISD::MULHU, VT: MVT::v1i64, Action: Custom);
1717	setOperationAction(Op: ISD::MULHU, VT: MVT::v2i64, Action: Custom);
1718	setOperationAction(Op: ISD::SMAX, VT: MVT::v1i64, Action: Custom);
1719	setOperationAction(Op: ISD::SMAX, VT: MVT::v2i64, Action: Custom);
1720	setOperationAction(Op: ISD::SMIN, VT: MVT::v1i64, Action: Custom);
1721	setOperationAction(Op: ISD::SMIN, VT: MVT::v2i64, Action: Custom);
1722	setOperationAction(Op: ISD::UMAX, VT: MVT::v1i64, Action: Custom);
1723	setOperationAction(Op: ISD::UMAX, VT: MVT::v2i64, Action: Custom);
1724	setOperationAction(Op: ISD::UMIN, VT: MVT::v1i64, Action: Custom);
1725	setOperationAction(Op: ISD::UMIN, VT: MVT::v2i64, Action: Custom);
1726	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT: MVT::v2i64, Action: Custom);
1727	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT: MVT::v2i64, Action: Custom);
1728	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT: MVT::v2i64, Action: Custom);
1729	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT: MVT::v2i64, Action: Custom);
1730
1731	// Int operations with no NEON support.
1732	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1733	MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1734	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Custom);
1735	setOperationAction(Op: ISD::CTTZ, VT, Action: Custom);
1736	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Custom);
1737	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Custom);
1738	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Custom);
1739	setOperationAction(Op: ISD::MULHS, VT, Action: Custom);
1740	setOperationAction(Op: ISD::MULHU, VT, Action: Custom);
1741	}
1742
1743	// Use SVE for vectors with more than 2 elements.
1744	for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1745	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Custom);
1746	}
1747
1748	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv2i1, DestVT: MVT::nxv2i64);
1749	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv4i1, DestVT: MVT::nxv4i32);
1750	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv8i1, DestVT: MVT::nxv8i16);
1751	setOperationPromotedToType(Opc: ISD::VECTOR_SPLICE, OrigVT: MVT::nxv16i1, DestVT: MVT::nxv16i8);
1752
1753	setOperationAction(Op: ISD::VSCALE, VT: MVT::i32, Action: Custom);
1754
1755	for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1756	setOperationAction(Op: ISD::INTRINSIC_WO_CHAIN, VT, Action: Custom);
1757	}
1758
1759	// Handle operations that are only available in non-streaming SVE mode.
1760	if (Subtarget->isSVEAvailable()) {
1761	for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1762	MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1763	MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1764	MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1765	MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1766	MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1767	MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1768	setOperationAction(Op: ISD::MGATHER, VT, Action: Custom);
1769	setOperationAction(Op: ISD::MSCATTER, VT, Action: Custom);
1770	}
1771
1772	for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1773	MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1774	MVT::v2f32, MVT::v4f32, MVT::v2f64})
1775	setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: Custom);
1776
1777	// Histcnt is SVE2 only
1778	if (Subtarget->hasSVE2())
1779	setOperationAction(Op: ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VT: MVT::Other,
1780	Action: Custom);
1781	}
1782
1783
1784	if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1785	// Only required for llvm.aarch64.mops.memset.tag
1786	setOperationAction(Op: ISD::INTRINSIC_W_CHAIN, VT: MVT::i8, Action: Custom);
1787	}
1788
1789	setOperationAction(Op: ISD::INTRINSIC_VOID, VT: MVT::Other, Action: Custom);
1790
1791	if (Subtarget->hasSVE()) {
1792	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f64, Action: Custom);
1793	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f32, Action: Custom);
1794	setOperationAction(Op: ISD::FLDEXP, VT: MVT::f16, Action: Custom);
1795	setOperationAction(Op: ISD::FLDEXP, VT: MVT::bf16, Action: Custom);
1796	}
1797
1798	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1799
1800	IsStrictFPEnabled = true;
1801	setMaxAtomicSizeInBitsSupported(`128`);
1802
1803	// On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1804	// it, but it's just a wrapper around ldexp.
1805	if (Subtarget->isTargetWindows()) {
1806	for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1807	if (isOperationExpand(Op, VT: MVT::f32))
1808	setOperationAction(Op, VT: MVT::f32, Action: Promote);
1809	}
1810
1811	// LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1812	// isn't legal.
1813	for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1814	if (isOperationExpand(Op, VT: MVT::f16))
1815	setOperationAction(Op, VT: MVT::f16, Action: Promote);
1816
1817	if (Subtarget->isWindowsArm64EC()) {
1818	// FIXME: are there intrinsics we need to exclude from this?
1819	for (int i = `0`; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1820	auto code = static_cast<RTLIB::Libcall>(i);
1821	auto libcallName = getLibcallName(Call: code);
1822	if ((libcallName != nullptr) && (libcallName[`0`] != `'#'`)) {
1823	setLibcallName(Call: code, Name: Saver.save(S: Twine ("#") + libcallName).data());
1824	}
1825	}
1826	}
1827	}
1828
1829	void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1830	assert(VT.isVector() && "VT should be a vector type");
1831
1832	if (VT.isFloatingPoint()) {
1833	MVT PromoteTo = EVT (VT).changeVectorElementTypeToInteger().getSimpleVT();
1834	setOperationPromotedToType(Opc: ISD::LOAD, OrigVT: VT, DestVT: PromoteTo);
1835	setOperationPromotedToType(Opc: ISD::STORE, OrigVT: VT, DestVT: PromoteTo);
1836	}
1837
1838	// Mark vector float intrinsics as expand.
1839	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
1840	setOperationAction(Op: ISD::FSIN, VT, Action: Expand);
1841	setOperationAction(Op: ISD::FCOS, VT, Action: Expand);
1842	setOperationAction(Op: ISD::FTAN, VT, Action: Expand);
1843	setOperationAction(Op: ISD::FASIN, VT, Action: Expand);
1844	setOperationAction(Op: ISD::FACOS, VT, Action: Expand);
1845	setOperationAction(Op: ISD::FATAN, VT, Action: Expand);
1846	setOperationAction(Op: ISD::FSINH, VT, Action: Expand);
1847	setOperationAction(Op: ISD::FCOSH, VT, Action: Expand);
1848	setOperationAction(Op: ISD::FTANH, VT, Action: Expand);
1849	setOperationAction(Op: ISD::FPOW, VT, Action: Expand);
1850	setOperationAction(Op: ISD::FLOG, VT, Action: Expand);
1851	setOperationAction(Op: ISD::FLOG2, VT, Action: Expand);
1852	setOperationAction(Op: ISD::FLOG10, VT, Action: Expand);
1853	setOperationAction(Op: ISD::FEXP, VT, Action: Expand);
1854	setOperationAction(Op: ISD::FEXP2, VT, Action: Expand);
1855	setOperationAction(Op: ISD::FEXP10, VT, Action: Expand);
1856	}
1857
1858	// But we do support custom-lowering for FCOPYSIGN.
1859	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
1860	((VT == MVT::v4bf16 \|\| VT == MVT::v8bf16 \|\| VT == MVT::v4f16 \|\|
1861	VT == MVT::v8f16) &&
1862	Subtarget->hasFullFP16()))
1863	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Custom);
1864
1865	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Custom);
1866	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Custom);
1867	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Custom);
1868	setOperationAction(Op: ISD::ZERO_EXTEND_VECTOR_INREG, VT, Action: Custom);
1869	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Custom);
1870	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Custom);
1871	setOperationAction(Op: ISD::SRA, VT, Action: Custom);
1872	setOperationAction(Op: ISD::SRL, VT, Action: Custom);
1873	setOperationAction(Op: ISD::SHL, VT, Action: Custom);
1874	setOperationAction(Op: ISD::OR, VT, Action: Custom);
1875	setOperationAction(Op: ISD::SETCC, VT, Action: Custom);
1876	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Legal);
1877
1878	setOperationAction(Op: ISD::SELECT, VT, Action: Expand);
1879	setOperationAction(Op: ISD::SELECT_CC, VT, Action: Expand);
1880	setOperationAction(Op: ISD::VSELECT, VT, Action: Expand);
1881	for (MVT InnerVT : MVT::all_valuetypes())
1882	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: InnerVT, MemVT: VT, Action: Expand);
1883
1884	// CNT supports only B element sizes, then use UADDLP to widen.
1885	if (VT != MVT::v8i8 && VT != MVT::v16i8)
1886	setOperationAction(Op: ISD::CTPOP, VT, Action: Custom);
1887
1888	setOperationAction(Op: ISD::UDIV, VT, Action: Expand);
1889	setOperationAction(Op: ISD::SDIV, VT, Action: Expand);
1890	setOperationAction(Op: ISD::UREM, VT, Action: Expand);
1891	setOperationAction(Op: ISD::SREM, VT, Action: Expand);
1892	setOperationAction(Op: ISD::FREM, VT, Action: Expand);
1893
1894	for (unsigned Opcode :
1895	{ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1896	ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1897	setOperationAction(Op: Opcode, VT, Action: Custom);
1898
1899	if (!VT.isFloatingPoint())
1900	setOperationAction(Op: ISD::ABS, VT, Action: Legal);
1901
1902	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
1903	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1904	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1905	setOperationAction(Op: Opcode, VT, Action: Legal);
1906
1907	// F[MIN\|MAX][NUM\|NAN] and simple strict operations are available for all FP
1908	// NEON types.
1909	if (VT.isFloatingPoint() &&
1910	VT.getVectorElementType() != MVT::bf16 &&
1911	(VT.getVectorElementType() != MVT::f16 \|\| Subtarget->hasFullFP16()))
1912	for (unsigned Opcode :
1913	{ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1914	ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1915	ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1916	ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1917	ISD::STRICT_FSQRT})
1918	setOperationAction(Op: Opcode, VT, Action: Legal);
1919
1920	// Strict fp extend and trunc are legal
1921	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `16`)
1922	setOperationAction(Op: ISD::STRICT_FP_EXTEND, VT, Action: Legal);
1923	if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != `64`)
1924	setOperationAction(Op: ISD::STRICT_FP_ROUND, VT, Action: Legal);
1925
1926	// FIXME: We could potentially make use of the vector comparison instructions
1927	// for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1928	// complications:
1929	// FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,*
1930	// so we would need to expand when the condition code doesn't match the
1931	// kind of comparison.
1932	// Some kinds of comparison require more than one FCMXY instruction so*
1933	// would need to be expanded instead.
1934	// The lowering of the non-strict versions involves target-specific ISD*
1935	// nodes so we would likely need to add strict versions of all of them and
1936	// handle them appropriately.
1937	setOperationAction(Op: ISD::STRICT_FSETCC, VT, Action: Expand);
1938	setOperationAction(Op: ISD::STRICT_FSETCCS, VT, Action: Expand);
1939
1940	if (Subtarget->isLittleEndian()) {
1941	for (unsigned im = (unsigned)ISD::PRE_INC;
1942	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1943	setIndexedLoadAction(IdxModes: im, VT, Action: Legal);
1944	setIndexedStoreAction(IdxModes: im, VT, Action: Legal);
1945	}
1946	}
1947
1948	if (Subtarget->hasD128()) {
1949	setOperationAction(Op: ISD::READ_REGISTER, VT: MVT::i128, Action: Custom);
1950	setOperationAction(Op: ISD::WRITE_REGISTER, VT: MVT::i128, Action: Custom);
1951	}
1952	}
1953
1954	bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1955	EVT OpVT) const {
1956	// Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1957	if (!Subtarget->hasSVE())
1958	return true;
1959
1960	// We can only support legal predicate result types. We can use the SVE
1961	// whilelo instruction for generating fixed-width predicates too.
1962	if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1963	ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1964	ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1965	return true;
1966
1967	// The whilelo instruction only works with i32 or i64 scalar inputs.
1968	if (OpVT != MVT::i32 && OpVT != MVT::i64)
1969	return true;
1970
1971	return false;
1972	}
1973
1974	bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
1975	if (!Subtarget->isSVEorStreamingSVEAvailable())
1976	return true;
1977
1978	// We can only use the BRKB + CNTP sequence with legal predicate types. We can
1979	// also support fixed-width predicates.
1980	return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1981	VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1982	VT != MVT::v4i1 && VT != MVT::v2i1;
1983	}
1984
1985	void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1986	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1987
1988	// By default everything must be expanded.
1989	for (unsigned Op = `0`; Op < ISD::BUILTIN_OP_END; ++Op)
1990	setOperationAction(Op, VT, Action: Expand);
1991
1992	if (VT.isFloatingPoint()) {
1993	setCondCodeAction(CCs: ISD::SETO, VT, Action: Expand);
1994	setCondCodeAction(CCs: ISD::SETOLT, VT, Action: Expand);
1995	setCondCodeAction(CCs: ISD::SETOLE, VT, Action: Expand);
1996	setCondCodeAction(CCs: ISD::SETULT, VT, Action: Expand);
1997	setCondCodeAction(CCs: ISD::SETULE, VT, Action: Expand);
1998	setCondCodeAction(CCs: ISD::SETUGE, VT, Action: Expand);
1999	setCondCodeAction(CCs: ISD::SETUGT, VT, Action: Expand);
2000	setCondCodeAction(CCs: ISD::SETUEQ, VT, Action: Expand);
2001	setCondCodeAction(CCs: ISD::SETONE, VT, Action: Expand);
2002	}
2003
2004	TargetLoweringBase::LegalizeAction Default =
2005	VT == MVT::v1f64 ? Expand : Custom;
2006
2007	// Mark integer truncating stores/extending loads as having custom lowering
2008	if (VT.isInteger()) {
2009	MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::i8);
2010	while (InnerVT != VT) {
2011	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Default);
2012	setLoadExtAction(ExtType: ISD::ZEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2013	setLoadExtAction(ExtType: ISD::SEXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2014	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2015	InnerVT = InnerVT.changeVectorElementType(
2016	EltVT: MVT::getIntegerVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
2017	}
2018	}
2019
2020	// Mark floating-point truncating stores/extending loads as having custom
2021	// lowering
2022	if (VT.isFloatingPoint()) {
2023	MVT InnerVT = VT.changeVectorElementType(EltVT: MVT::f16);
2024	while (InnerVT != VT) {
2025	setTruncStoreAction(ValVT: VT, MemVT: InnerVT, Action: Custom);
2026	setLoadExtAction(ExtType: ISD::EXTLOAD, ValVT: VT, MemVT: InnerVT, Action: Default);
2027	InnerVT = InnerVT.changeVectorElementType(
2028	EltVT: MVT::getFloatingPointVT(BitWidth: `2` * InnerVT.getScalarSizeInBits()));
2029	}
2030	}
2031
2032	bool PreferNEON = VT.is64BitVector() \|\| VT.is128BitVector();
2033	bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2034
2035	// Lower fixed length vector operations to scalable equivalents.
2036	setOperationAction(Op: ISD::ABS, VT, Action: Default);
2037	setOperationAction(Op: ISD::ADD, VT, Action: Default);
2038	setOperationAction(Op: ISD::AND, VT, Action: Default);
2039	setOperationAction(Op: ISD::ANY_EXTEND, VT, Action: Default);
2040	setOperationAction(Op: ISD::BITCAST, VT, Action: PreferNEON ? Legal : Default);
2041	setOperationAction(Op: ISD::BITREVERSE, VT, Action: Default);
2042	setOperationAction(Op: ISD::BSWAP, VT, Action: Default);
2043	setOperationAction(Op: ISD::BUILD_VECTOR, VT, Action: Default);
2044	setOperationAction(Op: ISD::CONCAT_VECTORS, VT, Action: Default);
2045	setOperationAction(Op: ISD::CTLZ, VT, Action: Default);
2046	setOperationAction(Op: ISD::CTPOP, VT, Action: Default);
2047	setOperationAction(Op: ISD::CTTZ, VT, Action: Default);
2048	setOperationAction(Op: ISD::EXTRACT_SUBVECTOR, VT, Action: Default);
2049	setOperationAction(Op: ISD::EXTRACT_VECTOR_ELT, VT, Action: Default);
2050	setOperationAction(Op: ISD::FABS, VT, Action: Default);
2051	setOperationAction(Op: ISD::FADD, VT, Action: Default);
2052	setOperationAction(Op: ISD::FCEIL, VT, Action: Default);
2053	setOperationAction(Op: ISD::FCOPYSIGN, VT, Action: Default);
2054	setOperationAction(Op: ISD::FDIV, VT, Action: Default);
2055	setOperationAction(Op: ISD::FFLOOR, VT, Action: Default);
2056	setOperationAction(Op: ISD::FMA, VT, Action: Default);
2057	setOperationAction(Op: ISD::FMAXIMUM, VT, Action: Default);
2058	setOperationAction(Op: ISD::FMAXNUM, VT, Action: Default);
2059	setOperationAction(Op: ISD::FMINIMUM, VT, Action: Default);
2060	setOperationAction(Op: ISD::FMINNUM, VT, Action: Default);
2061	setOperationAction(Op: ISD::FMUL, VT, Action: Default);
2062	setOperationAction(Op: ISD::FNEARBYINT, VT, Action: Default);
2063	setOperationAction(Op: ISD::FNEG, VT, Action: Default);
2064	setOperationAction(Op: ISD::FP_EXTEND, VT, Action: Default);
2065	setOperationAction(Op: ISD::FP_ROUND, VT, Action: Default);
2066	setOperationAction(Op: ISD::FP_TO_SINT, VT, Action: Default);
2067	setOperationAction(Op: ISD::FP_TO_UINT, VT, Action: Default);
2068	setOperationAction(Op: ISD::FRINT, VT, Action: Default);
2069	setOperationAction(Op: ISD::LRINT, VT, Action: Default);
2070	setOperationAction(Op: ISD::LLRINT, VT, Action: Default);
2071	setOperationAction(Op: ISD::FROUND, VT, Action: Default);
2072	setOperationAction(Op: ISD::FROUNDEVEN, VT, Action: Default);
2073	setOperationAction(Op: ISD::FSQRT, VT, Action: Default);
2074	setOperationAction(Op: ISD::FSUB, VT, Action: Default);
2075	setOperationAction(Op: ISD::FTRUNC, VT, Action: Default);
2076	setOperationAction(Op: ISD::INSERT_VECTOR_ELT, VT, Action: Default);
2077	setOperationAction(Op: ISD::LOAD, VT, Action: PreferNEON ? Legal : Default);
2078	setOperationAction(Op: ISD::MGATHER, VT, Action: PreferSVE ? Default : Expand);
2079	setOperationAction(Op: ISD::MLOAD, VT, Action: Default);
2080	setOperationAction(Op: ISD::MSCATTER, VT, Action: PreferSVE ? Default : Expand);
2081	setOperationAction(Op: ISD::MSTORE, VT, Action: Default);
2082	setOperationAction(Op: ISD::MUL, VT, Action: Default);
2083	setOperationAction(Op: ISD::MULHS, VT, Action: Default);
2084	setOperationAction(Op: ISD::MULHU, VT, Action: Default);
2085	setOperationAction(Op: ISD::OR, VT, Action: Default);
2086	setOperationAction(Op: ISD::SCALAR_TO_VECTOR, VT, Action: PreferNEON ? Legal : Expand);
2087	setOperationAction(Op: ISD::SDIV, VT, Action: Default);
2088	setOperationAction(Op: ISD::SELECT, VT, Action: Default);
2089	setOperationAction(Op: ISD::SETCC, VT, Action: Default);
2090	setOperationAction(Op: ISD::SHL, VT, Action: Default);
2091	setOperationAction(Op: ISD::SIGN_EXTEND, VT, Action: Default);
2092	setOperationAction(Op: ISD::SIGN_EXTEND_INREG, VT, Action: Default);
2093	setOperationAction(Op: ISD::SINT_TO_FP, VT, Action: Default);
2094	setOperationAction(Op: ISD::SMAX, VT, Action: Default);
2095	setOperationAction(Op: ISD::SMIN, VT, Action: Default);
2096	setOperationAction(Op: ISD::SPLAT_VECTOR, VT, Action: Default);
2097	setOperationAction(Op: ISD::SRA, VT, Action: Default);
2098	setOperationAction(Op: ISD::SRL, VT, Action: Default);
2099	setOperationAction(Op: ISD::STORE, VT, Action: PreferNEON ? Legal : Default);
2100	setOperationAction(Op: ISD::SUB, VT, Action: Default);
2101	setOperationAction(Op: ISD::TRUNCATE, VT, Action: Default);
2102	setOperationAction(Op: ISD::UDIV, VT, Action: Default);
2103	setOperationAction(Op: ISD::UINT_TO_FP, VT, Action: Default);
2104	setOperationAction(Op: ISD::UMAX, VT, Action: Default);
2105	setOperationAction(Op: ISD::UMIN, VT, Action: Default);
2106	setOperationAction(Op: ISD::VECREDUCE_ADD, VT, Action: Default);
2107	setOperationAction(Op: ISD::VECREDUCE_AND, VT, Action: Default);
2108	setOperationAction(Op: ISD::VECREDUCE_FADD, VT, Action: Default);
2109	setOperationAction(Op: ISD::VECREDUCE_FMAX, VT, Action: Default);
2110	setOperationAction(Op: ISD::VECREDUCE_FMIN, VT, Action: Default);
2111	setOperationAction(Op: ISD::VECREDUCE_FMAXIMUM, VT, Action: Default);
2112	setOperationAction(Op: ISD::VECREDUCE_FMINIMUM, VT, Action: Default);
2113	setOperationAction(Op: ISD::VECREDUCE_OR, VT, Action: Default);
2114	setOperationAction(Op: ISD::VECREDUCE_SEQ_FADD, VT, Action: PreferSVE ? Default : Expand);
2115	setOperationAction(Op: ISD::VECREDUCE_SMAX, VT, Action: Default);
2116	setOperationAction(Op: ISD::VECREDUCE_SMIN, VT, Action: Default);
2117	setOperationAction(Op: ISD::VECREDUCE_UMAX, VT, Action: Default);
2118	setOperationAction(Op: ISD::VECREDUCE_UMIN, VT, Action: Default);
2119	setOperationAction(Op: ISD::VECREDUCE_XOR, VT, Action: Default);
2120	setOperationAction(Op: ISD::VECTOR_SHUFFLE, VT, Action: Default);
2121	setOperationAction(Op: ISD::VECTOR_SPLICE, VT, Action: Default);
2122	setOperationAction(Op: ISD::VSELECT, VT, Action: Default);
2123	setOperationAction(Op: ISD::XOR, VT, Action: Default);
2124	setOperationAction(Op: ISD::ZERO_EXTEND, VT, Action: Default);
2125	}
2126
2127	void AArch64TargetLowering::addDRType(MVT VT) {
2128	addRegisterClass(VT, RC: &AArch64::FPR64RegClass);
2129	if (Subtarget->isNeonAvailable())
2130	addTypeForNEON(VT);
2131	}
2132
2133	void AArch64TargetLowering::addQRType(MVT VT) {
2134	addRegisterClass(VT, RC: &AArch64::FPR128RegClass);
2135	if (Subtarget->isNeonAvailable())
2136	addTypeForNEON(VT);
2137	}
2138
2139	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
2140	LLVMContext &C, EVT VT) const {
2141	if (!VT.isVector())
2142	return MVT::i32;
2143	if (VT.isScalableVector())
2144	return EVT::getVectorVT(Context&: C, VT: MVT::i1, EC: VT.getVectorElementCount());
2145	return VT.changeVectorElementTypeToInteger();
2146	}
2147
2148	// isIntImmediate - This method tests to see if the node is a constant
2149	// operand. If so Imm will receive the value.
2150	static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2151	if (const ConstantSDNode C = dyn_cast<const* ConstantSDNode>(Val: N)) {
2152	Imm = C->getZExtValue();
2153	return true;
2154	}
2155	return false;
2156	}
2157
2158	// isOpcWithIntImmediate - This method tests to see if the node is a specific
2159	// opcode and that it has a immediate integer right operand.
2160	// If so Imm will receive the value.
2161	static bool isOpcWithIntImmediate(const SDNode N, unsigned* Opc,
2162	uint64_t &Imm) {
2163	return N->getOpcode() == Opc &&
2164	isIntImmediate(N: N->getOperand(Num: `1`).getNode(), Imm);
2165	}
2166
2167	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2168	const APInt &Demanded,
2169	TargetLowering::TargetLoweringOpt &TLO,
2170	unsigned NewOpc) {
2171	uint64_t OldImm = Imm, NewImm, Enc;
2172	uint64_t Mask = ((uint64_t)(-`1LL`) >> (`64` - Size)), OrigMask = Mask;
2173
2174	// Return if the immediate is already all zeros, all ones, a bimm32 or a
2175	// bimm64.
2176	if (Imm == `0` \|\| Imm == Mask \|\|
2177	AArch64_AM::isLogicalImmediate(imm: Imm & Mask, regSize: Size))
2178	return false;
2179
2180	unsigned EltSize = Size;
2181	uint64_t DemandedBits = Demanded.getZExtValue();
2182
2183	// Clear bits that are not demanded.
2184	Imm &= DemandedBits;
2185
2186	while (true) {
2187	// The goal here is to set the non-demanded bits in a way that minimizes
2188	// the number of switching between 0 and 1. In order to achieve this goal,
2189	// we set the non-demanded bits to the value of the preceding demanded bits.
2190	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2191	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
2192	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2193	// The final result is 0b11000011.
2194	uint64_t NonDemandedBits = ~DemandedBits;
2195	uint64_t InvertedImm = ~Imm & DemandedBits;
2196	uint64_t RotatedImm =
2197	((InvertedImm << `1`) \| (InvertedImm >> (EltSize - `1`) & `1`)) &
2198	NonDemandedBits;
2199	uint64_t Sum = RotatedImm + NonDemandedBits;
2200	bool Carry = NonDemandedBits & ~Sum & (`1ULL` << (EltSize - `1`));
2201	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2202	NewImm = (Imm \| Ones) & Mask;
2203
2204	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2205	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2206	// we halve the element size and continue the search.
2207	if (isShiftedMask_64(Value: NewImm) \|\| isShiftedMask_64(Value: ~(NewImm \| ~Mask)))
2208	break;
2209
2210	// We cannot shrink the element size any further if it is 2-bits.
2211	if (EltSize == `2`)
2212	return false;
2213
2214	EltSize /= `2`;
2215	Mask >>= EltSize;
2216	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2217
2218	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
2219	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != `0`)
2220	return false;
2221
2222	// Merge the upper and lower halves of Imm and DemandedBits.
2223	Imm \|= Hi;
2224	DemandedBits \|= DemandedBitsHi;
2225	}
2226
2227	++NumOptimizedImms;
2228
2229	// Replicate the element across the register width.
2230	while (EltSize < Size) {
2231	NewImm \|= NewImm << EltSize;
2232	EltSize *= `2`;
2233	}
2234
2235	(void)OldImm;
2236	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == `0` &&
2237	"demanded bits should never be altered");
2238	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2239
2240	// Create the new constant immediate node.
2241	EVT VT = Op.getValueType();
2242	SDLoc DL(Op);
2243	SDValue New;
2244
2245	// If the new constant immediate is all-zeros or all-ones, let the target
2246	// independent DAG combine optimize this node.
2247	if (NewImm == `0` \|\| NewImm == OrigMask) {
2248	New = TLO.DAG.getNode(Opcode: Op.getOpcode(), DL, VT, N1: Op.getOperand(i: `0`),
2249	N2: TLO.DAG.getConstant(Val: NewImm, DL, VT));
2250	// Otherwise, create a machine node so that target independent DAG combine
2251	// doesn't undo this optimization.
2252	} else {
2253	Enc = AArch64_AM::encodeLogicalImmediate(imm: NewImm, regSize: Size);
2254	SDValue EncConst = TLO.DAG.getTargetConstant(Val: Enc, DL, VT);
2255	New = SDValue (
2256	TLO.DAG.getMachineNode(Opcode: NewOpc, dl: DL, VT, Op1: Op.getOperand(i: `0`), Op2: EncConst), `0`);
2257	}
2258
2259	return TLO.CombineTo(O: Op, N: New);
2260	}
2261
2262	bool AArch64TargetLowering::targetShrinkDemandedConstant(
2263	SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2264	TargetLoweringOpt &TLO) const {
2265	// Delay this optimization to as late as possible.
2266	if (!TLO.LegalOps)
2267	return false;
2268
2269	if (!EnableOptimizeLogicalImm)
2270	return false;
2271
2272	EVT VT = Op.getValueType();
2273	if (VT.isVector())
2274	return false;
2275
2276	unsigned Size = VT.getSizeInBits();
2277	assert((Size == `32` \|\| Size == `64`) &&
2278	"i32 or i64 is expected after legalization.");
2279
2280	// Exit early if we demand all bits.
2281	if (DemandedBits.popcount() == Size)
2282	return false;
2283
2284	unsigned NewOpc;
2285	switch (Op.getOpcode()) {
2286	default:
2287	return false;
2288	case ISD::AND:
2289	NewOpc = Size == `32` ? AArch64::ANDWri : AArch64::ANDXri;
2290	break;
2291	case ISD::OR:
2292	NewOpc = Size == `32` ? AArch64::ORRWri : AArch64::ORRXri;
2293	break;
2294	case ISD::XOR:
2295	NewOpc = Size == `32` ? AArch64::EORWri : AArch64::EORXri;
2296	break;
2297	}
2298	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
2299	if (!C)
2300	return false;
2301	uint64_t Imm = C->getZExtValue();
2302	return optimizeLogicalImm(Op, Size, Imm, Demanded: DemandedBits, TLO, NewOpc);
2303	}
2304
2305	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2306	/// Mask are known to be either zero or one and return them Known.
2307	void AArch64TargetLowering::computeKnownBitsForTargetNode(
2308	const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2309	const SelectionDAG &DAG, unsigned Depth) const {
2310	switch (Op.getOpcode()) {
2311	default:
2312	break;
2313	case AArch64ISD::DUP: {
2314	SDValue SrcOp = Op.getOperand(i: `0`);
2315	Known = DAG.computeKnownBits(Op: SrcOp, Depth: Depth + `1`);
2316	if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2317	assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2318	"Expected DUP implicit truncation");
2319	Known = Known.trunc(BitWidth: Op.getScalarValueSizeInBits());
2320	}
2321	break;
2322	}
2323	case AArch64ISD::CSEL: {
2324	KnownBits Known2;
2325	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2326	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2327	Known = Known.intersectWith(RHS: Known2);
2328	break;
2329	}
2330	case AArch64ISD::BICi: {
2331	// Compute the bit cleared value.
2332	uint64_t Mask =
2333	~(Op ->getConstantOperandVal(Num: `1`) << Op ->getConstantOperandVal(Num: `2`));
2334	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2335	Known &= KnownBits::makeConstant(C: APInt (Known.getBitWidth(), Mask));
2336	break;
2337	}
2338	case AArch64ISD::VLSHR: {
2339	KnownBits Known2;
2340	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2341	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2342	Known = KnownBits::lshr(LHS: Known, RHS: Known2);
2343	break;
2344	}
2345	case AArch64ISD::VASHR: {
2346	KnownBits Known2;
2347	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2348	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2349	Known = KnownBits::ashr(LHS: Known, RHS: Known2);
2350	break;
2351	}
2352	case AArch64ISD::VSHL: {
2353	KnownBits Known2;
2354	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2355	Known2 = DAG.computeKnownBits(Op: Op ->getOperand(Num: `1`), Depth: Depth + `1`);
2356	Known = KnownBits::shl(LHS: Known, RHS: Known2);
2357	break;
2358	}
2359	case AArch64ISD::MOVI: {
2360	Known = KnownBits::makeConstant(
2361	C: APInt (Known.getBitWidth(), Op ->getConstantOperandVal(Num: `0`)));
2362	break;
2363	}
2364	case AArch64ISD::LOADgot:
2365	case AArch64ISD::ADDlow: {
2366	if (!Subtarget->isTargetILP32())
2367	break;
2368	// In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2369	Known.Zero = APInt::getHighBitsSet(numBits: `64`, hiBitsSet: `32`);
2370	break;
2371	}
2372	case AArch64ISD::ASSERT_ZEXT_BOOL: {
2373	Known = DAG.computeKnownBits(Op: Op ->getOperand(Num: `0`), Depth: Depth + `1`);
2374	Known.Zero \|= APInt (Known.getBitWidth(), `0xFE`);
2375	break;
2376	}
2377	case ISD::INTRINSIC_W_CHAIN: {
2378	Intrinsic::ID IntID =
2379	static_cast<Intrinsic::ID>(Op ->getConstantOperandVal(Num: `1`));
2380	switch (IntID) {
2381	default: return;
2382	case Intrinsic::aarch64_ldaxr:
2383	case Intrinsic::aarch64_ldxr: {
2384	unsigned BitWidth = Known.getBitWidth();
2385	EVT VT = cast<MemIntrinsicSDNode>(Val: Op)->getMemoryVT();
2386	unsigned MemBits = VT.getScalarSizeInBits();
2387	Known.Zero \|= APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - MemBits);
2388	return;
2389	}
2390	}
2391	break;
2392	}
2393	case ISD::INTRINSIC_WO_CHAIN:
2394	case ISD::INTRINSIC_VOID: {
2395	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
2396	switch (IntNo) {
2397	default:
2398	break;
2399	case Intrinsic::aarch64_neon_uaddlv: {
2400	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2401	unsigned BitWidth = Known.getBitWidth();
2402	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2403	unsigned Bound = (VT == MVT::v8i8) ? `11` : `12`;
2404	assert(BitWidth >= Bound && "Unexpected width!");
2405	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - Bound);
2406	Known.Zero \|= Mask;
2407	}
2408	break;
2409	}
2410	case Intrinsic::aarch64_neon_umaxv:
2411	case Intrinsic::aarch64_neon_uminv: {
2412	// Figure out the datatype of the vector operand. The UMINV instruction
2413	// will zero extend the result, so we can mark as known zero all the
2414	// bits larger than the element datatype. 32-bit or larget doesn't need
2415	// this as those are legal types and will be handled by isel directly.
2416	MVT VT = Op.getOperand(i: `1`).getValueType().getSimpleVT();
2417	unsigned BitWidth = Known.getBitWidth();
2418	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
2419	assert(BitWidth >= `8` && "Unexpected width!");
2420	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `8`);
2421	Known.Zero \|= Mask;
2422	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
2423	assert(BitWidth >= `16` && "Unexpected width!");
2424	APInt Mask = APInt::getHighBitsSet(numBits: BitWidth, hiBitsSet: BitWidth - `16`);
2425	Known.Zero \|= Mask;
2426	}
2427	break;
2428	} break;
2429	}
2430	}
2431	}
2432	}
2433
2434	unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode(
2435	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2436	unsigned Depth) const {
2437	EVT VT = Op.getValueType();
2438	unsigned VTBits = VT.getScalarSizeInBits();
2439	unsigned Opcode = Op.getOpcode();
2440	switch (Opcode) {
2441	case AArch64ISD::CMEQ:
2442	case AArch64ISD::CMGE:
2443	case AArch64ISD::CMGT:
2444	case AArch64ISD::CMHI:
2445	case AArch64ISD::CMHS:
2446	case AArch64ISD::FCMEQ:
2447	case AArch64ISD::FCMGE:
2448	case AArch64ISD::FCMGT:
2449	case AArch64ISD::CMEQz:
2450	case AArch64ISD::CMGEz:
2451	case AArch64ISD::CMGTz:
2452	case AArch64ISD::CMLEz:
2453	case AArch64ISD::CMLTz:
2454	case AArch64ISD::FCMEQz:
2455	case AArch64ISD::FCMGEz:
2456	case AArch64ISD::FCMGTz:
2457	case AArch64ISD::FCMLEz:
2458	case AArch64ISD::FCMLTz:
2459	// Compares return either 0 or all-ones
2460	return VTBits;
2461	}
2462
2463	return `1`;
2464	}
2465
2466	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
2467	EVT) const {
2468	return MVT::i64;
2469	}
2470
2471	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2472	EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2473	unsigned Fast) const* {
2474	if (Subtarget->requiresStrictAlign())
2475	return false;
2476
2477	if (Fast) {
2478	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2479	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != `16` \|\|
2480	// See comments in performSTORECombine() for more details about
2481	// these conditions.
2482
2483	// Code that uses clang vector extensions can mark that it
2484	// wants unaligned accesses to be treated as fast by
2485	// underspecifying alignment to be 1 or 2.
2486	Alignment <= `2` \|\|
2487
2488	// Disregard v2i64. Memcpy lowering produces those and splitting
2489	// them regresses performance on micro-benchmarks and olden/bh.
2490	VT == MVT::v2i64;
2491	}
2492	return true;
2493	}
2494
2495	// Same as above but handling LLTs instead.
2496	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2497	LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2498	unsigned Fast) const* {
2499	if (Subtarget->requiresStrictAlign())
2500	return false;
2501
2502	if (Fast) {
2503	// Some CPUs are fine with unaligned stores except for 128-bit ones.
2504	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\|
2505	Ty.getSizeInBytes() != `16` \|\|
2506	// See comments in performSTORECombine() for more details about
2507	// these conditions.
2508
2509	// Code that uses clang vector extensions can mark that it
2510	// wants unaligned accesses to be treated as fast by
2511	// underspecifying alignment to be 1 or 2.
2512	Alignment <= `2` \|\|
2513
2514	// Disregard v2i64. Memcpy lowering produces those and splitting
2515	// them regresses performance on micro-benchmarks and olden/bh.
2516	Ty == LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
2517	}
2518	return true;
2519	}
2520
2521	FastISel *
2522	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2523	const TargetLibraryInfo libInfo) const* {
2524	return AArch64::createFastISel(funcInfo, libInfo);
2525	}
2526
2527	const char AArch64TargetLowering::getTargetNodeName(unsigned* Opcode) const {
2528	#define MAKE_CASE(V) \
2529	case V: \
2530	return #V;
2531	switch ((AArch64ISD::NodeType)Opcode) {
2532	case AArch64ISD::FIRST_NUMBER:
2533	break;
2534	MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
2535	MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
2536	MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2537	MAKE_CASE(AArch64ISD::VG_SAVE)
2538	MAKE_CASE(AArch64ISD::VG_RESTORE)
2539	MAKE_CASE(AArch64ISD::SMSTART)
2540	MAKE_CASE(AArch64ISD::SMSTOP)
2541	MAKE_CASE(AArch64ISD::RESTORE_ZA)
2542	MAKE_CASE(AArch64ISD::RESTORE_ZT)
2543	MAKE_CASE(AArch64ISD::SAVE_ZT)
2544	MAKE_CASE(AArch64ISD::CALL)
2545	MAKE_CASE(AArch64ISD::ADRP)
2546	MAKE_CASE(AArch64ISD::ADR)
2547	MAKE_CASE(AArch64ISD::ADDlow)
2548	MAKE_CASE(AArch64ISD::AUTH_CALL)
2549	MAKE_CASE(AArch64ISD::AUTH_TC_RETURN)
2550	MAKE_CASE(AArch64ISD::AUTH_CALL_RVMARKER)
2551	MAKE_CASE(AArch64ISD::LOADgot)
2552	MAKE_CASE(AArch64ISD::RET_GLUE)
2553	MAKE_CASE(AArch64ISD::BRCOND)
2554	MAKE_CASE(AArch64ISD::CSEL)
2555	MAKE_CASE(AArch64ISD::CSINV)
2556	MAKE_CASE(AArch64ISD::CSNEG)
2557	MAKE_CASE(AArch64ISD::CSINC)
2558	MAKE_CASE(AArch64ISD::THREAD_POINTER)
2559	MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2560	MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
2561	MAKE_CASE(AArch64ISD::ABDS_PRED)
2562	MAKE_CASE(AArch64ISD::ABDU_PRED)
2563	MAKE_CASE(AArch64ISD::HADDS_PRED)
2564	MAKE_CASE(AArch64ISD::HADDU_PRED)
2565	MAKE_CASE(AArch64ISD::MUL_PRED)
2566	MAKE_CASE(AArch64ISD::MULHS_PRED)
2567	MAKE_CASE(AArch64ISD::MULHU_PRED)
2568	MAKE_CASE(AArch64ISD::RHADDS_PRED)
2569	MAKE_CASE(AArch64ISD::RHADDU_PRED)
2570	MAKE_CASE(AArch64ISD::SDIV_PRED)
2571	MAKE_CASE(AArch64ISD::SHL_PRED)
2572	MAKE_CASE(AArch64ISD::SMAX_PRED)
2573	MAKE_CASE(AArch64ISD::SMIN_PRED)
2574	MAKE_CASE(AArch64ISD::SRA_PRED)
2575	MAKE_CASE(AArch64ISD::SRL_PRED)
2576	MAKE_CASE(AArch64ISD::UDIV_PRED)
2577	MAKE_CASE(AArch64ISD::UMAX_PRED)
2578	MAKE_CASE(AArch64ISD::UMIN_PRED)
2579	MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2580	MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2581	MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2582	MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2583	MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2584	MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2585	MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2586	MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2587	MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2588	MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2589	MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2590	MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2591	MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2592	MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2593	MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2594	MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2595	MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2596	MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2597	MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2598	MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2599	MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2600	MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2601	MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2602	MAKE_CASE(AArch64ISD::ADC)
2603	MAKE_CASE(AArch64ISD::SBC)
2604	MAKE_CASE(AArch64ISD::ADDS)
2605	MAKE_CASE(AArch64ISD::SUBS)
2606	MAKE_CASE(AArch64ISD::ADCS)
2607	MAKE_CASE(AArch64ISD::SBCS)
2608	MAKE_CASE(AArch64ISD::ANDS)
2609	MAKE_CASE(AArch64ISD::CCMP)
2610	MAKE_CASE(AArch64ISD::CCMN)
2611	MAKE_CASE(AArch64ISD::FCCMP)
2612	MAKE_CASE(AArch64ISD::FCMP)
2613	MAKE_CASE(AArch64ISD::STRICT_FCMP)
2614	MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2615	MAKE_CASE(AArch64ISD::FCVTXN)
2616	MAKE_CASE(AArch64ISD::SME_ZA_LDR)
2617	MAKE_CASE(AArch64ISD::SME_ZA_STR)
2618	MAKE_CASE(AArch64ISD::DUP)
2619	MAKE_CASE(AArch64ISD::DUPLANE8)
2620	MAKE_CASE(AArch64ISD::DUPLANE16)
2621	MAKE_CASE(AArch64ISD::DUPLANE32)
2622	MAKE_CASE(AArch64ISD::DUPLANE64)
2623	MAKE_CASE(AArch64ISD::DUPLANE128)
2624	MAKE_CASE(AArch64ISD::MOVI)
2625	MAKE_CASE(AArch64ISD::MOVIshift)
2626	MAKE_CASE(AArch64ISD::MOVIedit)
2627	MAKE_CASE(AArch64ISD::MOVImsl)
2628	MAKE_CASE(AArch64ISD::FMOV)
2629	MAKE_CASE(AArch64ISD::MVNIshift)
2630	MAKE_CASE(AArch64ISD::MVNImsl)
2631	MAKE_CASE(AArch64ISD::BICi)
2632	MAKE_CASE(AArch64ISD::ORRi)
2633	MAKE_CASE(AArch64ISD::BSP)
2634	MAKE_CASE(AArch64ISD::ZIP1)
2635	MAKE_CASE(AArch64ISD::ZIP2)
2636	MAKE_CASE(AArch64ISD::UZP1)
2637	MAKE_CASE(AArch64ISD::UZP2)
2638	MAKE_CASE(AArch64ISD::TRN1)
2639	MAKE_CASE(AArch64ISD::TRN2)
2640	MAKE_CASE(AArch64ISD::REV16)
2641	MAKE_CASE(AArch64ISD::REV32)
2642	MAKE_CASE(AArch64ISD::REV64)
2643	MAKE_CASE(AArch64ISD::EXT)
2644	MAKE_CASE(AArch64ISD::SPLICE)
2645	MAKE_CASE(AArch64ISD::VSHL)
2646	MAKE_CASE(AArch64ISD::VLSHR)
2647	MAKE_CASE(AArch64ISD::VASHR)
2648	MAKE_CASE(AArch64ISD::VSLI)
2649	MAKE_CASE(AArch64ISD::VSRI)
2650	MAKE_CASE(AArch64ISD::CMEQ)
2651	MAKE_CASE(AArch64ISD::CMGE)
2652	MAKE_CASE(AArch64ISD::CMGT)
2653	MAKE_CASE(AArch64ISD::CMHI)
2654	MAKE_CASE(AArch64ISD::CMHS)
2655	MAKE_CASE(AArch64ISD::FCMEQ)
2656	MAKE_CASE(AArch64ISD::FCMGE)
2657	MAKE_CASE(AArch64ISD::FCMGT)
2658	MAKE_CASE(AArch64ISD::CMEQz)
2659	MAKE_CASE(AArch64ISD::CMGEz)
2660	MAKE_CASE(AArch64ISD::CMGTz)
2661	MAKE_CASE(AArch64ISD::CMLEz)
2662	MAKE_CASE(AArch64ISD::CMLTz)
2663	MAKE_CASE(AArch64ISD::FCMEQz)
2664	MAKE_CASE(AArch64ISD::FCMGEz)
2665	MAKE_CASE(AArch64ISD::FCMGTz)
2666	MAKE_CASE(AArch64ISD::FCMLEz)
2667	MAKE_CASE(AArch64ISD::FCMLTz)
2668	MAKE_CASE(AArch64ISD::SADDV)
2669	MAKE_CASE(AArch64ISD::UADDV)
2670	MAKE_CASE(AArch64ISD::UADDLV)
2671	MAKE_CASE(AArch64ISD::SADDLV)
2672	MAKE_CASE(AArch64ISD::SDOT)
2673	MAKE_CASE(AArch64ISD::UDOT)
2674	MAKE_CASE(AArch64ISD::SMINV)
2675	MAKE_CASE(AArch64ISD::UMINV)
2676	MAKE_CASE(AArch64ISD::SMAXV)
2677	MAKE_CASE(AArch64ISD::UMAXV)
2678	MAKE_CASE(AArch64ISD::SADDV_PRED)
2679	MAKE_CASE(AArch64ISD::UADDV_PRED)
2680	MAKE_CASE(AArch64ISD::SMAXV_PRED)
2681	MAKE_CASE(AArch64ISD::UMAXV_PRED)
2682	MAKE_CASE(AArch64ISD::SMINV_PRED)
2683	MAKE_CASE(AArch64ISD::UMINV_PRED)
2684	MAKE_CASE(AArch64ISD::ORV_PRED)
2685	MAKE_CASE(AArch64ISD::EORV_PRED)
2686	MAKE_CASE(AArch64ISD::ANDV_PRED)
2687	MAKE_CASE(AArch64ISD::CLASTA_N)
2688	MAKE_CASE(AArch64ISD::CLASTB_N)
2689	MAKE_CASE(AArch64ISD::LASTA)
2690	MAKE_CASE(AArch64ISD::LASTB)
2691	MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2692	MAKE_CASE(AArch64ISD::LS64_BUILD)
2693	MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2694	MAKE_CASE(AArch64ISD::TBL)
2695	MAKE_CASE(AArch64ISD::FADD_PRED)
2696	MAKE_CASE(AArch64ISD::FADDA_PRED)
2697	MAKE_CASE(AArch64ISD::FADDV_PRED)
2698	MAKE_CASE(AArch64ISD::FDIV_PRED)
2699	MAKE_CASE(AArch64ISD::FMA_PRED)
2700	MAKE_CASE(AArch64ISD::FMAX_PRED)
2701	MAKE_CASE(AArch64ISD::FMAXV_PRED)
2702	MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2703	MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2704	MAKE_CASE(AArch64ISD::FMIN_PRED)
2705	MAKE_CASE(AArch64ISD::FMINV_PRED)
2706	MAKE_CASE(AArch64ISD::FMINNM_PRED)
2707	MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2708	MAKE_CASE(AArch64ISD::FMUL_PRED)
2709	MAKE_CASE(AArch64ISD::FSUB_PRED)
2710	MAKE_CASE(AArch64ISD::RDSVL)
2711	MAKE_CASE(AArch64ISD::BIC)
2712	MAKE_CASE(AArch64ISD::CBZ)
2713	MAKE_CASE(AArch64ISD::CBNZ)
2714	MAKE_CASE(AArch64ISD::TBZ)
2715	MAKE_CASE(AArch64ISD::TBNZ)
2716	MAKE_CASE(AArch64ISD::TC_RETURN)
2717	MAKE_CASE(AArch64ISD::PREFETCH)
2718	MAKE_CASE(AArch64ISD::SITOF)
2719	MAKE_CASE(AArch64ISD::UITOF)
2720	MAKE_CASE(AArch64ISD::NVCAST)
2721	MAKE_CASE(AArch64ISD::MRS)
2722	MAKE_CASE(AArch64ISD::SQSHL_I)
2723	MAKE_CASE(AArch64ISD::UQSHL_I)
2724	MAKE_CASE(AArch64ISD::SRSHR_I)
2725	MAKE_CASE(AArch64ISD::URSHR_I)
2726	MAKE_CASE(AArch64ISD::SQSHLU_I)
2727	MAKE_CASE(AArch64ISD::WrapperLarge)
2728	MAKE_CASE(AArch64ISD::LD2post)
2729	MAKE_CASE(AArch64ISD::LD3post)
2730	MAKE_CASE(AArch64ISD::LD4post)
2731	MAKE_CASE(AArch64ISD::ST2post)
2732	MAKE_CASE(AArch64ISD::ST3post)
2733	MAKE_CASE(AArch64ISD::ST4post)
2734	MAKE_CASE(AArch64ISD::LD1x2post)
2735	MAKE_CASE(AArch64ISD::LD1x3post)
2736	MAKE_CASE(AArch64ISD::LD1x4post)
2737	MAKE_CASE(AArch64ISD::ST1x2post)
2738	MAKE_CASE(AArch64ISD::ST1x3post)
2739	MAKE_CASE(AArch64ISD::ST1x4post)
2740	MAKE_CASE(AArch64ISD::LD1DUPpost)
2741	MAKE_CASE(AArch64ISD::LD2DUPpost)
2742	MAKE_CASE(AArch64ISD::LD3DUPpost)
2743	MAKE_CASE(AArch64ISD::LD4DUPpost)
2744	MAKE_CASE(AArch64ISD::LD1LANEpost)
2745	MAKE_CASE(AArch64ISD::LD2LANEpost)
2746	MAKE_CASE(AArch64ISD::LD3LANEpost)
2747	MAKE_CASE(AArch64ISD::LD4LANEpost)
2748	MAKE_CASE(AArch64ISD::ST2LANEpost)
2749	MAKE_CASE(AArch64ISD::ST3LANEpost)
2750	MAKE_CASE(AArch64ISD::ST4LANEpost)
2751	MAKE_CASE(AArch64ISD::SMULL)
2752	MAKE_CASE(AArch64ISD::UMULL)
2753	MAKE_CASE(AArch64ISD::PMULL)
2754	MAKE_CASE(AArch64ISD::FRECPE)
2755	MAKE_CASE(AArch64ISD::FRECPS)
2756	MAKE_CASE(AArch64ISD::FRSQRTE)
2757	MAKE_CASE(AArch64ISD::FRSQRTS)
2758	MAKE_CASE(AArch64ISD::STG)
2759	MAKE_CASE(AArch64ISD::STZG)
2760	MAKE_CASE(AArch64ISD::ST2G)
2761	MAKE_CASE(AArch64ISD::STZ2G)
2762	MAKE_CASE(AArch64ISD::SUNPKHI)
2763	MAKE_CASE(AArch64ISD::SUNPKLO)
2764	MAKE_CASE(AArch64ISD::UUNPKHI)
2765	MAKE_CASE(AArch64ISD::UUNPKLO)
2766	MAKE_CASE(AArch64ISD::INSR)
2767	MAKE_CASE(AArch64ISD::PTEST)
2768	MAKE_CASE(AArch64ISD::PTEST_ANY)
2769	MAKE_CASE(AArch64ISD::PTRUE)
2770	MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2771	MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2772	MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2773	MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2774	MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2775	MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2776	MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2777	MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2778	MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2779	MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2780	MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2781	MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2782	MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2783	MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2784	MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2785	MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2786	MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2787	MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2788	MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2789	MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2790	MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2791	MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2792	MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2793	MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2794	MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2795	MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2796	MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2797	MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2798	MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2799	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2800	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2801	MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2802	MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2803	MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2804	MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2805	MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2806	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2807	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2808	MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2809	MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2810	MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2811	MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2812	MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2813	MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2814	MAKE_CASE(AArch64ISD::SST1Q_PRED)
2815	MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2816	MAKE_CASE(AArch64ISD::ST1_PRED)
2817	MAKE_CASE(AArch64ISD::SST1_PRED)
2818	MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2819	MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2820	MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2821	MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2822	MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2823	MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2824	MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2825	MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2826	MAKE_CASE(AArch64ISD::LDP)
2827	MAKE_CASE(AArch64ISD::LDIAPP)
2828	MAKE_CASE(AArch64ISD::LDNP)
2829	MAKE_CASE(AArch64ISD::STP)
2830	MAKE_CASE(AArch64ISD::STILP)
2831	MAKE_CASE(AArch64ISD::STNP)
2832	MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2833	MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2834	MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2835	MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2836	MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2837	MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2838	MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2839	MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2840	MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2841	MAKE_CASE(AArch64ISD::ADDP)
2842	MAKE_CASE(AArch64ISD::SADDLP)
2843	MAKE_CASE(AArch64ISD::UADDLP)
2844	MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2845	MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2846	MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2847	MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2848	MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2849	MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2850	MAKE_CASE(AArch64ISD::CALL_BTI)
2851	MAKE_CASE(AArch64ISD::MRRS)
2852	MAKE_CASE(AArch64ISD::MSRR)
2853	MAKE_CASE(AArch64ISD::RSHRNB_I)
2854	MAKE_CASE(AArch64ISD::CTTZ_ELTS)
2855	MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
2856	MAKE_CASE(AArch64ISD::URSHR_I_PRED)
2857	}
2858	#undef MAKE_CASE
2859	return nullptr;
2860	}
2861
2862	MachineBasicBlock *
2863	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2864	MachineBasicBlock MBB) const* {
2865	// We materialise the F128CSEL pseudo-instruction as some control flow and a
2866	// phi node:
2867
2868	// OrigBB:
2869	// [... previous instrs leading to comparison ...]
2870	// b.ne TrueBB
2871	// b EndBB
2872	// TrueBB:
2873	// ; Fallthrough
2874	// EndBB:
2875	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2876
2877	MachineFunction *MF = MBB->getParent();
2878	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2879	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2880	DebugLoc DL = MI.getDebugLoc();
2881	MachineFunction::iterator It = ++MBB->getIterator();
2882
2883	Register DestReg = MI.getOperand(i: `0`).getReg();
2884	Register IfTrueReg = MI.getOperand(i: `1`).getReg();
2885	Register IfFalseReg = MI.getOperand(i: `2`).getReg();
2886	unsigned CondCode = MI.getOperand(i: `3`).getImm();
2887	bool NZCVKilled = MI.getOperand(i: `4`).isKill();
2888
2889	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2890	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(BB: LLVM_BB);
2891	MF->insert(MBBI: It, MBB: TrueBB);
2892	MF->insert(MBBI: It, MBB: EndBB);
2893
2894	// Transfer rest of current basic-block to EndBB
2895	EndBB->splice(Where: EndBB->begin(), Other: MBB, From: std::next(x: MachineBasicBlock::iterator (MI)),
2896	To: MBB->end());
2897	EndBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
2898
2899	BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::Bcc)).addImm(Val: CondCode).addMBB(MBB: TrueBB);
2900	BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: AArch64::B)).addMBB(MBB: EndBB);
2901	MBB->addSuccessor(Succ: TrueBB);
2902	MBB->addSuccessor(Succ: EndBB);
2903
2904	// TrueBB falls through to the end.
2905	TrueBB->addSuccessor(Succ: EndBB);
2906
2907	if (!NZCVKilled) {
2908	TrueBB->addLiveIn(PhysReg: AArch64::NZCV);
2909	EndBB->addLiveIn(PhysReg: AArch64::NZCV);
2910	}
2911
2912	BuildMI(BB&: *EndBB, I: EndBB->begin(), MIMD: DL, MCID: TII->get(Opcode: AArch64::PHI), DestReg)
2913	.addReg(RegNo: IfTrueReg)
2914	.addMBB(MBB: TrueBB)
2915	.addReg(RegNo: IfFalseReg)
2916	.addMBB(MBB);
2917
2918	MI.eraseFromParent();
2919	return EndBB;
2920	}
2921
2922	MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2923	MachineInstr &MI, MachineBasicBlock BB) const* {
2924	assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2925	BB->getParent()->getFunction().getPersonalityFn())) &&
2926	"SEH does not use catchret!");
2927	return BB;
2928	}
2929
2930	MachineBasicBlock *
2931	AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
2932	MachineBasicBlock MBB) const* {
2933	MachineFunction &MF = *MBB->getParent();
2934	MachineBasicBlock::iterator MBBI = MI.getIterator();
2935	DebugLoc DL = MBB->findDebugLoc(MBBI);
2936	const AArch64InstrInfo &TII =
2937	*MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2938	Register TargetReg = MI.getOperand(i: `0`).getReg();
2939	MachineBasicBlock::iterator NextInst =
2940	TII.probedStackAlloc(MBBI, TargetReg, FrameSetup: false);
2941
2942	MI.eraseFromParent();
2943	return NextInst ->getParent();
2944	}
2945
2946	MachineBasicBlock *
2947	AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2948	MachineInstr &MI,
2949	MachineBasicBlock BB) const* {
2950	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2951	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
2952
2953	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: `0`).getImm(), flags: RegState::Define);
2954	MIB.add(MO: MI.getOperand(i: `1`)); // slice index register
2955	MIB.add(MO: MI.getOperand(i: `2`)); // slice index offset
2956	MIB.add(MO: MI.getOperand(i: `3`)); // pg
2957	MIB.add(MO: MI.getOperand(i: `4`)); // base
2958	MIB.add(MO: MI.getOperand(i: `5`)); // offset
2959
2960	MI.eraseFromParent(); // The pseudo is gone now.
2961	return BB;
2962	}
2963
2964	MachineBasicBlock *
2965	AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock BB) const* {
2966	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2967	MachineInstrBuilder MIB =
2968	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::LDR_ZA));
2969
2970	MIB.addReg(RegNo: AArch64::ZA, flags: RegState::Define);
2971	MIB.add(MO: MI.getOperand(i: `0`)); // Vector select register
2972	MIB.add(MO: MI.getOperand(i: `1`)); // Vector select offset
2973	MIB.add(MO: MI.getOperand(i: `2`)); // Base
2974	MIB.add(MO: MI.getOperand(i: `1`)); // Offset, same as vector select offset
2975
2976	MI.eraseFromParent(); // The pseudo is gone now.
2977	return BB;
2978	}
2979
2980	MachineBasicBlock *AArch64TargetLowering::EmitZTInstr(MachineInstr &MI,
2981	MachineBasicBlock *BB,
2982	unsigned Opcode,
2983	bool Op0IsDef) const {
2984	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2985	MachineInstrBuilder MIB;
2986
2987	MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode))
2988	.addReg(RegNo: MI.getOperand(i: `0`).getReg(), flags: Op0IsDef ? RegState::Define : `0`);
2989	for (unsigned I = `1`; I < MI.getNumOperands(); ++I)
2990	MIB.add(MO: MI.getOperand(i: I));
2991
2992	MI.eraseFromParent(); // The pseudo is gone now.
2993	return BB;
2994	}
2995
2996	MachineBasicBlock *
2997	AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2998	MachineInstr &MI,
2999	MachineBasicBlock BB) const* {
3000	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001	MachineInstrBuilder MIB = BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: Opc));
3002	unsigned StartIdx = `0`;
3003
3004	bool HasTile = BaseReg != AArch64::ZA;
3005	bool HasZPROut = HasTile && MI.getOperand(i: `0`).isReg();
3006	if (HasZPROut) {
3007	MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
3008	++StartIdx;
3009	}
3010	if (HasTile) {
3011	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm(),
3012	flags: RegState::Define); // Output ZA Tile
3013	MIB.addReg(RegNo: BaseReg + MI.getOperand(i: StartIdx).getImm()); // Input Za Tile
3014	StartIdx++;
3015	} else {
3016	// Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3017	if (MI.getOperand(i: `0`).isReg() && !MI.getOperand(i: `1`).isImm()) {
3018	MIB.add(MO: MI.getOperand(i: StartIdx)); // Output ZPR
3019	++StartIdx;
3020	}
3021	MIB.addReg(RegNo: BaseReg, flags: RegState::Define).addReg(RegNo: BaseReg);
3022	}
3023	for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3024	MIB.add(MO: MI.getOperand(i: I));
3025
3026	MI.eraseFromParent(); // The pseudo is gone now.
3027	return BB;
3028	}
3029
3030	MachineBasicBlock *
3031	AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock BB) const* {
3032	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3033	MachineInstrBuilder MIB =
3034	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::ZERO_M));
3035	MIB.add(MO: MI.getOperand(i: `0`)); // Mask
3036
3037	unsigned Mask = MI.getOperand(i: `0`).getImm();
3038	for (unsigned I = `0`; I < `8`; I++) {
3039	if (Mask & (`1` << I))
3040	MIB.addDef(RegNo: AArch64::ZAD0 + I, Flags: RegState::ImplicitDefine);
3041	}
3042
3043	MI.eraseFromParent(); // The pseudo is gone now.
3044	return BB;
3045	}
3046
3047	MachineBasicBlock *
3048	AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
3049	MachineBasicBlock BB) const* {
3050	MachineFunction *MF = BB->getParent();
3051	MachineFrameInfo &MFI = MF->getFrameInfo();
3052	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3053	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3054	if (TPIDR2.Uses > `0`) {
3055	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3056	// Store the buffer pointer to the TPIDR2 stack object.
3057	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRXui))
3058	.addReg(RegNo: MI.getOperand(i: `0`).getReg())
3059	.addFrameIndex(Idx: TPIDR2.FrameIndex)
3060	.addImm(Val: `0`);
3061	// Set the reserved bytes (10-15) to zero
3062	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRHHui))
3063	.addReg(RegNo: AArch64::WZR)
3064	.addFrameIndex(Idx: TPIDR2.FrameIndex)
3065	.addImm(Val: `5`);
3066	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::STRWui))
3067	.addReg(RegNo: AArch64::WZR)
3068	.addFrameIndex(Idx: TPIDR2.FrameIndex)
3069	.addImm(Val: `3`);
3070	} else
3071	MFI.RemoveStackObject(ObjectIdx: TPIDR2.FrameIndex);
3072
3073	BB->remove_instr(I: &MI);
3074	return BB;
3075	}
3076
3077	MachineBasicBlock *
3078	AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
3079	MachineBasicBlock BB) const* {
3080	MachineFunction *MF = BB->getParent();
3081	MachineFrameInfo &MFI = MF->getFrameInfo();
3082	AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3083	// TODO This function grows the stack with a subtraction, which doesn't work
3084	// on Windows. Some refactoring to share the functionality in
3085	// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3086	// supports SME
3087	assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3088	"Lazy ZA save is not yet supported on Windows");
3089
3090	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3091
3092	if (TPIDR2.Uses > `0`) {
3093	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3094	MachineRegisterInfo &MRI = MF->getRegInfo();
3095
3096	// The SUBXrs below won't always be emitted in a form that accepts SP
3097	// directly
3098	Register SP = MRI.createVirtualRegister(RegClass: &AArch64::GPR64RegClass);
3099	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: SP)
3100	.addReg(RegNo: AArch64::SP);
3101
3102	// Allocate a lazy-save buffer object of the size given, normally SVL SVL*
3103	auto Size = MI.getOperand(i: `1`).getReg();
3104	auto Dest = MI.getOperand(i: `0`).getReg();
3105	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: AArch64::MSUBXrrr), DestReg: Dest)
3106	.addReg(RegNo: Size)
3107	.addReg(RegNo: Size)
3108	.addReg(RegNo: SP);
3109	BuildMI(BB&: *BB, I&: MI, MIMD: MI.getDebugLoc(), MCID: TII->get(Opcode: TargetOpcode::COPY),
3110	DestReg: AArch64::SP)
3111	.addReg(RegNo: Dest);
3112
3113	// We have just allocated a variable sized object, tell this to PEI.
3114	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
3115	}
3116
3117	BB->remove_instr(I: &MI);
3118	return BB;
3119	}
3120
3121	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3122	MachineInstr &MI, MachineBasicBlock BB) const* {
3123
3124	int SMEOrigInstr = AArch64::getSMEPseudoMap(Opcode: MI.getOpcode());
3125	if (SMEOrigInstr != -`1`) {
3126	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3127	uint64_t SMEMatrixType =
3128	TII->get(Opcode: MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3129	switch (SMEMatrixType) {
3130	case (AArch64::SMEMatrixArray):
3131	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZA, MI, BB);
3132	case (AArch64::SMEMatrixTileB):
3133	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAB0, MI, BB);
3134	case (AArch64::SMEMatrixTileH):
3135	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAH0, MI, BB);
3136	case (AArch64::SMEMatrixTileS):
3137	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAS0, MI, BB);
3138	case (AArch64::SMEMatrixTileD):
3139	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAD0, MI, BB);
3140	case (AArch64::SMEMatrixTileQ):
3141	return EmitZAInstr(Opc: SMEOrigInstr, BaseReg: AArch64::ZAQ0, MI, BB);
3142	}
3143	}
3144
3145	switch (MI.getOpcode()) {
3146	default:
3147	#ifndef NDEBUG
3148	MI.dump();
3149	#endif
3150	llvm_unreachable("Unexpected instruction for custom inserter!");
3151	case AArch64::InitTPIDR2Obj:
3152	return EmitInitTPIDR2Object(MI, BB);
3153	case AArch64::AllocateZABuffer:
3154	return EmitAllocateZABuffer(MI, BB);
3155	case AArch64::F128CSEL:
3156	return EmitF128CSEL(MI, MBB: BB);
3157	case TargetOpcode::STATEPOINT:
3158	// STATEPOINT is a pseudo instruction which has no implicit defs/uses
3159	// while bl call instruction (where statepoint will be lowered at the end)
3160	// has implicit def. This def is early-clobber as it will be set at
3161	// the moment of the call and earlier than any use is read.
3162	// Add this implicit dead def here as a workaround.
3163	MI.addOperand(MF&: *MI.getMF(),
3164	Op: MachineOperand::CreateReg(
3165	Reg: AArch64::LR, /isDef/ true,
3166	/isImp/ true, /isKill/ false, /isDead/ true,
3167	/isUndef/ false, /isEarlyClobber/ true));
3168	[[fallthrough]];
3169	case TargetOpcode::STACKMAP:
3170	case TargetOpcode::PATCHPOINT:
3171	return emitPatchPoint(MI, MBB: BB);
3172
3173	case TargetOpcode::PATCHABLE_EVENT_CALL:
3174	case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3175	return BB;
3176
3177	case AArch64::CATCHRET:
3178	return EmitLoweredCatchRet(MI, BB);
3179
3180	case AArch64::PROBED_STACKALLOC_DYN:
3181	return EmitDynamicProbedAlloc(MI, MBB: BB);
3182
3183	case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3184	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_B, BaseReg: AArch64::ZAB0, MI, BB);
3185	case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3186	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_H, BaseReg: AArch64::ZAH0, MI, BB);
3187	case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3188	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_S, BaseReg: AArch64::ZAS0, MI, BB);
3189	case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3190	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_D, BaseReg: AArch64::ZAD0, MI, BB);
3191	case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3192	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_H_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3193	case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3194	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_B, BaseReg: AArch64::ZAB0, MI, BB);
3195	case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3196	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_H, BaseReg: AArch64::ZAH0, MI, BB);
3197	case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3198	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_S, BaseReg: AArch64::ZAS0, MI, BB);
3199	case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3200	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_D, BaseReg: AArch64::ZAD0, MI, BB);
3201	case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3202	return EmitTileLoad(Opc: AArch64::LD1_MXIPXX_V_Q, BaseReg: AArch64::ZAQ0, MI, BB);
3203	case AArch64::LDR_ZA_PSEUDO:
3204	return EmitFill(MI, BB);
3205	case AArch64::LDR_TX_PSEUDO:
3206	return EmitZTInstr(MI, BB, Opcode: AArch64::LDR_TX, /Op0IsDef=/true);
3207	case AArch64::STR_TX_PSEUDO:
3208	return EmitZTInstr(MI, BB, Opcode: AArch64::STR_TX, /Op0IsDef=/false);
3209	case AArch64::ZERO_M_PSEUDO:
3210	return EmitZero(MI, BB);
3211	case AArch64::ZERO_T_PSEUDO:
3212	return EmitZTInstr(MI, BB, Opcode: AArch64::ZERO_T, /Op0IsDef=/true);
3213	}
3214	}
3215
3216	//===----------------------------------------------------------------------===//
3217	// AArch64 Lowering private implementation.
3218	//===----------------------------------------------------------------------===//
3219
3220	//===----------------------------------------------------------------------===//
3221	// Lowering Code
3222	//===----------------------------------------------------------------------===//
3223
3224	// Forward declarations of SVE fixed length lowering helpers
3225	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
3226	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3227	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
3228	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
3229	SelectionDAG &DAG);
3230	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT);
3231	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
3232	EVT VT);
3233
3234	/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3235	static bool isZerosVector(const SDNode *N) {
3236	// Look through a bit convert.
3237	while (N->getOpcode() == ISD::BITCAST)
3238	N = N->getOperand(Num: `0`).getNode();
3239
3240	if (ISD::isConstantSplatVectorAllZeros(N))
3241	return true;
3242
3243	if (N->getOpcode() != AArch64ISD::DUP)
3244	return false;
3245
3246	auto Opnd0 = N->getOperand(Num: `0`);
3247	return isNullConstant(V: Opnd0) \|\| isNullFPConstant(V: Opnd0);
3248	}
3249
3250	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3251	/// CC
3252	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
3253	switch (CC) {
3254	default:
3255	llvm_unreachable("Unknown condition code!");
3256	case ISD::SETNE:
3257	return AArch64CC::NE;
3258	case ISD::SETEQ:
3259	return AArch64CC::EQ;
3260	case ISD::SETGT:
3261	return AArch64CC::GT;
3262	case ISD::SETGE:
3263	return AArch64CC::GE;
3264	case ISD::SETLT:
3265	return AArch64CC::LT;
3266	case ISD::SETLE:
3267	return AArch64CC::LE;
3268	case ISD::SETUGT:
3269	return AArch64CC::HI;
3270	case ISD::SETUGE:
3271	return AArch64CC::HS;
3272	case ISD::SETULT:
3273	return AArch64CC::LO;
3274	case ISD::SETULE:
3275	return AArch64CC::LS;
3276	}
3277	}
3278
3279	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3280	static void changeFPCCToAArch64CC(ISD::CondCode CC,
3281	AArch64CC::CondCode &CondCode,
3282	AArch64CC::CondCode &CondCode2) {
3283	CondCode2 = AArch64CC::AL;
3284	switch (CC) {
3285	default:
3286	llvm_unreachable("Unknown FP condition!");
3287	case ISD::SETEQ:
3288	case ISD::SETOEQ:
3289	CondCode = AArch64CC::EQ;
3290	break;
3291	case ISD::SETGT:
3292	case ISD::SETOGT:
3293	CondCode = AArch64CC::GT;
3294	break;
3295	case ISD::SETGE:
3296	case ISD::SETOGE:
3297	CondCode = AArch64CC::GE;
3298	break;
3299	case ISD::SETOLT:
3300	CondCode = AArch64CC::MI;
3301	break;
3302	case ISD::SETOLE:
3303	CondCode = AArch64CC::LS;
3304	break;
3305	case ISD::SETONE:
3306	CondCode = AArch64CC::MI;
3307	CondCode2 = AArch64CC::GT;
3308	break;
3309	case ISD::SETO:
3310	CondCode = AArch64CC::VC;
3311	break;
3312	case ISD::SETUO:
3313	CondCode = AArch64CC::VS;
3314	break;
3315	case ISD::SETUEQ:
3316	CondCode = AArch64CC::EQ;
3317	CondCode2 = AArch64CC::VS;
3318	break;
3319	case ISD::SETUGT:
3320	CondCode = AArch64CC::HI;
3321	break;
3322	case ISD::SETUGE:
3323	CondCode = AArch64CC::PL;
3324	break;
3325	case ISD::SETLT:
3326	case ISD::SETULT:
3327	CondCode = AArch64CC::LT;
3328	break;
3329	case ISD::SETLE:
3330	case ISD::SETULE:
3331	CondCode = AArch64CC::LE;
3332	break;
3333	case ISD::SETNE:
3334	case ISD::SETUNE:
3335	CondCode = AArch64CC::NE;
3336	break;
3337	}
3338	}
3339
3340	/// Convert a DAG fp condition code to an AArch64 CC.
3341	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3342	/// should be AND'ed instead of OR'ed.
3343	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
3344	AArch64CC::CondCode &CondCode,
3345	AArch64CC::CondCode &CondCode2) {
3346	CondCode2 = AArch64CC::AL;
3347	switch (CC) {
3348	default:
3349	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3350	assert(CondCode2 == AArch64CC::AL);
3351	break;
3352	case ISD::SETONE:
3353	// (a one b)
3354	// == ((a olt b) \|\| (a ogt b))
3355	// == ((a ord b) && (a une b))
3356	CondCode = AArch64CC::VC;
3357	CondCode2 = AArch64CC::NE;
3358	break;
3359	case ISD::SETUEQ:
3360	// (a ueq b)
3361	// == ((a uno b) \|\| (a oeq b))
3362	// == ((a ule b) && (a uge b))
3363	CondCode = AArch64CC::PL;
3364	CondCode2 = AArch64CC::LE;
3365	break;
3366	}
3367	}
3368
3369	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3370	/// CC usable with the vector instructions. Fewer operations are available
3371	/// without a real NZCV register, so we have to use less efficient combinations
3372	/// to get the same effect.
3373	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
3374	AArch64CC::CondCode &CondCode,
3375	AArch64CC::CondCode &CondCode2,
3376	bool &Invert) {
3377	Invert = false;
3378	switch (CC) {
3379	default:
3380	// Mostly the scalar mappings work fine.
3381	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3382	break;
3383	case ISD::SETUO:
3384	Invert = true;
3385	[[fallthrough]];
3386	case ISD::SETO:
3387	CondCode = AArch64CC::MI;
3388	CondCode2 = AArch64CC::GE;
3389	break;
3390	case ISD::SETUEQ:
3391	case ISD::SETULT:
3392	case ISD::SETULE:
3393	case ISD::SETUGT:
3394	case ISD::SETUGE:
3395	// All of the compare-mask comparisons are ordered, but we can switch
3396	// between the two by a double inversion. E.g. ULE == !OGT.
3397	Invert = true;
3398	changeFPCCToAArch64CC(CC: getSetCCInverse(Operation: CC, / FP inverse / Type: MVT::f32),
3399	CondCode, CondCode2);
3400	break;
3401	}
3402	}
3403
3404	static bool isLegalArithImmed(uint64_t C) {
3405	// Matches AArch64DAGToDAGISel::SelectArithImmed().
3406	bool IsLegal = (C >> `12` == `0`) \|\| ((C & `0xFFFULL`) == `0` && C >> `24` == `0`);
3407	LLVM_DEBUG(dbgs() << "Is imm " << C
3408	<< " legal: " << (IsLegal ? "yes\n" : "no\n"));
3409	return IsLegal;
3410	}
3411
3412	static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3413	KnownBits KnownSrc = DAG.computeKnownBits(Op: CheckedVal);
3414	return !KnownSrc.getSignedMinValue().isMinSignedValue();
3415	}
3416
3417	// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3418	// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3419	// can be set differently by this operation. It comes down to whether
3420	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3421	// everything is fine. If not then the optimization is wrong. Thus general
3422	// comparisons are only valid if op2 != 0.
3423	//
3424	// So, finally, the only LLVM-native comparisons that don't mention C or V
3425	// are the ones that aren't unsigned comparisons. They're the only ones we can
3426	// safely use CMN for in the absence of information about op2.
3427	static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG) {
3428	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`)) &&
3429	(isIntEqualitySetCC(Code: CC) \|\|
3430	(isUnsignedIntSetCC(Code: CC) && DAG.isKnownNeverZero(Op: Op.getOperand(i: `1`))) \|\|
3431	(isSignedIntSetCC(Code: CC) && cannotBeIntMin(CheckedVal: Op.getOperand(i: `1`), DAG)));
3432	}
3433
3434	static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
3435	SelectionDAG &DAG, SDValue Chain,
3436	bool IsSignaling) {
3437	EVT VT = LHS.getValueType();
3438	assert(VT != MVT::f128);
3439
3440	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3441
3442	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3443	LHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
3444	Ops: {Chain, LHS});
3445	RHS = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
3446	Ops: {LHS.getValue(R: `1`), RHS});
3447	Chain = RHS.getValue(R: `1`);
3448	VT = MVT::f32;
3449	}
3450	unsigned Opcode =
3451	IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3452	return DAG.getNode(Opcode, DL: dl, ResultTys: {VT, MVT::Other}, Ops: {Chain, LHS, RHS});
3453	}
3454
3455	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3456	const SDLoc &dl, SelectionDAG &DAG) {
3457	EVT VT = LHS.getValueType();
3458	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3459
3460	if (VT.isFloatingPoint()) {
3461	assert(VT != MVT::f128);
3462	if ((VT == MVT::f16 && !FullFP16) \|\| VT == MVT::bf16) {
3463	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: LHS);
3464	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: RHS);
3465	VT = MVT::f32;
3466	}
3467	return DAG.getNode(Opcode: AArch64ISD::FCMP, DL: dl, VT, N1: LHS, N2: RHS);
3468	}
3469
3470	// The CMP instruction is just an alias for SUBS, and representing it as
3471	// SUBS means that it's possible to get CSE with subtract operations.
3472	// A later phase can perform the optimization of setting the destination
3473	// register to WZR/XZR if it ends up being unused.
3474	unsigned Opcode = AArch64ISD::SUBS;
3475
3476	if (isCMN(Op: RHS, CC, DAG)) {
3477	// Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3478	Opcode = AArch64ISD::ADDS;
3479	RHS = RHS.getOperand(i: `1`);
3480	} else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: `0`)) &&
3481	isIntEqualitySetCC(Code: CC)) {
3482	// As we are looking for EQ/NE compares, the operands can be commuted ; can
3483	// we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3484	Opcode = AArch64ISD::ADDS;
3485	LHS = LHS.getOperand(i: `1`);
3486	} else if (isNullConstant(V: RHS) && !isUnsignedIntSetCC(Code: CC)) {
3487	if (LHS.getOpcode() == ISD::AND) {
3488	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3489	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3490	// of the signed comparisons.
3491	const SDValue ANDSNode = DAG.getNode(Opcode: AArch64ISD::ANDS, DL: dl,
3492	VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC),
3493	N1: LHS.getOperand(i: `0`),
3494	N2: LHS.getOperand(i: `1`));
3495	// Replace all users of (and X, Y) with newly generated (ands X, Y)
3496	DAG.ReplaceAllUsesWith(From: LHS, To: ANDSNode);
3497	return ANDSNode.getValue(R: `1`);
3498	} else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3499	// Use result of ANDS
3500	return LHS.getValue(R: `1`);
3501	}
3502	}
3503
3504	return DAG.getNode(Opcode, DL: dl, VTList: DAG.getVTList(VT1: VT, VT2: MVT_CC), N1: LHS, N2: RHS)
3505	.getValue(R: `1`);
3506	}
3507
3508	/// \defgroup AArch64CCMP CMP;CCMP matching
3509	///
3510	/// These functions deal with the formation of CMP;CCMP;... sequences.
3511	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3512	/// a comparison. They set the NZCV flags to a predefined value if their
3513	/// predicate is false. This allows to express arbitrary conjunctions, for
3514	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3515	/// expressed as:
3516	/// cmp A
3517	/// ccmp B, inv(CB), CA
3518	/// check for CB flags
3519	///
3520	/// This naturally lets us implement chains of AND operations with SETCC
3521	/// operands. And we can even implement some other situations by transforming
3522	/// them:
3523	/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3524	/// negating the flags used in a CCMP/FCCMP operations.
3525	/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3526	/// by negating the flags we test for afterwards. i.e.
3527	/// NEG (CMP CCMP CCCMP ...) can be implemented.
3528	/// - Note that we can only ever negate all previously processed results.
3529	/// What we can not implement by flipping the flags to test is a negation
3530	/// of two sub-trees (because the negation affects all sub-trees emitted so
3531	/// far, so the 2nd sub-tree we emit would also affect the first).
3532	/// With those tools we can implement some OR operations:
3533	/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3534	/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3535	/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3536	/// elimination rules from earlier to implement the whole thing as a
3537	/// CCMP/FCCMP chain.
3538	///
3539	/// As complete example:
3540	/// or (or (setCA (cmp A)) (setCB (cmp B)))
3541	/// (and (setCC (cmp C)) (setCD (cmp D)))"
3542	/// can be reassociated to:
3543	/// or (and (setCC (cmp C)) setCD (cmp D))
3544	// (or (setCA (cmp A)) (setCB (cmp B)))
3545	/// can be transformed to:
3546	/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3547	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3548	/// which can be implemented as:
3549	/// cmp C
3550	/// ccmp D, inv(CD), CC
3551	/// ccmp A, CA, inv(CD)
3552	/// ccmp B, CB, inv(CA)
3553	/// check for CB flags
3554	///
3555	/// A counterexample is "or (and A B) (and C D)" which translates to
3556	/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3557	/// can only implement 1 of the inner (not) operations, but not both!
3558	/// @{
3559
3560	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3561	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
3562	ISD::CondCode CC, SDValue CCOp,
3563	AArch64CC::CondCode Predicate,
3564	AArch64CC::CondCode OutCC,
3565	const SDLoc &DL, SelectionDAG &DAG) {
3566	unsigned Opcode = `0`;
3567	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3568
3569	if (LHS.getValueType().isFloatingPoint()) {
3570	assert(LHS.getValueType() != MVT::f128);
3571	if ((LHS.getValueType() == MVT::f16 && !FullFP16) \|\|
3572	LHS.getValueType() == MVT::bf16) {
3573	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: LHS);
3574	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: RHS);
3575	}
3576	Opcode = AArch64ISD::FCCMP;
3577	} else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Val&: RHS)) {
3578	APInt Imm = Const->getAPIntValue();
3579	if (Imm.isNegative() && Imm.sgt(RHS: -`32`)) {
3580	Opcode = AArch64ISD::CCMN;
3581	RHS = DAG.getConstant(Val: Imm.abs(), DL, VT: Const->getValueType(ResNo: `0`));
3582	}
3583	} else if (isCMN(Op: RHS, CC, DAG)) {
3584	Opcode = AArch64ISD::CCMN;
3585	RHS = RHS.getOperand(i: `1`);
3586	} else if (LHS.getOpcode() == ISD::SUB && isNullConstant(V: LHS.getOperand(i: `0`)) &&
3587	isIntEqualitySetCC(Code: CC)) {
3588	// As we are looking for EQ/NE compares, the operands can be commuted ; can
3589	// we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3590	Opcode = AArch64ISD::CCMN;
3591	LHS = LHS.getOperand(i: `1`);
3592	}
3593	if (Opcode == `0`)
3594	Opcode = AArch64ISD::CCMP;
3595
3596	SDValue Condition = DAG.getConstant(Val: Predicate, DL, VT: MVT_CC);
3597	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3598	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
3599	SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
3600	return DAG.getNode(Opcode, DL, VT: MVT_CC, N1: LHS, N2: RHS, N3: NZCVOp, N4: Condition, N5: CCOp);
3601	}
3602
3603	/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3604	/// expressed as a conjunction. See \ref AArch64CCMP.
3605	/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3606	/// changing the conditions on the SETCC tests.
3607	/// (this means we can call emitConjunctionRec() with
3608	/// Negate==true on this sub-tree)
3609	/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3610	/// cannot do the negation naturally. We are required to
3611	/// emit the subtree first in this case.
3612	/// \param WillNegate Is true if are called when the result of this
3613	/// subexpression must be negated. This happens when the
3614	/// outer expression is an OR. We can use this fact to know
3615	/// that we have a double negation (or (or ...) ...) that
3616	/// can be implemented for free.
3617	static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3618	bool &MustBeFirst, bool WillNegate,
3619	unsigned Depth = `0`) {
3620	if (!Val.hasOneUse())
3621	return false;
3622	unsigned Opcode = Val ->getOpcode();
3623	if (Opcode == ISD::SETCC) {
3624	if (Val ->getOperand(Num: `0`).getValueType() == MVT::f128)
3625	return false;
3626	CanNegate = true;
3627	MustBeFirst = false;
3628	return true;
3629	}
3630	// Protect against exponential runtime and stack overflow.
3631	if (Depth > `6`)
3632	return false;
3633	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
3634	bool IsOR = Opcode == ISD::OR;
3635	SDValue O0 = Val ->getOperand(Num: `0`);
3636	SDValue O1 = Val ->getOperand(Num: `1`);
3637	bool CanNegateL;
3638	bool MustBeFirstL;
3639	if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, Depth: Depth+`1`))
3640	return false;
3641	bool CanNegateR;
3642	bool MustBeFirstR;
3643	if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, Depth: Depth+`1`))
3644	return false;
3645
3646	if (MustBeFirstL && MustBeFirstR)
3647	return false;
3648
3649	if (IsOR) {
3650	// For an OR expression we need to be able to naturally negate at least
3651	// one side or we cannot do the transformation at all.
3652	if (!CanNegateL && !CanNegateR)
3653	return false;
3654	// If we the result of the OR will be negated and we can naturally negate
3655	// the leafs, then this sub-tree as a whole negates naturally.
3656	CanNegate = WillNegate && CanNegateL && CanNegateR;
3657	// If we cannot naturally negate the whole sub-tree, then this must be
3658	// emitted first.
3659	MustBeFirst = !CanNegate;
3660	} else {
3661	assert(Opcode == ISD::AND && "Must be OR or AND");
3662	// We cannot naturally negate an AND operation.
3663	CanNegate = false;
3664	MustBeFirst = MustBeFirstL \|\| MustBeFirstR;
3665	}
3666	return true;
3667	}
3668	return false;
3669	}
3670
3671	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3672	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3673	/// Tries to transform the given i1 producing node @p Val to a series compare
3674	/// and conditional compare operations. @returns an NZCV flags producing node
3675	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3676	/// transformation was not possible.
3677	/// \p Negate is true if we want this sub-tree being negated just by changing
3678	/// SETCC conditions.
3679	static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3680	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3681	AArch64CC::CondCode Predicate) {
3682	// We're at a tree leaf, produce a conditional comparison operation.
3683	unsigned Opcode = Val ->getOpcode();
3684	if (Opcode == ISD::SETCC) {
3685	SDValue LHS = Val ->getOperand(Num: `0`);
3686	SDValue RHS = Val ->getOperand(Num: `1`);
3687	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Val ->getOperand(Num: `2`))->get();
3688	bool isInteger = LHS.getValueType().isInteger();
3689	if (Negate)
3690	CC = getSetCCInverse(Operation: CC, Type: LHS.getValueType());
3691	SDLoc DL(Val);
3692	// Determine OutCC and handle FP special case.
3693	if (isInteger) {
3694	OutCC = changeIntCCToAArch64CC(CC);
3695	} else {
3696	assert(LHS.getValueType().isFloatingPoint());
3697	AArch64CC::CondCode ExtraCC;
3698	changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
3699	// Some floating point conditions can't be tested with a single condition
3700	// code. Construct an additional comparison in this case.
3701	if (ExtraCC != AArch64CC::AL) {
3702	SDValue ExtraCmp;
3703	if (!CCOp.getNode())
3704	ExtraCmp = emitComparison(LHS, RHS, CC, dl: DL, DAG);
3705	else
3706	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3707	OutCC: ExtraCC, DL, DAG);
3708	CCOp = ExtraCmp;
3709	Predicate = ExtraCC;
3710	}
3711	}
3712
3713	// Produce a normal comparison if we are first in the chain
3714	if (!CCOp)
3715	return emitComparison(LHS, RHS, CC, dl: DL, DAG);
3716	// Otherwise produce a ccmp.
3717	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3718	DAG);
3719	}
3720	assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3721
3722	bool IsOR = Opcode == ISD::OR;
3723
3724	SDValue LHS = Val ->getOperand(Num: `0`);
3725	bool CanNegateL;
3726	bool MustBeFirstL;
3727	bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR);
3728	assert(ValidL && "Valid conjunction/disjunction tree");
3729	(void)ValidL;
3730
3731	SDValue RHS = Val ->getOperand(Num: `1`);
3732	bool CanNegateR;
3733	bool MustBeFirstR;
3734	bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR);
3735	assert(ValidR && "Valid conjunction/disjunction tree");
3736	(void)ValidR;
3737
3738	// Swap sub-tree that must come first to the right side.
3739	if (MustBeFirstL) {
3740	assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3741	std::swap(a&: LHS, b&: RHS);
3742	std::swap(a&: CanNegateL, b&: CanNegateR);
3743	std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
3744	}
3745
3746	bool NegateR;
3747	bool NegateAfterR;
3748	bool NegateL;
3749	bool NegateAfterAll;
3750	if (Opcode == ISD::OR) {
3751	// Swap the sub-tree that we can negate naturally to the left.
3752	if (!CanNegateL) {
3753	assert(CanNegateR && "at least one side must be negatable");
3754	assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3755	assert(!Negate);
3756	std::swap(a&: LHS, b&: RHS);
3757	NegateR = false;
3758	NegateAfterR = true;
3759	} else {
3760	// Negate the left sub-tree if possible, otherwise negate the result.
3761	NegateR = CanNegateR;
3762	NegateAfterR = !CanNegateR;
3763	}
3764	NegateL = true;
3765	NegateAfterAll = !Negate;
3766	} else {
3767	assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3768	assert(!Negate && "Valid conjunction/disjunction tree");
3769
3770	NegateL = false;
3771	NegateR = false;
3772	NegateAfterR = false;
3773	NegateAfterAll = false;
3774	}
3775
3776	// Emit sub-trees.
3777	AArch64CC::CondCode RHSCC;
3778	SDValue CmpR = emitConjunctionRec(DAG, Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate);
3779	if (NegateAfterR)
3780	RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
3781	SDValue CmpL = emitConjunctionRec(DAG, Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR, Predicate: RHSCC);
3782	if (NegateAfterAll)
3783	OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
3784	return CmpL;
3785	}
3786
3787	/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3788	/// In some cases this is even possible with OR operations in the expression.
3789	/// See \ref AArch64CCMP.
3790	/// \see emitConjunctionRec().
3791	static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3792	AArch64CC::CondCode &OutCC) {
3793	bool DummyCanNegate;
3794	bool DummyMustBeFirst;
3795	if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false))
3796	return SDValue ();
3797
3798	return emitConjunctionRec(DAG, Val, OutCC, Negate: false, CCOp: SDValue (), Predicate: AArch64CC::AL);
3799	}
3800
3801	/// @}
3802
3803	/// Returns how profitable it is to fold a comparison's operand's shift and/or
3804	/// extension operations.
3805	static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3806	auto isSupportedExtend = [&](SDValue V) {
3807	if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3808	return true;
3809
3810	if (V.getOpcode() == ISD::AND)
3811	if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(Val: V.getOperand(i: `1`))) {
3812	uint64_t Mask = MaskCst->getZExtValue();
3813	return (Mask == `0xFF` \|\| Mask == `0xFFFF` \|\| Mask == `0xFFFFFFFF`);
3814	}
3815
3816	return false;
3817	};
3818
3819	if (!Op.hasOneUse())
3820	return `0`;
3821
3822	if (isSupportedExtend (Op))
3823	return `1`;
3824
3825	unsigned Opc = Op.getOpcode();
3826	if (Opc == ISD::SHL \|\| Opc == ISD::SRL \|\| Opc == ISD::SRA)
3827	if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`))) {
3828	uint64_t Shift = ShiftCst->getZExtValue();
3829	if (isSupportedExtend (Op.getOperand(i: `0`)))
3830	return (Shift <= `4`) ? `2` : `1`;
3831	EVT VT = Op.getValueType();
3832	if ((VT == MVT::i32 && Shift <= `31`) \|\| (VT == MVT::i64 && Shift <= `63`))
3833	return `1`;
3834	}
3835
3836	return `0`;
3837	}
3838
3839	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3840	SDValue &AArch64cc, SelectionDAG &DAG,
3841	const SDLoc &dl) {
3842	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val: RHS.getNode())) {
3843	EVT VT = RHS.getValueType();
3844	uint64_t C = RHSC->getZExtValue();
3845	if (!isLegalArithImmed(C)) {
3846	// Constant does not fit, try adjusting it by one?
3847	switch (CC) {
3848	default:
3849	break;
3850	case ISD::SETLT:
3851	case ISD::SETGE:
3852	if ((VT == MVT::i32 && C != `0x80000000` &&
3853	isLegalArithImmed(C: (uint32_t)(C - `1`))) \|\|
3854	(VT == MVT::i64 && C != `0x80000000ULL` &&
3855	isLegalArithImmed(C: C - `1ULL`))) {
3856	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3857	C = (VT == MVT::i32) ? (uint32_t)(C - `1`) : C - `1`;
3858	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3859	}
3860	break;
3861	case ISD::SETULT:
3862	case ISD::SETUGE:
3863	if ((VT == MVT::i32 && C != `0` &&
3864	isLegalArithImmed(C: (uint32_t)(C - `1`))) \|\|
3865	(VT == MVT::i64 && C != `0ULL` && isLegalArithImmed(C: C - `1ULL`))) {
3866	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3867	C = (VT == MVT::i32) ? (uint32_t)(C - `1`) : C - `1`;
3868	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3869	}
3870	break;
3871	case ISD::SETLE:
3872	case ISD::SETGT:
3873	if ((VT == MVT::i32 && C != INT32_MAX &&
3874	isLegalArithImmed(C: (uint32_t)(C + `1`))) \|\|
3875	(VT == MVT::i64 && C != INT64_MAX &&
3876	isLegalArithImmed(C: C + `1ULL`))) {
3877	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3878	C = (VT == MVT::i32) ? (uint32_t)(C + `1`) : C + `1`;
3879	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3880	}
3881	break;
3882	case ISD::SETULE:
3883	case ISD::SETUGT:
3884	if ((VT == MVT::i32 && C != UINT32_MAX &&
3885	isLegalArithImmed(C: (uint32_t)(C + `1`))) \|\|
3886	(VT == MVT::i64 && C != UINT64_MAX &&
3887	isLegalArithImmed(C: C + `1ULL`))) {
3888	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3889	C = (VT == MVT::i32) ? (uint32_t)(C + `1`) : C + `1`;
3890	RHS = DAG.getConstant(Val: C, DL: dl, VT);
3891	}
3892	break;
3893	}
3894	}
3895	}
3896
3897	// Comparisons are canonicalized so that the RHS operand is simpler than the
3898	// LHS one, the extreme case being when RHS is an immediate. However, AArch64
3899	// can fold some shift+extend operations on the RHS operand, so swap the
3900	// operands if that can be done.
3901	//
3902	// For example:
3903	// lsl w13, w11, #1
3904	// cmp w13, w12
3905	// can be turned into:
3906	// cmp w12, w11, lsl #1
3907	if (!isa<ConstantSDNode>(Val: RHS) \|\|
3908	!isLegalArithImmed(C: RHS ->getAsAPIntVal().abs().getZExtValue())) {
3909	bool LHSIsCMN = isCMN(Op: LHS, CC, DAG);
3910	bool RHSIsCMN = isCMN(Op: RHS, CC, DAG);
3911	SDValue TheLHS = LHSIsCMN ? LHS.getOperand(i: `1`) : LHS;
3912	SDValue TheRHS = RHSIsCMN ? RHS.getOperand(i: `1`) : RHS;
3913
3914	if (getCmpOperandFoldingProfit(Op: TheLHS) + (LHSIsCMN ? `1` : `0`) >
3915	getCmpOperandFoldingProfit(Op: TheRHS) + (RHSIsCMN ? `1` : `0`)) {
3916	std::swap(a&: LHS, b&: RHS);
3917	CC = ISD::getSetCCSwappedOperands(Operation: CC);
3918	}
3919	}
3920
3921	SDValue Cmp;
3922	AArch64CC::CondCode AArch64CC;
3923	if (isIntEqualitySetCC(Code: CC) && isa<ConstantSDNode>(Val: RHS)) {
3924	const ConstantSDNode *RHSC = cast<ConstantSDNode>(Val&: RHS);
3925
3926	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3927	// For the i8 operand, the largest immediate is 255, so this can be easily
3928	// encoded in the compare instruction. For the i16 operand, however, the
3929	// largest immediate cannot be encoded in the compare.
3930	// Therefore, use a sign extending load and cmn to avoid materializing the
3931	// -1 constant. For example,
3932	// movz w1, #65535
3933	// ldrh w0, [x0, #0]
3934	// cmp w0, w1
3935	// >
3936	// ldrsh w0, [x0, #0]
3937	// cmn w0, #1
3938	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3939	// if and only if (sext LHS) == (sext RHS). The checks are in place to
3940	// ensure both the LHS and RHS are truly zero extended and to make sure the
3941	// transformation is profitable.
3942	if ((RHSC->getZExtValue() >> `16` == `0`) && isa<LoadSDNode>(Val: LHS) &&
3943	cast<LoadSDNode>(Val&: LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3944	cast<LoadSDNode>(Val&: LHS)->getMemoryVT() == MVT::i16 &&
3945	LHS.getNode()->hasNUsesOfValue(NUses: `1`, Value: `0`)) {
3946	int16_t ValueofRHS = RHS ->getAsZExtVal();
3947	if (ValueofRHS < `0` && isLegalArithImmed(C: -ValueofRHS)) {
3948	SDValue SExt =
3949	DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL: dl, VT: LHS.getValueType(), N1: LHS,
3950	N2: DAG.getValueType(MVT::i16));
3951	Cmp = emitComparison(LHS: SExt, RHS: DAG.getConstant(Val: ValueofRHS, DL: dl,
3952	VT: RHS.getValueType()),
3953	CC, dl, DAG);
3954	AArch64CC = changeIntCCToAArch64CC(CC);
3955	}
3956	}
3957
3958	if (!Cmp && (RHSC->isZero() \|\| RHSC->isOne())) {
3959	if ((Cmp = emitConjunction(DAG, Val: LHS, OutCC&: AArch64CC))) {
3960	if ((CC == ISD::SETNE) ^ RHSC->isZero())
3961	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
3962	}
3963	}
3964	}
3965
3966	if (!Cmp) {
3967	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3968	AArch64CC = changeIntCCToAArch64CC(CC);
3969	}
3970	AArch64cc = DAG.getConstant(Val: AArch64CC, DL: dl, VT: MVT_CC);
3971	return Cmp;
3972	}
3973
3974	static std::pair<SDValue, SDValue>
3975	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3976	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
3977	"Unsupported value type");
3978	SDValue Value, Overflow;
3979	SDLoc DL(Op);
3980	SDValue LHS = Op.getOperand(i: `0`);
3981	SDValue RHS = Op.getOperand(i: `1`);
3982	unsigned Opc = `0`;
3983	switch (Op.getOpcode()) {
3984	default:
3985	llvm_unreachable("Unknown overflow instruction!");
3986	case ISD::SADDO:
3987	Opc = AArch64ISD::ADDS;
3988	CC = AArch64CC::VS;
3989	break;
3990	case ISD::UADDO:
3991	Opc = AArch64ISD::ADDS;
3992	CC = AArch64CC::HS;
3993	break;
3994	case ISD::SSUBO:
3995	Opc = AArch64ISD::SUBS;
3996	CC = AArch64CC::VS;
3997	break;
3998	case ISD::USUBO:
3999	Opc = AArch64ISD::SUBS;
4000	CC = AArch64CC::LO;
4001	break;
4002	// Multiply needs a little bit extra work.
4003	case ISD::SMULO:
4004	case ISD::UMULO: {
4005	CC = AArch64CC::NE;
4006	bool IsSigned = Op.getOpcode() == ISD::SMULO;
4007	if (Op.getValueType() == MVT::i32) {
4008	// Extend to 64-bits, then perform a 64-bit multiply.
4009	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4010	LHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: LHS);
4011	RHS = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::i64, Operand: RHS);
4012	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4013	Value = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: Mul);
4014
4015	// Check that the result fits into a 32-bit integer.
4016	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT_CC);
4017	if (IsSigned) {
4018	// cmp xreg, wreg, sxtw
4019	SDValue SExtMul = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Value);
4020	Overflow =
4021	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Mul, N2: SExtMul).getValue(R: `1`);
4022	} else {
4023	// tst xreg, #0xffffffff00000000
4024	SDValue UpperBits = DAG.getConstant(Val: `0xFFFFFFFF00000000`, DL, VT: MVT::i64);
4025	Overflow =
4026	DAG.getNode(Opcode: AArch64ISD::ANDS, DL, VTList: VTs, N1: Mul, N2: UpperBits).getValue(R: `1`);
4027	}
4028	break;
4029	}
4030	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4031	// For the 64 bit multiply
4032	Value = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4033	if (IsSigned) {
4034	SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHS, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4035	SDValue LowerBits = DAG.getNode(Opcode: ISD::SRA, DL, VT: MVT::i64, N1: Value,
4036	N2: DAG.getConstant(Val: `63`, DL, VT: MVT::i64));
4037	// It is important that LowerBits is last, otherwise the arithmetic
4038	// shift will not be folded into the compare (SUBS).
4039	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4040	Overflow = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: UpperBits, N2: LowerBits)
4041	.getValue(R: `1`);
4042	} else {
4043	SDValue UpperBits = DAG.getNode(Opcode: ISD::MULHU, DL, VT: MVT::i64, N1: LHS, N2: RHS);
4044	SDVTList VTs = DAG.getVTList(VT1: MVT::i64, VT2: MVT::i32);
4045	Overflow =
4046	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs,
4047	N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i64),
4048	N2: UpperBits).getValue(R: `1`);
4049	}
4050	break;
4051	}
4052	} // switch (...)
4053
4054	if (Opc) {
4055	SDVTList VTs = DAG.getVTList(VT1: Op ->getValueType(ResNo: `0`), VT2: MVT::i32);
4056
4057	// Emit the AArch64 operation with overflow check.
4058	Value = DAG.getNode(Opcode: Opc, DL, VTList: VTs, N1: LHS, N2: RHS);
4059	Overflow = Value.getValue(R: `1`);
4060	}
4061	return std::make_pair(x&: Value, y&: Overflow);
4062	}
4063
4064	SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4065	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
4066	OverrideNEON: !Subtarget->isNeonAvailable()))
4067	return LowerToScalableOp(Op, DAG);
4068
4069	SDValue Sel = Op.getOperand(i: `0`);
4070	SDValue Other = Op.getOperand(i: `1`);
4071	SDLoc dl(Sel);
4072
4073	// If the operand is an overflow checking operation, invert the condition
4074	// code and kill the Not operation. I.e., transform:
4075	// (xor (overflow_op_bool, 1))
4076	// -->
4077	// (csel 1, 0, invert(cc), overflow_op_bool)
4078	// ... which later gets transformed to just a cset instruction with an
4079	// inverted condition code, rather than a cset + eor sequence.
4080	if (isOneConstant(V: Other) && ISD::isOverflowIntrOpRes(Op: Sel)) {
4081	// Only lower legal XALUO ops.
4082	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Sel ->getValueType(ResNo: `0`)))
4083	return SDValue ();
4084
4085	SDValue TVal = DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32);
4086	SDValue FVal = DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32);
4087	AArch64CC::CondCode CC;
4088	SDValue Value, Overflow;
4089	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op: Sel.getValue(R: `0`), DAG);
4090	SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL: dl, VT: MVT::i32);
4091	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Op.getValueType(), N1: TVal, N2: FVal,
4092	N3: CCVal, N4: Overflow);
4093	}
4094	// If neither operand is a SELECT_CC, give up.
4095	if (Sel.getOpcode() != ISD::SELECT_CC)
4096	std::swap(a&: Sel, b&: Other);
4097	if (Sel.getOpcode() != ISD::SELECT_CC)
4098	return Op;
4099
4100	// The folding we want to perform is:
4101	// (xor x, (select_cc a, b, cc, 0, -1) )
4102	// -->
4103	// (csel x, (xor x, -1), cc ...)
4104	//
4105	// The latter will get matched to a CSINV instruction.
4106
4107	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Sel.getOperand(i: `4`))->get();
4108	SDValue LHS = Sel.getOperand(i: `0`);
4109	SDValue RHS = Sel.getOperand(i: `1`);
4110	SDValue TVal = Sel.getOperand(i: `2`);
4111	SDValue FVal = Sel.getOperand(i: `3`);
4112
4113	// FIXME: This could be generalized to non-integer comparisons.
4114	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4115	return Op;
4116
4117	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
4118	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
4119
4120	// The values aren't constants, this isn't the pattern we're looking for.
4121	if (!CFVal \|\| !CTVal)
4122	return Op;
4123
4124	// We can commute the SELECT_CC by inverting the condition. This
4125	// might be needed to make this fit into a CSINV pattern.
4126	if (CTVal->isAllOnes() && CFVal->isZero()) {
4127	std::swap(a&: TVal, b&: FVal);
4128	std::swap(a&: CTVal, b&: CFVal);
4129	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
4130	}
4131
4132	// If the constants line up, perform the transform!
4133	if (CTVal->isZero() && CFVal->isAllOnes()) {
4134	SDValue CCVal;
4135	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
4136
4137	FVal = Other;
4138	TVal = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: Other.getValueType(), N1: Other,
4139	N2: DAG.getConstant(Val: -`1ULL`, DL: dl, VT: Other.getValueType()));
4140
4141	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: Sel.getValueType(), N1: FVal, N2: TVal,
4142	N3: CCVal, N4: Cmp);
4143	}
4144
4145	return Op;
4146	}
4147
4148	// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4149	// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4150	// sets 'C' bit to 0.
4151	static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
4152	SDLoc DL(Value);
4153	EVT VT = Value.getValueType();
4154	SDValue Op0 = Invert ? DAG.getConstant(Val: `0`, DL, VT) : Value;
4155	SDValue Op1 = Invert ? Value : DAG.getConstant(Val: `1`, DL, VT);
4156	SDValue Cmp =
4157	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue), N1: Op0, N2: Op1);
4158	return Cmp.getValue(R: `1`);
4159	}
4160
4161	// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4162	// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4163	static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG,
4164	bool Invert) {
4165	assert(Glue.getResNo() == `1`);
4166	SDLoc DL(Glue);
4167	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
4168	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
4169	unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4170	SDValue CC = DAG.getConstant(Val: Cond, DL, VT: MVT::i32);
4171	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4172	}
4173
4174	// Value is 1 if 'V' bit of NZCV is 1, else 0
4175	static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG) {
4176	assert(Glue.getResNo() == `1`);
4177	SDLoc DL(Glue);
4178	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
4179	SDValue One = DAG.getConstant(Val: `1`, DL, VT);
4180	SDValue CC = DAG.getConstant(Val: AArch64CC::VS, DL, VT: MVT::i32);
4181	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: One, N2: Zero, N3: CC, N4: Glue);
4182	}
4183
4184	// This lowering is inefficient, but it will get cleaned up by
4185	// `foldOverflowCheck`
4186	static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
4187	unsigned Opcode, bool IsSigned) {
4188	EVT VT0 = Op.getValue(R: `0`).getValueType();
4189	EVT VT1 = Op.getValue(R: `1`).getValueType();
4190
4191	if (VT0 != MVT::i32 && VT0 != MVT::i64)
4192	return SDValue ();
4193
4194	bool InvertCarry = Opcode == AArch64ISD::SBCS;
4195	SDValue OpLHS = Op.getOperand(i: `0`);
4196	SDValue OpRHS = Op.getOperand(i: `1`);
4197	SDValue OpCarryIn = valueToCarryFlag(Value: Op.getOperand(i: `2`), DAG, Invert: InvertCarry);
4198
4199	SDLoc DL(Op);
4200	SDVTList VTs = DAG.getVTList(VT1: VT0, VT2: VT1);
4201
4202	SDValue Sum = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: VT0, VT2: MVT::Glue), N1: OpLHS,
4203	N2: OpRHS, N3: OpCarryIn);
4204
4205	SDValue OutFlag =
4206	IsSigned ? overflowFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG)
4207	: carryFlagToValue(Glue: Sum.getValue(R: `1`), VT: VT1, DAG, Invert: InvertCarry);
4208
4209	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL, VTList: VTs, N1: Sum, N2: OutFlag);
4210	}
4211
4212	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
4213	// Let legalize expand this if it isn't a legal type yet.
4214	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op.getValueType()))
4215	return SDValue ();
4216
4217	SDLoc dl(Op);
4218	AArch64CC::CondCode CC;
4219	// The actual operation that sets the overflow or carry flag.
4220	SDValue Value, Overflow;
4221	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4222
4223	// We use 0 and 1 as false and true values.
4224	SDValue TVal = DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32);
4225	SDValue FVal = DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i32);
4226
4227	// We use an inverted condition, because the conditional select is inverted
4228	// too. This will allow it to be selected to a single instruction:
4229	// CSINC Wd, WZR, WZR, invert(cond).
4230	SDValue CCVal = DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL: dl, VT: MVT::i32);
4231	Overflow = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT: MVT::i32, N1: FVal, N2: TVal,
4232	N3: CCVal, N4: Overflow);
4233
4234	SDVTList VTs = DAG.getVTList(VT1: Op.getValueType(), VT2: MVT::i32);
4235	return DAG.getNode(Opcode: ISD::MERGE_VALUES, DL: dl, VTList: VTs, N1: Value, N2: Overflow);
4236	}
4237
4238	// Prefetch operands are:
4239	// 1: Address to prefetch
4240	// 2: bool isWrite
4241	// 3: int locality (0 = no locality ... 3 = extreme locality)
4242	// 4: bool isDataCache
4243	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
4244	SDLoc DL(Op);
4245	unsigned IsWrite = Op.getConstantOperandVal(i: `2`);
4246	unsigned Locality = Op.getConstantOperandVal(i: `3`);
4247	unsigned IsData = Op.getConstantOperandVal(i: `4`);
4248
4249	bool IsStream = !Locality;
4250	// When the locality number is set
4251	if (Locality) {
4252	// The front-end should have filtered out the out-of-range values
4253	assert(Locality <= `3` && "Prefetch locality out-of-range");
4254	// The locality degree is the opposite of the cache speed.
4255	// Put the number the other way around.
4256	// The encoding starts at 0 for level 1
4257	Locality = `3` - Locality;
4258	}
4259
4260	// built the mask value encoding the expected behavior.
4261	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
4262	(!IsData << `3`) \| // IsDataCache bit
4263	(Locality << `1`) \| // Cache level bits
4264	(unsigned)IsStream; // Stream bit
4265	return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Op.getOperand(i: `0`),
4266	N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32),
4267	N3: Op.getOperand(i: `1`));
4268	}
4269
4270	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4271	SelectionDAG &DAG) const {
4272	EVT VT = Op.getValueType();
4273	if (VT.isScalableVector())
4274	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4275
4276	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
4277	return LowerFixedLengthFPExtendToSVE(Op, DAG);
4278
4279	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4280	return SDValue ();
4281	}
4282
4283	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4284	SelectionDAG &DAG) const {
4285	EVT VT = Op.getValueType();
4286	if (VT.isScalableVector())
4287	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4288
4289	bool IsStrict = Op ->isStrictFPOpcode();
4290	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4291	EVT SrcVT = SrcVal.getValueType();
4292	bool Trunc = Op.getConstantOperandVal(i: IsStrict ? `2` : `1`) == `1`;
4293
4294	if (useSVEForFixedLengthVectorVT(VT: SrcVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4295	return LowerFixedLengthFPRoundToSVE(Op, DAG);
4296
4297	// Expand cases where the result type is BF16 but we don't have hardware
4298	// instructions to lower it.
4299	if (VT.getScalarType() == MVT::bf16 &&
4300	!((Subtarget->hasNEON() \|\| Subtarget->hasSME()) &&
4301	Subtarget->hasBF16())) {
4302	SDLoc dl(Op);
4303	SDValue Narrow = SrcVal;
4304	SDValue NaN;
4305	EVT I32 = SrcVT.changeElementType(EltVT: MVT::i32);
4306	EVT F32 = SrcVT.changeElementType(EltVT: MVT::f32);
4307	if (SrcVT.getScalarType() == MVT::f32) {
4308	bool NeverSNaN = DAG.isKnownNeverSNaN(Op: Narrow);
4309	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4310	if (!NeverSNaN) {
4311	// Set the quiet bit.
4312	NaN = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: I32, N1: Narrow,
4313	N2: DAG.getConstant(Val: `0x400000`, DL: dl, VT: I32));
4314	}
4315	} else if (SrcVT.getScalarType() == MVT::f64) {
4316	Narrow = DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL: dl, VT: F32, Operand: Narrow);
4317	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: I32, Operand: Narrow);
4318	} else {
4319	return SDValue ();
4320	}
4321	if (!Trunc) {
4322	SDValue One = DAG.getConstant(Val: `1`, DL: dl, VT: I32);
4323	SDValue Lsb = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4324	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL: dl));
4325	Lsb = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: I32, N1: Lsb, N2: One);
4326	SDValue RoundingBias =
4327	DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: DAG.getConstant(Val: `0x7fff`, DL: dl, VT: I32), N2: Lsb);
4328	Narrow = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: I32, N1: Narrow, N2: RoundingBias);
4329	}
4330
4331	// Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4332	// 0x80000000.
4333	if (NaN) {
4334	SDValue IsNaN = DAG.getSetCC(
4335	DL: dl, VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: SrcVT),
4336	LHS: SrcVal, RHS: SrcVal, Cond: ISD::SETUO);
4337	Narrow = DAG.getSelect(DL: dl, VT: I32, Cond: IsNaN, LHS: NaN, RHS: Narrow);
4338	}
4339
4340	// Now that we have rounded, shift the bits into position.
4341	Narrow = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: I32, N1: Narrow,
4342	N2: DAG.getShiftAmountConstant(Val: `16`, VT: I32, DL: dl));
4343	if (VT.isVector()) {
4344	EVT I16 = I32.changeVectorElementType(EltVT: MVT::i16);
4345	Narrow = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: I16, Operand: Narrow);
4346	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Narrow);
4347	}
4348	Narrow = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: F32, Operand: Narrow);
4349	SDValue Result = DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL: dl, VT, Operand: Narrow);
4350	return IsStrict ? DAG.getMergeValues(Ops: {Result, Op.getOperand(i: `0`)}, dl)
4351	: Result;
4352	}
4353
4354	if (SrcVT != MVT::f128) {
4355	// Expand cases where the input is a vector bigger than NEON.
4356	if (useSVEForFixedLengthVectorVT(VT: SrcVT))
4357	return SDValue ();
4358
4359	// It's legal except when f128 is involved
4360	return Op;
4361	}
4362
4363	return SDValue ();
4364	}
4365
4366	SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4367	SelectionDAG &DAG) const {
4368	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4369	// Any additional optimization in this function should be recorded
4370	// in the cost tables.
4371	bool IsStrict = Op ->isStrictFPOpcode();
4372	EVT InVT = Op.getOperand(i: IsStrict ? `1` : `0`).getValueType();
4373	EVT VT = Op.getValueType();
4374
4375	if (VT.isScalableVector()) {
4376	unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4377	? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4378	: AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4379	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4380	}
4381
4382	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4383	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4384	return LowerFixedLengthFPToIntToSVE(Op, DAG);
4385
4386	unsigned NumElts = InVT.getVectorNumElements();
4387
4388	// f16 conversions are promoted to f32 when full fp16 is not supported.
4389	if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4390	InVT.getVectorElementType() == MVT::bf16) {
4391	MVT NewVT = MVT::getVectorVT(VT: MVT::f32, NumElements: NumElts);
4392	SDLoc dl(Op);
4393	if (IsStrict) {
4394	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {NewVT, MVT::Other},
4395	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4396	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {VT, MVT::Other},
4397	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4398	}
4399	return DAG.getNode(
4400	Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4401	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: NewVT, Operand: Op.getOperand(i: `0`)));
4402	}
4403
4404	uint64_t VTSize = VT.getFixedSizeInBits();
4405	uint64_t InVTSize = InVT.getFixedSizeInBits();
4406	if (VTSize < InVTSize) {
4407	SDLoc dl(Op);
4408	if (IsStrict) {
4409	InVT = InVT.changeVectorElementTypeToInteger();
4410	SDValue Cv = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {InVT, MVT::Other},
4411	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4412	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4413	return DAG.getMergeValues(Ops: {Trunc, Cv.getValue(R: `1`)}, dl);
4414	}
4415	SDValue Cv =
4416	DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: InVT.changeVectorElementTypeToInteger(),
4417	Operand: Op.getOperand(i: `0`));
4418	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Cv);
4419	}
4420
4421	if (VTSize > InVTSize) {
4422	SDLoc dl(Op);
4423	MVT ExtVT =
4424	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: VT.getScalarSizeInBits()),
4425	NumElements: VT.getVectorNumElements());
4426	if (IsStrict) {
4427	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {ExtVT, MVT::Other},
4428	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
4429	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {VT, MVT::Other},
4430	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4431	}
4432	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: ExtVT, Operand: Op.getOperand(i: `0`));
4433	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT, Operand: Ext);
4434	}
4435
4436	// Use a scalar operation for conversions between single-element vectors of
4437	// the same size.
4438	if (NumElts == `1`) {
4439	SDLoc dl(Op);
4440	SDValue Extract = DAG.getNode(
4441	Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: InVT.getScalarType(),
4442	N1: Op.getOperand(i: IsStrict ? `1` : `0`), N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
4443	EVT ScalarVT = VT.getScalarType();
4444	if (IsStrict)
4445	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {ScalarVT, MVT::Other},
4446	Ops: {Op.getOperand(i: `0`), Extract});
4447	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4448	}
4449
4450	// Type changing conversions are illegal.
4451	return Op;
4452	}
4453
4454	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4455	SelectionDAG &DAG) const {
4456	bool IsStrict = Op ->isStrictFPOpcode();
4457	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4458
4459	if (SrcVal.getValueType().isVector())
4460	return LowerVectorFP_TO_INT(Op, DAG);
4461
4462	// f16 conversions are promoted to f32 when full fp16 is not supported.
4463	if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
4464	SrcVal.getValueType() == MVT::bf16) {
4465	SDLoc dl(Op);
4466	if (IsStrict) {
4467	SDValue Ext =
4468	DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL: dl, ResultTys: {MVT::f32, MVT::Other},
4469	Ops: {Op.getOperand(i: `0`), SrcVal});
4470	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4471	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
4472	}
4473	return DAG.getNode(
4474	Opcode: Op.getOpcode(), DL: dl, VT: Op.getValueType(),
4475	Operand: DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: SrcVal));
4476	}
4477
4478	if (SrcVal.getValueType() != MVT::f128) {
4479	// It's legal except when f128 is involved
4480	return Op;
4481	}
4482
4483	return SDValue ();
4484	}
4485
4486	SDValue
4487	AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4488	SelectionDAG &DAG) const {
4489	// AArch64 FP-to-int conversions saturate to the destination element size, so
4490	// we can lower common saturating conversions to simple instructions.
4491	SDValue SrcVal = Op.getOperand(i: `0`);
4492	EVT SrcVT = SrcVal.getValueType();
4493	EVT DstVT = Op.getValueType();
4494	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4495
4496	uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4497	uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4498	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4499	assert(SatWidth <= DstElementWidth &&
4500	"Saturation width cannot exceed result width");
4501
4502	// TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4503	// Currently, the `llvm.fpto[su]i.sat.` intrinsics don't accept scalable*
4504	// types, so this is hard to reach.
4505	if (DstVT.isScalableVector())
4506	return SDValue ();
4507
4508	EVT SrcElementVT = SrcVT.getVectorElementType();
4509
4510	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4511	if ((SrcElementVT == MVT::f16 &&
4512	(!Subtarget->hasFullFP16() \|\| DstElementWidth > `16`)) \|\|
4513	SrcElementVT == MVT::bf16) {
4514	MVT F32VT = MVT::getVectorVT(VT: MVT::f32, NumElements: SrcVT.getVectorNumElements());
4515	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT: F32VT, Operand: SrcVal);
4516	SrcVT = F32VT;
4517	SrcElementVT = MVT::f32;
4518	SrcElementWidth = `32`;
4519	} else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4520	SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4521	return SDValue ();
4522
4523	SDLoc DL(Op);
4524	// Expand to f64 if we are saturating to i64, to help produce keep the lanes
4525	// the same width and produce a fcvtzu.
4526	if (SatWidth == `64` && SrcElementWidth < `64`) {
4527	MVT F64VT = MVT::getVectorVT(VT: MVT::f64, NumElements: SrcVT.getVectorNumElements());
4528	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: F64VT, Operand: SrcVal);
4529	SrcVT = F64VT;
4530	SrcElementVT = MVT::f64;
4531	SrcElementWidth = `64`;
4532	}
4533	// Cases that we can emit directly.
4534	if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4535	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4536	N2: DAG.getValueType(DstVT.getScalarType()));
4537
4538	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4539	// result. This is only valid if the legal cvt is larger than the saturate
4540	// width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4541	// (at least until sqxtn is selected).
4542	if (SrcElementWidth < SatWidth \|\| SrcElementVT == MVT::f64)
4543	return SDValue ();
4544
4545	EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4546	SDValue NativeCvt = DAG.getNode(Opcode: Op.getOpcode(), DL, VT: IntVT, N1: SrcVal,
4547	N2: DAG.getValueType(IntVT.getScalarType()));
4548	SDValue Sat;
4549	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4550	SDValue MinC = DAG.getConstant(
4551	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4552	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4553	SDValue MaxC = DAG.getConstant(
4554	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: SrcElementWidth), DL, VT: IntVT);
4555	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: IntVT, N1: Min, N2: MaxC);
4556	} else {
4557	SDValue MinC = DAG.getConstant(
4558	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: SrcElementWidth), DL, VT: IntVT);
4559	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: IntVT, N1: NativeCvt, N2: MinC);
4560	}
4561
4562	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4563	}
4564
4565	SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4566	SelectionDAG &DAG) const {
4567	// AArch64 FP-to-int conversions saturate to the destination register size, so
4568	// we can lower common saturating conversions to simple instructions.
4569	SDValue SrcVal = Op.getOperand(i: `0`);
4570	EVT SrcVT = SrcVal.getValueType();
4571
4572	if (SrcVT.isVector())
4573	return LowerVectorFP_TO_INT_SAT(Op, DAG);
4574
4575	EVT DstVT = Op.getValueType();
4576	EVT SatVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
4577	uint64_t SatWidth = SatVT.getScalarSizeInBits();
4578	uint64_t DstWidth = DstVT.getScalarSizeInBits();
4579	assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4580
4581	// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4582	if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) \|\| SrcVT == MVT::bf16) {
4583	SrcVal = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: SDLoc (Op), VT: MVT::f32, Operand: SrcVal);
4584	SrcVT = MVT::f32;
4585	} else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4586	SrcVT != MVT::bf16)
4587	return SDValue ();
4588
4589	SDLoc DL(Op);
4590	// Cases that we can emit directly.
4591	if ((SrcVT == MVT::f64 \|\| SrcVT == MVT::f32 \|\|
4592	(SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4593	DstVT == SatVT && (DstVT == MVT::i64 \|\| DstVT == MVT::i32))
4594	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal,
4595	N2: DAG.getValueType(DstVT));
4596
4597	// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4598	// result. This is only valid if the legal cvt is larger than the saturate
4599	// width.
4600	if (DstWidth < SatWidth)
4601	return SDValue ();
4602
4603	SDValue NativeCvt =
4604	DAG.getNode(Opcode: Op.getOpcode(), DL, VT: DstVT, N1: SrcVal, N2: DAG.getValueType(DstVT));
4605	SDValue Sat;
4606	if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4607	SDValue MinC = DAG.getConstant(
4608	Val: APInt::getSignedMaxValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4609	SDValue Min = DAG.getNode(Opcode: ISD::SMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4610	SDValue MaxC = DAG.getConstant(
4611	Val: APInt::getSignedMinValue(numBits: SatWidth).sext(width: DstWidth), DL, VT: DstVT);
4612	Sat = DAG.getNode(Opcode: ISD::SMAX, DL, VT: DstVT, N1: Min, N2: MaxC);
4613	} else {
4614	SDValue MinC = DAG.getConstant(
4615	Val: APInt::getAllOnes(numBits: SatWidth).zext(width: DstWidth), DL, VT: DstVT);
4616	Sat = DAG.getNode(Opcode: ISD::UMIN, DL, VT: DstVT, N1: NativeCvt, N2: MinC);
4617	}
4618
4619	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DstVT, Operand: Sat);
4620	}
4621
4622	SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4623	SelectionDAG &DAG) const {
4624	EVT VT = Op.getValueType();
4625	SDValue Src = Op.getOperand(i: `0`);
4626	SDLoc DL(Op);
4627
4628	assert(VT.isVector() && "Expected vector type");
4629
4630	EVT CastVT =
4631	VT.changeVectorElementType(EltVT: Src.getValueType().getVectorElementType());
4632
4633	// Round the floating-point value into a floating-point register with the
4634	// current rounding mode.
4635	SDValue FOp = DAG.getNode(Opcode: ISD::FRINT, DL, VT: CastVT, Operand: Src);
4636
4637	// Truncate the rounded floating point to an integer.
4638	return DAG.getNode(Opcode: ISD::FP_TO_SINT_SAT, DL, VT, N1: FOp,
4639	N2: DAG.getValueType(VT.getVectorElementType()));
4640	}
4641
4642	SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4643	SelectionDAG &DAG) const {
4644	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4645	// Any additional optimization in this function should be recorded
4646	// in the cost tables.
4647	bool IsStrict = Op ->isStrictFPOpcode();
4648	EVT VT = Op.getValueType();
4649	SDLoc dl(Op);
4650	SDValue In = Op.getOperand(i: IsStrict ? `1` : `0`);
4651	EVT InVT = In.getValueType();
4652	unsigned Opc = Op.getOpcode();
4653	bool IsSigned = Opc == ISD::SINT_TO_FP \|\| Opc == ISD::STRICT_SINT_TO_FP;
4654
4655	if (VT.isScalableVector()) {
4656	if (InVT.getVectorElementType() == MVT::i1) {
4657	// We can't directly extend an SVE predicate; extend it first.
4658	unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4659	EVT CastVT = getPromotedVTForPredicate(VT: InVT);
4660	In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4661	return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4662	}
4663
4664	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4665	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
4666	return LowerToPredicatedOp(Op, DAG, NewOp: Opcode);
4667	}
4668
4669	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()) \|\|
4670	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable()))
4671	return LowerFixedLengthIntToFPToSVE(Op, DAG);
4672
4673	// Promote bf16 conversions to f32.
4674	if (VT.getVectorElementType() == MVT::bf16) {
4675	EVT F32 = VT.changeElementType(EltVT: MVT::f32);
4676	if (IsStrict) {
4677	SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {F32, MVT::Other},
4678	Ops: {Op.getOperand(i: `0`), In});
4679	return DAG.getNode(
4680	Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4681	Ops: {Val.getValue(R: `1`), Val.getValue(R: `0`), DAG.getIntPtrConstant(Val: `0`, DL: dl)});
4682	}
4683	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4684	N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: F32, Operand: In),
4685	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl));
4686	}
4687
4688	uint64_t VTSize = VT.getFixedSizeInBits();
4689	uint64_t InVTSize = InVT.getFixedSizeInBits();
4690	if (VTSize < InVTSize) {
4691	MVT CastVT =
4692	MVT::getVectorVT(VT: MVT::getFloatingPointVT(BitWidth: InVT.getScalarSizeInBits()),
4693	NumElements: InVT.getVectorNumElements());
4694	if (IsStrict) {
4695	In = DAG.getNode(Opcode: Opc, DL: dl, ResultTys: {CastVT, MVT::Other},
4696	Ops: {Op.getOperand(i: `0`), In});
4697	return DAG.getNode(
4698	Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {VT, MVT::Other},
4699	Ops: {In.getValue(R: `1`), In.getValue(R: `0`), DAG.getIntPtrConstant(Val: `0`, DL: dl)});
4700	}
4701	In = DAG.getNode(Opcode: Opc, DL: dl, VT: CastVT, Operand: In);
4702	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT, N1: In,
4703	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl, /isTarget=/true));
4704	}
4705
4706	if (VTSize > InVTSize) {
4707	unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4708	EVT CastVT = VT.changeVectorElementTypeToInteger();
4709	In = DAG.getNode(Opcode: CastOpc, DL: dl, VT: CastVT, Operand: In);
4710	if (IsStrict)
4711	return DAG.getNode(Opcode: Opc, DL: dl, ResultTys: {VT, MVT::Other}, Ops: {Op.getOperand(i: `0`), In});
4712	return DAG.getNode(Opcode: Opc, DL: dl, VT, Operand: In);
4713	}
4714
4715	// Use a scalar operation for conversions between single-element vectors of
4716	// the same size.
4717	if (VT.getVectorNumElements() == `1`) {
4718	SDValue Extract = DAG.getNode(
4719	Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: InVT.getScalarType(),
4720	N1: In, N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
4721	EVT ScalarVT = VT.getScalarType();
4722	if (IsStrict)
4723	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {ScalarVT, MVT::Other},
4724	Ops: {Op.getOperand(i: `0`), Extract});
4725	return DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: ScalarVT, Operand: Extract);
4726	}
4727
4728	return Op;
4729	}
4730
4731	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4732	SelectionDAG &DAG) const {
4733	if (Op.getValueType().isVector())
4734	return LowerVectorINT_TO_FP(Op, DAG);
4735
4736	bool IsStrict = Op ->isStrictFPOpcode();
4737	SDValue SrcVal = Op.getOperand(i: IsStrict ? `1` : `0`);
4738
4739	bool IsSigned = Op ->getOpcode() == ISD::STRICT_SINT_TO_FP \|\|
4740	Op ->getOpcode() == ISD::SINT_TO_FP;
4741
4742	auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4743	SDLoc dl(Op);
4744	if (IsStrict) {
4745	SDValue Val = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, ResultTys: {PromoteVT, MVT::Other},
4746	Ops: {Op.getOperand(i: `0`), SrcVal});
4747	return DAG.getNode(
4748	Opcode: ISD::STRICT_FP_ROUND, DL: dl, ResultTys: {Op.getValueType(), MVT::Other},
4749	Ops: {Val.getValue(R: `1`), Val.getValue(R: `0`), DAG.getIntPtrConstant(Val: `0`, DL: dl)});
4750	}
4751	return DAG.getNode(Opcode: ISD::FP_ROUND, DL: dl, VT: Op.getValueType(),
4752	N1: DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromoteVT, Operand: SrcVal),
4753	N2: DAG.getIntPtrConstant(Val: `0`, DL: dl));
4754	};
4755
4756	if (Op.getValueType() == MVT::bf16) {
4757	unsigned MaxWidth = IsSigned
4758	? DAG.ComputeMaxSignificantBits(Op: SrcVal)
4759	: DAG.computeKnownBits(Op: SrcVal).countMaxActiveBits();
4760	// bf16 conversions are promoted to f32 when converting from i16.
4761	if (MaxWidth <= `24`) {
4762	return IntToFpViaPromotion (MVT::f32);
4763	}
4764
4765	// bf16 conversions are promoted to f64 when converting from i32.
4766	if (MaxWidth <= `53`) {
4767	return IntToFpViaPromotion (MVT::f64);
4768	}
4769
4770	// We need to be careful about i64 -> bf16.
4771	// Consider an i32 22216703.
4772	// This number cannot be represented exactly as an f32 and so a itofp will
4773	// turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4774	// However, the correct bf16 was supposed to be 22151168.0
4775	// We need to use sticky rounding to get this correct.
4776	if (SrcVal.getValueType() == MVT::i64) {
4777	SDLoc DL(Op);
4778	// This algorithm is equivalent to the following:
4779	// uint64_t SrcHi = SrcVal & ~0xfffull;
4780	// uint64_t SrcLo = SrcVal & 0xfffull;
4781	// uint64_t Highest = SrcVal >> 53;
4782	// bool HasHighest = Highest != 0;
4783	// uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4784	// double Rounded = static_cast<double>(ToRound);
4785	// uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4786	// uint64_t HasLo = SrcLo != 0;
4787	// bool NeedsAdjustment = HasHighest & HasLo;
4788	// uint64_t AdjustedBits = RoundedBits \| uint64_t{NeedsAdjustment};
4789	// double Adjusted = std::bit_cast<double>(AdjustedBits);
4790	// return static_cast<__bf16>(Adjusted);
4791	//
4792	// Essentially, what happens is that SrcVal either fits perfectly in a
4793	// double-precision value or it is too big. If it is sufficiently small,
4794	// we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4795	// ensure that u64 -> double has no rounding error by only using the 52
4796	// MSB of the input. The low order bits will get merged into a sticky bit
4797	// which will avoid issues incurred by double rounding.
4798
4799	// Signed conversion is more or less like so:
4800	// copysign((__bf16)abs(SrcVal), SrcVal)
4801	SDValue SignBit;
4802	if (IsSigned) {
4803	SignBit = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4804	N2: DAG.getConstant(Val: `1ull` << `63`, DL, VT: MVT::i64));
4805	SrcVal = DAG.getNode(Opcode: ISD::ABS, DL, VT: MVT::i64, Operand: SrcVal);
4806	}
4807	SDValue SrcHi = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4808	N2: DAG.getConstant(Val: ~`0xfffull`, DL, VT: MVT::i64));
4809	SDValue SrcLo = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: SrcVal,
4810	N2: DAG.getConstant(Val: `0xfffull`, DL, VT: MVT::i64));
4811	SDValue Highest =
4812	DAG.getNode(Opcode: ISD::SRL, DL, VT: MVT::i64, N1: SrcVal,
4813	N2: DAG.getShiftAmountConstant(Val: `53`, VT: MVT::i64, DL));
4814	SDValue Zero64 = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
4815	SDValue ToRound =
4816	DAG.getSelectCC(DL, LHS: Highest, RHS: Zero64, True: SrcHi, False: SrcVal, Cond: ISD::SETNE);
4817	SDValue Rounded =
4818	IsStrict ? DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {MVT::f64, MVT::Other},
4819	Ops: {Op.getOperand(i: `0`), ToRound})
4820	: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: MVT::f64, Operand: ToRound);
4821
4822	SDValue RoundedBits = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i64, Operand: Rounded);
4823	if (SignBit) {
4824	RoundedBits = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: SignBit);
4825	}
4826
4827	SDValue HasHighest = DAG.getSetCC(
4828	DL,
4829	VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
4830	LHS: Highest, RHS: Zero64, Cond: ISD::SETNE);
4831
4832	SDValue HasLo = DAG.getSetCC(
4833	DL,
4834	VT: getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT: MVT::i64),
4835	LHS: SrcLo, RHS: Zero64, Cond: ISD::SETNE);
4836
4837	SDValue NeedsAdjustment =
4838	DAG.getNode(Opcode: ISD::AND, DL, VT: HasLo.getValueType(), N1: HasHighest, N2: HasLo);
4839	NeedsAdjustment = DAG.getZExtOrTrunc(Op: NeedsAdjustment, DL, VT: MVT::i64);
4840
4841	SDValue AdjustedBits =
4842	DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: RoundedBits, N2: NeedsAdjustment);
4843	SDValue Adjusted = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f64, Operand: AdjustedBits);
4844	return IsStrict
4845	? DAG.getNode(Opcode: ISD::STRICT_FP_ROUND, DL,
4846	ResultTys: {Op.getValueType(), MVT::Other},
4847	Ops: {Rounded.getValue(R: `1`), Adjusted,
4848	DAG.getIntPtrConstant(Val: `0`, DL)})
4849	: DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: Op.getValueType(), N1: Adjusted,
4850	N2: DAG.getIntPtrConstant(Val: `0`, DL, isTarget: true));
4851	}
4852	}
4853
4854	// f16 conversions are promoted to f32 when full fp16 is not supported.
4855	if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4856	return IntToFpViaPromotion (MVT::f32);
4857	}
4858
4859	// i128 conversions are libcalls.
4860	if (SrcVal.getValueType() == MVT::i128)
4861	return SDValue ();
4862
4863	// Other conversions are legal, unless it's to the completely software-based
4864	// fp128.
4865	if (Op.getValueType() != MVT::f128)
4866	return Op;
4867	return SDValue ();
4868	}
4869
4870	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4871	SelectionDAG &DAG) const {
4872	// For iOS, we want to call an alternative entry point: __sincos_stret,
4873	// which returns the values in two S / D registers.
4874	SDLoc dl(Op);
4875	SDValue Arg = Op.getOperand(i: `0`);
4876	EVT ArgVT = Arg.getValueType();
4877	Type ArgTy = ArgVT.getTypeForEVT(Context&: DAG.getContext());
4878
4879	ArgListTy Args;
4880	ArgListEntry Entry;
4881
4882	Entry.Node = Arg;
4883	Entry.Ty = ArgTy;
4884	Entry.IsSExt = false;
4885	Entry.IsZExt = false;
4886	Args.push_back(x: Entry);
4887
4888	RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4889	: RTLIB::SINCOS_STRET_F32;
4890	const char *LibcallName = getLibcallName(Call: LC);
4891	SDValue Callee =
4892	DAG.getExternalSymbol(Sym: LibcallName, VT: getPointerTy(DL: DAG.getDataLayout()));
4893
4894	StructType *RetTy = StructType::get(elt1: ArgTy, elts: ArgTy);
4895	TargetLowering::CallLoweringInfo CLI(DAG);
4896	CLI.setDebugLoc(dl)
4897	.setChain(DAG.getEntryNode())
4898	.setLibCallee(CC: CallingConv::Fast, ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
4899
4900	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4901	return CallResult.first;
4902	}
4903
4904	static MVT getSVEContainerType(EVT ContentTy);
4905
4906	SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4907	SelectionDAG &DAG) const {
4908	EVT OpVT = Op.getValueType();
4909	EVT ArgVT = Op.getOperand(i: `0`).getValueType();
4910
4911	if (useSVEForFixedLengthVectorVT(VT: OpVT))
4912	return LowerFixedLengthBitcastToSVE(Op, DAG);
4913
4914	if (OpVT.isScalableVector()) {
4915	// Bitcasting between unpacked vector types of different element counts is
4916	// not a NOP because the live elements are laid out differently.
4917	// 01234567
4918	// e.g. nxv2i32 = XX??XX??
4919	// nxv4f16 = X?X?X?X?
4920	if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4921	return SDValue ();
4922
4923	if (isTypeLegal(VT: OpVT) && !isTypeLegal(VT: ArgVT)) {
4924	assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4925	"Expected int->fp bitcast!");
4926	SDValue ExtResult =
4927	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: SDLoc (Op), VT: getSVEContainerType(ContentTy: ArgVT),
4928	Operand: Op.getOperand(i: `0`));
4929	return getSVESafeBitCast(VT: OpVT, Op: ExtResult, DAG);
4930	}
4931	return getSVESafeBitCast(VT: OpVT, Op: Op.getOperand(i: `0`), DAG);
4932	}
4933
4934	if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4935	return SDValue ();
4936
4937	// Bitcasts between f16 and bf16 are legal.
4938	if (ArgVT == MVT::f16 \|\| ArgVT == MVT::bf16)
4939	return Op;
4940
4941	assert(ArgVT == MVT::i16);
4942	SDLoc DL(Op);
4943
4944	Op = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: Op.getOperand(i: `0`));
4945	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::f32, Operand: Op);
4946	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: OpVT, Operand: Op);
4947	}
4948
4949	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4950	if (OrigVT.getSizeInBits() >= `64`)
4951	return OrigVT;
4952
4953	assert(OrigVT.isSimple() && "Expecting a simple value type");
4954
4955	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4956	switch (OrigSimpleTy) {
4957	default: llvm_unreachable("Unexpected Vector Type");
4958	case MVT::v2i8:
4959	case MVT::v2i16:
4960	return MVT::v2i32;
4961	case MVT::v4i8:
4962	return MVT::v4i16;
4963	}
4964	}
4965
4966	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4967	const EVT &OrigTy,
4968	const EVT &ExtTy,
4969	unsigned ExtOpcode) {
4970	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
4971	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4972	// 64-bits we need to insert a new extension so that it will be 64-bits.
4973	assert(ExtTy.is128BitVector() && "Unexpected extension size");
4974	if (OrigTy.getSizeInBits() >= `64`)
4975	return N;
4976
4977	// Must extend size to at least 64 bits to be used as an operand for VMULL.
4978	EVT NewVT = getExtensionTo64Bits(OrigVT: OrigTy);
4979
4980	return DAG.getNode(Opcode: ExtOpcode, DL: SDLoc (N), VT: NewVT, Operand: N);
4981	}
4982
4983	// Returns lane if Op extracts from a two-element vector and lane is constant
4984	// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4985	static std::optional<uint64_t>
4986	getConstantLaneNumOfExtractHalfOperand(SDValue &Op) {
4987	SDNode *OpNode = Op.getNode();
4988	if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4989	return std::nullopt;
4990
4991	EVT VT = OpNode->getOperand(Num: `0`).getValueType();
4992	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: OpNode->getOperand(Num: `1`));
4993	if (!VT.isFixedLengthVector() \|\| VT.getVectorNumElements() != `2` \|\| !C)
4994	return std::nullopt;
4995
4996	return C->getZExtValue();
4997	}
4998
4999	static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG,
5000	bool isSigned) {
5001	EVT VT = N.getValueType();
5002
5003	if (N.getOpcode() != ISD::BUILD_VECTOR)
5004	return false;
5005
5006	for (const SDValue &Elt : N ->op_values()) {
5007	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Elt)) {
5008	unsigned EltSize = VT.getScalarSizeInBits();
5009	unsigned HalfSize = EltSize / `2`;
5010	if (isSigned) {
5011	if (!isIntN(N: HalfSize, x: C->getSExtValue()))
5012	return false;
5013	} else {
5014	if (!isUIntN(N: HalfSize, x: C->getZExtValue()))
5015	return false;
5016	}
5017	continue;
5018	}
5019	return false;
5020	}
5021
5022	return true;
5023	}
5024
5025	static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG) {
5026	EVT VT = N.getValueType();
5027	assert(VT.is128BitVector() && "Unexpected vector MULL size");
5028
5029	unsigned NumElts = VT.getVectorNumElements();
5030	unsigned OrigEltSize = VT.getScalarSizeInBits();
5031	unsigned EltSize = OrigEltSize / `2`;
5032	MVT TruncVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
5033
5034	APInt HiBits = APInt::getHighBitsSet(numBits: OrigEltSize, hiBitsSet: EltSize);
5035	if (DAG.MaskedValueIsZero(Op: N, Mask: HiBits))
5036	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: TruncVT, Operand: N);
5037
5038	if (ISD::isExtOpcode(Opcode: N.getOpcode()))
5039	return addRequiredExtensionForVectorMULL(N: N.getOperand(i: `0`), DAG,
5040	OrigTy: N.getOperand(i: `0`).getValueType(), ExtTy: VT,
5041	ExtOpcode: N.getOpcode());
5042
5043	assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5044	SDLoc dl(N);
5045	SmallVector<SDValue, `8`> Ops;
5046	for (unsigned i = `0`; i != NumElts; ++i) {
5047	const APInt &CInt = N.getConstantOperandAPInt(i);
5048	// Element types smaller than 32 bits are not legal, so use i32 elements.
5049	// The values are implicitly truncated so sext vs. zext doesn't matter.
5050	Ops.push_back(Elt: DAG.getConstant(Val: CInt.zextOrTrunc(width: `32`), DL: dl, VT: MVT::i32));
5051	}
5052	return DAG.getBuildVector(VT: TruncVT, DL: dl, Ops);
5053	}
5054
5055	static bool isSignExtended(SDValue N, SelectionDAG &DAG) {
5056	return N.getOpcode() == ISD::SIGN_EXTEND \|\|
5057	N.getOpcode() == ISD::ANY_EXTEND \|\|
5058	isExtendedBUILD_VECTOR(N, DAG, isSigned: true);
5059	}
5060
5061	static bool isZeroExtended(SDValue N, SelectionDAG &DAG) {
5062	return N.getOpcode() == ISD::ZERO_EXTEND \|\|
5063	N.getOpcode() == ISD::ANY_EXTEND \|\|
5064	isExtendedBUILD_VECTOR(N, DAG, isSigned: false);
5065	}
5066
5067	static bool isAddSubSExt(SDValue N, SelectionDAG &DAG) {
5068	unsigned Opcode = N.getOpcode();
5069	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
5070	SDValue N0 = N.getOperand(i: `0`);
5071	SDValue N1 = N.getOperand(i: `1`);
5072	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
5073	isSignExtended(N: N0, DAG) && isSignExtended(N: N1, DAG);
5074	}
5075	return false;
5076	}
5077
5078	static bool isAddSubZExt(SDValue N, SelectionDAG &DAG) {
5079	unsigned Opcode = N.getOpcode();
5080	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
5081	SDValue N0 = N.getOperand(i: `0`);
5082	SDValue N1 = N.getOperand(i: `1`);
5083	return N0 ->hasOneUse() && N1 ->hasOneUse() &&
5084	isZeroExtended(N: N0, DAG) && isZeroExtended(N: N1, DAG);
5085	}
5086	return false;
5087	}
5088
5089	SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5090	SelectionDAG &DAG) const {
5091	// The rounding mode is in bits 23:22 of the FPSCR.
5092	// The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5093	// The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5094	// so that the shift + and get folded into a bitfield extract.
5095	SDLoc dl(Op);
5096
5097	SDValue Chain = Op.getOperand(i: `0`);
5098	SDValue FPCR_64 = DAG.getNode(
5099	Opcode: ISD::INTRINSIC_W_CHAIN, DL: dl, ResultTys: {MVT::i64, MVT::Other},
5100	Ops: {Chain, DAG.getConstant(Val: Intrinsic::aarch64_get_fpcr, DL: dl, VT: MVT::i64)});
5101	Chain = FPCR_64.getValue(R: `1`);
5102	SDValue FPCR_32 = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: MVT::i32, Operand: FPCR_64);
5103	SDValue FltRounds = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: MVT::i32, N1: FPCR_32,
5104	N2: DAG.getConstant(Val: `1U` << `22`, DL: dl, VT: MVT::i32));
5105	SDValue RMODE = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i32, N1: FltRounds,
5106	N2: DAG.getConstant(Val: `22`, DL: dl, VT: MVT::i32));
5107	SDValue AND = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MVT::i32, N1: RMODE,
5108	N2: DAG.getConstant(Val: `3`, DL: dl, VT: MVT::i32));
5109	return DAG.getMergeValues(Ops: {AND, Chain}, dl);
5110	}
5111
5112	SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5113	SelectionDAG &DAG) const {
5114	SDLoc DL(Op);
5115	SDValue Chain = Op ->getOperand(Num: `0`);
5116	SDValue RMValue = Op ->getOperand(Num: `1`);
5117
5118	// The rounding mode is in bits 23:22 of the FPCR.
5119	// The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5120	// is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5121	// ((arg - 1) & 3) << 22).
5122	//
5123	// The argument of llvm.set.rounding must be within the segment [0, 3], so
5124	// NearestTiesToAway (4) is not handled here. It is responsibility of the code
5125	// generated llvm.set.rounding to ensure this condition.
5126
5127	// Calculate new value of FPCR[23:22].
5128	RMValue = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i32, N1: RMValue,
5129	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
5130	RMValue = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: RMValue,
5131	N2: DAG.getConstant(Val: `0x3`, DL, VT: MVT::i32));
5132	RMValue =
5133	DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: RMValue,
5134	N2: DAG.getConstant(Val: AArch64::RoundingBitsPos, DL, VT: MVT::i32));
5135	RMValue = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: RMValue);
5136
5137	// Get current value of FPCR.
5138	SDValue Ops[] = {
5139	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5140	SDValue FPCR =
5141	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5142	Chain = FPCR.getValue(R: `1`);
5143	FPCR = FPCR.getValue(R: `0`);
5144
5145	// Put new rounding mode into FPSCR[23:22].
5146	const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5147	FPCR = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5148	N2: DAG.getConstant(Val: RMMask, DL, VT: MVT::i64));
5149	FPCR = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i64, N1: FPCR, N2: RMValue);
5150	SDValue Ops2[] = {
5151	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5152	FPCR};
5153	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5154	}
5155
5156	SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5157	SelectionDAG &DAG) const {
5158	SDLoc DL(Op);
5159	SDValue Chain = Op ->getOperand(Num: `0`);
5160
5161	// Get current value of FPCR.
5162	SDValue Ops[] = {
5163	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5164	SDValue FPCR =
5165	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5166	Chain = FPCR.getValue(R: `1`);
5167	FPCR = FPCR.getValue(R: `0`);
5168
5169	// Truncate FPCR to 32 bits.
5170	SDValue Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i32, Operand: FPCR);
5171
5172	return DAG.getMergeValues(Ops: {Result, Chain}, dl: DL);
5173	}
5174
5175	SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5176	SelectionDAG &DAG) const {
5177	SDLoc DL(Op);
5178	SDValue Chain = Op ->getOperand(Num: `0`);
5179	SDValue Mode = Op ->getOperand(Num: `1`);
5180
5181	// Extend the specified value to 64 bits.
5182	SDValue FPCR = DAG.getZExtOrTrunc(Op: Mode, DL, VT: MVT::i64);
5183
5184	// Set new value of FPCR.
5185	SDValue Ops2[] = {
5186	Chain, DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64), FPCR};
5187	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5188	}
5189
5190	SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5191	SelectionDAG &DAG) const {
5192	SDLoc DL(Op);
5193	SDValue Chain = Op ->getOperand(Num: `0`);
5194
5195	// Get current value of FPCR.
5196	SDValue Ops[] = {
5197	Chain, DAG.getTargetConstant(Val: Intrinsic::aarch64_get_fpcr, DL, VT: MVT::i64)};
5198	SDValue FPCR =
5199	DAG.getNode(Opcode: ISD::INTRINSIC_W_CHAIN, DL, ResultTys: {MVT::i64, MVT::Other}, Ops);
5200	Chain = FPCR.getValue(R: `1`);
5201	FPCR = FPCR.getValue(R: `0`);
5202
5203	// Clear bits that are not reserved.
5204	SDValue FPSCRMasked = DAG.getNode(
5205	Opcode: ISD::AND, DL, VT: MVT::i64, N1: FPCR,
5206	N2: DAG.getConstant(Val: AArch64::ReservedFPControlBits, DL, VT: MVT::i64));
5207
5208	// Set new value of FPCR.
5209	SDValue Ops2[] = {Chain,
5210	DAG.getConstant(Val: Intrinsic::aarch64_set_fpcr, DL, VT: MVT::i64),
5211	FPSCRMasked};
5212	return DAG.getNode(Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, Ops: Ops2);
5213	}
5214
5215	static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5216	SDLoc DL, bool &IsMLA) {
5217	bool IsN0SExt = isSignExtended(N: N0, DAG);
5218	bool IsN1SExt = isSignExtended(N: N1, DAG);
5219	if (IsN0SExt && IsN1SExt)
5220	return AArch64ISD::SMULL;
5221
5222	bool IsN0ZExt = isZeroExtended(N: N0, DAG);
5223	bool IsN1ZExt = isZeroExtended(N: N1, DAG);
5224
5225	if (IsN0ZExt && IsN1ZExt)
5226	return AArch64ISD::UMULL;
5227
5228	// Select SMULL if we can replace zext with sext.
5229	if (((IsN0SExt && IsN1ZExt) \|\| (IsN0ZExt && IsN1SExt)) &&
5230	!isExtendedBUILD_VECTOR(N: N0, DAG, isSigned: false) &&
5231	!isExtendedBUILD_VECTOR(N: N1, DAG, isSigned: false)) {
5232	SDValue ZextOperand;
5233	if (IsN0ZExt)
5234	ZextOperand = N0.getOperand(i: `0`);
5235	else
5236	ZextOperand = N1.getOperand(i: `0`);
5237	if (DAG.SignBitIsZero(Op: ZextOperand)) {
5238	SDValue NewSext =
5239	DAG.getSExtOrTrunc(Op: ZextOperand, DL, VT: N0.getValueType());
5240	if (IsN0ZExt)
5241	N0 = NewSext;
5242	else
5243	N1 = NewSext;
5244	return AArch64ISD::SMULL;
5245	}
5246	}
5247
5248	// Select UMULL if we can replace the other operand with an extend.
5249	if (IsN0ZExt \|\| IsN1ZExt) {
5250	EVT VT = N0.getValueType();
5251	APInt Mask = APInt::getHighBitsSet(numBits: VT.getScalarSizeInBits(),
5252	hiBitsSet: VT.getScalarSizeInBits() / `2`);
5253	if (DAG.MaskedValueIsZero(Op: IsN0ZExt ? N1 : N0, Mask))
5254	return AArch64ISD::UMULL;
5255	}
5256
5257	if (!IsN1SExt && !IsN1ZExt)
5258	return `0`;
5259
5260	// Look for (s/zext A + s/zext B) (s/zext C). We want to turn these*
5261	// into (s/zext A s/zext C) + (s/zext B * s/zext C)*
5262	if (IsN1SExt && isAddSubSExt(N: N0, DAG)) {
5263	IsMLA = true;
5264	return AArch64ISD::SMULL;
5265	}
5266	if (IsN1ZExt && isAddSubZExt(N: N0, DAG)) {
5267	IsMLA = true;
5268	return AArch64ISD::UMULL;
5269	}
5270	if (IsN0ZExt && isAddSubZExt(N: N1, DAG)) {
5271	std::swap(a&: N0, b&: N1);
5272	IsMLA = true;
5273	return AArch64ISD::UMULL;
5274	}
5275	return `0`;
5276	}
5277
5278	SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5279	EVT VT = Op.getValueType();
5280
5281	bool OverrideNEON = !Subtarget->isNeonAvailable();
5282	if (VT.isScalableVector() \|\| useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5283	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5284
5285	// Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5286	// that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5287	assert((VT.is128BitVector() \|\| VT.is64BitVector()) && VT.isInteger() &&
5288	"unexpected type for custom-lowering ISD::MUL");
5289	SDValue N0 = Op.getOperand(i: `0`);
5290	SDValue N1 = Op.getOperand(i: `1`);
5291	bool isMLA = false;
5292	EVT OVT = VT;
5293	if (VT.is64BitVector()) {
5294	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5295	isNullConstant(V: N0.getOperand(i: `1`)) &&
5296	N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5297	isNullConstant(V: N1.getOperand(i: `1`))) {
5298	N0 = N0.getOperand(i: `0`);
5299	N1 = N1.getOperand(i: `0`);
5300	VT = N0.getValueType();
5301	} else {
5302	if (VT == MVT::v1i64) {
5303	if (Subtarget->hasSVE())
5304	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5305	// Fall through to expand this. It is not legal.
5306	return SDValue ();
5307	} else
5308	// Other vector multiplications are legal.
5309	return Op;
5310	}
5311	}
5312
5313	SDLoc DL(Op);
5314	unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, IsMLA&: isMLA);
5315
5316	if (!NewOpc) {
5317	if (VT.getVectorElementType() == MVT::i64) {
5318	// If SVE is available then i64 vector multiplications can also be made
5319	// legal.
5320	if (Subtarget->hasSVE())
5321	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MUL_PRED);
5322	// Fall through to expand this. It is not legal.
5323	return SDValue ();
5324	} else
5325	// Other vector multiplications are legal.
5326	return Op;
5327	}
5328
5329	// Legalize to a S/UMULL instruction
5330	SDValue Op0;
5331	SDValue Op1 = skipExtensionForVectorMULL(N: N1, DAG);
5332	if (!isMLA) {
5333	Op0 = skipExtensionForVectorMULL(N: N0, DAG);
5334	assert(Op0.getValueType().is64BitVector() &&
5335	Op1.getValueType().is64BitVector() &&
5336	"unexpected types for extended operands to VMULL");
5337	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5338	N1: DAG.getNode(Opcode: NewOpc, DL, VT, N1: Op0, N2: Op1),
5339	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
5340	}
5341	// Optimizing (zext A + zext B) C, to (S/UMULL A, C) + (S/UMULL B, C) during*
5342	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5343	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5344	SDValue N00 = skipExtensionForVectorMULL(N: N0.getOperand(i: `0`), DAG);
5345	SDValue N01 = skipExtensionForVectorMULL(N: N0.getOperand(i: `1`), DAG);
5346	EVT Op1VT = Op1.getValueType();
5347	return DAG.getNode(
5348	Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: OVT,
5349	N1: DAG.getNode(Opcode: N0.getOpcode(), DL, VT,
5350	N1: DAG.getNode(Opcode: NewOpc, DL, VT,
5351	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N00), N2: Op1),
5352	N2: DAG.getNode(Opcode: NewOpc, DL, VT,
5353	N1: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op1VT, Operand: N01), N2: Op1)),
5354	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
5355	}
5356
5357	static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5358	int Pattern) {
5359	if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5360	return DAG.getConstant(Val: `1`, DL, VT: MVT::nxv1i1);
5361	return DAG.getNode(Opcode: AArch64ISD::PTRUE, DL, VT,
5362	Operand: DAG.getTargetConstant(Val: Pattern, DL, VT: MVT::i32));
5363	}
5364
5365	static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG,
5366	bool IsSigned, bool IsEqual) {
5367	if (!isa<ConstantSDNode>(Val: Op.getOperand(i: `1`)) \|\|
5368	!isa<ConstantSDNode>(Val: Op.getOperand(i: `2`)))
5369	return SDValue ();
5370
5371	SDLoc dl(Op);
5372	APInt X = Op.getConstantOperandAPInt(i: `1`);
5373	APInt Y = Op.getConstantOperandAPInt(i: `2`);
5374	bool Overflow;
5375	APInt NumActiveElems =
5376	IsSigned ? Y.ssub_ov(RHS: X, Overflow) : Y.usub_ov(RHS: X, Overflow);
5377
5378	if (Overflow)
5379	return SDValue ();
5380
5381	if (IsEqual) {
5382	APInt One(NumActiveElems.getBitWidth(), `1`, IsSigned);
5383	NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(RHS: One, Overflow)
5384	: NumActiveElems.uadd_ov(RHS: One, Overflow);
5385	if (Overflow)
5386	return SDValue ();
5387	}
5388
5389	std::optional<unsigned> PredPattern =
5390	getSVEPredPatternFromNumElements(MinNumElts: NumActiveElems.getZExtValue());
5391	unsigned MinSVEVectorSize = std::max(
5392	a: DAG.getSubtarget<AArch64Subtarget>().getMinSVEVectorSizeInBits(), b: `128u`);
5393	unsigned ElementSize = `128` / Op.getValueType().getVectorMinNumElements();
5394	if (PredPattern != std::nullopt &&
5395	NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5396	return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: *PredPattern);
5397
5398	return SDValue ();
5399	}
5400
5401	// Returns a safe bitcast between two scalable vector predicates, where
5402	// any newly created lanes from a widening bitcast are defined as zero.
5403	static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
5404	SDLoc DL(Op);
5405	EVT InVT = Op.getValueType();
5406
5407	assert(InVT.getVectorElementType() == MVT::i1 &&
5408	VT.getVectorElementType() == MVT::i1 &&
5409	"Expected a predicate-to-predicate bitcast");
5410	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5411	InVT.isScalableVector() &&
5412	DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5413	"Only expect to cast between legal scalable predicate types!");
5414
5415	// Return the operand if the cast isn't changing type,
5416	// e.g. <n x 16 x i1> -> <n x 16 x i1>
5417	if (InVT == VT)
5418	return Op;
5419
5420	SDValue Reinterpret = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
5421
5422	// We only have to zero the lanes if new lanes are being defined, e.g. when
5423	// casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5424	// case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5425	// we can return here.
5426	if (InVT.bitsGT(VT))
5427	return Reinterpret;
5428
5429	// Check if the other lanes are already known to be zeroed by
5430	// construction.
5431	if (isZeroingInactiveLanes(Op))
5432	return Reinterpret;
5433
5434	// Zero the newly introduced lanes.
5435	SDValue Mask = DAG.getConstant(Val: `1`, DL, VT: InVT);
5436	Mask = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Mask);
5437	return DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Reinterpret, N2: Mask);
5438	}
5439
5440	SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5441	SDValue Chain, SDLoc DL,
5442	EVT VT) const {
5443	SDValue Callee = DAG.getExternalSymbol(Sym: "__arm_sme_state",
5444	VT: getPointerTy(DL: DAG.getDataLayout()));
5445	Type Int64Ty = Type::getInt64Ty(C&: DAG.getContext());
5446	Type *RetTy = StructType::get(elt1: Int64Ty, elts: Int64Ty);
5447	TargetLowering::CallLoweringInfo CLI(DAG);
5448	ArgListTy Args;
5449	CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5450	CC: CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2,
5451	ResultType: RetTy, Target: Callee, ArgsList: std::move(Args));
5452	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5453	SDValue Mask = DAG.getConstant(/PSTATE.SM/ Val: `1`, DL, VT: MVT::i64);
5454	return DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i64, N1: CallResult.first.getOperand(i: `0`),
5455	N2: Mask);
5456	}
5457
5458	// Lower an SME LDR/STR ZA intrinsic
5459	// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5460	// folded into the instruction
5461	// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5462	// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5463	// and tile slice registers
5464	// ldr(%tileslice, %ptr, %vecnum)
5465	// ->
5466	// %svl = rdsvl
5467	// %ptr2 = %ptr + %svl %vecnum*
5468	// %tileslice2 = %tileslice + %vecnum
5469	// ldr [%tileslice2, 0], [%ptr2, 0]
5470	// Case 3: If the vecnum is an immediate out of range, then the same is done as
5471	// case 2, but the base and slice registers are modified by the greatest
5472	// multiple of 15 lower than the vecnum and the remainder is folded into the
5473	// instruction. This means that successive loads and stores that are offset from
5474	// each other can share the same base and slice register updates.
5475	// ldr(%tileslice, %ptr, 22)
5476	// ldr(%tileslice, %ptr, 23)
5477	// ->
5478	// %svl = rdsvl
5479	// %ptr2 = %ptr + %svl 15*
5480	// %tileslice2 = %tileslice + 15
5481	// ldr [%tileslice2, 7], [%ptr2, 7]
5482	// ldr [%tileslice2, 8], [%ptr2, 8]
5483	// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5484	// operand and the immediate can be folded into the instruction, like case 2.
5485	// ldr(%tileslice, %ptr, %vecnum + 7)
5486	// ldr(%tileslice, %ptr, %vecnum + 8)
5487	// ->
5488	// %svl = rdsvl
5489	// %ptr2 = %ptr + %svl %vecnum*
5490	// %tileslice2 = %tileslice + %vecnum
5491	// ldr [%tileslice2, 7], [%ptr2, 7]
5492	// ldr [%tileslice2, 8], [%ptr2, 8]
5493	// Case 5: The vecnum being an add of an immediate out of range is also handled,
5494	// in which case the same remainder logic as case 3 is used.
5495	SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) {
5496	SDLoc DL(N);
5497
5498	SDValue TileSlice = N ->getOperand(Num: `2`);
5499	SDValue Base = N ->getOperand(Num: `3`);
5500	SDValue VecNum = N ->getOperand(Num: `4`);
5501	int32_t ConstAddend = `0`;
5502	SDValue VarAddend = VecNum;
5503
5504	// If the vnum is an add of an immediate, we can fold it into the instruction
5505	if (VecNum.getOpcode() == ISD::ADD &&
5506	isa<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))) {
5507	ConstAddend = cast<ConstantSDNode>(Val: VecNum.getOperand(i: `1`))->getSExtValue();
5508	VarAddend = VecNum.getOperand(i: `0`);
5509	} else if (auto ImmNode = dyn_cast<ConstantSDNode>(Val&: VecNum)) {
5510	ConstAddend = ImmNode->getSExtValue();
5511	VarAddend = SDValue ();
5512	}
5513
5514	int32_t ImmAddend = ConstAddend % `16`;
5515	if (int32_t C = (ConstAddend - ImmAddend)) {
5516	SDValue CVal = DAG.getTargetConstant(Val: C, DL, VT: MVT::i32);
5517	VarAddend = VarAddend
5518	? DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {VarAddend, CVal})
5519	: CVal;
5520	}
5521
5522	if (VarAddend) {
5523	// Get the vector length that will be multiplied by vnum
5524	auto SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
5525	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
5526
5527	// Multiply SVL and vnum then add it to the base
5528	SDValue Mul = DAG.getNode(
5529	Opcode: ISD::MUL, DL, VT: MVT::i64,
5530	Ops: {SVL, DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: VarAddend)});
5531	Base = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, Ops: {Base, Mul});
5532	// Just add vnum to the tileslice
5533	TileSlice = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i32, Ops: {TileSlice, VarAddend});
5534	}
5535
5536	return DAG.getNode(Opcode: IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
5537	DL, VT: MVT::Other,
5538	Ops: {/Chain=/N.getOperand(i: `0`), TileSlice, Base,
5539	DAG.getTargetConstant(Val: ImmAddend, DL, VT: MVT::i32)});
5540	}
5541
5542	SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5543	SelectionDAG &DAG) const {
5544	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5545	SDLoc DL(Op);
5546	switch (IntNo) {
5547	default:
5548	return SDValue (); // Don't custom lower most intrinsics.
5549	case Intrinsic::aarch64_prefetch: {
5550	SDValue Chain = Op.getOperand(i: `0`);
5551	SDValue Addr = Op.getOperand(i: `2`);
5552
5553	unsigned IsWrite = Op.getConstantOperandVal(i: `3`);
5554	unsigned Locality = Op.getConstantOperandVal(i: `4`);
5555	unsigned IsStream = Op.getConstantOperandVal(i: `5`);
5556	unsigned IsData = Op.getConstantOperandVal(i: `6`);
5557	unsigned PrfOp = (IsWrite << `4`) \| // Load/Store bit
5558	(!IsData << `3`) \| // IsDataCache bit
5559	(Locality << `1`) \| // Cache level bits
5560	(unsigned)IsStream; // Stream bit
5561
5562	return DAG.getNode(Opcode: AArch64ISD::PREFETCH, DL, VT: MVT::Other, N1: Chain,
5563	N2: DAG.getTargetConstant(Val: PrfOp, DL, VT: MVT::i32), N3: Addr);
5564	}
5565	case Intrinsic::aarch64_sme_str:
5566	case Intrinsic::aarch64_sme_ldr: {
5567	return LowerSMELdrStr(N: Op, DAG, IsLoad: IntNo == Intrinsic::aarch64_sme_ldr);
5568	}
5569	case Intrinsic::aarch64_sme_za_enable:
5570	return DAG.getNode(
5571	Opcode: AArch64ISD::SMSTART, DL, VT: MVT::Other,
5572	N1: Op ->getOperand(Num: `0`), // Chain
5573	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
5574	N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
5575	case Intrinsic::aarch64_sme_za_disable:
5576	return DAG.getNode(
5577	Opcode: AArch64ISD::SMSTOP, DL, VT: MVT::Other,
5578	N1: Op ->getOperand(Num: `0`), // Chain
5579	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
5580	N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
5581	}
5582	}
5583
5584	SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5585	SelectionDAG &DAG) const {
5586	unsigned IntNo = Op.getConstantOperandVal(i: `1`);
5587	SDLoc DL(Op);
5588	switch (IntNo) {
5589	default:
5590	return SDValue (); // Don't custom lower most intrinsics.
5591	case Intrinsic::aarch64_mops_memset_tag: {
5592	auto Node = cast<MemIntrinsicSDNode>(Val: Op.getNode());
5593	SDValue Chain = Node->getChain();
5594	SDValue Dst = Op.getOperand(i: `2`);
5595	SDValue Val = Op.getOperand(i: `3`);
5596	Val = DAG.getAnyExtOrTrunc(Op: Val, DL, VT: MVT::i64);
5597	SDValue Size = Op.getOperand(i: `4`);
5598	auto Alignment = Node->getMemOperand()->getAlign();
5599	bool IsVol = Node->isVolatile();
5600	auto DstPtrInfo = Node->getPointerInfo();
5601
5602	const auto &SDI =
5603	static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5604	SDValue MS =
5605	SDI.EmitMOPS(SDOpcode: AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, SrcOrValue: Val,
5606	Size, Alignment, isVolatile: IsVol, DstPtrInfo, SrcPtrInfo: MachinePointerInfo {});
5607
5608	// MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5609	// intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5610	// LowerOperationWrapper will complain that the number of results has
5611	// changed.
5612	return DAG.getMergeValues(Ops: {MS.getValue(R: `0`), MS.getValue(R: `2`)}, dl: DL);
5613	}
5614	}
5615	}
5616
5617	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5618	SelectionDAG &DAG) const {
5619	unsigned IntNo = Op.getConstantOperandVal(i: `0`);
5620	SDLoc dl(Op);
5621	switch (IntNo) {
5622	default: return SDValue (); // Don't custom lower most intrinsics.
5623	case Intrinsic::thread_pointer: {
5624	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
5625	return DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL: dl, VT: PtrVT);
5626	}
5627	case Intrinsic::aarch64_neon_abs: {
5628	EVT Ty = Op.getValueType();
5629	if (Ty == MVT::i64) {
5630	SDValue Result = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::v1i64,
5631	Operand: Op.getOperand(i: `1`));
5632	Result = DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: MVT::v1i64, Operand: Result);
5633	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i64, Operand: Result);
5634	} else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(VT: Ty)) {
5635	return DAG.getNode(Opcode: ISD::ABS, DL: dl, VT: Ty, Operand: Op.getOperand(i: `1`));
5636	} else {
5637	report_fatal_error(reason: "Unexpected type for AArch64 NEON intrinic");
5638	}
5639	}
5640	case Intrinsic::aarch64_neon_pmull64: {
5641	SDValue LHS = Op.getOperand(i: `1`);
5642	SDValue RHS = Op.getOperand(i: `2`);
5643
5644	std::optional<uint64_t> LHSLane =
5645	getConstantLaneNumOfExtractHalfOperand(Op&: LHS);
5646	std::optional<uint64_t> RHSLane =
5647	getConstantLaneNumOfExtractHalfOperand(Op&: RHS);
5648
5649	assert((!LHSLane \|\| *LHSLane < `2`) && "Expect lane to be None or 0 or 1");
5650	assert((!RHSLane \|\| *RHSLane < `2`) && "Expect lane to be None or 0 or 1");
5651
5652	// 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5653	// instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5654	// which ISel recognizes better. For example, generate a ldr into d*
5655	// registers as opposed to a GPR load followed by a fmov.
5656	auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5657	std::optional<uint64_t> OtherLane,
5658	const SDLoc &dl,
5659	SelectionDAG &DAG) -> SDValue {
5660	// If the operand is an higher half itself, rewrite it to
5661	// extract_high_v2i64; this way aarch64_neon_pmull64 could
5662	// re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5663	if (NLane && *NLane == `1`)
5664	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: MVT::v1i64,
5665	N1: N.getOperand(i: `0`), N2: DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i64));
5666
5667	// Operand N is not a higher half but the other operand is.
5668	if (OtherLane && *OtherLane == `1`) {
5669	// If this operand is a lower half, rewrite it to
5670	// extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5671	// align lanes of two operands. A roundtrip sequence (to move from lane
5672	// 1 to lane 0) is like this:
5673	// mov x8, v0.d[1]
5674	// fmov d0, x8
5675	if (NLane && *NLane == `0`)
5676	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: MVT::v1i64,
5677	N1: DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL: dl, VT: MVT::v2i64,
5678	N1: N.getOperand(i: `0`),
5679	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64)),
5680	N2: DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i64));
5681
5682	// Otherwise just dup from main to all lanes.
5683	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: MVT::v1i64, Operand: N);
5684	}
5685
5686	// Neither operand is an extract of higher half, so codegen may just use
5687	// the non-high version of PMULL instruction. Use v1i64 to represent i64.
5688	assert(N.getValueType() == MVT::i64 &&
5689	"Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5690	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v1i64, Operand: N);
5691	};
5692
5693	LHS = TryVectorizeOperand (LHS, LHSLane, RHSLane, dl, DAG);
5694	RHS = TryVectorizeOperand (RHS, RHSLane, LHSLane, dl, DAG);
5695
5696	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: dl, VT: Op.getValueType(), N1: LHS, N2: RHS);
5697	}
5698	case Intrinsic::aarch64_neon_smax:
5699	return DAG.getNode(Opcode: ISD::SMAX, DL: dl, VT: Op.getValueType(),
5700	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5701	case Intrinsic::aarch64_neon_umax:
5702	return DAG.getNode(Opcode: ISD::UMAX, DL: dl, VT: Op.getValueType(),
5703	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5704	case Intrinsic::aarch64_neon_smin:
5705	return DAG.getNode(Opcode: ISD::SMIN, DL: dl, VT: Op.getValueType(),
5706	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5707	case Intrinsic::aarch64_neon_umin:
5708	return DAG.getNode(Opcode: ISD::UMIN, DL: dl, VT: Op.getValueType(),
5709	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5710	case Intrinsic::aarch64_neon_scalar_sqxtn:
5711	case Intrinsic::aarch64_neon_scalar_sqxtun:
5712	case Intrinsic::aarch64_neon_scalar_uqxtn: {
5713	assert(Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::f32);
5714	if (Op.getValueType() == MVT::i32)
5715	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::i32,
5716	Operand: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::f32,
5717	N1: Op.getOperand(i: `0`),
5718	N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::f64,
5719	Operand: Op.getOperand(i: `1`))));
5720	return SDValue ();
5721	}
5722	case Intrinsic::aarch64_sve_whilelo:
5723	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/false,
5724	/IsEqual=/false);
5725	case Intrinsic::aarch64_sve_whilelt:
5726	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/true,
5727	/IsEqual=/false);
5728	case Intrinsic::aarch64_sve_whilels:
5729	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/false,
5730	/IsEqual=/true);
5731	case Intrinsic::aarch64_sve_whilele:
5732	return optimizeIncrementingWhile(Op, DAG, /IsSigned=/true,
5733	/IsEqual=/true);
5734	case Intrinsic::aarch64_sve_sunpkhi:
5735	return DAG.getNode(Opcode: AArch64ISD::SUNPKHI, DL: dl, VT: Op.getValueType(),
5736	Operand: Op.getOperand(i: `1`));
5737	case Intrinsic::aarch64_sve_sunpklo:
5738	return DAG.getNode(Opcode: AArch64ISD::SUNPKLO, DL: dl, VT: Op.getValueType(),
5739	Operand: Op.getOperand(i: `1`));
5740	case Intrinsic::aarch64_sve_uunpkhi:
5741	return DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL: dl, VT: Op.getValueType(),
5742	Operand: Op.getOperand(i: `1`));
5743	case Intrinsic::aarch64_sve_uunpklo:
5744	return DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL: dl, VT: Op.getValueType(),
5745	Operand: Op.getOperand(i: `1`));
5746	case Intrinsic::aarch64_sve_clasta_n:
5747	return DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL: dl, VT: Op.getValueType(),
5748	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5749	case Intrinsic::aarch64_sve_clastb_n:
5750	return DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL: dl, VT: Op.getValueType(),
5751	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5752	case Intrinsic::aarch64_sve_lasta:
5753	return DAG.getNode(Opcode: AArch64ISD::LASTA, DL: dl, VT: Op.getValueType(),
5754	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5755	case Intrinsic::aarch64_sve_lastb:
5756	return DAG.getNode(Opcode: AArch64ISD::LASTB, DL: dl, VT: Op.getValueType(),
5757	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5758	case Intrinsic::aarch64_sve_rev:
5759	return DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL: dl, VT: Op.getValueType(),
5760	Operand: Op.getOperand(i: `1`));
5761	case Intrinsic::aarch64_sve_tbl:
5762	return DAG.getNode(Opcode: AArch64ISD::TBL, DL: dl, VT: Op.getValueType(),
5763	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5764	case Intrinsic::aarch64_sve_trn1:
5765	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT: Op.getValueType(),
5766	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5767	case Intrinsic::aarch64_sve_trn2:
5768	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT: Op.getValueType(),
5769	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5770	case Intrinsic::aarch64_sve_uzp1:
5771	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT: Op.getValueType(),
5772	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5773	case Intrinsic::aarch64_sve_uzp2:
5774	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT: Op.getValueType(),
5775	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5776	case Intrinsic::aarch64_sve_zip1:
5777	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: Op.getValueType(),
5778	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5779	case Intrinsic::aarch64_sve_zip2:
5780	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT: Op.getValueType(),
5781	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5782	case Intrinsic::aarch64_sve_splice:
5783	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL: dl, VT: Op.getValueType(),
5784	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
5785	case Intrinsic::aarch64_sve_ptrue:
5786	return getPTrue(DAG, DL: dl, VT: Op.getValueType(), Pattern: Op.getConstantOperandVal(i: `1`));
5787	case Intrinsic::aarch64_sve_clz:
5788	return DAG.getNode(Opcode: AArch64ISD::CTLZ_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5789	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5790	case Intrinsic::aarch64_sme_cntsb:
5791	return DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5792	Operand: DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32));
5793	case Intrinsic::aarch64_sme_cntsh: {
5794	SDValue One = DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32);
5795	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(), Operand: One);
5796	return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes, N2: One);
5797	}
5798	case Intrinsic::aarch64_sme_cntsw: {
5799	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5800	Operand: DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32));
5801	return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes,
5802	N2: DAG.getConstant(Val: `2`, DL: dl, VT: MVT::i32));
5803	}
5804	case Intrinsic::aarch64_sme_cntsd: {
5805	SDValue Bytes = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL: dl, VT: Op.getValueType(),
5806	Operand: DAG.getConstant(Val: `1`, DL: dl, VT: MVT::i32));
5807	return DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: Op.getValueType(), N1: Bytes,
5808	N2: DAG.getConstant(Val: `3`, DL: dl, VT: MVT::i32));
5809	}
5810	case Intrinsic::aarch64_sve_cnt: {
5811	SDValue Data = Op.getOperand(i: `3`);
5812	// CTPOP only supports integer operands.
5813	if (Data.getValueType().isFloatingPoint())
5814	Data = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Data);
5815	return DAG.getNode(Opcode: AArch64ISD::CTPOP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5816	N1: Op.getOperand(i: `2`), N2: Data, N3: Op.getOperand(i: `1`));
5817	}
5818	case Intrinsic::aarch64_sve_dupq_lane:
5819	return LowerDUPQLane(Op, DAG);
5820	case Intrinsic::aarch64_sve_convert_from_svbool:
5821	if (Op.getValueType() == MVT::aarch64svcount)
5822	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
5823	return getSVEPredicateBitCast(VT: Op.getValueType(), Op: Op.getOperand(i: `1`), DAG);
5824	case Intrinsic::aarch64_sve_convert_to_svbool:
5825	if (Op.getOperand(i: `1`).getValueType() == MVT::aarch64svcount)
5826	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MVT::nxv16i1, Operand: Op.getOperand(i: `1`));
5827	return getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Op.getOperand(i: `1`), DAG);
5828	case Intrinsic::aarch64_sve_fneg:
5829	return DAG.getNode(Opcode: AArch64ISD::FNEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5830	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5831	case Intrinsic::aarch64_sve_frintp:
5832	return DAG.getNode(Opcode: AArch64ISD::FCEIL_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5833	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5834	case Intrinsic::aarch64_sve_frintm:
5835	return DAG.getNode(Opcode: AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5836	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5837	case Intrinsic::aarch64_sve_frinti:
5838	return DAG.getNode(Opcode: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5839	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5840	case Intrinsic::aarch64_sve_frintx:
5841	return DAG.getNode(Opcode: AArch64ISD::FRINT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5842	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5843	case Intrinsic::aarch64_sve_frinta:
5844	return DAG.getNode(Opcode: AArch64ISD::FROUND_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5845	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5846	case Intrinsic::aarch64_sve_frintn:
5847	return DAG.getNode(Opcode: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5848	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5849	case Intrinsic::aarch64_sve_frintz:
5850	return DAG.getNode(Opcode: AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5851	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5852	case Intrinsic::aarch64_sve_ucvtf:
5853	return DAG.getNode(Opcode: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5854	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5855	N3: Op.getOperand(i: `1`));
5856	case Intrinsic::aarch64_sve_scvtf:
5857	return DAG.getNode(Opcode: AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL: dl,
5858	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5859	N3: Op.getOperand(i: `1`));
5860	case Intrinsic::aarch64_sve_fcvtzu:
5861	return DAG.getNode(Opcode: AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL: dl,
5862	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5863	N3: Op.getOperand(i: `1`));
5864	case Intrinsic::aarch64_sve_fcvtzs:
5865	return DAG.getNode(Opcode: AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL: dl,
5866	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5867	N3: Op.getOperand(i: `1`));
5868	case Intrinsic::aarch64_sve_fsqrt:
5869	return DAG.getNode(Opcode: AArch64ISD::FSQRT_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5870	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5871	case Intrinsic::aarch64_sve_frecpx:
5872	return DAG.getNode(Opcode: AArch64ISD::FRECPX_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5873	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5874	case Intrinsic::aarch64_sve_frecpe_x:
5875	return DAG.getNode(Opcode: AArch64ISD::FRECPE, DL: dl, VT: Op.getValueType(),
5876	Operand: Op.getOperand(i: `1`));
5877	case Intrinsic::aarch64_sve_frecps_x:
5878	return DAG.getNode(Opcode: AArch64ISD::FRECPS, DL: dl, VT: Op.getValueType(),
5879	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5880	case Intrinsic::aarch64_sve_frsqrte_x:
5881	return DAG.getNode(Opcode: AArch64ISD::FRSQRTE, DL: dl, VT: Op.getValueType(),
5882	Operand: Op.getOperand(i: `1`));
5883	case Intrinsic::aarch64_sve_frsqrts_x:
5884	return DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL: dl, VT: Op.getValueType(),
5885	N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`));
5886	case Intrinsic::aarch64_sve_fabs:
5887	return DAG.getNode(Opcode: AArch64ISD::FABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5888	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5889	case Intrinsic::aarch64_sve_abs:
5890	return DAG.getNode(Opcode: AArch64ISD::ABS_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5891	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5892	case Intrinsic::aarch64_sve_neg:
5893	return DAG.getNode(Opcode: AArch64ISD::NEG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5894	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5895	case Intrinsic::aarch64_sve_insr: {
5896	SDValue Scalar = Op.getOperand(i: `2`);
5897	EVT ScalarTy = Scalar.getValueType();
5898	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
5899	Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: Scalar);
5900
5901	return DAG.getNode(Opcode: AArch64ISD::INSR, DL: dl, VT: Op.getValueType(),
5902	N1: Op.getOperand(i: `1`), N2: Scalar);
5903	}
5904	case Intrinsic::aarch64_sve_rbit:
5905	return DAG.getNode(Opcode: AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL: dl,
5906	VT: Op.getValueType(), N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5907	N3: Op.getOperand(i: `1`));
5908	case Intrinsic::aarch64_sve_revb:
5909	return DAG.getNode(Opcode: AArch64ISD::BSWAP_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5910	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5911	case Intrinsic::aarch64_sve_revh:
5912	return DAG.getNode(Opcode: AArch64ISD::REVH_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5913	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5914	case Intrinsic::aarch64_sve_revw:
5915	return DAG.getNode(Opcode: AArch64ISD::REVW_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5916	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5917	case Intrinsic::aarch64_sve_revd:
5918	return DAG.getNode(Opcode: AArch64ISD::REVD_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5919	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`), N3: Op.getOperand(i: `1`));
5920	case Intrinsic::aarch64_sve_sxtb:
5921	return DAG.getNode(
5922	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5923	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5924	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
5925	N4: Op.getOperand(i: `1`));
5926	case Intrinsic::aarch64_sve_sxth:
5927	return DAG.getNode(
5928	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5929	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5930	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
5931	N4: Op.getOperand(i: `1`));
5932	case Intrinsic::aarch64_sve_sxtw:
5933	return DAG.getNode(
5934	Opcode: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5935	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5936	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
5937	N4: Op.getOperand(i: `1`));
5938	case Intrinsic::aarch64_sve_uxtb:
5939	return DAG.getNode(
5940	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5941	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5942	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i8)),
5943	N4: Op.getOperand(i: `1`));
5944	case Intrinsic::aarch64_sve_uxth:
5945	return DAG.getNode(
5946	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5947	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5948	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i16)),
5949	N4: Op.getOperand(i: `1`));
5950	case Intrinsic::aarch64_sve_uxtw:
5951	return DAG.getNode(
5952	Opcode: AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL: dl, VT: Op.getValueType(),
5953	N1: Op.getOperand(i: `2`), N2: Op.getOperand(i: `3`),
5954	N3: DAG.getValueType(Op.getValueType().changeVectorElementType(EltVT: MVT::i32)),
5955	N4: Op.getOperand(i: `1`));
5956	case Intrinsic::localaddress: {
5957	const auto &MF = DAG.getMachineFunction();
5958	const auto *RegInfo = Subtarget->getRegisterInfo();
5959	unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5960	return DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl, Reg,
5961	VT: Op.getSimpleValueType());
5962	}
5963
5964	case Intrinsic::eh_recoverfp: {
5965	// FIXME: This needs to be implemented to correctly handle highly aligned
5966	// stack objects. For now we simply return the incoming FP. Refer D53541
5967	// for more details.
5968	SDValue FnOp = Op.getOperand(i: `1`);
5969	SDValue IncomingFPOp = Op.getOperand(i: `2`);
5970	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Val&: FnOp);
5971	auto Fn = dyn_cast_or_null<Function>(Val: GSD ? GSD->getGlobal() : nullptr*);
5972	if (!Fn)
5973	report_fatal_error(
5974	reason: "llvm.eh.recoverfp must take a function as the first argument");
5975	return IncomingFPOp;
5976	}
5977
5978	case Intrinsic::aarch64_neon_vsri:
5979	case Intrinsic::aarch64_neon_vsli:
5980	case Intrinsic::aarch64_sve_sri:
5981	case Intrinsic::aarch64_sve_sli: {
5982	EVT Ty = Op.getValueType();
5983
5984	if (!Ty.isVector())
5985	report_fatal_error(reason: "Unexpected type for aarch64_neon_vsli");
5986
5987	assert(Op.getConstantOperandVal(`3`) <= Ty.getScalarSizeInBits());
5988
5989	bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri \|\|
5990	IntNo == Intrinsic::aarch64_sve_sri;
5991	unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5992	return DAG.getNode(Opcode, DL: dl, VT: Ty, N1: Op.getOperand(i: `1`), N2: Op.getOperand(i: `2`),
5993	N3: Op.getOperand(i: `3`));
5994	}
5995
5996	case Intrinsic::aarch64_neon_srhadd:
5997	case Intrinsic::aarch64_neon_urhadd:
5998	case Intrinsic::aarch64_neon_shadd:
5999	case Intrinsic::aarch64_neon_uhadd: {
6000	bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
6001	IntNo == Intrinsic::aarch64_neon_shadd);
6002	bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd \|\|
6003	IntNo == Intrinsic::aarch64_neon_urhadd);
6004	unsigned Opcode = IsSignedAdd
6005	? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6006	: (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6007	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6008	N2: Op.getOperand(i: `2`));
6009	}
6010	case Intrinsic::aarch64_neon_saddlp:
6011	case Intrinsic::aarch64_neon_uaddlp: {
6012	unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6013	? AArch64ISD::UADDLP
6014	: AArch64ISD::SADDLP;
6015	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), Operand: Op.getOperand(i: `1`));
6016	}
6017	case Intrinsic::aarch64_neon_sdot:
6018	case Intrinsic::aarch64_neon_udot:
6019	case Intrinsic::aarch64_sve_sdot:
6020	case Intrinsic::aarch64_sve_udot: {
6021	unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot \|\|
6022	IntNo == Intrinsic::aarch64_sve_udot)
6023	? AArch64ISD::UDOT
6024	: AArch64ISD::SDOT;
6025	return DAG.getNode(Opcode, DL: dl, VT: Op.getValueType(), N1: Op.getOperand(i: `1`),
6026	N2: Op.getOperand(i: `2`), N3: Op.getOperand(i: `3`));
6027	}
6028	case Intrinsic::get_active_lane_mask: {
6029	SDValue ID =
6030	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL: dl, VT: MVT::i64);
6031
6032	EVT VT = Op.getValueType();
6033	if (VT.isScalableVector())
6034	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT, N1: ID, N2: Op.getOperand(i: `1`),
6035	N3: Op.getOperand(i: `2`));
6036
6037	// We can use the SVE whilelo instruction to lower this intrinsic by
6038	// creating the appropriate sequence of scalable vector operations and
6039	// then extracting a fixed-width subvector from the scalable vector.
6040
6041	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6042	EVT WhileVT = ContainerVT.changeElementType(EltVT: MVT::i1);
6043
6044	SDValue Mask = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: WhileVT, N1: ID,
6045	N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`));
6046	SDValue MaskAsInt = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: ContainerVT, Operand: Mask);
6047	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: MaskAsInt,
6048	N2: DAG.getVectorIdxConstant(Val: `0`, DL: dl));
6049	}
6050	case Intrinsic::aarch64_neon_uaddlv: {
6051	EVT OpVT = Op.getOperand(i: `1`).getValueType();
6052	EVT ResVT = Op.getValueType();
6053	if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 \|\| OpVT == MVT::v16i8 \|\|
6054	OpVT == MVT::v8i16 \|\| OpVT == MVT::v4i16)) {
6055	// In order to avoid insert_subvector, used v4i32 than v2i32.
6056	SDValue UADDLV =
6057	DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: dl, VT: MVT::v4i32, Operand: Op.getOperand(i: `1`));
6058	SDValue EXTRACT_VEC_ELT =
6059	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i32, N1: UADDLV,
6060	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
6061	return EXTRACT_VEC_ELT;
6062	}
6063	return SDValue ();
6064	}
6065	case Intrinsic::experimental_cttz_elts: {
6066	SDValue CttzOp = Op.getOperand(i: `1`);
6067	EVT VT = CttzOp.getValueType();
6068	assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6069
6070	if (VT.isFixedLengthVector()) {
6071	// We can use SVE instructions to lower this intrinsic by first creating
6072	// an SVE predicate register mask from the fixed-width vector.
6073	EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
6074	SDValue Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: dl, VT: NewVT, Operand: CttzOp);
6075	CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6076	}
6077
6078	SDValue NewCttzElts =
6079	DAG.getNode(Opcode: AArch64ISD::CTTZ_ELTS, DL: dl, VT: MVT::i64, Operand: CttzOp);
6080	return DAG.getZExtOrTrunc(Op: NewCttzElts, DL: dl, VT: Op.getValueType());
6081	}
6082	}
6083	}
6084
6085	bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6086	if (VT.getVectorElementType() == MVT::i8 \|\|
6087	VT.getVectorElementType() == MVT::i16) {
6088	EltTy = MVT::i32;
6089	return true;
6090	}
6091	return false;
6092	}
6093
6094	bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6095	EVT DataVT) const {
6096	const EVT IndexVT = Extend.getOperand(i: `0`).getValueType();
6097	// SVE only supports implicit extension of 32-bit indices.
6098	if (!Subtarget->hasSVE() \|\| IndexVT.getVectorElementType() != MVT::i32)
6099	return false;
6100
6101	// Indices cannot be smaller than the main data type.
6102	if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6103	return false;
6104
6105	// Scalable vectors with "vscale 2" or fewer elements sit within a 64-bit*
6106	// element container type, which would violate the previous clause.
6107	return DataVT.isFixedLengthVector() \|\| DataVT.getVectorMinNumElements() > `2`;
6108	}
6109
6110	bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6111	EVT ExtVT = ExtVal.getValueType();
6112	if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6113	return false;
6114
6115	// It may be worth creating extending masked loads if there are multiple
6116	// masked loads using the same predicate. That way we'll end up creating
6117	// extending masked loads that may then get split by the legaliser. This
6118	// results in just one set of predicate unpacks at the start, instead of
6119	// multiple sets of vector unpacks after each load.
6120	if (auto *Ld = dyn_cast<MaskedLoadSDNode>(Val: ExtVal ->getOperand(Num: `0`))) {
6121	if (!isLoadExtLegalOrCustom(ExtType: ISD::ZEXTLOAD, ValVT: ExtVT, MemVT: Ld->getValueType(ResNo: `0`))) {
6122	// Disable extending masked loads for fixed-width for now, since the code
6123	// quality doesn't look great.
6124	if (!ExtVT.isScalableVector())
6125	return false;
6126
6127	unsigned NumExtMaskedLoads = `0`;
6128	for (auto *U : Ld->getMask()->uses())
6129	if (isa<MaskedLoadSDNode>(Val: U))
6130	NumExtMaskedLoads++;
6131
6132	if (NumExtMaskedLoads <= `1`)
6133	return false;
6134	}
6135	}
6136
6137	return true;
6138	}
6139
6140	unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6141	std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6142	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: false),
6143	AArch64ISD::GLD1_MERGE_ZERO},
6144	{std::make_tuple(/Scaled/ args: false, /Signed/ args: false, /Extend/ args: true),
6145	AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6146	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: false),
6147	AArch64ISD::GLD1_MERGE_ZERO},
6148	{std::make_tuple(/Scaled/ args: false, /Signed/ args: true, /Extend/ args: true),
6149	AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6150	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: false),
6151	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6152	{std::make_tuple(/Scaled/ args: true, /Signed/ args: false, /Extend/ args: true),
6153	AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6154	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: false),
6155	AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6156	{std::make_tuple(/Scaled/ args: true, /Signed/ args: true, /Extend/ args: true),
6157	AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6158	};
6159	auto Key = std::make_tuple(args&: IsScaled, args&: IsSigned, args&: NeedsExtend);
6160	return AddrModes.find(x: Key)->second;
6161	}
6162
6163	unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6164	switch (Opcode) {
6165	default:
6166	llvm_unreachable("unimplemented opcode");
6167	return Opcode;
6168	case AArch64ISD::GLD1_MERGE_ZERO:
6169	return AArch64ISD::GLD1S_MERGE_ZERO;
6170	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6171	return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6172	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6173	return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6174	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6175	return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6176	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6177	return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6178	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6179	return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6180	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6181	return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6182	}
6183	}
6184
6185	SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6186	SelectionDAG &DAG) const {
6187	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Val&: Op);
6188
6189	SDLoc DL(Op);
6190	SDValue Chain = MGT->getChain();
6191	SDValue PassThru = MGT->getPassThru();
6192	SDValue Mask = MGT->getMask();
6193	SDValue BasePtr = MGT->getBasePtr();
6194	SDValue Index = MGT->getIndex();
6195	SDValue Scale = MGT->getScale();
6196	EVT VT = Op.getValueType();
6197	EVT MemVT = MGT->getMemoryVT();
6198	ISD::LoadExtType ExtType = MGT->getExtensionType();
6199	ISD::MemIndexType IndexType = MGT->getIndexType();
6200
6201	// SVE supports zero (and so undef) passthrough values only, everything else
6202	// must be handled manually by an explicit select on the load's output.
6203	if (!PassThru ->isUndef() && !isZerosVector(N: PassThru.getNode())) {
6204	SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6205	SDValue Load =
6206	DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6207	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6208	SDValue Select = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6209	return DAG.getMergeValues(Ops: {Select, Load.getValue(R: `1`)}, dl: DL);
6210	}
6211
6212	bool IsScaled = MGT->isIndexScaled();
6213	bool IsSigned = MGT->isIndexSigned();
6214
6215	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6216	// must be calculated before hand.
6217	uint64_t ScaleVal = Scale ->getAsZExtVal();
6218	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6219	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6220	EVT IndexVT = Index.getValueType();
6221	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6222	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6223	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
6224
6225	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6226	return DAG.getMaskedGather(VTs: MGT->getVTList(), MemVT, dl: DL, Ops,
6227	MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6228	}
6229
6230	// Lower fixed length gather to a scalable equivalent.
6231	if (VT.isFixedLengthVector()) {
6232	assert(Subtarget->useSVEForFixedLengthVectors() &&
6233	"Cannot lower when not using SVE for fixed vectors!");
6234
6235	// NOTE: Handle floating-point as if integer then bitcast the result.
6236	EVT DataVT = VT.changeVectorElementTypeToInteger();
6237	MemVT = MemVT.changeVectorElementTypeToInteger();
6238
6239	// Find the smallest integer fixed length vector we can use for the gather.
6240	EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6241	if (DataVT.getVectorElementType() == MVT::i64 \|\|
6242	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
6243	Mask.getValueType().getVectorElementType() == MVT::i64)
6244	PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6245
6246	// Promote vector operands except for passthrough, which we know is either
6247	// undef or zero, and thus best constructed directly.
6248	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6249	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6250	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6251
6252	// A promoted result type forces the need for an extending load.
6253	if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6254	ExtType = ISD::EXTLOAD;
6255
6256	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6257
6258	// Convert fixed length vector operands to scalable.
6259	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6260	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6261	Mask = convertFixedMaskToScalableVector(Mask, DAG);
6262	PassThru = PassThru ->isUndef() ? DAG.getUNDEF(VT: ContainerVT)
6263	: DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
6264
6265	// Emit equivalent scalable vector gather.
6266	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6267	SDValue Load =
6268	DAG.getMaskedGather(VTs: DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other), MemVT, dl: DL,
6269	Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: ExtType);
6270
6271	// Extract fixed length data then convert to the required result type.
6272	SDValue Result = convertFromScalableVector(DAG, VT: PromotedVT, V: Load);
6273	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DataVT, Operand: Result);
6274	if (VT.isFloatingPoint())
6275	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Result);
6276
6277	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
6278	}
6279
6280	// Everything else is legal.
6281	return Op;
6282	}
6283
6284	SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6285	SelectionDAG &DAG) const {
6286	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Val&: Op);
6287
6288	SDLoc DL(Op);
6289	SDValue Chain = MSC->getChain();
6290	SDValue StoreVal = MSC->getValue();
6291	SDValue Mask = MSC->getMask();
6292	SDValue BasePtr = MSC->getBasePtr();
6293	SDValue Index = MSC->getIndex();
6294	SDValue Scale = MSC->getScale();
6295	EVT VT = StoreVal.getValueType();
6296	EVT MemVT = MSC->getMemoryVT();
6297	ISD::MemIndexType IndexType = MSC->getIndexType();
6298	bool Truncating = MSC->isTruncatingStore();
6299
6300	bool IsScaled = MSC->isIndexScaled();
6301	bool IsSigned = MSC->isIndexSigned();
6302
6303	// SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6304	// must be calculated before hand.
6305	uint64_t ScaleVal = Scale ->getAsZExtVal();
6306	if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6307	assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6308	EVT IndexVT = Index.getValueType();
6309	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: IndexVT, N1: Index,
6310	N2: DAG.getConstant(Val: Log2_32(Value: ScaleVal), DL, VT: IndexVT));
6311	Scale = DAG.getTargetConstant(Val: `1`, DL, VT: Scale.getValueType());
6312
6313	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6314	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6315	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6316	}
6317
6318	// Lower fixed length scatter to a scalable equivalent.
6319	if (VT.isFixedLengthVector()) {
6320	assert(Subtarget->useSVEForFixedLengthVectors() &&
6321	"Cannot lower when not using SVE for fixed vectors!");
6322
6323	// Once bitcast we treat floating-point scatters as if integer.
6324	if (VT.isFloatingPoint()) {
6325	VT = VT.changeVectorElementTypeToInteger();
6326	MemVT = MemVT.changeVectorElementTypeToInteger();
6327	StoreVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: StoreVal);
6328	}
6329
6330	// Find the smallest integer fixed length vector we can use for the scatter.
6331	EVT PromotedVT = VT.changeVectorElementType(EltVT: MVT::i32);
6332	if (VT.getVectorElementType() == MVT::i64 \|\|
6333	Index.getValueType().getVectorElementType() == MVT::i64 \|\|
6334	Mask.getValueType().getVectorElementType() == MVT::i64)
6335	PromotedVT = VT.changeVectorElementType(EltVT: MVT::i64);
6336
6337	// Promote vector operands.
6338	unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6339	Index = DAG.getNode(Opcode: ExtOpcode, DL, VT: PromotedVT, Operand: Index);
6340	Mask = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: PromotedVT, Operand: Mask);
6341	StoreVal = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: PromotedVT, Operand: StoreVal);
6342
6343	// A promoted value type forces the need for a truncating store.
6344	if (PromotedVT != VT)
6345	Truncating = true;
6346
6347	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: PromotedVT);
6348
6349	// Convert fixed length vector operands to scalable.
6350	MemVT = ContainerVT.changeVectorElementType(EltVT: MemVT.getVectorElementType());
6351	Index = convertToScalableVector(DAG, VT: ContainerVT, V: Index);
6352	Mask = convertFixedMaskToScalableVector(Mask, DAG);
6353	StoreVal = convertToScalableVector(DAG, VT: ContainerVT, V: StoreVal);
6354
6355	// Emit equivalent scalable vector scatter.
6356	SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6357	return DAG.getMaskedScatter(VTs: MSC->getVTList(), MemVT, dl: DL, Ops,
6358	MMO: MSC->getMemOperand(), IndexType, IsTruncating: Truncating);
6359	}
6360
6361	// Everything else is legal.
6362	return Op;
6363	}
6364
6365	SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6366	SDLoc DL(Op);
6367	MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Val&: Op);
6368	assert(LoadNode && "Expected custom lowering of a masked load node");
6369	EVT VT = Op ->getValueType(ResNo: `0`);
6370
6371	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
6372	return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6373
6374	SDValue PassThru = LoadNode->getPassThru();
6375	SDValue Mask = LoadNode->getMask();
6376
6377	if (PassThru ->isUndef() \|\| isZerosVector(N: PassThru.getNode()))
6378	return Op;
6379
6380	SDValue Load = DAG.getMaskedLoad(
6381	VT, dl: DL, Chain: LoadNode->getChain(), Base: LoadNode->getBasePtr(),
6382	Offset: LoadNode->getOffset(), Mask, Src0: DAG.getUNDEF(VT), MemVT: LoadNode->getMemoryVT(),
6383	MMO: LoadNode->getMemOperand(), AM: LoadNode->getAddressingMode(),
6384	LoadNode->getExtensionType());
6385
6386	SDValue Result = DAG.getSelect(DL, VT, Cond: Mask, LHS: Load, RHS: PassThru);
6387
6388	return DAG.getMergeValues(Ops: {Result, Load.getValue(R: `1`)}, dl: DL);
6389	}
6390
6391	// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6392	static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
6393	EVT VT, EVT MemVT,
6394	SelectionDAG &DAG) {
6395	assert(VT.isVector() && "VT should be a vector type");
6396	assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6397
6398	SDValue Value = ST->getValue();
6399
6400	// It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6401	// the word lane which represent the v4i8 subvector. It optimizes the store
6402	// to:
6403	//
6404	// xtn v0.8b, v0.8h
6405	// str s0, [x0]
6406
6407	SDValue Undef = DAG.getUNDEF(VT: MVT::i16);
6408	SDValue UndefVec = DAG.getBuildVector(VT: MVT::v4i16, DL,
6409	Ops: {Undef, Undef, Undef, Undef});
6410
6411	SDValue TruncExt = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16,
6412	N1: Value, N2: UndefVec);
6413	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: TruncExt);
6414
6415	Trunc = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i32, Operand: Trunc);
6416	SDValue ExtractTrunc = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32,
6417	N1: Trunc, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6418
6419	return DAG.getStore(Chain: ST->getChain(), dl: DL, Val: ExtractTrunc,
6420	Ptr: ST->getBasePtr(), MMO: ST->getMemOperand());
6421	}
6422
6423	// Custom lowering for any store, vector or scalar and/or default or with
6424	// a truncate operations. Currently only custom lower truncate operation
6425	// from vector v4i16 to v4i8 or volatile stores of i128.
6426	SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6427	SelectionDAG &DAG) const {
6428	SDLoc Dl(Op);
6429	StoreSDNode *StoreNode = cast<StoreSDNode>(Val&: Op);
6430	assert (StoreNode && "Can only custom lower store nodes");
6431
6432	SDValue Value = StoreNode->getValue();
6433
6434	EVT VT = Value.getValueType();
6435	EVT MemVT = StoreNode->getMemoryVT();
6436
6437	if (VT.isVector()) {
6438	if (useSVEForFixedLengthVectorVT(
6439	VT,
6440	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
6441	return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6442
6443	unsigned AS = StoreNode->getAddressSpace();
6444	Align Alignment = StoreNode->getAlign();
6445	if (Alignment < MemVT.getStoreSize() &&
6446	!allowsMisalignedMemoryAccesses(VT: MemVT, AddrSpace: AS, Alignment,
6447	Flags: StoreNode->getMemOperand()->getFlags(),
6448	Fast: nullptr)) {
6449	return scalarizeVectorStore(ST: StoreNode, DAG);
6450	}
6451
6452	if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6453	MemVT == MVT::v4i8) {
6454	return LowerTruncateVectorStore(DL: Dl, ST: StoreNode, VT, MemVT, DAG);
6455	}
6456	// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6457	// the custom lowering, as there are no un-paired non-temporal stores and
6458	// legalization will break up 256 bit inputs.
6459	ElementCount EC = MemVT.getVectorElementCount();
6460	if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == `256u` &&
6461	EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6462	(MemVT.getScalarSizeInBits() == `8u` \|\|
6463	MemVT.getScalarSizeInBits() == `16u` \|\|
6464	MemVT.getScalarSizeInBits() == `32u` \|\|
6465	MemVT.getScalarSizeInBits() == `64u`)) {
6466	SDValue Lo =
6467	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6468	VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6469	N1: StoreNode->getValue(), N2: DAG.getConstant(Val: `0`, DL: Dl, VT: MVT::i64));
6470	SDValue Hi =
6471	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: Dl,
6472	VT: MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
6473	N1: StoreNode->getValue(),
6474	N2: DAG.getConstant(Val: EC.getKnownMinValue() / `2`, DL: Dl, VT: MVT::i64));
6475	SDValue Result = DAG.getMemIntrinsicNode(
6476	Opcode: AArch64ISD::STNP, dl: Dl, VTList: DAG.getVTList(VT: MVT::Other),
6477	Ops: {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6478	MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6479	return Result;
6480	}
6481	} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6482	return LowerStore128(Op, DAG);
6483	} else if (MemVT == MVT::i64x8) {
6484	SDValue Value = StoreNode->getValue();
6485	assert(Value->getValueType(`0`) == MVT::i64x8);
6486	SDValue Chain = StoreNode->getChain();
6487	SDValue Base = StoreNode->getBasePtr();
6488	EVT PtrVT = Base.getValueType();
6489	for (unsigned i = `0`; i < `8`; i++) {
6490	SDValue Part = DAG.getNode(Opcode: AArch64ISD::LS64_EXTRACT, DL: Dl, VT: MVT::i64,
6491	N1: Value, N2: DAG.getConstant(Val: i, DL: Dl, VT: MVT::i32));
6492	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL: Dl, VT: PtrVT, N1: Base,
6493	N2: DAG.getConstant(Val: i * `8`, DL: Dl, VT: PtrVT));
6494	Chain = DAG.getStore(Chain, dl: Dl, Val: Part, Ptr, PtrInfo: StoreNode->getPointerInfo(),
6495	Alignment: StoreNode->getOriginalAlign());
6496	}
6497	return Chain;
6498	}
6499
6500	return SDValue ();
6501	}
6502
6503	/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6504	SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6505	SelectionDAG &DAG) const {
6506	MemSDNode *StoreNode = cast<MemSDNode>(Val&: Op);
6507	assert(StoreNode->getMemoryVT() == MVT::i128);
6508	assert(StoreNode->isVolatile() \|\| StoreNode->isAtomic());
6509
6510	bool IsStoreRelease =
6511	StoreNode->getMergedOrdering() == AtomicOrdering::Release;
6512	if (StoreNode->isAtomic())
6513	assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6514	Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) \|\|
6515	StoreNode->getMergedOrdering() == AtomicOrdering::Unordered \|\|
6516	StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
6517
6518	SDValue Value = (StoreNode->getOpcode() == ISD::STORE \|\|
6519	StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6520	? StoreNode->getOperand(Num: `1`)
6521	: StoreNode->getOperand(Num: `2`);
6522	SDLoc DL(Op);
6523	auto StoreValue = DAG.SplitScalar(N: Value, DL, LoVT: MVT::i64, HiVT: MVT::i64);
6524	unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6525	if (DAG.getDataLayout().isBigEndian())
6526	std::swap(a&: StoreValue.first, b&: StoreValue.second);
6527	SDValue Result = DAG.getMemIntrinsicNode(
6528	Opcode, dl: DL, VTList: DAG.getVTList(VT: MVT::Other),
6529	Ops: {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6530	StoreNode->getBasePtr()},
6531	MemVT: StoreNode->getMemoryVT(), MMO: StoreNode->getMemOperand());
6532	return Result;
6533	}
6534
6535	SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6536	SelectionDAG &DAG) const {
6537	SDLoc DL(Op);
6538	LoadSDNode *LoadNode = cast<LoadSDNode>(Val&: Op);
6539	assert(LoadNode && "Expected custom lowering of a load node");
6540
6541	if (LoadNode->getMemoryVT() == MVT::i64x8) {
6542	SmallVector<SDValue, `8`> Ops;
6543	SDValue Base = LoadNode->getBasePtr();
6544	SDValue Chain = LoadNode->getChain();
6545	EVT PtrVT = Base.getValueType();
6546	for (unsigned i = `0`; i < `8`; i++) {
6547	SDValue Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: Base,
6548	N2: DAG.getConstant(Val: i * `8`, DL, VT: PtrVT));
6549	SDValue Part = DAG.getLoad(VT: MVT::i64, dl: DL, Chain, Ptr,
6550	PtrInfo: LoadNode->getPointerInfo(),
6551	Alignment: LoadNode->getOriginalAlign());
6552	Ops.push_back(Elt: Part);
6553	Chain = SDValue (Part.getNode(), `1`);
6554	}
6555	SDValue Loaded = DAG.getNode(Opcode: AArch64ISD::LS64_BUILD, DL, VT: MVT::i64x8, Ops);
6556	return DAG.getMergeValues(Ops: {Loaded, Chain}, dl: DL);
6557	}
6558
6559	// Custom lowering for extending v4i8 vector loads.
6560	EVT VT = Op ->getValueType(ResNo: `0`);
6561	assert((VT == MVT::v4i16 \|\| VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6562
6563	if (LoadNode->getMemoryVT() != MVT::v4i8)
6564	return SDValue ();
6565
6566	// Avoid generating unaligned loads.
6567	if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align (`4`))
6568	return SDValue ();
6569
6570	unsigned ExtType;
6571	if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6572	ExtType = ISD::SIGN_EXTEND;
6573	else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD \|\|
6574	LoadNode->getExtensionType() == ISD::EXTLOAD)
6575	ExtType = ISD::ZERO_EXTEND;
6576	else
6577	return SDValue ();
6578
6579	SDValue Load = DAG.getLoad(VT: MVT::f32, dl: DL, Chain: LoadNode->getChain(),
6580	Ptr: LoadNode->getBasePtr(), PtrInfo: MachinePointerInfo ());
6581	SDValue Chain = Load.getValue(R: `1`);
6582	SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v2f32, Operand: Load);
6583	SDValue BC = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Vec);
6584	SDValue Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v8i16, Operand: BC);
6585	Ext = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v4i16, N1: Ext,
6586	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
6587	if (VT == MVT::v4i32)
6588	Ext = DAG.getNode(Opcode: ExtType, DL, VT: MVT::v4i32, Operand: Ext);
6589	return DAG.getMergeValues(Ops: {Ext, Chain}, dl: DL);
6590	}
6591
6592	// Generate SUBS and CSEL for integer abs.
6593	SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6594	MVT VT = Op.getSimpleValueType();
6595
6596	if (VT.isVector())
6597	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABS_MERGE_PASSTHRU);
6598
6599	SDLoc DL(Op);
6600	SDValue Neg = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
6601	N2: Op.getOperand(i: `0`));
6602	// Generate SUBS & CSEL.
6603	SDValue Cmp =
6604	DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::i32),
6605	N1: Op.getOperand(i: `0`), N2: DAG.getConstant(Val: `0`, DL, VT));
6606	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: Op.getOperand(i: `0`), N2: Neg,
6607	N3: DAG.getConstant(Val: AArch64CC::PL, DL, VT: MVT::i32),
6608	N4: Cmp.getValue(R: `1`));
6609	}
6610
6611	static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
6612	SDValue Chain = Op.getOperand(i: `0`);
6613	SDValue Cond = Op.getOperand(i: `1`);
6614	SDValue Dest = Op.getOperand(i: `2`);
6615
6616	AArch64CC::CondCode CC;
6617	if (SDValue Cmp = emitConjunction(DAG, Val: Cond, OutCC&: CC)) {
6618	SDLoc dl(Op);
6619	SDValue CCVal = DAG.getConstant(Val: CC, DL: dl, VT: MVT::i32);
6620	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
6621	N4: Cmp);
6622	}
6623
6624	return SDValue ();
6625	}
6626
6627	// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6628	// FSHL is converted to FSHR before deciding what to do with it
6629	static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
6630	SDValue Shifts = Op.getOperand(i: `2`);
6631	// Check if the shift amount is a constant
6632	// If opcode is FSHL, convert it to FSHR
6633	if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Val&: Shifts)) {
6634	SDLoc DL(Op);
6635	MVT VT = Op.getSimpleValueType();
6636
6637	if (Op.getOpcode() == ISD::FSHL) {
6638	unsigned int NewShiftNo =
6639	VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6640	return DAG.getNode(
6641	Opcode: ISD::FSHR, DL, VT, N1: Op.getOperand(i: `0`), N2: Op.getOperand(i: `1`),
6642	N3: DAG.getConstant(Val: NewShiftNo, DL, VT: Shifts.getValueType()));
6643	} else if (Op.getOpcode() == ISD::FSHR) {
6644	return Op;
6645	}
6646	}
6647
6648	return SDValue ();
6649	}
6650
6651	static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
6652	SDValue X = Op.getOperand(i: `0`);
6653	EVT XScalarTy = X.getValueType();
6654	SDValue Exp = Op.getOperand(i: `1`);
6655
6656	SDLoc DL(Op);
6657	EVT XVT, ExpVT;
6658	switch (Op.getSimpleValueType().SimpleTy) {
6659	default:
6660	return SDValue ();
6661	case MVT::bf16:
6662	case MVT::f16:
6663	X = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: X);
6664	[[fallthrough]];
6665	case MVT::f32:
6666	XVT = MVT::nxv4f32;
6667	ExpVT = MVT::nxv4i32;
6668	break;
6669	case MVT::f64:
6670	XVT = MVT::nxv2f64;
6671	ExpVT = MVT::nxv2i64;
6672	Exp = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: MVT::i64, Operand: Exp);
6673	break;
6674	}
6675
6676	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
6677	SDValue VX =
6678	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: XVT, N1: DAG.getUNDEF(VT: XVT), N2: X, N3: Zero);
6679	SDValue VExp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ExpVT,
6680	N1: DAG.getUNDEF(VT: ExpVT), N2: Exp, N3: Zero);
6681	SDValue VPg = getPTrue(DAG, DL, VT: XVT.changeVectorElementType(EltVT: MVT::i1),
6682	Pattern: AArch64SVEPredPattern::all);
6683	SDValue FScale =
6684	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: XVT,
6685	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_fscale, DL, VT: MVT::i64),
6686	N2: VPg, N3: VX, N4: VExp);
6687	SDValue Final =
6688	DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: X.getValueType(), N1: FScale, N2: Zero);
6689	if (X.getValueType() != XScalarTy)
6690	Final = DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT: XScalarTy, N1: Final,
6691	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (Op)));
6692	return Final;
6693	}
6694
6695	SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
6696	SelectionDAG &DAG) const {
6697	// Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6698	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
6699	report_fatal_error(
6700	reason: "ADJUST_TRAMPOLINE operation is only supported on Linux.");
6701
6702	return Op.getOperand(i: `0`);
6703	}
6704
6705	SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
6706	SelectionDAG &DAG) const {
6707
6708	// Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6709	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
6710	report_fatal_error(reason: "INIT_TRAMPOLINE operation is only supported on Linux.");
6711
6712	SDValue Chain = Op.getOperand(i: `0`);
6713	SDValue Trmp = Op.getOperand(i: `1`); // trampoline
6714	SDValue FPtr = Op.getOperand(i: `2`); // nested function
6715	SDValue Nest = Op.getOperand(i: `3`); // 'nest' parameter value
6716	SDLoc dl(Op);
6717
6718	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
6719	Type IntPtrTy = DAG.getDataLayout().getIntPtrType(C&: DAG.getContext());
6720
6721	TargetLowering::ArgListTy Args;
6722	TargetLowering::ArgListEntry Entry;
6723
6724	Entry.Ty = IntPtrTy;
6725	Entry.Node = Trmp;
6726	Args.push_back(x: Entry);
6727	Entry.Node = DAG.getConstant(Val: `20`, DL: dl, VT: MVT::i64);
6728	Args.push_back(x: Entry);
6729
6730	Entry.Node = FPtr;
6731	Args.push_back(x: Entry);
6732	Entry.Node = Nest;
6733	Args.push_back(x: Entry);
6734
6735	// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
6736	TargetLowering::CallLoweringInfo CLI(DAG);
6737	CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
6738	CC: CallingConv::C, ResultType: Type::getVoidTy(C&: *DAG.getContext()),
6739	Target: DAG.getExternalSymbol(Sym: "__trampoline_setup", VT: PtrVT), ArgsList: std::move(Args));
6740
6741	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6742	return CallResult.second;
6743	}
6744
6745	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
6746	SelectionDAG &DAG) const {
6747	LLVM_DEBUG(dbgs() << "Custom lowering: ");
6748	LLVM_DEBUG(Op.dump());
6749
6750	switch (Op.getOpcode()) {
6751	default:
6752	llvm_unreachable("unimplemented operand");
6753	return SDValue ();
6754	case ISD::BITCAST:
6755	return LowerBITCAST(Op, DAG);
6756	case ISD::GlobalAddress:
6757	return LowerGlobalAddress(Op, DAG);
6758	case ISD::GlobalTLSAddress:
6759	return LowerGlobalTLSAddress(Op, DAG);
6760	case ISD::PtrAuthGlobalAddress:
6761	return LowerPtrAuthGlobalAddress(Op, DAG);
6762	case ISD::ADJUST_TRAMPOLINE:
6763	return LowerADJUST_TRAMPOLINE(Op, DAG);
6764	case ISD::INIT_TRAMPOLINE:
6765	return LowerINIT_TRAMPOLINE(Op, DAG);
6766	case ISD::SETCC:
6767	case ISD::STRICT_FSETCC:
6768	case ISD::STRICT_FSETCCS:
6769	return LowerSETCC(Op, DAG);
6770	case ISD::SETCCCARRY:
6771	return LowerSETCCCARRY(Op, DAG);
6772	case ISD::BRCOND:
6773	return LowerBRCOND(Op, DAG);
6774	case ISD::BR_CC:
6775	return LowerBR_CC(Op, DAG);
6776	case ISD::SELECT:
6777	return LowerSELECT(Op, DAG);
6778	case ISD::SELECT_CC:
6779	return LowerSELECT_CC(Op, DAG);
6780	case ISD::JumpTable:
6781	return LowerJumpTable(Op, DAG);
6782	case ISD::BR_JT:
6783	return LowerBR_JT(Op, DAG);
6784	case ISD::BRIND:
6785	return LowerBRIND(Op, DAG);
6786	case ISD::ConstantPool:
6787	return LowerConstantPool(Op, DAG);
6788	case ISD::BlockAddress:
6789	return LowerBlockAddress(Op, DAG);
6790	case ISD::VASTART:
6791	return LowerVASTART(Op, DAG);
6792	case ISD::VACOPY:
6793	return LowerVACOPY(Op, DAG);
6794	case ISD::VAARG:
6795	return LowerVAARG(Op, DAG);
6796	case ISD::UADDO_CARRY:
6797	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: false /unsigned/);
6798	case ISD::USUBO_CARRY:
6799	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: false /unsigned/);
6800	case ISD::SADDO_CARRY:
6801	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::ADCS, IsSigned: true /signed/);
6802	case ISD::SSUBO_CARRY:
6803	return lowerADDSUBO_CARRY(Op, DAG, Opcode: AArch64ISD::SBCS, IsSigned: true /signed/);
6804	case ISD::SADDO:
6805	case ISD::UADDO:
6806	case ISD::SSUBO:
6807	case ISD::USUBO:
6808	case ISD::SMULO:
6809	case ISD::UMULO:
6810	return LowerXALUO(Op, DAG);
6811	case ISD::FADD:
6812	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FADD_PRED);
6813	case ISD::FSUB:
6814	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSUB_PRED);
6815	case ISD::FMUL:
6816	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMUL_PRED);
6817	case ISD::FMA:
6818	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMA_PRED);
6819	case ISD::FDIV:
6820	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FDIV_PRED);
6821	case ISD::FNEG:
6822	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEG_MERGE_PASSTHRU);
6823	case ISD::FCEIL:
6824	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FCEIL_MERGE_PASSTHRU);
6825	case ISD::FFLOOR:
6826	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6827	case ISD::FNEARBYINT:
6828	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6829	case ISD::FRINT:
6830	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FRINT_MERGE_PASSTHRU);
6831	case ISD::FROUND:
6832	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUND_MERGE_PASSTHRU);
6833	case ISD::FROUNDEVEN:
6834	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6835	case ISD::FTRUNC:
6836	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6837	case ISD::FSQRT:
6838	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FSQRT_MERGE_PASSTHRU);
6839	case ISD::FABS:
6840	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FABS_MERGE_PASSTHRU);
6841	case ISD::FP_ROUND:
6842	case ISD::STRICT_FP_ROUND:
6843	return LowerFP_ROUND(Op, DAG);
6844	case ISD::FP_EXTEND:
6845	return LowerFP_EXTEND(Op, DAG);
6846	case ISD::FRAMEADDR:
6847	return LowerFRAMEADDR(Op, DAG);
6848	case ISD::SPONENTRY:
6849	return LowerSPONENTRY(Op, DAG);
6850	case ISD::RETURNADDR:
6851	return LowerRETURNADDR(Op, DAG);
6852	case ISD::ADDROFRETURNADDR:
6853	return LowerADDROFRETURNADDR(Op, DAG);
6854	case ISD::CONCAT_VECTORS:
6855	return LowerCONCAT_VECTORS(Op, DAG);
6856	case ISD::INSERT_VECTOR_ELT:
6857	return LowerINSERT_VECTOR_ELT(Op, DAG);
6858	case ISD::EXTRACT_VECTOR_ELT:
6859	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6860	case ISD::BUILD_VECTOR:
6861	return LowerBUILD_VECTOR(Op, DAG);
6862	case ISD::ZERO_EXTEND_VECTOR_INREG:
6863	return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6864	case ISD::VECTOR_SHUFFLE:
6865	return LowerVECTOR_SHUFFLE(Op, DAG);
6866	case ISD::SPLAT_VECTOR:
6867	return LowerSPLAT_VECTOR(Op, DAG);
6868	case ISD::EXTRACT_SUBVECTOR:
6869	return LowerEXTRACT_SUBVECTOR(Op, DAG);
6870	case ISD::INSERT_SUBVECTOR:
6871	return LowerINSERT_SUBVECTOR(Op, DAG);
6872	case ISD::SDIV:
6873	case ISD::UDIV:
6874	return LowerDIV(Op, DAG);
6875	case ISD::SMIN:
6876	case ISD::UMIN:
6877	case ISD::SMAX:
6878	case ISD::UMAX:
6879	return LowerMinMax(Op, DAG);
6880	case ISD::SRA:
6881	case ISD::SRL:
6882	case ISD::SHL:
6883	return LowerVectorSRA_SRL_SHL(Op, DAG);
6884	case ISD::SHL_PARTS:
6885	case ISD::SRL_PARTS:
6886	case ISD::SRA_PARTS:
6887	return LowerShiftParts(Op, DAG);
6888	case ISD::CTPOP:
6889	case ISD::PARITY:
6890	return LowerCTPOP_PARITY(Op, DAG);
6891	case ISD::FCOPYSIGN:
6892	return LowerFCOPYSIGN(Op, DAG);
6893	case ISD::OR:
6894	return LowerVectorOR(Op, DAG);
6895	case ISD::XOR:
6896	return LowerXOR(Op, DAG);
6897	case ISD::PREFETCH:
6898	return LowerPREFETCH(Op, DAG);
6899	case ISD::SINT_TO_FP:
6900	case ISD::UINT_TO_FP:
6901	case ISD::STRICT_SINT_TO_FP:
6902	case ISD::STRICT_UINT_TO_FP:
6903	return LowerINT_TO_FP(Op, DAG);
6904	case ISD::FP_TO_SINT:
6905	case ISD::FP_TO_UINT:
6906	case ISD::STRICT_FP_TO_SINT:
6907	case ISD::STRICT_FP_TO_UINT:
6908	return LowerFP_TO_INT(Op, DAG);
6909	case ISD::FP_TO_SINT_SAT:
6910	case ISD::FP_TO_UINT_SAT:
6911	return LowerFP_TO_INT_SAT(Op, DAG);
6912	case ISD::FSINCOS:
6913	return LowerFSINCOS(Op, DAG);
6914	case ISD::GET_ROUNDING:
6915	return LowerGET_ROUNDING(Op, DAG);
6916	case ISD::SET_ROUNDING:
6917	return LowerSET_ROUNDING(Op, DAG);
6918	case ISD::GET_FPMODE:
6919	return LowerGET_FPMODE(Op, DAG);
6920	case ISD::SET_FPMODE:
6921	return LowerSET_FPMODE(Op, DAG);
6922	case ISD::RESET_FPMODE:
6923	return LowerRESET_FPMODE(Op, DAG);
6924	case ISD::MUL:
6925	return LowerMUL(Op, DAG);
6926	case ISD::MULHS:
6927	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHS_PRED);
6928	case ISD::MULHU:
6929	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::MULHU_PRED);
6930	case ISD::INTRINSIC_W_CHAIN:
6931	return LowerINTRINSIC_W_CHAIN(Op, DAG);
6932	case ISD::INTRINSIC_WO_CHAIN:
6933	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6934	case ISD::INTRINSIC_VOID:
6935	return LowerINTRINSIC_VOID(Op, DAG);
6936	case ISD::ATOMIC_STORE:
6937	if (cast<MemSDNode>(Val&: Op)->getMemoryVT() == MVT::i128) {
6938	assert(Subtarget->hasLSE2() \|\| Subtarget->hasRCPC3());
6939	return LowerStore128(Op, DAG);
6940	}
6941	return SDValue ();
6942	case ISD::STORE:
6943	return LowerSTORE(Op, DAG);
6944	case ISD::MSTORE:
6945	return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6946	case ISD::MGATHER:
6947	return LowerMGATHER(Op, DAG);
6948	case ISD::MSCATTER:
6949	return LowerMSCATTER(Op, DAG);
6950	case ISD::VECREDUCE_SEQ_FADD:
6951	return LowerVECREDUCE_SEQ_FADD(ScalarOp: Op, DAG);
6952	case ISD::VECREDUCE_ADD:
6953	case ISD::VECREDUCE_AND:
6954	case ISD::VECREDUCE_OR:
6955	case ISD::VECREDUCE_XOR:
6956	case ISD::VECREDUCE_SMAX:
6957	case ISD::VECREDUCE_SMIN:
6958	case ISD::VECREDUCE_UMAX:
6959	case ISD::VECREDUCE_UMIN:
6960	case ISD::VECREDUCE_FADD:
6961	case ISD::VECREDUCE_FMAX:
6962	case ISD::VECREDUCE_FMIN:
6963	case ISD::VECREDUCE_FMAXIMUM:
6964	case ISD::VECREDUCE_FMINIMUM:
6965	return LowerVECREDUCE(Op, DAG);
6966	case ISD::ATOMIC_LOAD_AND:
6967	return LowerATOMIC_LOAD_AND(Op, DAG);
6968	case ISD::DYNAMIC_STACKALLOC:
6969	return LowerDYNAMIC_STACKALLOC(Op, DAG);
6970	case ISD::VSCALE:
6971	return LowerVSCALE(Op, DAG);
6972	case ISD::ANY_EXTEND:
6973	case ISD::SIGN_EXTEND:
6974	case ISD::ZERO_EXTEND:
6975	return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6976	case ISD::SIGN_EXTEND_INREG: {
6977	// Only custom lower when ExtraVT has a legal byte based element type.
6978	EVT ExtraVT = cast<VTSDNode>(Val: Op.getOperand(i: `1`))->getVT();
6979	EVT ExtraEltVT = ExtraVT.getVectorElementType();
6980	if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6981	(ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6982	return SDValue ();
6983
6984	return LowerToPredicatedOp(Op, DAG,
6985	NewOp: AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
6986	}
6987	case ISD::TRUNCATE:
6988	return LowerTRUNCATE(Op, DAG);
6989	case ISD::MLOAD:
6990	return LowerMLOAD(Op, DAG);
6991	case ISD::LOAD:
6992	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
6993	OverrideNEON: !Subtarget->isNeonAvailable()))
6994	return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6995	return LowerLOAD(Op, DAG);
6996	case ISD::ADD:
6997	case ISD::AND:
6998	case ISD::SUB:
6999	return LowerToScalableOp(Op, DAG);
7000	case ISD::FMAXIMUM:
7001	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAX_PRED);
7002	case ISD::FMAXNUM:
7003	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMAXNM_PRED);
7004	case ISD::FMINIMUM:
7005	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMIN_PRED);
7006	case ISD::FMINNUM:
7007	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::FMINNM_PRED);
7008	case ISD::VSELECT:
7009	return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7010	case ISD::ABS:
7011	return LowerABS(Op, DAG);
7012	case ISD::ABDS:
7013	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDS_PRED);
7014	case ISD::ABDU:
7015	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::ABDU_PRED);
7016	case ISD::AVGFLOORS:
7017	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDS_PRED);
7018	case ISD::AVGFLOORU:
7019	return LowerAVG(Op, DAG, NewOp: AArch64ISD::HADDU_PRED);
7020	case ISD::AVGCEILS:
7021	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDS_PRED);
7022	case ISD::AVGCEILU:
7023	return LowerAVG(Op, DAG, NewOp: AArch64ISD::RHADDU_PRED);
7024	case ISD::BITREVERSE:
7025	return LowerBitreverse(Op, DAG);
7026	case ISD::BSWAP:
7027	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BSWAP_MERGE_PASSTHRU);
7028	case ISD::CTLZ:
7029	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTLZ_MERGE_PASSTHRU);
7030	case ISD::CTTZ:
7031	return LowerCTTZ(Op, DAG);
7032	case ISD::VECTOR_SPLICE:
7033	return LowerVECTOR_SPLICE(Op, DAG);
7034	case ISD::VECTOR_DEINTERLEAVE:
7035	return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7036	case ISD::VECTOR_INTERLEAVE:
7037	return LowerVECTOR_INTERLEAVE(Op, DAG);
7038	case ISD::LRINT:
7039	case ISD::LLRINT:
7040	if (Op.getValueType().isVector())
7041	return LowerVectorXRINT(Op, DAG);
7042	[[fallthrough]];
7043	case ISD::LROUND:
7044	case ISD::LLROUND: {
7045	assert((Op.getOperand(`0`).getValueType() == MVT::f16 \|\|
7046	Op.getOperand(`0`).getValueType() == MVT::bf16) &&
7047	"Expected custom lowering of rounding operations only for f16");
7048	SDLoc DL(Op);
7049	SDValue Ext = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::f32, Operand: Op.getOperand(i: `0`));
7050	return DAG.getNode(Opcode: Op.getOpcode(), DL, VT: Op.getValueType(), Operand: Ext);
7051	}
7052	case ISD::STRICT_LROUND:
7053	case ISD::STRICT_LLROUND:
7054	case ISD::STRICT_LRINT:
7055	case ISD::STRICT_LLRINT: {
7056	assert((Op.getOperand(`1`).getValueType() == MVT::f16 \|\|
7057	Op.getOperand(`1`).getValueType() == MVT::bf16) &&
7058	"Expected custom lowering of rounding operations only for f16");
7059	SDLoc DL(Op);
7060	SDValue Ext = DAG.getNode(Opcode: ISD::STRICT_FP_EXTEND, DL, ResultTys: {MVT::f32, MVT::Other},
7061	Ops: {Op.getOperand(i: `0`), Op.getOperand(i: `1`)});
7062	return DAG.getNode(Opcode: Op.getOpcode(), DL, ResultTys: {Op.getValueType(), MVT::Other},
7063	Ops: {Ext.getValue(R: `1`), Ext.getValue(R: `0`)});
7064	}
7065	case ISD::WRITE_REGISTER: {
7066	assert(Op.getOperand(`2`).getValueType() == MVT::i128 &&
7067	"WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7068	SDLoc DL(Op);
7069
7070	SDValue Chain = Op.getOperand(i: `0`);
7071	SDValue SysRegName = Op.getOperand(i: `1`);
7072	std::pair<SDValue, SDValue> Pair =
7073	DAG.SplitScalar(N: Op.getOperand(i: `2`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
7074
7075	// chain = MSRR(chain, sysregname, lo, hi)
7076	SDValue Result = DAG.getNode(Opcode: AArch64ISD::MSRR, DL, VT: MVT::Other, N1: Chain,
7077	N2: SysRegName, N3: Pair.first, N4: Pair.second);
7078
7079	return Result;
7080	}
7081	case ISD::FSHL:
7082	case ISD::FSHR:
7083	return LowerFunnelShift(Op, DAG);
7084	case ISD::FLDEXP:
7085	return LowerFLDEXP(Op, DAG);
7086	case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7087	return LowerVECTOR_HISTOGRAM(Op, DAG);
7088	}
7089	}
7090
7091	bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
7092	return !Subtarget->useSVEForFixedLengthVectors();
7093	}
7094
7095	bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
7096	EVT VT, bool OverrideNEON) const {
7097	if (!VT.isFixedLengthVector() \|\| !VT.isSimple())
7098	return false;
7099
7100	// Don't use SVE for vectors we cannot scalarize if required.
7101	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7102	// Fixed length predicates should be promoted to i8.
7103	// NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7104	case MVT::i1:
7105	default:
7106	return false;
7107	case MVT::i8:
7108	case MVT::i16:
7109	case MVT::i32:
7110	case MVT::i64:
7111	case MVT::f16:
7112	case MVT::f32:
7113	case MVT::f64:
7114	break;
7115	}
7116
7117	// NEON-sized vectors can be emulated using SVE instructions.
7118	if (OverrideNEON && (VT.is128BitVector() \|\| VT.is64BitVector()))
7119	return Subtarget->isSVEorStreamingSVEAvailable();
7120
7121	// Ensure NEON MVTs only belong to a single register class.
7122	if (VT.getFixedSizeInBits() <= `128`)
7123	return false;
7124
7125	// Ensure wider than NEON code generation is enabled.
7126	if (!Subtarget->useSVEForFixedLengthVectors())
7127	return false;
7128
7129	// Don't use SVE for types that don't fit.
7130	if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7131	return false;
7132
7133	// TODO: Perhaps an artificial restriction, but worth having whilst getting
7134	// the base fixed length SVE support in place.
7135	if (!VT.isPow2VectorType())
7136	return false;
7137
7138	return true;
7139	}
7140
7141	//===----------------------------------------------------------------------===//
7142	// Calling Convention Implementation
7143	//===----------------------------------------------------------------------===//
7144
7145	static unsigned getIntrinsicID(const SDNode *N) {
7146	unsigned Opcode = N->getOpcode();
7147	switch (Opcode) {
7148	default:
7149	return Intrinsic::not_intrinsic;
7150	case ISD::INTRINSIC_WO_CHAIN: {
7151	unsigned IID = N->getConstantOperandVal(Num: `0`);
7152	if (IID < Intrinsic::num_intrinsics)
7153	return IID;
7154	return Intrinsic::not_intrinsic;
7155	}
7156	}
7157	}
7158
7159	bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
7160	SDValue N1) const {
7161	if (!N0.hasOneUse())
7162	return false;
7163
7164	unsigned IID = getIntrinsicID(N: N1.getNode());
7165	// Avoid reassociating expressions that can be lowered to smlal/umlal.
7166	if (IID == Intrinsic::aarch64_neon_umull \|\|
7167	N1.getOpcode() == AArch64ISD::UMULL \|\|
7168	IID == Intrinsic::aarch64_neon_smull \|\|
7169	N1.getOpcode() == AArch64ISD::SMULL)
7170	return N0.getOpcode() != ISD::ADD;
7171
7172	return true;
7173	}
7174
7175	/// Selects the correct CCAssignFn for a given CallingConvention value.
7176	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
7177	bool IsVarArg) const {
7178	switch (CC) {
7179	default:
7180	report_fatal_error(reason: "Unsupported calling convention.");
7181	case CallingConv::GHC:
7182	return CC_AArch64_GHC;
7183	case CallingConv::PreserveNone:
7184	// The VarArg implementation makes assumptions about register
7185	// argument passing that do not hold for preserve_none, so we
7186	// instead fall back to C argument passing.
7187	// The non-vararg case is handled in the CC function itself.
7188	if (!IsVarArg)
7189	return CC_AArch64_Preserve_None;
7190	[[fallthrough]];
7191	case CallingConv::C:
7192	case CallingConv::Fast:
7193	case CallingConv::PreserveMost:
7194	case CallingConv::PreserveAll:
7195	case CallingConv::CXX_FAST_TLS:
7196	case CallingConv::Swift:
7197	case CallingConv::SwiftTail:
7198	case CallingConv::Tail:
7199	case CallingConv::GRAAL:
7200	if (Subtarget->isTargetWindows()) {
7201	if (IsVarArg) {
7202	if (Subtarget->isWindowsArm64EC())
7203	return CC_AArch64_Arm64EC_VarArg;
7204	return CC_AArch64_Win64_VarArg;
7205	}
7206	return CC_AArch64_Win64PCS;
7207	}
7208	if (!Subtarget->isTargetDarwin())
7209	return CC_AArch64_AAPCS;
7210	if (!IsVarArg)
7211	return CC_AArch64_DarwinPCS;
7212	return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
7213	: CC_AArch64_DarwinPCS_VarArg;
7214	case CallingConv::Win64:
7215	if (IsVarArg) {
7216	if (Subtarget->isWindowsArm64EC())
7217	return CC_AArch64_Arm64EC_VarArg;
7218	return CC_AArch64_Win64_VarArg;
7219	}
7220	return CC_AArch64_Win64PCS;
7221	case CallingConv::CFGuard_Check:
7222	if (Subtarget->isWindowsArm64EC())
7223	return CC_AArch64_Arm64EC_CFGuard_Check;
7224	return CC_AArch64_Win64_CFGuard_Check;
7225	case CallingConv::AArch64_VectorCall:
7226	case CallingConv::AArch64_SVE_VectorCall:
7227	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
7228	case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
7229	return CC_AArch64_AAPCS;
7230	case CallingConv::ARM64EC_Thunk_X64:
7231	return CC_AArch64_Arm64EC_Thunk;
7232	case CallingConv::ARM64EC_Thunk_Native:
7233	return CC_AArch64_Arm64EC_Thunk_Native;
7234	}
7235	}
7236
7237	CCAssignFn *
7238	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7239	switch (CC) {
7240	default:
7241	return RetCC_AArch64_AAPCS;
7242	case CallingConv::ARM64EC_Thunk_X64:
7243	return RetCC_AArch64_Arm64EC_Thunk;
7244	case CallingConv::CFGuard_Check:
7245	if (Subtarget->isWindowsArm64EC())
7246	return RetCC_AArch64_Arm64EC_CFGuard_Check;
7247	return RetCC_AArch64_AAPCS;
7248	}
7249	}
7250
7251	static bool isPassedInFPR(EVT VT) {
7252	return VT.isFixedLengthVector() \|\|
7253	(VT.isFloatingPoint() && !VT.isScalableVector());
7254	}
7255
7256	SDValue AArch64TargetLowering::LowerFormalArguments(
7257	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7258	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7259	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7260	MachineFunction &MF = DAG.getMachineFunction();
7261	const Function &F = MF.getFunction();
7262	MachineFrameInfo &MFI = MF.getFrameInfo();
7263	bool IsWin64 =
7264	Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7265	bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 \|\|
7266	(isVarArg && Subtarget->isWindowsArm64EC());
7267	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7268
7269	SmallVector<ISD::OutputArg, `4`> Outs;
7270	GetReturnInfo(CC: CallConv, ReturnType: F.getReturnType(), attr: F.getAttributes(), Outs,
7271	TLI: DAG.getTargetLoweringInfo(), DL: MF.getDataLayout());
7272	if (any_of(Range&: Outs, P: [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7273	FuncInfo->setIsSVECC(true);
7274
7275	// Assign locations to all of the incoming arguments.
7276	SmallVector<CCValAssign, `16`> ArgLocs;
7277	DenseMap<unsigned, SDValue> CopiedRegs;
7278	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7279
7280	// At this point, Ins[].VT may already be promoted to i32. To correctly
7281	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7282	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7283	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7284	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
7285	// LocVT.
7286	unsigned NumArgs = Ins.size();
7287	Function::const_arg_iterator CurOrigArg = F.arg_begin();
7288	unsigned CurArgIdx = `0`;
7289	for (unsigned i = `0`; i != NumArgs; ++i) {
7290	MVT ValVT = Ins [i].VT;
7291	if (Ins [i].isOrigArg()) {
7292	std::advance(i&: CurOrigArg, n: Ins [i].getOrigArgIndex() - CurArgIdx);
7293	CurArgIdx = Ins [i].getOrigArgIndex();
7294
7295	// Get type of the original argument.
7296	EVT ActualVT = getValueType(DL: DAG.getDataLayout(), Ty: CurOrigArg->getType(),
7297	/AllowUnknown/ true);
7298	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7299	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7300	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
7301	ValVT = MVT::i8;
7302	else if (ActualMVT == MVT::i16)
7303	ValVT = MVT::i16;
7304	}
7305	bool UseVarArgCC = false;
7306	if (IsWin64)
7307	UseVarArgCC = isVarArg;
7308	CCAssignFn *AssignFn = CCAssignFnForCall(CC: CallConv, IsVarArg: UseVarArgCC);
7309	bool Res =
7310	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins [i].Flags, CCInfo);
7311	assert(!Res && "Call operand has unhandled type");
7312	(void)Res;
7313	}
7314
7315	SMEAttrs Attrs(MF.getFunction());
7316	bool IsLocallyStreaming =
7317	!Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7318	assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7319	SDValue Glue = Chain.getValue(R: `1`);
7320
7321	SmallVector<SDValue, `16`> ArgValues;
7322	unsigned ExtraArgLocs = `0`;
7323	for (unsigned i = `0`, e = Ins.size(); i != e; ++i) {
7324	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
7325
7326	if (Ins [i].Flags.isByVal()) {
7327	// Byval is used for HFAs in the PCS, but the system should work in a
7328	// non-compliant manner for larger structs.
7329	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
7330	int Size = Ins [i].Flags.getByValSize();
7331	unsigned NumRegs = (Size + `7`) / `8`;
7332
7333	// FIXME: This works on big-endian for composite byvals, which are the common
7334	// case. It should also work for fundamental types too.
7335	unsigned FrameIdx =
7336	MFI.CreateFixedObject(Size: `8` * NumRegs, SPOffset: VA.getLocMemOffset(), IsImmutable: false);
7337	SDValue FrameIdxN = DAG.getFrameIndex(FI: FrameIdx, VT: PtrVT);
7338	InVals.push_back(Elt: FrameIdxN);
7339
7340	continue;
7341	}
7342
7343	if (Ins [i].Flags.isSwiftAsync())
7344	MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
7345
7346	SDValue ArgValue;
7347	if (VA.isRegLoc()) {
7348	// Arguments stored in registers.
7349	EVT RegVT = VA.getLocVT();
7350	const TargetRegisterClass *RC;
7351
7352	if (RegVT == MVT::i32)
7353	RC = &AArch64::GPR32RegClass;
7354	else if (RegVT == MVT::i64)
7355	RC = &AArch64::GPR64RegClass;
7356	else if (RegVT == MVT::f16 \|\| RegVT == MVT::bf16)
7357	RC = &AArch64::FPR16RegClass;
7358	else if (RegVT == MVT::f32)
7359	RC = &AArch64::FPR32RegClass;
7360	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
7361	RC = &AArch64::FPR64RegClass;
7362	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
7363	RC = &AArch64::FPR128RegClass;
7364	else if (RegVT.isScalableVector() &&
7365	RegVT.getVectorElementType() == MVT::i1) {
7366	FuncInfo->setIsSVECC(true);
7367	RC = &AArch64::PPRRegClass;
7368	} else if (RegVT == MVT::aarch64svcount) {
7369	FuncInfo->setIsSVECC(true);
7370	RC = &AArch64::PPRRegClass;
7371	} else if (RegVT.isScalableVector()) {
7372	FuncInfo->setIsSVECC(true);
7373	RC = &AArch64::ZPRRegClass;
7374	} else
7375	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7376
7377	// Transform the arguments in physical registers into virtual ones.
7378	Register Reg = MF.addLiveIn(PReg: VA.getLocReg(), RC);
7379
7380	if (IsLocallyStreaming) {
7381	// LocallyStreamingFunctions must insert the SMSTART in the correct
7382	// position, so we use Glue to ensure no instructions can be scheduled
7383	// between the chain of:
7384	// t0: ch,glue = EntryNode
7385	// t1: res,ch,glue = CopyFromReg
7386	// ...
7387	// tn: res,ch,glue = CopyFromReg t(n-1), ..
7388	// t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7389	// ^^^^^^
7390	// This will be the new Chain/Root node.
7391	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT, Glue);
7392	Glue = ArgValue.getValue(R: `2`);
7393	if (isPassedInFPR(VT: ArgValue.getValueType())) {
7394	ArgValue =
7395	DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
7396	VTList: DAG.getVTList(VT1: ArgValue.getValueType(), VT2: MVT::Glue),
7397	Ops: {ArgValue, Glue});
7398	Glue = ArgValue.getValue(R: `1`);
7399	}
7400	} else
7401	ArgValue = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: RegVT);
7402
7403	// If this is an 8, 16 or 32-bit value, it is really passed promoted
7404	// to 64 bits. Insert an assert[sz]ext to capture this, then
7405	// truncate to the right size.
7406	switch (VA.getLocInfo()) {
7407	default:
7408	llvm_unreachable("Unknown loc info!");
7409	case CCValAssign::Full:
7410	break;
7411	case CCValAssign::Indirect:
7412	assert(
7413	(VA.getValVT().isScalableVT() \|\| Subtarget->isWindowsArm64EC()) &&
7414	"Indirect arguments should be scalable on most subtargets");
7415	break;
7416	case CCValAssign::BCvt:
7417	ArgValue = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: ArgValue);
7418	break;
7419	case CCValAssign::AExt:
7420	case CCValAssign::SExt:
7421	case CCValAssign::ZExt:
7422	break;
7423	case CCValAssign::AExtUpper:
7424	ArgValue = DAG.getNode(Opcode: ISD::SRL, DL, VT: RegVT, N1: ArgValue,
7425	N2: DAG.getConstant(Val: `32`, DL, VT: RegVT));
7426	ArgValue = DAG.getZExtOrTrunc(Op: ArgValue, DL, VT: VA.getValVT());
7427	break;
7428	}
7429	} else { // VA.isRegLoc()
7430	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7431	unsigned ArgOffset = VA.getLocMemOffset();
7432	unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7433	? VA.getLocVT().getSizeInBits()
7434	: VA.getValVT().getSizeInBits()) / `8`;
7435
7436	uint32_t BEAlign = `0`;
7437	if (!Subtarget->isLittleEndian() && ArgSize < `8` &&
7438	!Ins [i].Flags.isInConsecutiveRegs())
7439	BEAlign = `8` - ArgSize;
7440
7441	SDValue FIN;
7442	MachinePointerInfo PtrInfo;
7443	if (StackViaX4) {
7444	// In both the ARM64EC varargs convention and the thunk convention,
7445	// arguments on the stack are accessed relative to x4, not sp. In
7446	// the thunk convention, there's an additional offset of 32 bytes
7447	// to account for the shadow store.
7448	unsigned ObjOffset = ArgOffset + BEAlign;
7449	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7450	ObjOffset += `32`;
7451	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7452	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7453	FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
7454	N2: DAG.getConstant(Val: ObjOffset, DL, VT: MVT::i64));
7455	PtrInfo = MachinePointerInfo::getUnknownStack(MF);
7456	} else {
7457	int FI = MFI.CreateFixedObject(Size: ArgSize, SPOffset: ArgOffset + BEAlign, IsImmutable: true);
7458
7459	// Create load nodes to retrieve arguments from the stack.
7460	FIN = DAG.getFrameIndex(FI, VT: getPointerTy(DL: DAG.getDataLayout()));
7461	PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7462	}
7463
7464	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7465	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
7466	MVT MemVT = VA.getValVT();
7467
7468	switch (VA.getLocInfo()) {
7469	default:
7470	break;
7471	case CCValAssign::Trunc:
7472	case CCValAssign::BCvt:
7473	MemVT = VA.getLocVT();
7474	break;
7475	case CCValAssign::Indirect:
7476	assert((VA.getValVT().isScalableVector() \|\|
7477	Subtarget->isWindowsArm64EC()) &&
7478	"Indirect arguments should be scalable on most subtargets");
7479	MemVT = VA.getLocVT();
7480	break;
7481	case CCValAssign::SExt:
7482	ExtType = ISD::SEXTLOAD;
7483	break;
7484	case CCValAssign::ZExt:
7485	ExtType = ISD::ZEXTLOAD;
7486	break;
7487	case CCValAssign::AExt:
7488	ExtType = ISD::EXTLOAD;
7489	break;
7490	}
7491
7492	ArgValue = DAG.getExtLoad(ExtType, dl: DL, VT: VA.getLocVT(), Chain, Ptr: FIN, PtrInfo,
7493	MemVT);
7494	}
7495
7496	if (VA.getLocInfo() == CCValAssign::Indirect) {
7497	assert((VA.getValVT().isScalableVT() \|\|
7498	Subtarget->isWindowsArm64EC()) &&
7499	"Indirect arguments should be scalable on most subtargets");
7500
7501	uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7502	unsigned NumParts = `1`;
7503	if (Ins [i].Flags.isInConsecutiveRegs()) {
7504	while (!Ins [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
7505	++NumParts;
7506	}
7507
7508	MVT PartLoad = VA.getValVT();
7509	SDValue Ptr = ArgValue;
7510
7511	// Ensure we generate all loads for each tuple part, whilst updating the
7512	// pointer after each load correctly using vscale.
7513	while (NumParts > `0`) {
7514	ArgValue = DAG.getLoad(VT: PartLoad, dl: DL, Chain, Ptr, PtrInfo: MachinePointerInfo ());
7515	InVals.push_back(Elt: ArgValue);
7516	NumParts--;
7517	if (NumParts > `0`) {
7518	SDValue BytesIncrement;
7519	if (PartLoad.isScalableVector()) {
7520	BytesIncrement = DAG.getVScale(
7521	DL, VT: Ptr.getValueType(),
7522	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7523	} else {
7524	BytesIncrement = DAG.getConstant(
7525	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7526	VT: Ptr.getValueType());
7527	}
7528	SDNodeFlags Flags;
7529	Flags.setNoUnsignedWrap(true);
7530	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
7531	N2: BytesIncrement, Flags);
7532	ExtraArgLocs++;
7533	i++;
7534	}
7535	}
7536	} else {
7537	if (Subtarget->isTargetILP32() && Ins [i].Flags.isPointer())
7538	ArgValue = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: ArgValue.getValueType(),
7539	N1: ArgValue, N2: DAG.getValueType(MVT::i32));
7540
7541	// i1 arguments are zero-extended to i8 by the caller. Emit a
7542	// hint to reflect this.
7543	if (Ins [i].isOrigArg()) {
7544	Argument *OrigArg = F.getArg(i: Ins [i].getOrigArgIndex());
7545	if (OrigArg->getType()->isIntegerTy(Bitwidth: `1`)) {
7546	if (!Ins [i].Flags.isZExt()) {
7547	ArgValue = DAG.getNode(Opcode: AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7548	VT: ArgValue.getValueType(), Operand: ArgValue);
7549	}
7550	}
7551	}
7552
7553	InVals.push_back(Elt: ArgValue);
7554	}
7555	}
7556	assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7557
7558	// Insert the SMSTART if this is a locally streaming function and
7559	// make sure it is Glued to the last CopyFromReg value.
7560	if (IsLocallyStreaming) {
7561	SDValue PStateSM;
7562	if (Attrs.hasStreamingCompatibleInterface()) {
7563	PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
7564	Register Reg = MF.getRegInfo().createVirtualRegister(
7565	RegClass: getRegClassFor(VT: PStateSM.getValueType().getSimpleVT()));
7566	FuncInfo->setPStateSMReg(Reg);
7567	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: PStateSM);
7568	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
7569	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
7570	} else
7571	Chain = changeStreamingMode(DAG, DL, /Enable/ true, Chain, InGlue: Glue,
7572	Condition: AArch64SME::Always);
7573
7574	// Ensure that the SMSTART happens after the CopyWithChain such that its
7575	// chain result is used.
7576	for (unsigned I=`0`; I<InVals.size(); ++I) {
7577	Register Reg = MF.getRegInfo().createVirtualRegister(
7578	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
7579	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg, N: InVals [I]);
7580	InVals [I] = DAG.getCopyFromReg(Chain, dl: DL, Reg,
7581	VT: InVals [I].getValueType());
7582	}
7583	}
7584
7585	// varargs
7586	if (isVarArg) {
7587	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
7588	// The AAPCS variadic function ABI is identical to the non-variadic
7589	// one. As a result there may be more arguments in registers and we should
7590	// save them for future reference.
7591	// Win64 variadic functions also pass arguments in registers, but all float
7592	// arguments are passed in integer registers.
7593	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7594	}
7595
7596	// This will point to the next argument passed via stack.
7597	unsigned VarArgsOffset = CCInfo.getStackSize();
7598	// We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7599	VarArgsOffset = alignTo(Value: VarArgsOffset, Align: Subtarget->isTargetILP32() ? `4` : `8`);
7600	FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7601	FuncInfo->setVarArgsStackIndex(
7602	MFI.CreateFixedObject(Size: `4`, SPOffset: VarArgsOffset, IsImmutable: true));
7603
7604	if (MFI.hasMustTailInVarArgFunc()) {
7605	SmallVector<MVT, `2`> RegParmTypes;
7606	RegParmTypes.push_back(Elt: MVT::i64);
7607	RegParmTypes.push_back(Elt: MVT::f128);
7608	// Compute the set of forwarded registers. The rest are scratch.
7609	SmallVectorImpl<ForwardedRegister> &Forwards =
7610	FuncInfo->getForwardedMustTailRegParms();
7611	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7612	Fn: CC_AArch64_AAPCS);
7613
7614	// Conservatively forward X8, since it might be used for aggregate return.
7615	if (!CCInfo.isAllocated(Reg: AArch64::X8)) {
7616	Register X8VReg = MF.addLiveIn(PReg: AArch64::X8, RC: &AArch64::GPR64RegClass);
7617	Forwards.push_back(Elt: ForwardedRegister (X8VReg, AArch64::X8, MVT::i64));
7618	}
7619	}
7620	}
7621
7622	// On Windows, InReg pointers must be returned, so record the pointer in a
7623	// virtual register at the start of the function so it can be returned in the
7624	// epilogue.
7625	if (IsWin64 \|\| F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7626	for (unsigned I = `0`, E = Ins.size(); I != E; ++I) {
7627	if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 \|\|
7628	Ins [I].Flags.isInReg()) &&
7629	Ins [I].Flags.isSRet()) {
7630	assert(!FuncInfo->getSRetReturnReg());
7631
7632	MVT PtrTy = getPointerTy(DL: DAG.getDataLayout());
7633	Register Reg =
7634	MF.getRegInfo().createVirtualRegister(RegClass: getRegClassFor(VT: PtrTy));
7635	FuncInfo->setSRetReturnReg(Reg);
7636
7637	SDValue Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg, N: InVals [I]);
7638	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, N1: Copy, N2: Chain);
7639	break;
7640	}
7641	}
7642	}
7643
7644	unsigned StackArgSize = CCInfo.getStackSize();
7645	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7646	if (DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt)) {
7647	// This is a non-standard ABI so by fiat I say we're allowed to make full
7648	// use of the stack area to be popped, which must be aligned to 16 bytes in
7649	// any case:
7650	StackArgSize = alignTo(Value: StackArgSize, Align: `16`);
7651
7652	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7653	// a multiple of 16.
7654	FuncInfo->setArgumentStackToRestore(StackArgSize);
7655
7656	// This realignment carries over to the available bytes below. Our own
7657	// callers will guarantee the space is free by giving an aligned value to
7658	// CALLSEQ_START.
7659	}
7660	// Even if we're not expected to free up the space, it's useful to know how
7661	// much is there while considering tail calls (because we can reuse it).
7662	FuncInfo->setBytesInStackArgArea(StackArgSize);
7663
7664	if (Subtarget->hasCustomCallingConv())
7665	Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7666
7667	// Create a 16 Byte TPIDR2 object. The dynamic buffer
7668	// will be expanded and stored in the static object later using a pseudonode.
7669	if (SMEAttrs (MF.getFunction()).hasZAState()) {
7670	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7671	TPIDR2.FrameIndex = MFI.CreateStackObject(Size: `16`, Alignment: Align (`16`), isSpillSlot: false);
7672	SDValue SVL = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
7673	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
7674
7675	SDValue Buffer;
7676	if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7677	Buffer = DAG.getNode(Opcode: AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
7678	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other), Ops: {Chain, SVL});
7679	} else {
7680	SDValue Size = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: SVL, N2: SVL);
7681	Buffer = DAG.getNode(Opcode: ISD::DYNAMIC_STACKALLOC, DL,
7682	VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Other),
7683	Ops: {Chain, Size, DAG.getConstant(Val: `1`, DL, VT: MVT::i64)});
7684	MFI.CreateVariableSizedObject(Alignment: Align (`16`), Alloca: nullptr);
7685	}
7686	Chain = DAG.getNode(
7687	Opcode: AArch64ISD::INIT_TPIDR2OBJ, DL, VTList: DAG.getVTList(VT: MVT::Other),
7688	Ops: {/Chain/ Buffer.getValue(R: `1`), /Buffer ptr/ Buffer.getValue(R: `0`)});
7689	}
7690
7691	if (CallConv == CallingConv::PreserveNone) {
7692	for (const ISD::InputArg &I : Ins) {
7693	if (I.Flags.isSwiftSelf() \|\| I.Flags.isSwiftError() \|\|
7694	I.Flags.isSwiftAsync()) {
7695	MachineFunction &MF = DAG.getMachineFunction();
7696	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
7697	MF.getFunction(),
7698	"Swift attributes can't be used with preserve_none",
7699	DL.getDebugLoc()));
7700	break;
7701	}
7702	}
7703	}
7704
7705	return Chain;
7706	}
7707
7708	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7709	SelectionDAG &DAG,
7710	const SDLoc &DL,
7711	SDValue &Chain) const {
7712	MachineFunction &MF = DAG.getMachineFunction();
7713	MachineFrameInfo &MFI = MF.getFrameInfo();
7714	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7715	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
7716	Function &F = MF.getFunction();
7717	bool IsWin64 =
7718	Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg());
7719
7720	SmallVector<SDValue, `8`> MemOps;
7721
7722	auto GPRArgRegs = AArch64::getGPRArgRegs();
7723	unsigned NumGPRArgRegs = GPRArgRegs.size();
7724	if (Subtarget->isWindowsArm64EC()) {
7725	// In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7726	// functions.
7727	NumGPRArgRegs = `4`;
7728	}
7729	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(Regs: GPRArgRegs);
7730
7731	unsigned GPRSaveSize = `8` * (NumGPRArgRegs - FirstVariadicGPR);
7732	int GPRIdx = `0`;
7733	if (GPRSaveSize != `0`) {
7734	if (IsWin64) {
7735	GPRIdx = MFI.CreateFixedObject(Size: GPRSaveSize, SPOffset: -(int)GPRSaveSize, IsImmutable: false);
7736	if (GPRSaveSize & `15`)
7737	// The extra size here, if triggered, will always be 8.
7738	MFI.CreateFixedObject(Size: `16` - (GPRSaveSize & `15`), SPOffset: -(int)alignTo(Value: GPRSaveSize, Align: `16`), IsImmutable: false);
7739	} else
7740	GPRIdx = MFI.CreateStackObject(Size: GPRSaveSize, Alignment: Align (`8`), isSpillSlot: false);
7741
7742	SDValue FIN;
7743	if (Subtarget->isWindowsArm64EC()) {
7744	// With the Arm64EC ABI, we reserve the save area as usual, but we
7745	// compute its address relative to x4. For a normal AArch64->AArch64
7746	// call, x4 == sp on entry, but calls from an entry thunk can pass in a
7747	// different address.
7748	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
7749	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7750	FIN = DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Val,
7751	N2: DAG.getConstant(Val: GPRSaveSize, DL, VT: MVT::i64));
7752	} else {
7753	FIN = DAG.getFrameIndex(FI: GPRIdx, VT: PtrVT);
7754	}
7755
7756	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7757	Register VReg = MF.addLiveIn(PReg: GPRArgRegs [i], RC: &AArch64::GPR64RegClass);
7758	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::i64);
7759	SDValue Store =
7760	DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
7761	PtrInfo: IsWin64 ? MachinePointerInfo::getFixedStack(
7762	MF, FI: GPRIdx, Offset: (i - FirstVariadicGPR) * `8`)
7763	: MachinePointerInfo::getStack(MF, Offset: i * `8`));
7764	MemOps.push_back(Elt: Store);
7765	FIN =
7766	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN, N2: DAG.getConstant(Val: `8`, DL, VT: PtrVT));
7767	}
7768	}
7769	FuncInfo->setVarArgsGPRIndex(GPRIdx);
7770	FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7771
7772	if (Subtarget->hasFPARMv8() && !IsWin64) {
7773	auto FPRArgRegs = AArch64::getFPRArgRegs();
7774	const unsigned NumFPRArgRegs = FPRArgRegs.size();
7775	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(Regs: FPRArgRegs);
7776
7777	unsigned FPRSaveSize = `16` * (NumFPRArgRegs - FirstVariadicFPR);
7778	int FPRIdx = `0`;
7779	if (FPRSaveSize != `0`) {
7780	FPRIdx = MFI.CreateStackObject(Size: FPRSaveSize, Alignment: Align (`16`), isSpillSlot: false);
7781
7782	SDValue FIN = DAG.getFrameIndex(FI: FPRIdx, VT: PtrVT);
7783
7784	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7785	Register VReg = MF.addLiveIn(PReg: FPRArgRegs [i], RC: &AArch64::FPR128RegClass);
7786	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: VReg, VT: MVT::f128);
7787
7788	SDValue Store = DAG.getStore(Chain: Val.getValue(R: `1`), dl: DL, Val, Ptr: FIN,
7789	PtrInfo: MachinePointerInfo::getStack(MF, Offset: i * `16`));
7790	MemOps.push_back(Elt: Store);
7791	FIN = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: FIN,
7792	N2: DAG.getConstant(Val: `16`, DL, VT: PtrVT));
7793	}
7794	}
7795	FuncInfo->setVarArgsFPRIndex(FPRIdx);
7796	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7797	}
7798
7799	if (!MemOps.empty()) {
7800	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
7801	}
7802	}
7803
7804	/// LowerCallResult - Lower the result values of a call into the
7805	/// appropriate copies out of appropriate physical registers.
7806	SDValue AArch64TargetLowering::LowerCallResult(
7807	SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7808	const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7809	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7810	SDValue ThisVal, bool RequiresSMChange) const {
7811	DenseMap<unsigned, SDValue> CopiedRegs;
7812	// Copy all of the result registers out of their specified physreg.
7813	for (unsigned i = `0`; i != RVLocs.size(); ++i) {
7814	CCValAssign VA = RVLocs [i];
7815
7816	// Pass 'this' value directly from the argument to return value, to avoid
7817	// reg unit interference
7818	if (i == `0` && isThisReturn) {
7819	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7820	"unexpected return calling convention register assignment");
7821	InVals.push_back(Elt: ThisVal);
7822	continue;
7823	}
7824
7825	// Avoid copying a physreg twice since RegAllocFast is incompetent and only
7826	// allows one use of a physreg per block.
7827	SDValue Val = CopiedRegs.lookup(Val: VA.getLocReg());
7828	if (!Val) {
7829	Val =
7830	DAG.getCopyFromReg(Chain, dl: DL, Reg: VA.getLocReg(), VT: VA.getLocVT(), Glue: InGlue);
7831	Chain = Val.getValue(R: `1`);
7832	InGlue = Val.getValue(R: `2`);
7833	CopiedRegs [VA.getLocReg()] = Val;
7834	}
7835
7836	switch (VA.getLocInfo()) {
7837	default:
7838	llvm_unreachable("Unknown loc info!");
7839	case CCValAssign::Full:
7840	break;
7841	case CCValAssign::BCvt:
7842	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getValVT(), Operand: Val);
7843	break;
7844	case CCValAssign::AExtUpper:
7845	Val = DAG.getNode(Opcode: ISD::SRL, DL, VT: VA.getLocVT(), N1: Val,
7846	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
7847	[[fallthrough]];
7848	case CCValAssign::AExt:
7849	[[fallthrough]];
7850	case CCValAssign::ZExt:
7851	Val = DAG.getZExtOrTrunc(Op: Val, DL, VT: VA.getValVT());
7852	break;
7853	}
7854
7855	if (RequiresSMChange && isPassedInFPR(VT: VA.getValVT()))
7856	Val = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL, VT: Val.getValueType(),
7857	Operand: Val);
7858
7859	InVals.push_back(Elt: Val);
7860	}
7861
7862	return Chain;
7863	}
7864
7865	/// Return true if the calling convention is one that we can guarantee TCO for.
7866	static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7867	return (CC == CallingConv::Fast && GuaranteeTailCalls) \|\|
7868	CC == CallingConv::Tail \|\| CC == CallingConv::SwiftTail;
7869	}
7870
7871	/// Return true if we might ever do TCO for calls with this calling convention.
7872	static bool mayTailCallThisCC(CallingConv::ID CC) {
7873	switch (CC) {
7874	case CallingConv::C:
7875	case CallingConv::AArch64_SVE_VectorCall:
7876	case CallingConv::PreserveMost:
7877	case CallingConv::PreserveAll:
7878	case CallingConv::PreserveNone:
7879	case CallingConv::Swift:
7880	case CallingConv::SwiftTail:
7881	case CallingConv::Tail:
7882	case CallingConv::Fast:
7883	return true;
7884	default:
7885	return false;
7886	}
7887	}
7888
7889	/// Return true if the call convention supports varargs
7890	/// Currently only those that pass varargs like the C
7891	/// calling convention does are eligible
7892	/// Calling conventions listed in this function must also
7893	/// be properly handled in AArch64Subtarget::isCallingConvWin64
7894	static bool callConvSupportsVarArgs(CallingConv::ID CC) {
7895	switch (CC) {
7896	case CallingConv::C:
7897	case CallingConv::PreserveNone:
7898	return true;
7899	default:
7900	return false;
7901	}
7902	}
7903
7904	static void analyzeCallOperands(const AArch64TargetLowering &TLI,
7905	const AArch64Subtarget *Subtarget,
7906	const TargetLowering::CallLoweringInfo &CLI,
7907	CCState &CCInfo) {
7908	const SelectionDAG &DAG = CLI.DAG;
7909	CallingConv::ID CalleeCC = CLI.CallConv;
7910	bool IsVarArg = CLI.IsVarArg;
7911	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
7912	bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CC: CalleeCC, IsVarArg);
7913
7914	// For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7915	// for the shadow store.
7916	if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7917	CCInfo.AllocateStack(Size: `32`, Alignment: Align (`16`));
7918
7919	unsigned NumArgs = Outs.size();
7920	for (unsigned i = `0`; i != NumArgs; ++i) {
7921	MVT ArgVT = Outs [i].VT;
7922	ISD::ArgFlagsTy ArgFlags = Outs [i].Flags;
7923
7924	bool UseVarArgCC = false;
7925	if (IsVarArg) {
7926	// On Windows, the fixed arguments in a vararg call are passed in GPRs
7927	// too, so use the vararg CC to force them to integer registers.
7928	if (IsCalleeWin64) {
7929	UseVarArgCC = true;
7930	} else {
7931	UseVarArgCC = !Outs [i].IsFixed;
7932	}
7933	}
7934
7935	if (!UseVarArgCC) {
7936	// Get type of the original argument.
7937	EVT ActualVT =
7938	TLI.getValueType(DL: DAG.getDataLayout(), Ty: CLI.Args [Outs [i].OrigArgIndex].Ty,
7939	/AllowUnknown/ true);
7940	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7941	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7942	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
7943	ArgVT = MVT::i8;
7944	else if (ActualMVT == MVT::i16)
7945	ArgVT = MVT::i16;
7946	}
7947
7948	CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC: CalleeCC, IsVarArg: UseVarArgCC);
7949	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7950	assert(!Res && "Call operand has unhandled type");
7951	(void)Res;
7952	}
7953	}
7954
7955	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7956	const CallLoweringInfo &CLI) const {
7957	CallingConv::ID CalleeCC = CLI.CallConv;
7958	if (!mayTailCallThisCC(CC: CalleeCC))
7959	return false;
7960
7961	SDValue Callee = CLI.Callee;
7962	bool IsVarArg = CLI.IsVarArg;
7963	const SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
7964	const SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
7965	const SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
7966	const SelectionDAG &DAG = CLI.DAG;
7967	MachineFunction &MF = DAG.getMachineFunction();
7968	const Function &CallerF = MF.getFunction();
7969	CallingConv::ID CallerCC = CallerF.getCallingConv();
7970
7971	// SME Streaming functions are not eligible for TCO as they may require
7972	// the streaming mode or ZA to be restored after returning from the call.
7973	SMEAttrs CallerAttrs(MF.getFunction());
7974	auto CalleeAttrs = CLI.CB ? SMEAttrs (*CLI.CB) : SMEAttrs (SMEAttrs::Normal);
7975	if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) \|\|
7976	CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
7977	CallerAttrs.hasStreamingBody())
7978	return false;
7979
7980	// Functions using the C or Fast calling convention that have an SVE signature
7981	// preserve more registers and should assume the SVE_VectorCall CC.
7982	// The check for matching callee-saved regs will determine whether it is
7983	// eligible for TCO.
7984	if ((CallerCC == CallingConv::C \|\| CallerCC == CallingConv::Fast) &&
7985	MF.getInfo<AArch64FunctionInfo>()->isSVECC())
7986	CallerCC = CallingConv::AArch64_SVE_VectorCall;
7987
7988	bool CCMatch = CallerCC == CalleeCC;
7989
7990	// When using the Windows calling convention on a non-windows OS, we want
7991	// to back up and restore X18 in such functions; we can't do a tail call
7992	// from those functions.
7993	if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7994	CalleeCC != CallingConv::Win64)
7995	return false;
7996
7997	// Byval parameters hand the function a pointer directly into the stack area
7998	// we want to reuse during a tail call. Working around this is* possible (see*
7999	// X86) but less efficient and uglier in LowerCall.
8000	for (Function::const_arg_iterator i = CallerF.arg_begin(),
8001	e = CallerF.arg_end();
8002	i != e; ++i) {
8003	if (i->hasByValAttr())
8004	return false;
8005
8006	// On Windows, "inreg" attributes signify non-aggregate indirect returns.
8007	// In this case, it is necessary to save/restore X0 in the callee. Tail
8008	// call opt interferes with this. So we disable tail call opt when the
8009	// caller has an argument with "inreg" attribute.
8010
8011	// FIXME: Check whether the callee also has an "inreg" argument.
8012	if (i->hasInRegAttr())
8013	return false;
8014	}
8015
8016	if (canGuaranteeTCO(CC: CalleeCC, GuaranteeTailCalls: getTargetMachine().Options.GuaranteedTailCallOpt))
8017	return CCMatch;
8018
8019	// Externally-defined functions with weak linkage should not be
8020	// tail-called on AArch64 when the OS does not support dynamic
8021	// pre-emption of symbols, as the AAELF spec requires normal calls
8022	// to undefined weak functions to be replaced with a NOP or jump to the
8023	// next instruction. The behaviour of branch instructions in this
8024	// situation (as used for tail calls) is implementation-defined, so we
8025	// cannot rely on the linker replacing the tail call with a return.
8026	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8027	const GlobalValue *GV = G->getGlobal();
8028	const Triple &TT = getTargetMachine().getTargetTriple();
8029	if (GV->hasExternalWeakLinkage() &&
8030	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
8031	return false;
8032	}
8033
8034	// Now we search for cases where we can use a tail call without changing the
8035	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
8036	// concept.
8037
8038	// I want anyone implementing a new calling convention to think long and hard
8039	// about this assert.
8040	if (IsVarArg && !callConvSupportsVarArgs(CC: CalleeCC))
8041	report_fatal_error(reason: "Unsupported variadic calling convention");
8042
8043	LLVMContext &C = *DAG.getContext();
8044	// Check that the call results are passed in the same way.
8045	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8046	CalleeFn: CCAssignFnForCall(CC: CalleeCC, IsVarArg),
8047	CallerFn: CCAssignFnForCall(CC: CallerCC, IsVarArg)))
8048	return false;
8049	// The callee has to preserve all registers the caller needs to preserve.
8050	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8051	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8052	if (!CCMatch) {
8053	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8054	if (Subtarget->hasCustomCallingConv()) {
8055	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CallerPreserved);
8056	TRI->UpdateCustomCallPreservedMask(MF, Mask: &CalleePreserved);
8057	}
8058	if (!TRI->regmaskSubsetEqual(mask0: CallerPreserved, mask1: CalleePreserved))
8059	return false;
8060	}
8061
8062	// Nothing more to check if the callee is taking no arguments
8063	if (Outs.empty())
8064	return true;
8065
8066	SmallVector<CCValAssign, `16`> ArgLocs;
8067	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8068
8069	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8070
8071	if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8072	// When we are musttail, additional checks have been done and we can safely ignore this check
8073	// At least two cases here: if caller is fastcc then we can't have any
8074	// memory arguments (we'd be expected to clean up the stack afterwards). If
8075	// caller is C then we could potentially use its argument area.
8076
8077	// FIXME: for now we take the most conservative of these in both cases:
8078	// disallow all variadic memory operands.
8079	for (const CCValAssign &ArgLoc : ArgLocs)
8080	if (!ArgLoc.isRegLoc())
8081	return false;
8082	}
8083
8084	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8085
8086	// If any of the arguments is passed indirectly, it must be SVE, so the
8087	// 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8088	// allocate space on the stack. That is why we determine this explicitly here
8089	// the call cannot be a tailcall.
8090	if (llvm::any_of(Range&: ArgLocs, P: [&](CCValAssign &A) {
8091	assert((A.getLocInfo() != CCValAssign::Indirect \|\|
8092	A.getValVT().isScalableVector() \|\|
8093	Subtarget->isWindowsArm64EC()) &&
8094	"Expected value to be scalable");
8095	return A.getLocInfo() == CCValAssign::Indirect;
8096	}))
8097	return false;
8098
8099	// If the stack arguments for this call do not fit into our own save area then
8100	// the call cannot be made tail.
8101	if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8102	return false;
8103
8104	const MachineRegisterInfo &MRI = MF.getRegInfo();
8105	if (!parametersInCSRMatch(MRI, CallerPreservedMask: CallerPreserved, ArgLocs, OutVals))
8106	return false;
8107
8108	return true;
8109	}
8110
8111	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8112	SelectionDAG &DAG,
8113	MachineFrameInfo &MFI,
8114	int ClobberedFI) const {
8115	SmallVector<SDValue, `8`> ArgChains;
8116	int64_t FirstByte = MFI.getObjectOffset(ObjectIdx: ClobberedFI);
8117	int64_t LastByte = FirstByte + MFI.getObjectSize(ObjectIdx: ClobberedFI) - `1`;
8118
8119	// Include the original chain at the beginning of the list. When this is
8120	// used by target LowerCall hooks, this helps legalize find the
8121	// CALLSEQ_BEGIN node.
8122	ArgChains.push_back(Elt: Chain);
8123
8124	// Add a chain value for each stack argument corresponding
8125	for (SDNode *U : DAG.getEntryNode().getNode()->uses())
8126	if (LoadSDNode *L = dyn_cast<LoadSDNode>(Val: U))
8127	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Val: L->getBasePtr()))
8128	if (FI->getIndex() < `0`) {
8129	int64_t InFirstByte = MFI.getObjectOffset(ObjectIdx: FI->getIndex());
8130	int64_t InLastByte = InFirstByte;
8131	InLastByte += MFI.getObjectSize(ObjectIdx: FI->getIndex()) - `1`;
8132
8133	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
8134	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
8135	ArgChains.push_back(Elt: SDValue (L, `1`));
8136	}
8137
8138	// Build a tokenfactor for all the chains.
8139	return DAG.getNode(Opcode: ISD::TokenFactor, DL: SDLoc (Chain), VT: MVT::Other, Ops: ArgChains);
8140	}
8141
8142	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8143	bool TailCallOpt) const {
8144	return (CallCC == CallingConv::Fast && TailCallOpt) \|\|
8145	CallCC == CallingConv::Tail \|\| CallCC == CallingConv::SwiftTail;
8146	}
8147
8148	// Check if the value is zero-extended from i1 to i8
8149	static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8150	unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8151	if (SizeInBits < `8`)
8152	return false;
8153
8154	APInt RequredZero(SizeInBits, `0xFE`);
8155	KnownBits Bits = DAG.computeKnownBits(Op: Arg, Depth: `4`);
8156	bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8157	return ZExtBool;
8158	}
8159
8160	void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8161	SDNode Node) const* {
8162	// Live-in physreg copies that are glued to SMSTART are applied as
8163	// implicit-def's in the InstrEmitter. Here we remove them, allowing the
8164	// register allocator to pass call args in callee saved regs, without extra
8165	// copies to avoid these fake clobbers of actually-preserved GPRs.
8166	if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 \|\|
8167	MI.getOpcode() == AArch64::MSRpstatePseudo) {
8168	for (unsigned I = MI.getNumOperands() - `1`; I > `0`; --I)
8169	if (MachineOperand &MO = MI.getOperand(i: I);
8170	MO.isReg() && MO.isImplicit() && MO.isDef() &&
8171	(AArch64::GPR32RegClass.contains(Reg: MO.getReg()) \|\|
8172	AArch64::GPR64RegClass.contains(Reg: MO.getReg())))
8173	MI.removeOperand(OpNo: I);
8174
8175	// The SVE vector length can change when entering/leaving streaming mode.
8176	if (MI.getOperand(i: `0`).getImm() == AArch64SVCR::SVCRSM \|\|
8177	MI.getOperand(i: `0`).getImm() == AArch64SVCR::SVCRSMZA) {
8178	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: false,
8179	/IsImplicit=/isImp: true));
8180	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: true,
8181	/IsImplicit=/isImp: true));
8182	}
8183	}
8184
8185	// Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8186	// have nothing to do with VG, were it not that they are used to materialise a
8187	// frame-address. If they contain a frame-index to a scalable vector, this
8188	// will likely require an ADDVL instruction to materialise the address, thus
8189	// reading VG.
8190	const MachineFunction &MF = *MI.getMF();
8191	if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
8192	(MI.getOpcode() == AArch64::ADDXri \|\|
8193	MI.getOpcode() == AArch64::SUBXri)) {
8194	const MachineOperand &MO = MI.getOperand(i: `1`);
8195	if (MO.isFI() && MF.getFrameInfo().getStackID(ObjectIdx: MO.getIndex()) ==
8196	TargetStackID::ScalableVector)
8197	MI.addOperand(Op: MachineOperand::CreateReg(Reg: AArch64::VG, /IsDef=/isDef: false,
8198	/IsImplicit=/isImp: true));
8199	}
8200	}
8201
8202	SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
8203	bool Enable, SDValue Chain,
8204	SDValue InGlue,
8205	unsigned Condition,
8206	SDValue PStateSM) const {
8207	MachineFunction &MF = DAG.getMachineFunction();
8208	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8209	FuncInfo->setHasStreamingModeChanges(true);
8210
8211	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8212	SDValue RegMask = DAG.getRegisterMask(RegMask: TRI->getSMStartStopCallPreservedMask());
8213	SDValue MSROp =
8214	DAG.getTargetConstant(Val: (int32_t)AArch64SVCR::SVCRSM, DL, VT: MVT::i32);
8215	SDValue ConditionOp = DAG.getTargetConstant(Val: Condition, DL, VT: MVT::i64);
8216	SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8217	if (Condition != AArch64SME::Always) {
8218	assert(PStateSM && "PStateSM should be defined");
8219	Ops.push_back(Elt: PStateSM);
8220	}
8221	Ops.push_back(Elt: RegMask);
8222
8223	if (InGlue)
8224	Ops.push_back(Elt: InGlue);
8225
8226	unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8227	return DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
8228	}
8229
8230	static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8231	const SMEAttrs &CalleeAttrs) {
8232	if (!CallerAttrs.hasStreamingCompatibleInterface() \|\|
8233	CallerAttrs.hasStreamingBody())
8234	return AArch64SME::Always;
8235	if (CalleeAttrs.hasNonStreamingInterface())
8236	return AArch64SME::IfCallerIsStreaming;
8237	if (CalleeAttrs.hasStreamingInterface())
8238	return AArch64SME::IfCallerIsNonStreaming;
8239
8240	llvm_unreachable("Unsupported attributes");
8241	}
8242
8243	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8244	/// and add input and output parameter nodes.
8245	SDValue
8246	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8247	SmallVectorImpl<SDValue> &InVals) const {
8248	SelectionDAG &DAG = CLI.DAG;
8249	SDLoc &DL = CLI.DL;
8250	SmallVector<ISD::OutputArg, `32`> &Outs = CLI.Outs;
8251	SmallVector<SDValue, `32`> &OutVals = CLI.OutVals;
8252	SmallVector<ISD::InputArg, `32`> &Ins = CLI.Ins;
8253	SDValue Chain = CLI.Chain;
8254	SDValue Callee = CLI.Callee;
8255	bool &IsTailCall = CLI.IsTailCall;
8256	CallingConv::ID &CallConv = CLI.CallConv;
8257	bool IsVarArg = CLI.IsVarArg;
8258
8259	MachineFunction &MF = DAG.getMachineFunction();
8260	MachineFunction::CallSiteInfo CSInfo;
8261	bool IsThisReturn = false;
8262
8263	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8264	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8265	bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8266	bool IsSibCall = false;
8267	bool GuardWithBTI = false;
8268
8269	if (CLI.CB && CLI.CB->hasFnAttr(Kind: Attribute::ReturnsTwice) &&
8270	!Subtarget->noBTIAtReturnTwice()) {
8271	GuardWithBTI = FuncInfo->branchTargetEnforcement();
8272	}
8273
8274	// Analyze operands of the call, assigning locations to each operand.
8275	SmallVector<CCValAssign, `16`> ArgLocs;
8276	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8277
8278	if (IsVarArg) {
8279	unsigned NumArgs = Outs.size();
8280
8281	for (unsigned i = `0`; i != NumArgs; ++i) {
8282	if (!Outs [i].IsFixed && Outs [i].VT.isScalableVector())
8283	report_fatal_error(reason: "Passing SVE types to variadic functions is "
8284	"currently not supported");
8285	}
8286	}
8287
8288	analyzeCallOperands(TLI: *this, Subtarget, CLI, CCInfo);
8289
8290	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
8291	// Assign locations to each value returned by this call.
8292	SmallVector<CCValAssign, `16`> RVLocs;
8293	CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8294	*DAG.getContext());
8295	RetCCInfo.AnalyzeCallResult(Ins, Fn: RetCC);
8296
8297	// Check callee args/returns for SVE registers and set calling convention
8298	// accordingly.
8299	if (CallConv == CallingConv::C \|\| CallConv == CallingConv::Fast) {
8300	auto HasSVERegLoc = [](CCValAssign &Loc) {
8301	if (!Loc.isRegLoc())
8302	return false;
8303	return AArch64::ZPRRegClass.contains(Reg: Loc.getLocReg()) \|\|
8304	AArch64::PPRRegClass.contains(Reg: Loc.getLocReg());
8305	};
8306	if (any_of(Range&: RVLocs, P: HasSVERegLoc) \|\| any_of(Range&: ArgLocs, P: HasSVERegLoc))
8307	CallConv = CallingConv::AArch64_SVE_VectorCall;
8308	}
8309
8310	if (IsTailCall) {
8311	// Check if it's really possible to do a tail call.
8312	IsTailCall = isEligibleForTailCallOptimization(CLI);
8313
8314	// A sibling call is one where we're under the usual C ABI and not planning
8315	// to change that but can still do a tail call:
8316	if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8317	CallConv != CallingConv::SwiftTail)
8318	IsSibCall = true;
8319
8320	if (IsTailCall)
8321	++NumTailCalls;
8322	}
8323
8324	if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8325	report_fatal_error(reason: "failed to perform tail call elimination on a call "
8326	"site marked musttail");
8327
8328	// Get a count of how many bytes are to be pushed on the stack.
8329	unsigned NumBytes = CCInfo.getStackSize();
8330
8331	if (IsSibCall) {
8332	// Since we're not changing the ABI to make this a tail call, the memory
8333	// operands are already available in the caller's incoming argument space.
8334	NumBytes = `0`;
8335	}
8336
8337	// FPDiff is the byte offset of the call's argument area from the callee's.
8338	// Stores to callee stack arguments will be placed in FixedStackSlots offset
8339	// by this amount for a tail call. In a sibling call it must be 0 because the
8340	// caller will deallocate the entire stack and the callee still expects its
8341	// arguments to begin at SP+0. Completely unused for non-tail calls.
8342	int FPDiff = `0`;
8343
8344	if (IsTailCall && !IsSibCall) {
8345	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8346
8347	// Since callee will pop argument stack as a tail call, we must keep the
8348	// popped size 16-byte aligned.
8349	NumBytes = alignTo(Value: NumBytes, Align: `16`);
8350
8351	// FPDiff will be negative if this tail call requires more space than we
8352	// would automatically have in our incoming argument space. Positive if we
8353	// can actually shrink the stack.
8354	FPDiff = NumReusableBytes - NumBytes;
8355
8356	// Update the required reserved area if this is the tail call requiring the
8357	// most argument stack space.
8358	if (FPDiff < `0` && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8359	FuncInfo->setTailCallReservedStack(-FPDiff);
8360
8361	// The stack pointer must be 16-byte aligned at all times it's used for a
8362	// memory operation, which in practice means at all* times and in*
8363	// particular across call boundaries. Therefore our own arguments started at
8364	// a 16-byte aligned SP and the delta applied for the tail call should
8365	// satisfy the same constraint.
8366	assert(FPDiff % `16` == `0` && "unaligned stack on tail call");
8367	}
8368
8369	// Determine whether we need any streaming mode changes.
8370	SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8371	if (CLI.CB)
8372	CalleeAttrs = SMEAttrs (*CLI.CB);
8373	else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8374	CalleeAttrs = SMEAttrs (ES->getSymbol());
8375
8376	auto DescribeCallsite =
8377	[&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
8378	R << "call from '" << ore::NV ("Caller", MF.getName()) << "' to '";
8379	if (auto *ES = dyn_cast<ExternalSymbolSDNode>(Val&: CLI.Callee))
8380	R << ore::NV ("Callee", ES->getSymbol());
8381	else if (CLI.CB && CLI.CB->getCalledFunction())
8382	R << ore::NV ("Callee", CLI.CB->getCalledFunction()->getName());
8383	else
8384	R << "unknown callee";
8385	R << "'";
8386	return R;
8387	};
8388
8389	bool RequiresLazySave = CallerAttrs.requiresLazySave(Callee: CalleeAttrs);
8390	if (RequiresLazySave) {
8391	const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8392	MachinePointerInfo MPI =
8393	MachinePointerInfo::getStack(MF, Offset: TPIDR2.FrameIndex);
8394	SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8395	FI: TPIDR2.FrameIndex,
8396	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8397	SDValue NumZaSaveSlicesAddr =
8398	DAG.getNode(Opcode: ISD::ADD, DL, VT: TPIDR2ObjAddr.getValueType(), N1: TPIDR2ObjAddr,
8399	N2: DAG.getConstant(Val: `8`, DL, VT: TPIDR2ObjAddr.getValueType()));
8400	SDValue NumZaSaveSlices = DAG.getNode(Opcode: AArch64ISD::RDSVL, DL, VT: MVT::i64,
8401	Operand: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
8402	Chain = DAG.getTruncStore(Chain, dl: DL, Val: NumZaSaveSlices, Ptr: NumZaSaveSlicesAddr,
8403	PtrInfo: MPI, SVT: MVT::i16);
8404	Chain = DAG.getNode(
8405	Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Chain,
8406	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8407	N3: TPIDR2ObjAddr);
8408	OptimizationRemarkEmitter ORE(&MF.getFunction());
8409	ORE.emit(RemarkBuilder: [&]() {
8410	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
8411	CLI.CB)
8412	: OptimizationRemarkAnalysis ("sme", "SMELazySaveZA",
8413	&MF.getFunction());
8414	return DescribeCallsite (R) << " sets up a lazy save for ZA";
8415	});
8416	}
8417
8418	SDValue PStateSM;
8419	bool RequiresSMChange = CallerAttrs.requiresSMChange(Callee: CalleeAttrs);
8420	if (RequiresSMChange) {
8421	if (CallerAttrs.hasStreamingInterfaceOrBody())
8422	PStateSM = DAG.getConstant(Val: `1`, DL, VT: MVT::i64);
8423	else if (CallerAttrs.hasNonStreamingInterface())
8424	PStateSM = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
8425	else
8426	PStateSM = getRuntimePStateSM(DAG, Chain, DL, VT: MVT::i64);
8427	OptimizationRemarkEmitter ORE(&MF.getFunction());
8428	ORE.emit(RemarkBuilder: [&]() {
8429	auto R = CLI.CB ? OptimizationRemarkAnalysis ("sme", "SMETransition",
8430	CLI.CB)
8431	: OptimizationRemarkAnalysis ("sme", "SMETransition",
8432	&MF.getFunction());
8433	DescribeCallsite (R) << " requires a streaming mode transition";
8434	return R;
8435	});
8436	}
8437
8438	SDValue ZTFrameIdx;
8439	MachineFrameInfo &MFI = MF.getFrameInfo();
8440	bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs);
8441
8442	// If the caller has ZT0 state which will not be preserved by the callee,
8443	// spill ZT0 before the call.
8444	if (ShouldPreserveZT0) {
8445	unsigned ZTObj = MFI.CreateSpillStackObject(Size: `64`, Alignment: Align (`16`));
8446	ZTFrameIdx = DAG.getFrameIndex(
8447	FI: ZTObj,
8448	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8449
8450	Chain = DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
8451	Ops: {Chain, DAG.getConstant(Val: `0`, DL, VT: MVT::i32), ZTFrameIdx});
8452	}
8453
8454	// If caller shares ZT0 but the callee is not shared ZA, we need to stop
8455	// PSTATE.ZA before the call if there is no lazy-save active.
8456	bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(Callee: CalleeAttrs);
8457	assert((!DisableZA \|\| !RequiresLazySave) &&
8458	"Lazy-save should have PSTATE.SM=1 on entry to the function");
8459
8460	if (DisableZA)
8461	Chain = DAG.getNode(
8462	Opcode: AArch64ISD::SMSTOP, DL, VT: MVT::Other, N1: Chain,
8463	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
8464	N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
8465
8466	// Adjust the stack pointer for the new arguments...
8467	// These operations are automatically eliminated by the prolog/epilog pass
8468	if (!IsSibCall)
8469	Chain = DAG.getCALLSEQ_START(Chain, InSize: IsTailCall ? `0` : NumBytes, OutSize: `0`, DL);
8470
8471	SDValue StackPtr = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::SP,
8472	VT: getPointerTy(DL: DAG.getDataLayout()));
8473
8474	SmallVector<std::pair<unsigned, SDValue>, `8`> RegsToPass;
8475	SmallSet<unsigned, `8`> RegsUsed;
8476	SmallVector<SDValue, `8`> MemOpChains;
8477	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
8478
8479	if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8480	const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8481	for (const auto &F : Forwards) {
8482	SDValue Val = DAG.getCopyFromReg(Chain, dl: DL, Reg: F.VReg, VT: F.VT);
8483	RegsToPass.emplace_back(Args: F.PReg, Args&: Val);
8484	}
8485	}
8486
8487	// Walk the register/memloc assignments, inserting copies/loads.
8488	unsigned ExtraArgLocs = `0`;
8489	for (unsigned i = `0`, e = Outs.size(); i != e; ++i) {
8490	CCValAssign &VA = ArgLocs [i - ExtraArgLocs];
8491	SDValue Arg = OutVals [i];
8492	ISD::ArgFlagsTy Flags = Outs [i].Flags;
8493
8494	// Promote the value if needed.
8495	switch (VA.getLocInfo()) {
8496	default:
8497	llvm_unreachable("Unknown loc info!");
8498	case CCValAssign::Full:
8499	break;
8500	case CCValAssign::SExt:
8501	Arg = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8502	break;
8503	case CCValAssign::ZExt:
8504	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8505	break;
8506	case CCValAssign::AExt:
8507	if (Outs [i].ArgVT == MVT::i1) {
8508	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8509	//
8510	// Check if we actually have to do this, because the value may
8511	// already be zero-extended.
8512	//
8513	// We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8514	// and rely on DAGCombiner to fold this, because the following
8515	// (anyext i32) is combined with (zext i8) in DAG.getNode:
8516	//
8517	// (ext (zext x)) -> (zext x)
8518	//
8519	// This will give us (zext i32), which we cannot remove, so
8520	// try to check this beforehand.
8521	if (!checkZExtBool(Arg, DAG)) {
8522	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
8523	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i8, Operand: Arg);
8524	}
8525	}
8526	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8527	break;
8528	case CCValAssign::AExtUpper:
8529	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8530	Arg = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8531	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
8532	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
8533	break;
8534	case CCValAssign::BCvt:
8535	Arg = DAG.getBitcast(VT: VA.getLocVT(), V: Arg);
8536	break;
8537	case CCValAssign::Trunc:
8538	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
8539	break;
8540	case CCValAssign::FPExt:
8541	Arg = DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
8542	break;
8543	case CCValAssign::Indirect:
8544	bool isScalable = VA.getValVT().isScalableVT();
8545	assert((isScalable \|\| Subtarget->isWindowsArm64EC()) &&
8546	"Indirect arguments should be scalable on most subtargets");
8547
8548	uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8549	uint64_t PartSize = StoreSize;
8550	unsigned NumParts = `1`;
8551	if (Outs [i].Flags.isInConsecutiveRegs()) {
8552	while (!Outs [i + NumParts - `1`].Flags.isInConsecutiveRegsLast())
8553	++NumParts;
8554	StoreSize *= NumParts;
8555	}
8556
8557	Type Ty = EVT (VA.getValVT()).getTypeForEVT(Context&: DAG.getContext());
8558	Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8559	MachineFrameInfo &MFI = MF.getFrameInfo();
8560	int FI = MFI.CreateStackObject(Size: StoreSize, Alignment, isSpillSlot: false);
8561	if (isScalable)
8562	MFI.setStackID(ObjectIdx: FI, ID: TargetStackID::ScalableVector);
8563
8564	MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
8565	SDValue Ptr = DAG.getFrameIndex(
8566	FI, VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8567	SDValue SpillSlot = Ptr;
8568
8569	// Ensure we generate all stores for each tuple part, whilst updating the
8570	// pointer after each store correctly using vscale.
8571	while (NumParts) {
8572	SDValue Store = DAG.getStore(Chain, dl: DL, Val: OutVals [i], Ptr, PtrInfo: MPI);
8573	MemOpChains.push_back(Elt: Store);
8574
8575	NumParts--;
8576	if (NumParts > `0`) {
8577	SDValue BytesIncrement;
8578	if (isScalable) {
8579	BytesIncrement = DAG.getVScale(
8580	DL, VT: Ptr.getValueType(),
8581	MulImm: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8582	} else {
8583	BytesIncrement = DAG.getConstant(
8584	Val: APInt (Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8585	VT: Ptr.getValueType());
8586	}
8587	SDNodeFlags Flags;
8588	Flags.setNoUnsignedWrap(true);
8589
8590	MPI = MachinePointerInfo (MPI.getAddrSpace());
8591	Ptr = DAG.getNode(Opcode: ISD::ADD, DL, VT: Ptr.getValueType(), N1: Ptr,
8592	N2: BytesIncrement, Flags);
8593	ExtraArgLocs++;
8594	i++;
8595	}
8596	}
8597
8598	Arg = SpillSlot;
8599	break;
8600	}
8601
8602	if (VA.isRegLoc()) {
8603	if (i == `0` && Flags.isReturned() && !Flags.isSwiftSelf() &&
8604	Outs [`0`].VT == MVT::i64) {
8605	assert(VA.getLocVT() == MVT::i64 &&
8606	"unexpected calling convention register assignment");
8607	assert(!Ins.empty() && Ins[`0`].VT == MVT::i64 &&
8608	"unexpected use of 'returned'");
8609	IsThisReturn = true;
8610	}
8611	if (RegsUsed.count(V: VA.getLocReg())) {
8612	// If this register has already been used then we're trying to pack
8613	// parts of an [N x i32] into an X-register. The extension type will
8614	// take care of putting the two halves in the right place but we have to
8615	// combine them.
8616	SDValue &Bits =
8617	llvm::find_if(Range&: RegsToPass,
8618	P: [=](const std::pair<unsigned, SDValue> &Elt) {
8619	return Elt.first == VA.getLocReg();
8620	})
8621	->second;
8622	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
8623	// Call site info is used for function's parameter entry value
8624	// tracking. For now we track only simple cases when parameter
8625	// is transferred through whole register.
8626	llvm::erase_if(C&: CSInfo.ArgRegPairs,
8627	P: [&VA](MachineFunction::ArgRegPair ArgReg) {
8628	return ArgReg.Reg == VA.getLocReg();
8629	});
8630	} else {
8631	// Add an extra level of indirection for streaming mode changes by
8632	// using a pseudo copy node that cannot be rematerialised between a
8633	// smstart/smstop and the call by the simple register coalescer.
8634	if (RequiresSMChange && isPassedInFPR(VT: Arg.getValueType()))
8635	Arg = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
8636	VT: Arg.getValueType(), Operand: Arg);
8637	RegsToPass.emplace_back(Args: VA.getLocReg(), Args&: Arg);
8638	RegsUsed.insert(V: VA.getLocReg());
8639	const TargetOptions &Options = DAG.getTarget().Options;
8640	if (Options.EmitCallSiteInfo)
8641	CSInfo.ArgRegPairs.emplace_back(Args: VA.getLocReg(), Args&: i);
8642	}
8643	} else {
8644	assert(VA.isMemLoc());
8645
8646	SDValue DstAddr;
8647	MachinePointerInfo DstInfo;
8648
8649	// FIXME: This works on big-endian for composite byvals, which are the
8650	// common case. It should also work for fundamental types too.
8651	uint32_t BEAlign = `0`;
8652	unsigned OpSize;
8653	if (VA.getLocInfo() == CCValAssign::Indirect \|\|
8654	VA.getLocInfo() == CCValAssign::Trunc)
8655	OpSize = VA.getLocVT().getFixedSizeInBits();
8656	else
8657	OpSize = Flags.isByVal() ? Flags.getByValSize() * `8`
8658	: VA.getValVT().getSizeInBits();
8659	OpSize = (OpSize + `7`) / `8`;
8660	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8661	!Flags.isInConsecutiveRegs()) {
8662	if (OpSize < `8`)
8663	BEAlign = `8` - OpSize;
8664	}
8665	unsigned LocMemOffset = VA.getLocMemOffset();
8666	int32_t Offset = LocMemOffset + BEAlign;
8667	SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8668	PtrOff = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8669
8670	if (IsTailCall) {
8671	Offset = Offset + FPDiff;
8672	int FI = MF.getFrameInfo().CreateFixedObject(Size: OpSize, SPOffset: Offset, IsImmutable: true);
8673
8674	DstAddr = DAG.getFrameIndex(FI, VT: PtrVT);
8675	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8676
8677	// Make sure any stack arguments overlapping with where we're storing
8678	// are loaded before this eventual operation. Otherwise they'll be
8679	// clobbered.
8680	Chain = addTokenForArgument(Chain, DAG, MFI&: MF.getFrameInfo(), ClobberedFI: FI);
8681	} else {
8682	SDValue PtrOff = DAG.getIntPtrConstant(Val: Offset, DL);
8683
8684	DstAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: StackPtr, N2: PtrOff);
8685	DstInfo = MachinePointerInfo::getStack(MF, Offset: LocMemOffset);
8686	}
8687
8688	if (Outs [i].Flags.isByVal()) {
8689	SDValue SizeNode =
8690	DAG.getConstant(Val: Outs [i].Flags.getByValSize(), DL, VT: MVT::i64);
8691	SDValue Cpy = DAG.getMemcpy(
8692	Chain, dl: DL, Dst: DstAddr, Src: Arg, Size: SizeNode,
8693	Alignment: Outs [i].Flags.getNonZeroByValAlign(),
8694	/isVol = / false, /AlwaysInline = / false,
8695	/CI=/nullptr, OverrideTailCall: std::nullopt, DstPtrInfo: DstInfo, SrcPtrInfo: MachinePointerInfo ());
8696
8697	MemOpChains.push_back(Elt: Cpy);
8698	} else {
8699	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8700	// promoted to a legal register type i32, we should truncate Arg back to
8701	// i1/i8/i16.
8702	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
8703	VA.getValVT() == MVT::i16)
8704	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VA.getValVT(), Operand: Arg);
8705
8706	SDValue Store = DAG.getStore(Chain, dl: DL, Val: Arg, Ptr: DstAddr, PtrInfo: DstInfo);
8707	MemOpChains.push_back(Elt: Store);
8708	}
8709	}
8710	}
8711
8712	if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8713	SDValue ParamPtr = StackPtr;
8714	if (IsTailCall) {
8715	// Create a dummy object at the top of the stack that can be used to get
8716	// the SP after the epilogue
8717	int FI = MF.getFrameInfo().CreateFixedObject(Size: `1`, SPOffset: FPDiff, IsImmutable: true);
8718	ParamPtr = DAG.getFrameIndex(FI, VT: PtrVT);
8719	}
8720
8721	// For vararg calls, the Arm64EC ABI requires values in x4 and x5
8722	// describing the argument list. x4 contains the address of the
8723	// first stack parameter. x5 contains the size in bytes of all parameters
8724	// passed on the stack.
8725	RegsToPass.emplace_back(Args: AArch64::X4, Args&: ParamPtr);
8726	RegsToPass.emplace_back(Args: AArch64::X5,
8727	Args: DAG.getConstant(Val: NumBytes, DL, VT: MVT::i64));
8728	}
8729
8730	if (!MemOpChains.empty())
8731	Chain = DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOpChains);
8732
8733	SDValue InGlue;
8734	if (RequiresSMChange) {
8735
8736	Chain = DAG.getNode(Opcode: AArch64ISD::VG_SAVE, DL,
8737	VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), N: Chain);
8738	InGlue = Chain.getValue(R: `1`);
8739
8740	SDValue NewChain = changeStreamingMode(
8741	DAG, DL, Enable: CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8742	Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8743	Chain = NewChain.getValue(R: `0`);
8744	InGlue = NewChain.getValue(R: `1`);
8745	}
8746
8747	// Build a sequence of copy-to-reg nodes chained together with token chain
8748	// and flag operands which copy the outgoing args into the appropriate regs.
8749	for (auto &RegToPass : RegsToPass) {
8750	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RegToPass.first,
8751	N: RegToPass.second, Glue: InGlue);
8752	InGlue = Chain.getValue(R: `1`);
8753	}
8754
8755	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8756	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8757	// node so that legalize doesn't hack it.
8758	if (auto *G = dyn_cast<GlobalAddressSDNode>(Val&: Callee)) {
8759	auto GV = G->getGlobal();
8760	unsigned OpFlags =
8761	Subtarget->classifyGlobalFunctionReference(GV, TM: getTargetMachine());
8762	if (OpFlags & AArch64II::MO_GOT) {
8763	Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
8764	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8765	} else {
8766	const GlobalValue *GV = G->getGlobal();
8767	Callee = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: OpFlags);
8768	}
8769	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Val&: Callee)) {
8770	bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8771	Subtarget->isTargetMachO()) \|\|
8772	MF.getFunction().getParent()->getRtLibUseGOT();
8773	const char *Sym = S->getSymbol();
8774	if (UseGot) {
8775	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: AArch64II::MO_GOT);
8776	Callee = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: Callee);
8777	} else {
8778	Callee = DAG.getTargetExternalSymbol(Sym, VT: PtrVT, TargetFlags: `0`);
8779	}
8780	}
8781
8782	// We don't usually want to end the call-sequence here because we would tidy
8783	// the frame up after* the call, however in the ABI-changing tail-call case*
8784	// we've carefully laid out the parameters so that when sp is reset they'll be
8785	// in the correct location.
8786	if (IsTailCall && !IsSibCall) {
8787	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: InGlue, DL);
8788	InGlue = Chain.getValue(R: `1`);
8789	}
8790
8791	unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
8792
8793	std::vector<SDValue> Ops;
8794	Ops.push_back(x: Chain);
8795	Ops.push_back(x: Callee);
8796
8797	// Calls with operand bundle "clang.arc.attachedcall" are special. They should
8798	// be expanded to the call, directly followed by a special marker sequence and
8799	// a call to an ObjC library function. Use CALL_RVMARKER to do that.
8800	if (CLI.CB && objcarc::hasAttachedCallOpBundle(CB: CLI.CB)) {
8801	assert(!IsTailCall &&
8802	"tail calls cannot be marked with clang.arc.attachedcall");
8803	Opc = AArch64ISD::CALL_RVMARKER;
8804
8805	// Add a target global address for the retainRV/claimRV runtime function
8806	// just before the call target.
8807	Function ARCFn = objcarc::getAttachedARCFunction(CB: CLI.CB);
8808	auto GA = DAG.getTargetGlobalAddress(GV: ARCFn, DL, VT: PtrVT);
8809	Ops.insert(position: Ops.begin() + `1`, x: GA);
8810	} else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8811	Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
8812	} else if (GuardWithBTI) {
8813	Opc = AArch64ISD::CALL_BTI;
8814	}
8815
8816	if (IsTailCall) {
8817	// Each tail call may have to adjust the stack by a different amount, so
8818	// this information must travel along with the operation for eventual
8819	// consumption by emitEpilogue.
8820	Ops.push_back(x: DAG.getTargetConstant(Val: FPDiff, DL, VT: MVT::i32));
8821	}
8822
8823	if (CLI.PAI) {
8824	const uint64_t Key = CLI.PAI ->Key;
8825	assert((Key == AArch64PACKey::IA \|\| Key == AArch64PACKey::IB) &&
8826	"Invalid auth call key");
8827
8828	// Split the discriminator into address/integer components.
8829	SDValue AddrDisc, IntDisc;
8830	std::tie(args&: IntDisc, args&: AddrDisc) =
8831	extractPtrauthBlendDiscriminators(Disc: CLI.PAI ->Discriminator, DAG: &DAG);
8832
8833	if (Opc == AArch64ISD::CALL_RVMARKER)
8834	Opc = AArch64ISD::AUTH_CALL_RVMARKER;
8835	else
8836	Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
8837	Ops.push_back(x: DAG.getTargetConstant(Val: Key, DL, VT: MVT::i32));
8838	Ops.push_back(x: IntDisc);
8839	Ops.push_back(x: AddrDisc);
8840	}
8841
8842	// Add argument registers to the end of the list so that they are known live
8843	// into the call.
8844	for (auto &RegToPass : RegsToPass)
8845	Ops.push_back(x: DAG.getRegister(Reg: RegToPass.first,
8846	VT: RegToPass.second.getValueType()));
8847
8848	// Add a register mask operand representing the call-preserved registers.
8849	const uint32_t *Mask;
8850	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8851	if (IsThisReturn) {
8852	// For 'this' returns, use the X0-preserving mask if applicable
8853	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8854	if (!Mask) {
8855	IsThisReturn = false;
8856	Mask = TRI->getCallPreservedMask(MF, CallConv);
8857	}
8858	} else
8859	Mask = TRI->getCallPreservedMask(MF, CallConv);
8860
8861	if (Subtarget->hasCustomCallingConv())
8862	TRI->UpdateCustomCallPreservedMask(MF, Mask: &Mask);
8863
8864	if (TRI->isAnyArgRegReserved(MF))
8865	TRI->emitReservedArgRegCallError(MF);
8866
8867	assert(Mask && "Missing call preserved mask for calling convention");
8868	Ops.push_back(x: DAG.getRegisterMask(RegMask: Mask));
8869
8870	if (InGlue.getNode())
8871	Ops.push_back(x: InGlue);
8872
8873	SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
8874
8875	// If we're doing a tall call, use a TC_RETURN here rather than an
8876	// actual call instruction.
8877	if (IsTailCall) {
8878	MF.getFrameInfo().setHasTailCall();
8879	SDValue Ret = DAG.getNode(Opcode: Opc, DL, VTList: NodeTys, Ops);
8880	if (IsCFICall)
8881	Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8882
8883	DAG.addNoMergeSiteInfo(Node: Ret.getNode(), NoMerge: CLI.NoMerge);
8884	DAG.addCallSiteInfo(Node: Ret.getNode(), CallInfo: std::move(CSInfo));
8885	return Ret;
8886	}
8887
8888	// Returns a chain and a flag for retval copy to use.
8889	Chain = DAG.getNode(Opcode: Opc, DL, VTList: NodeTys, Ops);
8890	if (IsCFICall)
8891	Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8892
8893	DAG.addNoMergeSiteInfo(Node: Chain.getNode(), NoMerge: CLI.NoMerge);
8894	InGlue = Chain.getValue(R: `1`);
8895	DAG.addCallSiteInfo(Node: Chain.getNode(), CallInfo: std::move(CSInfo));
8896
8897	uint64_t CalleePopBytes =
8898	DoesCalleeRestoreStack(CallCC: CallConv, TailCallOpt) ? alignTo(Value: NumBytes, Align: `16`) : `0`;
8899
8900	Chain = DAG.getCALLSEQ_END(Chain, Size1: NumBytes, Size2: CalleePopBytes, Glue: InGlue, DL);
8901	InGlue = Chain.getValue(R: `1`);
8902
8903	// Handle result values, copying them out of physregs into vregs that we
8904	// return.
8905	SDValue Result = LowerCallResult(
8906	Chain, InGlue, CallConv, isVarArg: IsVarArg, RVLocs, DL, DAG, InVals, isThisReturn: IsThisReturn,
8907	ThisVal: IsThisReturn ? OutVals [`0`] : SDValue (), RequiresSMChange);
8908
8909	if (!Ins.empty())
8910	InGlue = Result.getValue(R: Result ->getNumValues() - `1`);
8911
8912	if (RequiresSMChange) {
8913	assert(PStateSM && "Expected a PStateSM to be set");
8914	Result = changeStreamingMode(
8915	DAG, DL, Enable: !CalleeAttrs.hasStreamingInterface(), Chain: Result, InGlue,
8916	Condition: getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8917	InGlue = Result.getValue(R: `1`);
8918
8919	Result =
8920	DAG.getNode(Opcode: AArch64ISD::VG_RESTORE, DL,
8921	VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops: {Result, InGlue});
8922	}
8923
8924	if (CallerAttrs.requiresEnablingZAAfterCall(Callee: CalleeAttrs))
8925	// Unconditionally resume ZA.
8926	Result = DAG.getNode(
8927	Opcode: AArch64ISD::SMSTART, DL, VT: MVT::Other, N1: Result,
8928	N2: DAG.getTargetConstant(Val: (int32_t)(AArch64SVCR::SVCRZA), DL, VT: MVT::i32),
8929	N3: DAG.getConstant(Val: AArch64SME::Always, DL, VT: MVT::i64));
8930
8931	if (ShouldPreserveZT0)
8932	Result =
8933	DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL, VTList: DAG.getVTList(VT: MVT::Other),
8934	Ops: {Result, DAG.getConstant(Val: `0`, DL, VT: MVT::i32), ZTFrameIdx});
8935
8936	if (RequiresLazySave) {
8937	// Conditionally restore the lazy save using a pseudo node.
8938	TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8939	SDValue RegMask = DAG.getRegisterMask(
8940	RegMask: TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8941	SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8942	Sym: "__arm_tpidr2_restore", VT: getPointerTy(DL: DAG.getDataLayout()));
8943	SDValue TPIDR2_EL0 = DAG.getNode(
8944	Opcode: ISD::INTRINSIC_W_CHAIN, DL, VT: MVT::i64, N1: Result,
8945	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_get_tpidr2, DL, VT: MVT::i32));
8946
8947	// Copy the address of the TPIDR2 block into X0 before 'calling' the
8948	// RESTORE_ZA pseudo.
8949	SDValue Glue;
8950	SDValue TPIDR2Block = DAG.getFrameIndex(
8951	FI: TPIDR2.FrameIndex,
8952	VT: DAG.getTargetLoweringInfo().getFrameIndexTy(DL: DAG.getDataLayout()));
8953	Result = DAG.getCopyToReg(Chain: Result, dl: DL, Reg: AArch64::X0, N: TPIDR2Block, Glue);
8954	Result =
8955	DAG.getNode(Opcode: AArch64ISD::RESTORE_ZA, DL, VT: MVT::Other,
8956	Ops: {Result, TPIDR2_EL0, DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64),
8957	RestoreRoutine, RegMask, Result.getValue(R: `1`)});
8958
8959	// Finally reset the TPIDR2_EL0 register to 0.
8960	Result = DAG.getNode(
8961	Opcode: ISD::INTRINSIC_VOID, DL, VT: MVT::Other, N1: Result,
8962	N2: DAG.getConstant(Val: Intrinsic::aarch64_sme_set_tpidr2, DL, VT: MVT::i32),
8963	N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
8964	TPIDR2.Uses++;
8965	}
8966
8967	if (RequiresSMChange \|\| RequiresLazySave \|\| ShouldPreserveZT0) {
8968	for (unsigned I = `0`; I < InVals.size(); ++I) {
8969	// The smstart/smstop is chained as part of the call, but when the
8970	// resulting chain is discarded (which happens when the call is not part
8971	// of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8972	// smstart/smstop is chained to the result value. We can do that by doing
8973	// a vreg -> vreg copy.
8974	Register Reg = MF.getRegInfo().createVirtualRegister(
8975	RegClass: getRegClassFor(VT: InVals [I].getValueType().getSimpleVT()));
8976	SDValue X = DAG.getCopyToReg(Chain: Result, dl: DL, Reg, N: InVals [I]);
8977	InVals [I] = DAG.getCopyFromReg(Chain: X, dl: DL, Reg,
8978	VT: InVals [I].getValueType());
8979	}
8980	}
8981
8982	if (CallConv == CallingConv::PreserveNone) {
8983	for (const ISD::OutputArg &O : Outs) {
8984	if (O.Flags.isSwiftSelf() \|\| O.Flags.isSwiftError() \|\|
8985	O.Flags.isSwiftAsync()) {
8986	MachineFunction &MF = DAG.getMachineFunction();
8987	DAG.getContext()->diagnose(DI: DiagnosticInfoUnsupported (
8988	MF.getFunction(),
8989	"Swift attributes can't be used with preserve_none",
8990	DL.getDebugLoc()));
8991	break;
8992	}
8993	}
8994	}
8995
8996	return Result;
8997	}
8998
8999	bool AArch64TargetLowering::CanLowerReturn(
9000	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9001	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
9002	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9003	SmallVector<CCValAssign, `16`> RVLocs;
9004	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9005	return CCInfo.CheckReturn(Outs, Fn: RetCC);
9006	}
9007
9008	SDValue
9009	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9010	bool isVarArg,
9011	const SmallVectorImpl<ISD::OutputArg> &Outs,
9012	const SmallVectorImpl<SDValue> &OutVals,
9013	const SDLoc &DL, SelectionDAG &DAG) const {
9014	auto &MF = DAG.getMachineFunction();
9015	auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9016
9017	CCAssignFn *RetCC = CCAssignFnForReturn(CC: CallConv);
9018	SmallVector<CCValAssign, `16`> RVLocs;
9019	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9020	CCInfo.AnalyzeReturn(Outs, Fn: RetCC);
9021
9022	// Copy the result values into the output registers.
9023	SDValue Glue;
9024	SmallVector<std::pair<unsigned, SDValue>, `4`> RetVals;
9025	SmallSet<unsigned, `4`> RegsUsed;
9026	for (unsigned i = `0`, realRVLocIdx = `0`; i != RVLocs.size();
9027	++i, ++realRVLocIdx) {
9028	CCValAssign &VA = RVLocs [i];
9029	assert(VA.isRegLoc() && "Can only return in registers!");
9030	SDValue Arg = OutVals [realRVLocIdx];
9031
9032	switch (VA.getLocInfo()) {
9033	default:
9034	llvm_unreachable("Unknown loc info!");
9035	case CCValAssign::Full:
9036	if (Outs [i].ArgVT == MVT::i1) {
9037	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
9038	// value. This is strictly redundant on Darwin (which uses "zeroext
9039	// i1"), but will be optimised out before ISel.
9040	Arg = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i1, Operand: Arg);
9041	Arg = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: VA.getLocVT(), Operand: Arg);
9042	}
9043	break;
9044	case CCValAssign::BCvt:
9045	Arg = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: VA.getLocVT(), Operand: Arg);
9046	break;
9047	case CCValAssign::AExt:
9048	case CCValAssign::ZExt:
9049	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9050	break;
9051	case CCValAssign::AExtUpper:
9052	assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9053	Arg = DAG.getZExtOrTrunc(Op: Arg, DL, VT: VA.getLocVT());
9054	Arg = DAG.getNode(Opcode: ISD::SHL, DL, VT: VA.getLocVT(), N1: Arg,
9055	N2: DAG.getConstant(Val: `32`, DL, VT: VA.getLocVT()));
9056	break;
9057	}
9058
9059	if (RegsUsed.count(V: VA.getLocReg())) {
9060	SDValue &Bits =
9061	llvm::find_if(Range&: RetVals, P: [=](const std::pair<unsigned, SDValue> &Elt) {
9062	return Elt.first == VA.getLocReg();
9063	})->second;
9064	Bits = DAG.getNode(Opcode: ISD::OR, DL, VT: Bits.getValueType(), N1: Bits, N2: Arg);
9065	} else {
9066	RetVals.emplace_back(Args: VA.getLocReg(), Args&: Arg);
9067	RegsUsed.insert(V: VA.getLocReg());
9068	}
9069	}
9070
9071	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9072
9073	// Emit SMSTOP before returning from a locally streaming function
9074	SMEAttrs FuncAttrs(MF.getFunction());
9075	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9076	if (FuncAttrs.hasStreamingCompatibleInterface()) {
9077	Register Reg = FuncInfo->getPStateSMReg();
9078	assert(Reg.isValid() && "PStateSM Register is invalid");
9079	SDValue PStateSM = DAG.getCopyFromReg(Chain, dl: DL, Reg, VT: MVT::i64);
9080	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
9081	/Glue/ InGlue: SDValue (),
9082	Condition: AArch64SME::IfCallerIsNonStreaming, PStateSM);
9083	} else
9084	Chain = changeStreamingMode(DAG, DL, /Enable/ false, Chain,
9085	/Glue/ InGlue: SDValue (), Condition: AArch64SME::Always);
9086	Glue = Chain.getValue(R: `1`);
9087	}
9088
9089	SmallVector<SDValue, `4`> RetOps(`1`, Chain);
9090	for (auto &RetVal : RetVals) {
9091	if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9092	isPassedInFPR(VT: RetVal.second.getValueType()))
9093	RetVal.second = DAG.getNode(Opcode: AArch64ISD::COALESCER_BARRIER, DL,
9094	VT: RetVal.second.getValueType(), Operand: RetVal.second);
9095	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetVal.first, N: RetVal.second, Glue);
9096	Glue = Chain.getValue(R: `1`);
9097	RetOps.push_back(
9098	Elt: DAG.getRegister(Reg: RetVal.first, VT: RetVal.second.getValueType()));
9099	}
9100
9101	// Windows AArch64 ABIs require that for returning structs by value we copy
9102	// the sret argument into X0 for the return.
9103	// We saved the argument into a virtual register in the entry block,
9104	// so now we copy the value out and into X0.
9105	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9106	SDValue Val = DAG.getCopyFromReg(Chain: RetOps [`0`], dl: DL, Reg: SRetReg,
9107	VT: getPointerTy(DL: MF.getDataLayout()));
9108
9109	unsigned RetValReg = AArch64::X0;
9110	if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9111	RetValReg = AArch64::X8;
9112	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: RetValReg, N: Val, Glue);
9113	Glue = Chain.getValue(R: `1`);
9114
9115	RetOps.push_back(
9116	Elt: DAG.getRegister(Reg: RetValReg, VT: getPointerTy(DL: DAG.getDataLayout())));
9117	}
9118
9119	const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(MF: &MF);
9120	if (I) {
9121	for (; *I; ++I) {
9122	if (AArch64::GPR64RegClass.contains(Reg: *I))
9123	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::i64));
9124	else if (AArch64::FPR64RegClass.contains(Reg: *I))
9125	RetOps.push_back(Elt: DAG.getRegister(Reg: *I, VT: MVT::getFloatingPointVT(BitWidth: `64`)));
9126	else
9127	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9128	}
9129	}
9130
9131	RetOps [`0`] = Chain; // Update chain.
9132
9133	// Add the glue if we have it.
9134	if (Glue.getNode())
9135	RetOps.push_back(Elt: Glue);
9136
9137	if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9138	// ARM64EC entry thunks use a special return sequence: instead of a regular
9139	// "ret" instruction, they need to explicitly call the emulator.
9140	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9141	SDValue Arm64ECRetDest =
9142	DAG.getExternalSymbol(Sym: "__os_arm64x_dispatch_ret", VT: PtrVT);
9143	Arm64ECRetDest =
9144	getAddr(N: cast<ExternalSymbolSDNode>(Val&: Arm64ECRetDest), DAG, Flags: `0`);
9145	Arm64ECRetDest = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Arm64ECRetDest,
9146	PtrInfo: MachinePointerInfo ());
9147	RetOps.insert(I: RetOps.begin() + `1`, Elt: Arm64ECRetDest);
9148	RetOps.insert(I: RetOps.begin() + `2`, Elt: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32));
9149	return DAG.getNode(Opcode: AArch64ISD::TC_RETURN, DL, VT: MVT::Other, Ops: RetOps);
9150	}
9151
9152	return DAG.getNode(Opcode: AArch64ISD::RET_GLUE, DL, VT: MVT::Other, Ops: RetOps);
9153	}
9154
9155	//===----------------------------------------------------------------------===//
9156	// Other Lowering Code
9157	//===----------------------------------------------------------------------===//
9158
9159	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9160	SelectionDAG &DAG,
9161	unsigned Flag) const {
9162	return DAG.getTargetGlobalAddress(GV: N->getGlobal(), DL: SDLoc (N), VT: Ty,
9163	offset: N->getOffset(), TargetFlags: Flag);
9164	}
9165
9166	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9167	SelectionDAG &DAG,
9168	unsigned Flag) const {
9169	return DAG.getTargetJumpTable(JTI: N->getIndex(), VT: Ty, TargetFlags: Flag);
9170	}
9171
9172	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9173	SelectionDAG &DAG,
9174	unsigned Flag) const {
9175	return DAG.getTargetConstantPool(C: N->getConstVal(), VT: Ty, Align: N->getAlign(),
9176	Offset: N->getOffset(), TargetFlags: Flag);
9177	}
9178
9179	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9180	SelectionDAG &DAG,
9181	unsigned Flag) const {
9182	return DAG.getTargetBlockAddress(BA: N->getBlockAddress(), VT: Ty, Offset: `0`, TargetFlags: Flag);
9183	}
9184
9185	SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9186	SelectionDAG &DAG,
9187	unsigned Flag) const {
9188	return DAG.getTargetExternalSymbol(Sym: N->getSymbol(), VT: Ty, TargetFlags: Flag);
9189	}
9190
9191	// (loadGOT sym)
9192	template <class NodeTy>
9193	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9194	unsigned Flags) const {
9195	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9196	SDLoc DL(N);
9197	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9198	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT \| Flags);
9199	// FIXME: Once remat is capable of dealing with instructions with register
9200	// operands, expand this into two nodes instead of using a wrapper node.
9201	return DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: Ty, Operand: GotAddr);
9202	}
9203
9204	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9205	template <class NodeTy>
9206	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9207	unsigned Flags) const {
9208	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9209	SDLoc DL(N);
9210	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9211	const unsigned char MO_NC = AArch64II::MO_NC;
9212	return DAG.getNode(
9213	AArch64ISD::WrapperLarge, DL, Ty,
9214	getTargetNode(N, Ty, DAG, AArch64II::MO_G3 \| Flags),
9215	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC \| Flags),
9216	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC \| Flags),
9217	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC \| Flags));
9218	}
9219
9220	// (addlow (adrp %hi(sym)) %lo(sym))
9221	template <class NodeTy>
9222	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9223	unsigned Flags) const {
9224	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9225	SDLoc DL(N);
9226	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9227	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE \| Flags);
9228	SDValue Lo = getTargetNode(N, Ty, DAG,
9229	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC \| Flags);
9230	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: Ty, Operand: Hi);
9231	return DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: Ty, N1: ADRP, N2: Lo);
9232	}
9233
9234	// (adr sym)
9235	template <class NodeTy>
9236	SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9237	unsigned Flags) const {
9238	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9239	SDLoc DL(N);
9240	EVT Ty = getPointerTy(DL: DAG.getDataLayout());
9241	SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9242	return DAG.getNode(Opcode: AArch64ISD::ADR, DL, VT: Ty, Operand: Sym);
9243	}
9244
9245	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9246	SelectionDAG &DAG) const {
9247	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Val&: Op);
9248	const GlobalValue *GV = GN->getGlobal();
9249	unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, TM: getTargetMachine());
9250
9251	if (OpFlags != AArch64II::MO_NO_FLAG)
9252	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == `0` &&
9253	"unexpected offset in global node");
9254
9255	// This also catches the large code model case for Darwin, and tiny code
9256	// model with got relocations.
9257	if ((OpFlags & AArch64II::MO_GOT) != `0`) {
9258	return getGOT(N: GN, DAG, Flags: OpFlags);
9259	}
9260
9261	SDValue Result;
9262	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9263	!getTargetMachine().isPositionIndependent()) {
9264	Result = getAddrLarge(N: GN, DAG, Flags: OpFlags);
9265	} else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9266	Result = getAddrTiny(N: GN, DAG, Flags: OpFlags);
9267	} else {
9268	Result = getAddr(N: GN, DAG, Flags: OpFlags);
9269	}
9270	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9271	SDLoc DL(GN);
9272	if (OpFlags & (AArch64II::MO_DLLIMPORT \| AArch64II::MO_COFFSTUB))
9273	Result = DAG.getLoad(VT: PtrVT, dl: DL, Chain: DAG.getEntryNode(), Ptr: Result,
9274	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()));
9275	return Result;
9276	}
9277
9278	/// Convert a TLS address reference into the correct sequence of loads
9279	/// and calls to compute the variable's address (for Darwin, currently) and
9280	/// return an SDValue containing the final node.
9281
9282	/// Darwin only has one TLS scheme which must be capable of dealing with the
9283	/// fully general situation, in the worst case. This means:
9284	/// + "extern __thread" declaration.
9285	/// + Defined in a possibly unknown dynamic library.
9286	///
9287	/// The general system is that each __thread variable has a [3 x i64] descriptor
9288	/// which contains information used by the runtime to calculate the address. The
9289	/// only part of this the compiler needs to know about is the first xword, which
9290	/// contains a function pointer that must be called with the address of the
9291	/// entire descriptor in "x0".
9292	///
9293	/// Since this descriptor may be in a different unit, in general even the
9294	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9295	/// is:
9296	/// adrp x0, _var@TLVPPAGE
9297	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9298	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9299	/// ; the function pointer
9300	/// blr x1 ; Uses descriptor address in x0
9301	/// ; Address of _var is now in x0.
9302	///
9303	/// If the address of _var's descriptor is* known to the linker, then it can*
9304	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9305	/// a slight efficiency gain.
9306	SDValue
9307	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9308	SelectionDAG &DAG) const {
9309	assert(Subtarget->isTargetDarwin() &&
9310	"This function expects a Darwin target");
9311
9312	SDLoc DL(Op);
9313	MVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9314	MVT PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
9315	const GlobalValue *GV = cast<GlobalAddressSDNode>(Val&: Op)->getGlobal();
9316
9317	SDValue TLVPAddr =
9318	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9319	SDValue DescAddr = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TLVPAddr);
9320
9321	// The first entry in the descriptor is a function pointer that we must call
9322	// to obtain the address of the variable.
9323	SDValue Chain = DAG.getEntryNode();
9324	SDValue FuncTLVGet = DAG.getLoad(
9325	VT: PtrMemVT, dl: DL, Chain, Ptr: DescAddr,
9326	PtrInfo: MachinePointerInfo::getGOT(MF&: DAG.getMachineFunction()),
9327	Alignment: Align (PtrMemVT.getSizeInBits() / `8`),
9328	MMOFlags: MachineMemOperand::MOInvariant \| MachineMemOperand::MODereferenceable);
9329	Chain = FuncTLVGet.getValue(R: `1`);
9330
9331	// Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9332	FuncTLVGet = DAG.getZExtOrTrunc(Op: FuncTLVGet, DL, VT: PtrVT);
9333
9334	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
9335	MFI.setAdjustsStack(true);
9336
9337	// TLS calls preserve all registers except those that absolutely must be
9338	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9339	// silly).
9340	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9341	const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9342	if (Subtarget->hasCustomCallingConv())
9343	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
9344
9345	// Finally, we can make the call. This is just a degenerate version of a
9346	// normal AArch64 call node: x0 takes the address of the descriptor, and
9347	// returns the address of the variable in this thread.
9348	Chain = DAG.getCopyToReg(Chain, dl: DL, Reg: AArch64::X0, N: DescAddr, Glue: SDValue ());
9349
9350	unsigned Opcode = AArch64ISD::CALL;
9351	SmallVector<SDValue, `8`> Ops;
9352	Ops.push_back(Elt: Chain);
9353	Ops.push_back(Elt: FuncTLVGet);
9354
9355	// With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
9356	if (DAG.getMachineFunction().getFunction().hasFnAttribute(Kind: "ptrauth-calls")) {
9357	Opcode = AArch64ISD::AUTH_CALL;
9358	Ops.push_back(Elt: DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32));
9359	Ops.push_back(Elt: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64)); // Integer Disc.
9360	Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::NoRegister, VT: MVT::i64)); // Addr Disc.
9361	}
9362
9363	Ops.push_back(Elt: DAG.getRegister(Reg: AArch64::X0, VT: MVT::i64));
9364	Ops.push_back(Elt: DAG.getRegisterMask(RegMask: Mask));
9365	Ops.push_back(Elt: Chain.getValue(R: `1`));
9366	Chain = DAG.getNode(Opcode, DL, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue), Ops);
9367	return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue: Chain.getValue(R: `1`));
9368	}
9369
9370	/// Convert a thread-local variable reference into a sequence of instructions to
9371	/// compute the variable's address for the local exec TLS model of ELF targets.
9372	/// The sequence depends on the maximum TLS area size.
9373	SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9374	SDValue ThreadBase,
9375	const SDLoc &DL,
9376	SelectionDAG &DAG) const {
9377	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9378	SDValue TPOff, Addr;
9379
9380	switch (DAG.getTarget().Options.TLSSize) {
9381	default:
9382	llvm_unreachable("Unexpected TLS size");
9383
9384	case `12`: {
9385	// mrs x0, TPIDR_EL0
9386	// add x0, x0, :tprel_lo12:a
9387	SDValue Var = DAG.getTargetGlobalAddress(
9388	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF);
9389	return SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
9390	Op2: Var,
9391	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9392	`0`);
9393	}
9394
9395	case `24`: {
9396	// mrs x0, TPIDR_EL0
9397	// add x0, x0, :tprel_hi12:a
9398	// add x0, x0, :tprel_lo12_nc:a
9399	SDValue HiVar = DAG.getTargetGlobalAddress(
9400	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
9401	SDValue LoVar = DAG.getTargetGlobalAddress(
9402	GV, DL, VT: PtrVT, offset: `0`,
9403	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9404	Addr = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: ThreadBase,
9405	Op2: HiVar,
9406	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9407	`0`);
9408	return SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: Addr,
9409	Op2: LoVar,
9410	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9411	`0`);
9412	}
9413
9414	case `32`: {
9415	// mrs x1, TPIDR_EL0
9416	// movz x0, #:tprel_g1:a
9417	// movk x0, #:tprel_g0_nc:a
9418	// add x0, x1, x0
9419	SDValue HiVar = DAG.getTargetGlobalAddress(
9420	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1);
9421	SDValue LoVar = DAG.getTargetGlobalAddress(
9422	GV, DL, VT: PtrVT, offset: `0`,
9423	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
9424	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
9425	Op2: DAG.getTargetConstant(Val: `16`, DL, VT: MVT::i32)),
9426	`0`);
9427	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9428	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9429	`0`);
9430	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9431	}
9432
9433	case `48`: {
9434	// mrs x1, TPIDR_EL0
9435	// movz x0, #:tprel_g2:a
9436	// movk x0, #:tprel_g1_nc:a
9437	// movk x0, #:tprel_g0_nc:a
9438	// add x0, x1, x0
9439	SDValue HiVar = DAG.getTargetGlobalAddress(
9440	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G2);
9441	SDValue MiVar = DAG.getTargetGlobalAddress(
9442	GV, DL, VT: PtrVT, offset: `0`,
9443	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G1 \| AArch64II::MO_NC);
9444	SDValue LoVar = DAG.getTargetGlobalAddress(
9445	GV, DL, VT: PtrVT, offset: `0`,
9446	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_G0 \| AArch64II::MO_NC);
9447	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVZXi, dl: DL, VT: PtrVT, Op1: HiVar,
9448	Op2: DAG.getTargetConstant(Val: `32`, DL, VT: MVT::i32)),
9449	`0`);
9450	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: MiVar,
9451	Op3: DAG.getTargetConstant(Val: `16`, DL, VT: MVT::i32)),
9452	`0`);
9453	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::MOVKXi, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9454	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9455	`0`);
9456	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9457	}
9458	}
9459	}
9460
9461	/// When accessing thread-local variables under either the general-dynamic or
9462	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9463	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9464	/// is a function pointer to carry out the resolution.
9465	///
9466	/// The sequence is:
9467	/// adrp x0, :tlsdesc:var
9468	/// ldr x1, [x0, #:tlsdesc_lo12:var]
9469	/// add x0, x0, #:tlsdesc_lo12:var
9470	/// .tlsdesccall var
9471	/// blr x1
9472	/// (TPIDR_EL0 offset now in x0)
9473	///
9474	/// The above sequence must be produced unscheduled, to enable the linker to
9475	/// optimize/relax this sequence.
9476	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9477	/// above sequence, and expanded really late in the compilation flow, to ensure
9478	/// the sequence is produced as per above.
9479	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9480	const SDLoc &DL,
9481	SelectionDAG &DAG) const {
9482	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9483
9484	SDValue Chain = DAG.getEntryNode();
9485	SDVTList NodeTys = DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue);
9486
9487	Chain =
9488	DAG.getNode(Opcode: AArch64ISD::TLSDESC_CALLSEQ, DL, VTList: NodeTys, Ops: {Chain, SymAddr});
9489	SDValue Glue = Chain.getValue(R: `1`);
9490
9491	return DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::X0, VT: PtrVT, Glue);
9492	}
9493
9494	SDValue
9495	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9496	SelectionDAG &DAG) const {
9497	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9498
9499	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9500
9501	TLSModel::Model Model = getTargetMachine().getTLSModel(GV: GA->getGlobal());
9502
9503	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
9504	if (Model == TLSModel::LocalDynamic)
9505	Model = TLSModel::GeneralDynamic;
9506	}
9507
9508	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
9509	Model != TLSModel::LocalExec)
9510	report_fatal_error(reason: "ELF TLS only supported in small memory model or "
9511	"in local exec TLS model");
9512	// Different choices can be made for the maximum size of the TLS area for a
9513	// module. For the small address model, the default TLS size is 16MiB and the
9514	// maximum TLS size is 4GiB.
9515	// FIXME: add tiny and large code model support for TLS access models other
9516	// than local exec. We currently generate the same code as small for tiny,
9517	// which may be larger than needed.
9518
9519	SDValue TPOff;
9520	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9521	SDLoc DL(Op);
9522	const GlobalValue *GV = GA->getGlobal();
9523
9524	SDValue ThreadBase = DAG.getNode(Opcode: AArch64ISD::THREAD_POINTER, DL, VT: PtrVT);
9525
9526	if (Model == TLSModel::LocalExec) {
9527	return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9528	} else if (Model == TLSModel::InitialExec) {
9529	TPOff = DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9530	TPOff = DAG.getNode(Opcode: AArch64ISD::LOADgot, DL, VT: PtrVT, Operand: TPOff);
9531	} else if (Model == TLSModel::LocalDynamic) {
9532	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9533	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9534	// the beginning of the module's TLS region, followed by a DTPREL offset
9535	// calculation.
9536
9537	// These accesses will need deduplicating if there's more than one.
9538	AArch64FunctionInfo *MFI =
9539	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
9540	MFI->incNumLocalDynamicTLSAccesses();
9541
9542	// The call needs a relocation too for linker relaxation. It doesn't make
9543	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9544	// the address.
9545	SDValue SymAddr = DAG.getTargetExternalSymbol(Sym: "_TLS_MODULE_BASE_", VT: PtrVT,
9546	TargetFlags: AArch64II::MO_TLS);
9547
9548	// Now we can calculate the offset from TPIDR_EL0 to this module's
9549	// thread-local area.
9550	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9551
9552	// Now use :dtprel_whatever: operations to calculate this variable's offset
9553	// in its thread-storage area.
9554	SDValue HiVar = DAG.getTargetGlobalAddress(
9555	GV, DL, VT: MVT::i64, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
9556	SDValue LoVar = DAG.getTargetGlobalAddress(
9557	GV, DL, VT: MVT::i64, offset: `0`,
9558	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9559
9560	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: HiVar,
9561	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9562	`0`);
9563	TPOff = SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TPOff, Op2: LoVar,
9564	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9565	`0`);
9566	} else if (Model == TLSModel::GeneralDynamic) {
9567	// The call needs a relocation too for linker relaxation. It doesn't make
9568	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9569	// the address.
9570	SDValue SymAddr =
9571	DAG.getTargetGlobalAddress(GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS);
9572
9573	// Finally we can make a call to calculate the offset from tpidr_el0.
9574	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9575	} else
9576	llvm_unreachable("Unsupported ELF TLS access model");
9577
9578	return DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: ThreadBase, N2: TPOff);
9579	}
9580
9581	SDValue
9582	AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9583	SelectionDAG &DAG) const {
9584	assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9585
9586	SDValue Chain = DAG.getEntryNode();
9587	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
9588	SDLoc DL(Op);
9589
9590	SDValue TEB = DAG.getRegister(Reg: AArch64::X18, VT: MVT::i64);
9591
9592	// Load the ThreadLocalStoragePointer from the TEB
9593	// A pointer to the TLS array is located at offset 0x58 from the TEB.
9594	SDValue TLSArray =
9595	DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TEB, N2: DAG.getIntPtrConstant(Val: `0x58`, DL));
9596	TLSArray = DAG.getLoad(VT: PtrVT, dl: DL, Chain, Ptr: TLSArray, PtrInfo: MachinePointerInfo ());
9597	Chain = TLSArray.getValue(R: `1`);
9598
9599	// Load the TLS index from the C runtime;
9600	// This does the same as getAddr(), but without having a GlobalAddressSDNode.
9601	// This also does the same as LOADgot, but using a generic i32 load,
9602	// while LOADgot only loads i64.
9603	SDValue TLSIndexHi =
9604	DAG.getTargetExternalSymbol(Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGE);
9605	SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9606	Sym: "_tls_index", VT: PtrVT, TargetFlags: AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9607	SDValue ADRP = DAG.getNode(Opcode: AArch64ISD::ADRP, DL, VT: PtrVT, Operand: TLSIndexHi);
9608	SDValue TLSIndex =
9609	DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: ADRP, N2: TLSIndexLo);
9610	TLSIndex = DAG.getLoad(VT: MVT::i32, dl: DL, Chain, Ptr: TLSIndex, PtrInfo: MachinePointerInfo ());
9611	Chain = TLSIndex.getValue(R: `1`);
9612
9613	// The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9614	// offset into the TLSArray.
9615	TLSIndex = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: PtrVT, Operand: TLSIndex);
9616	SDValue Slot = DAG.getNode(Opcode: ISD::SHL, DL, VT: PtrVT, N1: TLSIndex,
9617	N2: DAG.getConstant(Val: `3`, DL, VT: PtrVT));
9618	SDValue TLS = DAG.getLoad(VT: PtrVT, dl: DL, Chain,
9619	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: TLSArray, N2: Slot),
9620	PtrInfo: MachinePointerInfo ());
9621	Chain = TLS.getValue(R: `1`);
9622
9623	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9624	const GlobalValue *GV = GA->getGlobal();
9625	SDValue TGAHi = DAG.getTargetGlobalAddress(
9626	GV, DL, VT: PtrVT, offset: `0`, TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_HI12);
9627	SDValue TGALo = DAG.getTargetGlobalAddress(
9628	GV, DL, VT: PtrVT, offset: `0`,
9629	TargetFlags: AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
9630
9631	// Add the offset from the start of the .tls section (section base).
9632	SDValue Addr =
9633	SDValue (DAG.getMachineNode(Opcode: AArch64::ADDXri, dl: DL, VT: PtrVT, Op1: TLS, Op2: TGAHi,
9634	Op3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i32)),
9635	`0`);
9636	Addr = DAG.getNode(Opcode: AArch64ISD::ADDlow, DL, VT: PtrVT, N1: Addr, N2: TGALo);
9637	return Addr;
9638	}
9639
9640	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9641	SelectionDAG &DAG) const {
9642	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Val&: Op);
9643	if (DAG.getTarget().useEmulatedTLS())
9644	return LowerToTLSEmulatedModel(GA, DAG);
9645
9646	if (Subtarget->isTargetDarwin())
9647	return LowerDarwinGlobalTLSAddress(Op, DAG);
9648	if (Subtarget->isTargetELF())
9649	return LowerELFGlobalTLSAddress(Op, DAG);
9650	if (Subtarget->isTargetWindows())
9651	return LowerWindowsGlobalTLSAddress(Op, DAG);
9652
9653	llvm_unreachable("Unexpected platform trying to use TLS");
9654	}
9655
9656	//===----------------------------------------------------------------------===//
9657	// PtrAuthGlobalAddress lowering
9658	//
9659	// We have 3 lowering alternatives to choose from:
9660	// - MOVaddrPAC: similar to MOVaddr, with added PAC.
9661	// If the GV doesn't need a GOT load (i.e., is locally defined)
9662	// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
9663	//
9664	// - LOADgotPAC: similar to LOADgot, with added PAC.
9665	// If the GV needs a GOT load, materialize the pointer using the usual
9666	// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
9667	// section is assumed to be read-only (for example, via relro mechanism). See
9668	// LowerMOVaddrPAC.
9669	//
9670	// - LOADauthptrstatic: similar to LOADgot, but use a
9671	// special stub slot instead of a GOT slot.
9672	// Load a signed pointer for symbol 'sym' from a stub slot named
9673	// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
9674	// resolving. This usually lowers to adrp+ldr, but also emits an entry into
9675	// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
9676	//
9677	// All 3 are pseudos that are expand late to longer sequences: this lets us
9678	// provide integrity guarantees on the to-be-signed intermediate values.
9679	//
9680	// LOADauthptrstatic is undesirable because it requires a large section filled
9681	// with often similarly-signed pointers, making it a good harvesting target.
9682	// Thus, it's only used for ptrauth references to extern_weak to avoid null
9683	// checks.
9684
9685	SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
9686	SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
9687	SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
9688	const auto *TGN = cast<GlobalAddressSDNode>(Val: TGA.getNode());
9689	assert(TGN->getGlobal()->hasExternalWeakLinkage());
9690
9691	// Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
9692	// offset alone as a pointer if the symbol wasn't available, which would
9693	// probably break null checks in users. Ptrauth complicates things further:
9694	// error out.
9695	if (TGN->getOffset() != `0`)
9696	report_fatal_error(
9697	reason: "unsupported non-zero offset in weak ptrauth global reference");
9698
9699	if (!isNullConstant(V: AddrDiscriminator))
9700	report_fatal_error(reason: "unsupported weak addr-div ptrauth global");
9701
9702	SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
9703	return SDValue (DAG.getMachineNode(Opcode: AArch64::LOADauthptrstatic, dl: DL, VT: MVT::i64,
9704	Ops: {TGA, Key, Discriminator}),
9705	`0`);
9706	}
9707
9708	SDValue
9709	AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
9710	SelectionDAG &DAG) const {
9711	SDValue Ptr = Op.getOperand(i: `0`);
9712	uint64_t KeyC = Op.getConstantOperandVal(i: `1`);
9713	SDValue AddrDiscriminator = Op.getOperand(i: `2`);
9714	uint64_t DiscriminatorC = Op.getConstantOperandVal(i: `3`);
9715	EVT VT = Op.getValueType();
9716	SDLoc DL(Op);
9717
9718	if (KeyC > AArch64PACKey::LAST)
9719	report_fatal_error(reason: "key in ptrauth global out of range [0, " +
9720	Twine ((int)AArch64PACKey::LAST) + "]");
9721
9722	// Blend only works if the integer discriminator is 16-bit wide.
9723	if (!isUInt<`16`>(x: DiscriminatorC))
9724	report_fatal_error(
9725	reason: "constant discriminator in ptrauth global out of range [0, 0xffff]");
9726
9727	// Choosing between 3 lowering alternatives is target-specific.
9728	if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
9729	report_fatal_error(reason: "ptrauth global lowering only supported on MachO/ELF");
9730
9731	int64_t PtrOffsetC = `0`;
9732	if (Ptr.getOpcode() == ISD::ADD) {
9733	PtrOffsetC = Ptr.getConstantOperandVal(i: `1`);
9734	Ptr = Ptr.getOperand(i: `0`);
9735	}
9736	const auto *PtrN = cast<GlobalAddressSDNode>(Val: Ptr.getNode());
9737	const GlobalValue *PtrGV = PtrN->getGlobal();
9738
9739	// Classify the reference to determine whether it needs a GOT load.
9740	const unsigned OpFlags =
9741	Subtarget->ClassifyGlobalReference(GV: PtrGV, TM: getTargetMachine());
9742	const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != `0`);
9743	assert(((OpFlags & (~AArch64II::MO_GOT)) == `0`) &&
9744	"unsupported non-GOT op flags on ptrauth global reference");
9745
9746	// Fold any offset into the GV; our pseudos expect it there.
9747	PtrOffsetC += PtrN->getOffset();
9748	SDValue TPtr = DAG.getTargetGlobalAddress(GV: PtrGV, DL, VT, offset: PtrOffsetC,
9749	/TargetFlags=/`0`);
9750	assert(PtrN->getTargetFlags() == `0` &&
9751	"unsupported target flags on ptrauth global");
9752
9753	SDValue Key = DAG.getTargetConstant(Val: KeyC, DL, VT: MVT::i32);
9754	SDValue Discriminator = DAG.getTargetConstant(Val: DiscriminatorC, DL, VT: MVT::i64);
9755	SDValue TAddrDiscriminator = !isNullConstant(V: AddrDiscriminator)
9756	? AddrDiscriminator
9757	: DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
9758
9759	// No GOT load needed -> MOVaddrPAC
9760	if (!NeedsGOTLoad) {
9761	assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
9762	return SDValue (
9763	DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, VT: MVT::i64,
9764	Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
9765	`0`);
9766	}
9767
9768	// GOT load -> LOADgotPAC
9769	// Note that we disallow extern_weak refs to avoid null checks later.
9770	if (!PtrGV->hasExternalWeakLinkage())
9771	return SDValue (
9772	DAG.getMachineNode(Opcode: AArch64::LOADgotPAC, dl: DL, VT: MVT::i64,
9773	Ops: {TPtr, Key, TAddrDiscriminator, Discriminator}),
9774	`0`);
9775
9776	// extern_weak ref -> LOADauthptrstatic
9777	return LowerPtrAuthGlobalAddressStatically(
9778	TGA: TPtr, DL, VT, KeyC: (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
9779	DAG);
9780	}
9781
9782	// Looks through \param Val to determine the bit that can be used to
9783	// check the sign of the value. It returns the unextended value and
9784	// the sign bit position.
9785	std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9786	if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9787	return {Val.getOperand(i: `0`),
9788	cast<VTSDNode>(Val: Val.getOperand(i: `1`))->getVT().getFixedSizeInBits() -
9789	`1`};
9790
9791	if (Val.getOpcode() == ISD::SIGN_EXTEND)
9792	return {Val.getOperand(i: `0`),
9793	Val.getOperand(i: `0`)->getValueType(ResNo: `0`).getFixedSizeInBits() - `1`};
9794
9795	return {Val, Val.getValueSizeInBits() - `1`};
9796	}
9797
9798	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9799	SDValue Chain = Op.getOperand(i: `0`);
9800	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `1`))->get();
9801	SDValue LHS = Op.getOperand(i: `2`);
9802	SDValue RHS = Op.getOperand(i: `3`);
9803	SDValue Dest = Op.getOperand(i: `4`);
9804	SDLoc dl(Op);
9805
9806	MachineFunction &MF = DAG.getMachineFunction();
9807	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9808	// will not be produced, as they are conditional branch instructions that do
9809	// not set flags.
9810	bool ProduceNonFlagSettingCondBr =
9811	!MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening);
9812
9813	// Handle f128 first, since lowering it will result in comparing the return
9814	// value of a libcall against zero, which is just what the rest of LowerBR_CC
9815	// is expecting to deal with.
9816	if (LHS.getValueType() == MVT::f128) {
9817	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
9818
9819	// If softenSetCCOperands returned a scalar, we need to compare the result
9820	// against zero to select between true and false values.
9821	if (!RHS.getNode()) {
9822	RHS = DAG.getConstant(Val: `0`, DL: dl, VT: LHS.getValueType());
9823	CC = ISD::SETNE;
9824	}
9825	}
9826
9827	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
9828	// instruction.
9829	if (ISD::isOverflowIntrOpRes(Op: LHS) && isOneConstant(V: RHS) &&
9830	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
9831	// Only lower legal XALUO ops.
9832	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: LHS ->getValueType(ResNo: `0`)))
9833	return SDValue ();
9834
9835	// The actual operation with overflow check.
9836	AArch64CC::CondCode OFCC;
9837	SDValue Value, Overflow;
9838	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: LHS.getValue(R: `0`), DAG);
9839
9840	if (CC == ISD::SETNE)
9841	OFCC = getInvertedCondCode(Code: OFCC);
9842	SDValue CCVal = DAG.getConstant(Val: OFCC, DL: dl, VT: MVT::i32);
9843
9844	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
9845	N4: Overflow);
9846	}
9847
9848	if (LHS.getValueType().isInteger()) {
9849	assert((LHS.getValueType() == RHS.getValueType()) &&
9850	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
9851
9852	// If the RHS of the comparison is zero, we can potentially fold this
9853	// to a specialized branch.
9854	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
9855	if (RHSC && RHSC->getZExtValue() == `0` && ProduceNonFlagSettingCondBr) {
9856	if (CC == ISD::SETEQ) {
9857	// See if we can use a TBZ to fold in an AND as well.
9858	// TBZ has a smaller branch displacement than CBZ. If the offset is
9859	// out of bounds, a late MI-layer pass rewrites branches.
9860	// 403.gcc is an example that hits this case.
9861	if (LHS.getOpcode() == ISD::AND &&
9862	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
9863	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
9864	SDValue Test = LHS.getOperand(i: `0`);
9865	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
9866	return DAG.getNode(Opcode: AArch64ISD::TBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: Test,
9867	N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL: dl, VT: MVT::i64),
9868	N4: Dest);
9869	}
9870
9871	return DAG.getNode(Opcode: AArch64ISD::CBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
9872	} else if (CC == ISD::SETNE) {
9873	// See if we can use a TBZ to fold in an AND as well.
9874	// TBZ has a smaller branch displacement than CBZ. If the offset is
9875	// out of bounds, a late MI-layer pass rewrites branches.
9876	// 403.gcc is an example that hits this case.
9877	if (LHS.getOpcode() == ISD::AND &&
9878	isa<ConstantSDNode>(Val: LHS.getOperand(i: `1`)) &&
9879	isPowerOf2_64(Value: LHS.getConstantOperandVal(i: `1`))) {
9880	SDValue Test = LHS.getOperand(i: `0`);
9881	uint64_t Mask = LHS.getConstantOperandVal(i: `1`);
9882	return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: Test,
9883	N3: DAG.getConstant(Val: Log2_64(Value: Mask), DL: dl, VT: MVT::i64),
9884	N4: Dest);
9885	}
9886
9887	return DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
9888	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9889	// Don't combine AND since emitComparison converts the AND to an ANDS
9890	// (a.k.a. TST) and the test in the test bit and branch instruction
9891	// becomes redundant. This would also increase register pressure.
9892	uint64_t SignBitPos;
9893	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9894	return DAG.getNode(Opcode: AArch64ISD::TBNZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS,
9895	N3: DAG.getConstant(Val: SignBitPos, DL: dl, VT: MVT::i64), N4: Dest);
9896	}
9897	}
9898	if (RHSC && RHSC->getSExtValue() == -`1` && CC == ISD::SETGT &&
9899	LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9900	// Don't combine AND since emitComparison converts the AND to an ANDS
9901	// (a.k.a. TST) and the test in the test bit and branch instruction
9902	// becomes redundant. This would also increase register pressure.
9903	uint64_t SignBitPos;
9904	std::tie(args&: LHS, args&: SignBitPos) = lookThroughSignExtension(Val: LHS);
9905	return DAG.getNode(Opcode: AArch64ISD::TBZ, DL: dl, VT: MVT::Other, N1: Chain, N2: LHS,
9906	N3: DAG.getConstant(Val: SignBitPos, DL: dl, VT: MVT::i64), N4: Dest);
9907	}
9908
9909	SDValue CCVal;
9910	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
9911	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CCVal,
9912	N4: Cmp);
9913	}
9914
9915	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::bf16 \|\|
9916	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
9917
9918	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9919	// clean. Some of them require two branches to implement.
9920	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9921	AArch64CC::CondCode CC1, CC2;
9922	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
9923	SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
9924	SDValue BR1 =
9925	DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: Chain, N2: Dest, N3: CC1Val, N4: Cmp);
9926	if (CC2 != AArch64CC::AL) {
9927	SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
9928	return DAG.getNode(Opcode: AArch64ISD::BRCOND, DL: dl, VT: MVT::Other, N1: BR1, N2: Dest, N3: CC2Val,
9929	N4: Cmp);
9930	}
9931
9932	return BR1;
9933	}
9934
9935	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9936	SelectionDAG &DAG) const {
9937	if (!Subtarget->isNeonAvailable() &&
9938	!Subtarget->useSVEForFixedLengthVectors())
9939	return SDValue ();
9940
9941	EVT VT = Op.getValueType();
9942	EVT IntVT = VT.changeTypeToInteger();
9943	SDLoc DL(Op);
9944
9945	SDValue In1 = Op.getOperand(i: `0`);
9946	SDValue In2 = Op.getOperand(i: `1`);
9947	EVT SrcVT = In2.getValueType();
9948
9949	if (!SrcVT.bitsEq(VT))
9950	In2 = DAG.getFPExtendOrRound(Op: In2, DL, VT);
9951
9952	if (VT.isScalableVector())
9953	IntVT =
9954	getPackedSVEVectorVT(VT: VT.getVectorElementType().changeTypeToInteger());
9955
9956	if (VT.isFixedLengthVector() &&
9957	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
9958	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9959
9960	In1 = convertToScalableVector(DAG, VT: ContainerVT, V: In1);
9961	In2 = convertToScalableVector(DAG, VT: ContainerVT, V: In2);
9962
9963	SDValue Res = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL, VT: ContainerVT, N1: In1, N2: In2);
9964	return convertFromScalableVector(DAG, VT, V: Res);
9965	}
9966
9967	auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9968	if (VT.isScalableVector())
9969	return getSVESafeBitCast(VT, Op, DAG);
9970
9971	return DAG.getBitcast(VT, V: Op);
9972	};
9973
9974	SDValue VecVal1, VecVal2;
9975	EVT VecVT;
9976	auto SetVecVal = [&](int Idx = -`1`) {
9977	if (!VT.isVector()) {
9978	VecVal1 =
9979	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In1);
9980	VecVal2 =
9981	DAG.getTargetInsertSubreg(SRIdx: Idx, DL, VT: VecVT, Operand: DAG.getUNDEF(VT: VecVT), Subreg: In2);
9982	} else {
9983	VecVal1 = BitCast (VecVT, In1, DAG);
9984	VecVal2 = BitCast (VecVT, In2, DAG);
9985	}
9986	};
9987	if (VT.isVector()) {
9988	VecVT = IntVT;
9989	SetVecVal ();
9990	} else if (VT == MVT::f64) {
9991	VecVT = MVT::v2i64;
9992	SetVecVal (AArch64::dsub);
9993	} else if (VT == MVT::f32) {
9994	VecVT = MVT::v4i32;
9995	SetVecVal (AArch64::ssub);
9996	} else if (VT == MVT::f16 \|\| VT == MVT::bf16) {
9997	VecVT = MVT::v8i16;
9998	SetVecVal (AArch64::hsub);
9999	} else {
10000	llvm_unreachable("Invalid type for copysign!");
10001	}
10002
10003	unsigned BitWidth = In1.getScalarValueSizeInBits();
10004	SDValue SignMaskV = DAG.getConstant(Val: ~APInt::getSignMask(BitWidth), DL, VT: VecVT);
10005
10006	// We want to materialize a mask with every bit but the high bit set, but the
10007	// AdvSIMD immediate moves cannot materialize that in a single instruction for
10008	// 64-bit elements. Instead, materialize all bits set and then negate that.
10009	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
10010	SignMaskV = DAG.getConstant(Val: APInt::getAllOnes(numBits: BitWidth), DL, VT: VecVT);
10011	SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2f64, Operand: SignMaskV);
10012	SignMaskV = DAG.getNode(Opcode: ISD::FNEG, DL, VT: MVT::v2f64, Operand: SignMaskV);
10013	SignMaskV = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v2i64, Operand: SignMaskV);
10014	}
10015
10016	SDValue BSP =
10017	DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT: VecVT, N1: SignMaskV, N2: VecVal1, N3: VecVal2);
10018	if (VT == MVT::f16 \|\| VT == MVT::bf16)
10019	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT, Operand: BSP);
10020	if (VT == MVT::f32)
10021	return DAG.getTargetExtractSubreg(SRIdx: AArch64::ssub, DL, VT, Operand: BSP);
10022	if (VT == MVT::f64)
10023	return DAG.getTargetExtractSubreg(SRIdx: AArch64::dsub, DL, VT, Operand: BSP);
10024
10025	return BitCast (VT, BSP, DAG);
10026	}
10027
10028	SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10029	SelectionDAG &DAG) const {
10030	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10031	Kind: Attribute::NoImplicitFloat))
10032	return SDValue ();
10033
10034	EVT VT = Op.getValueType();
10035	if (VT.isScalableVector() \|\|
10036	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
10037	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::CTPOP_MERGE_PASSTHRU);
10038
10039	if (!Subtarget->isNeonAvailable())
10040	return SDValue ();
10041
10042	bool IsParity = Op.getOpcode() == ISD::PARITY;
10043	SDValue Val = Op.getOperand(i: `0`);
10044	SDLoc DL(Op);
10045
10046	// for i32, general parity function using EORs is more efficient compared to
10047	// using floating point
10048	if (VT == MVT::i32 && IsParity)
10049	return SDValue ();
10050
10051	// If there is no CNT instruction available, GPR popcount can
10052	// be more efficiently lowered to the following sequence that uses
10053	// AdvSIMD registers/instructions as long as the copies to/from
10054	// the AdvSIMD registers are cheap.
10055	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10056	// CNT V0.8B, V0.8B // 8xbyte pop-counts
10057	// ADDV B0, V0.8B // sum 8xbyte pop-counts
10058	// UMOV X0, V0.B[0] // copy byte result back to integer reg
10059	if (VT == MVT::i32 \|\| VT == MVT::i64) {
10060	if (VT == MVT::i32)
10061	Val = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: Val);
10062	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i8, Operand: Val);
10063
10064	SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v8i8, Operand: Val);
10065	SDValue UaddLV = DAG.getNode(Opcode: AArch64ISD::UADDLV, DL, VT: MVT::v4i32, Operand: CtPop);
10066	UaddLV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: UaddLV,
10067	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10068
10069	if (IsParity)
10070	UaddLV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UaddLV,
10071	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
10072
10073	if (VT == MVT::i64)
10074	UaddLV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i64, Operand: UaddLV);
10075	return UaddLV;
10076	} else if (VT == MVT::i128) {
10077	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v16i8, Operand: Val);
10078
10079	SDValue CtPop = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: MVT::v16i8, Operand: Val);
10080	SDValue UaddLV = DAG.getNode(Opcode: AArch64ISD::UADDLV, DL, VT: MVT::v4i32, Operand: CtPop);
10081	UaddLV = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i32, N1: UaddLV,
10082	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
10083
10084	if (IsParity)
10085	UaddLV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::i32, N1: UaddLV,
10086	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i32));
10087
10088	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i128, Operand: UaddLV);
10089	}
10090
10091	assert(!IsParity && "ISD::PARITY of vector types not supported");
10092
10093	assert((VT == MVT::v1i64 \|\| VT == MVT::v2i64 \|\| VT == MVT::v2i32 \|\|
10094	VT == MVT::v4i32 \|\| VT == MVT::v4i16 \|\| VT == MVT::v8i16) &&
10095	"Unexpected type for custom ctpop lowering");
10096
10097	EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10098	Val = DAG.getBitcast(VT: VT8Bit, V: Val);
10099	Val = DAG.getNode(Opcode: ISD::CTPOP, DL, VT: VT8Bit, Operand: Val);
10100
10101	if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != `16` &&
10102	VT.getVectorNumElements() >= `2`) {
10103	EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10104	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: DT);
10105	SDValue Ones = DAG.getConstant(Val: `1`, DL, VT: VT8Bit);
10106
10107	if (VT == MVT::v2i64) {
10108	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10109	Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT, Operand: Val);
10110	} else if (VT == MVT::v2i32) {
10111	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10112	} else if (VT == MVT::v4i32) {
10113	Val = DAG.getNode(Opcode: AArch64ISD::UDOT, DL, VT: DT, N1: Zeros, N2: Ones, N3: Val);
10114	} else {
10115	llvm_unreachable("Unexpected type for custom ctpop lowering");
10116	}
10117
10118	return Val;
10119	}
10120
10121	// Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10122	unsigned EltSize = `8`;
10123	unsigned NumElts = VT.is64BitVector() ? `8` : `16`;
10124	while (EltSize != VT.getScalarSizeInBits()) {
10125	EltSize *= `2`;
10126	NumElts /= `2`;
10127	MVT WidenVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: EltSize), NumElements: NumElts);
10128	Val = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: WidenVT, Operand: Val);
10129	}
10130
10131	return Val;
10132	}
10133
10134	SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10135	EVT VT = Op.getValueType();
10136	assert(VT.isScalableVector() \|\|
10137	useSVEForFixedLengthVectorVT(
10138	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()));
10139
10140	SDLoc DL(Op);
10141	SDValue RBIT = DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT, Operand: Op.getOperand(i: `0`));
10142	return DAG.getNode(Opcode: ISD::CTLZ, DL, VT, Operand: RBIT);
10143	}
10144
10145	SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10146	SelectionDAG &DAG) const {
10147
10148	EVT VT = Op.getValueType();
10149	SDLoc DL(Op);
10150	unsigned Opcode = Op.getOpcode();
10151	ISD::CondCode CC;
10152	switch (Opcode) {
10153	default:
10154	llvm_unreachable("Wrong instruction");
10155	case ISD::SMAX:
10156	CC = ISD::SETGT;
10157	break;
10158	case ISD::SMIN:
10159	CC = ISD::SETLT;
10160	break;
10161	case ISD::UMAX:
10162	CC = ISD::SETUGT;
10163	break;
10164	case ISD::UMIN:
10165	CC = ISD::SETULT;
10166	break;
10167	}
10168
10169	if (VT.isScalableVector() \|\|
10170	useSVEForFixedLengthVectorVT(
10171	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
10172	switch (Opcode) {
10173	default:
10174	llvm_unreachable("Wrong instruction");
10175	case ISD::SMAX:
10176	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMAX_PRED);
10177	case ISD::SMIN:
10178	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SMIN_PRED);
10179	case ISD::UMAX:
10180	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMAX_PRED);
10181	case ISD::UMIN:
10182	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::UMIN_PRED);
10183	}
10184	}
10185
10186	SDValue Op0 = Op.getOperand(i: `0`);
10187	SDValue Op1 = Op.getOperand(i: `1`);
10188	SDValue Cond = DAG.getSetCC(DL, VT, LHS: Op0, RHS: Op1, Cond: CC);
10189	return DAG.getSelect(DL, VT, Cond, LHS: Op0, RHS: Op1);
10190	}
10191
10192	SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10193	SelectionDAG &DAG) const {
10194	EVT VT = Op.getValueType();
10195
10196	if (VT.isScalableVector() \|\|
10197	useSVEForFixedLengthVectorVT(
10198	VT, /OverrideNEON=/Subtarget->useSVEForFixedLengthVectors()))
10199	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10200
10201	SDLoc DL(Op);
10202	SDValue REVB;
10203	MVT VST;
10204
10205	switch (VT.getSimpleVT().SimpleTy) {
10206	default:
10207	llvm_unreachable("Invalid type for bitreverse!");
10208
10209	case MVT::v2i32: {
10210	VST = MVT::v8i8;
10211	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10212
10213	break;
10214	}
10215
10216	case MVT::v4i32: {
10217	VST = MVT::v16i8;
10218	REVB = DAG.getNode(Opcode: AArch64ISD::REV32, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10219
10220	break;
10221	}
10222
10223	case MVT::v1i64: {
10224	VST = MVT::v8i8;
10225	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10226
10227	break;
10228	}
10229
10230	case MVT::v2i64: {
10231	VST = MVT::v16i8;
10232	REVB = DAG.getNode(Opcode: AArch64ISD::REV64, DL, VT: VST, Operand: Op.getOperand(i: `0`));
10233
10234	break;
10235	}
10236	}
10237
10238	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT,
10239	Operand: DAG.getNode(Opcode: ISD::BITREVERSE, DL, VT: VST, Operand: REVB));
10240	}
10241
10242	// Check whether the continuous comparison sequence.
10243	static bool
10244	isOrXorChain(SDValue N, unsigned &Num,
10245	SmallVector<std::pair<SDValue, SDValue>, `16`> &WorkList) {
10246	if (Num == MaxXors)
10247	return false;
10248
10249	// Skip the one-use zext
10250	if (N ->getOpcode() == ISD::ZERO_EXTEND && N ->hasOneUse())
10251	N = N ->getOperand(Num: `0`);
10252
10253	// The leaf node must be XOR
10254	if (N ->getOpcode() == ISD::XOR) {
10255	WorkList.push_back(Elt: std::make_pair(x: N ->getOperand(Num: `0`), y: N ->getOperand(Num: `1`)));
10256	Num++;
10257	return true;
10258	}
10259
10260	// All the non-leaf nodes must be OR.
10261	if (N ->getOpcode() != ISD::OR \|\| !N ->hasOneUse())
10262	return false;
10263
10264	if (isOrXorChain(N: N ->getOperand(Num: `0`), Num, WorkList) &&
10265	isOrXorChain(N: N ->getOperand(Num: `1`), Num, WorkList))
10266	return true;
10267	return false;
10268	}
10269
10270	// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10271	static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG) {
10272	SDValue LHS = N->getOperand(Num: `0`);
10273	SDValue RHS = N->getOperand(Num: `1`);
10274	SDLoc DL(N);
10275	EVT VT = N->getValueType(ResNo: `0`);
10276	SmallVector<std::pair<SDValue, SDValue>, `16`> WorkList;
10277
10278	// Only handle integer compares.
10279	if (N->getOpcode() != ISD::SETCC)
10280	return SDValue ();
10281
10282	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
10283	// Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10284	// sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10285	unsigned NumXors = `0`;
10286	if ((Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) && isNullConstant(V: RHS) &&
10287	LHS ->getOpcode() == ISD::OR && LHS ->hasOneUse() &&
10288	isOrXorChain(N: LHS, Num&: NumXors, WorkList)) {
10289	SDValue XOR0, XOR1;
10290	std::tie(args&: XOR0, args&: XOR1) = WorkList [`0`];
10291	unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10292	SDValue Cmp = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10293	for (unsigned I = `1`; I < WorkList.size(); I++) {
10294	std::tie(args&: XOR0, args&: XOR1) = WorkList [I];
10295	SDValue CmpChain = DAG.getSetCC(DL, VT, LHS: XOR0, RHS: XOR1, Cond);
10296	Cmp = DAG.getNode(Opcode: LogicOp, DL, VT, N1: Cmp, N2: CmpChain);
10297	}
10298
10299	// Exit early by inverting the condition, which help reduce indentations.
10300	return Cmp;
10301	}
10302
10303	return SDValue ();
10304	}
10305
10306	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10307
10308	if (Op.getValueType().isVector())
10309	return LowerVSETCC(Op, DAG);
10310
10311	bool IsStrict = Op ->isStrictFPOpcode();
10312	bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10313	unsigned OpNo = IsStrict ? `1` : `0`;
10314	SDValue Chain;
10315	if (IsStrict)
10316	Chain = Op.getOperand(i: `0`);
10317	SDValue LHS = Op.getOperand(i: OpNo + `0`);
10318	SDValue RHS = Op.getOperand(i: OpNo + `1`);
10319	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: OpNo + `2`))->get();
10320	SDLoc dl(Op);
10321
10322	// We chose ZeroOrOneBooleanContents, so use zero and one.
10323	EVT VT = Op.getValueType();
10324	SDValue TVal = DAG.getConstant(Val: `1`, DL: dl, VT);
10325	SDValue FVal = DAG.getConstant(Val: `0`, DL: dl, VT);
10326
10327	// Handle f128 first, since one possible outcome is a normal integer
10328	// comparison which gets picked up by the next if statement.
10329	if (LHS.getValueType() == MVT::f128) {
10330	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS, Chain,
10331	IsSignaling);
10332
10333	// If softenSetCCOperands returned a scalar, use it.
10334	if (!RHS.getNode()) {
10335	assert(LHS.getValueType() == Op.getValueType() &&
10336	"Unexpected setcc expansion!");
10337	return IsStrict ? DAG.getMergeValues(Ops: {LHS, Chain}, dl) : LHS;
10338	}
10339	}
10340
10341	if (LHS.getValueType().isInteger()) {
10342	SDValue CCVal;
10343	SDValue Cmp = getAArch64Cmp(
10344	LHS, RHS, CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), AArch64cc&: CCVal, DAG, dl);
10345
10346	// Note that we inverted the condition above, so we reverse the order of
10347	// the true and false operands here. This will allow the setcc to be
10348	// matched to a single CSINC instruction.
10349	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CCVal, N4: Cmp);
10350	return IsStrict ? DAG.getMergeValues(Ops: {Res, Chain}, dl) : Res;
10351	}
10352
10353	// Now we know we're dealing with FP values.
10354	assert(LHS.getValueType() == MVT::bf16 \|\| LHS.getValueType() == MVT::f16 \|\|
10355	LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
10356
10357	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
10358	// and do the comparison.
10359	SDValue Cmp;
10360	if (IsStrict)
10361	Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
10362	else
10363	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10364
10365	AArch64CC::CondCode CC1, CC2;
10366	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10367	SDValue Res;
10368	if (CC2 == AArch64CC::AL) {
10369	changeFPCCToAArch64CC(CC: ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType()), CondCode&: CC1,
10370	CondCode2&: CC2);
10371	SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10372
10373	// Note that we inverted the condition above, so we reverse the order of
10374	// the true and false operands here. This will allow the setcc to be
10375	// matched to a single CSINC instruction.
10376	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: FVal, N2: TVal, N3: CC1Val, N4: Cmp);
10377	} else {
10378	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
10379	// totally clean. Some of them require two CSELs to implement. As is in
10380	// this case, we emit the first CSEL and then emit a second using the output
10381	// of the first as the RHS. We're effectively OR'ing the two CC's together.
10382
10383	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
10384	SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10385	SDValue CS1 =
10386	DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
10387
10388	SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
10389	Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
10390	}
10391	return IsStrict ? DAG.getMergeValues(Ops: {Res, Cmp.getValue(R: `1`)}, dl) : Res;
10392	}
10393
10394	SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
10395	SelectionDAG &DAG) const {
10396
10397	SDValue LHS = Op.getOperand(i: `0`);
10398	SDValue RHS = Op.getOperand(i: `1`);
10399	EVT VT = LHS.getValueType();
10400	if (VT != MVT::i32 && VT != MVT::i64)
10401	return SDValue ();
10402
10403	SDLoc DL(Op);
10404	SDValue Carry = Op.getOperand(i: `2`);
10405	// SBCS uses a carry not a borrow so the carry flag should be inverted first.
10406	SDValue InvCarry = valueToCarryFlag(Value: Carry, DAG, Invert: true);
10407	SDValue Cmp = DAG.getNode(Opcode: AArch64ISD::SBCS, DL, VTList: DAG.getVTList(VT1: VT, VT2: MVT::Glue),
10408	N1: LHS, N2: RHS, N3: InvCarry);
10409
10410	EVT OpVT = Op.getValueType();
10411	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OpVT);
10412	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OpVT);
10413
10414	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: Op.getOperand(i: `3`))->get();
10415	ISD::CondCode CondInv = ISD::getSetCCInverse(Operation: Cond, Type: VT);
10416	SDValue CCVal =
10417	DAG.getConstant(Val: changeIntCCToAArch64CC(CC: CondInv), DL, VT: MVT::i32);
10418	// Inputs are swapped because the condition is inverted. This will allow
10419	// matching with a single CSINC instruction.
10420	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OpVT, N1: FVal, N2: TVal, N3: CCVal,
10421	N4: Cmp.getValue(R: `1`));
10422	}
10423
10424	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
10425	SDValue RHS, SDValue TVal,
10426	SDValue FVal, const SDLoc &dl,
10427	SelectionDAG &DAG) const {
10428	// Handle f128 first, because it will result in a comparison of some RTLIB
10429	// call result against zero.
10430	if (LHS.getValueType() == MVT::f128) {
10431	softenSetCCOperands(DAG, VT: MVT::f128, NewLHS&: LHS, NewRHS&: RHS, CCCode&: CC, DL: dl, OldLHS: LHS, OldRHS: RHS);
10432
10433	// If softenSetCCOperands returned a scalar, we need to compare the result
10434	// against zero to select between true and false values.
10435	if (!RHS.getNode()) {
10436	RHS = DAG.getConstant(Val: `0`, DL: dl, VT: LHS.getValueType());
10437	CC = ISD::SETNE;
10438	}
10439	}
10440
10441	// Also handle f16, for which we need to do a f32 comparison.
10442	if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
10443	LHS.getValueType() == MVT::bf16) {
10444	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: LHS);
10445	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::f32, Operand: RHS);
10446	}
10447
10448	// Next, handle integers.
10449	if (LHS.getValueType().isInteger()) {
10450	assert((LHS.getValueType() == RHS.getValueType()) &&
10451	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));
10452
10453	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val&: FVal);
10454	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val&: TVal);
10455	ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Val&: RHS);
10456	// Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
10457	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
10458	// supported types.
10459	if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
10460	CTVal->isOne() && CFVal->isAllOnes() &&
10461	LHS.getValueType() == TVal.getValueType()) {
10462	EVT VT = LHS.getValueType();
10463	SDValue Shift =
10464	DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
10465	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL: dl, VT));
10466	return DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Shift, N2: DAG.getConstant(Val: `1`, DL: dl, VT));
10467	}
10468
10469	// Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
10470	// (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
10471	// (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
10472	// Both require less instructions than compare and conditional select.
10473	if ((CC == ISD::SETGT \|\| CC == ISD::SETLT) && LHS == TVal &&
10474	RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
10475	LHS.getValueType() == RHS.getValueType()) {
10476	EVT VT = LHS.getValueType();
10477	SDValue Shift =
10478	DAG.getNode(Opcode: ISD::SRA, DL: dl, VT, N1: LHS,
10479	N2: DAG.getConstant(Val: VT.getSizeInBits() - `1`, DL: dl, VT));
10480
10481	if (CC == ISD::SETGT)
10482	Shift = DAG.getNOT(DL: dl, Val: Shift, VT);
10483
10484	return DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: LHS, N2: Shift);
10485	}
10486
10487	unsigned Opcode = AArch64ISD::CSEL;
10488
10489	// If both the TVal and the FVal are constants, see if we can swap them in
10490	// order to for a CSINV or CSINC out of them.
10491	if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
10492	std::swap(a&: TVal, b&: FVal);
10493	std::swap(a&: CTVal, b&: CFVal);
10494	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10495	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
10496	std::swap(a&: TVal, b&: FVal);
10497	std::swap(a&: CTVal, b&: CFVal);
10498	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10499	} else if (TVal.getOpcode() == ISD::XOR) {
10500	// If TVal is a NOT we want to swap TVal and FVal so that we can match
10501	// with a CSINV rather than a CSEL.
10502	if (isAllOnesConstant(V: TVal.getOperand(i: `1`))) {
10503	std::swap(a&: TVal, b&: FVal);
10504	std::swap(a&: CTVal, b&: CFVal);
10505	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10506	}
10507	} else if (TVal.getOpcode() == ISD::SUB) {
10508	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
10509	// that we can match with a CSNEG rather than a CSEL.
10510	if (isNullConstant(V: TVal.getOperand(i: `0`))) {
10511	std::swap(a&: TVal, b&: FVal);
10512	std::swap(a&: CTVal, b&: CFVal);
10513	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10514	}
10515	} else if (CTVal && CFVal) {
10516	const int64_t TrueVal = CTVal->getSExtValue();
10517	const int64_t FalseVal = CFVal->getSExtValue();
10518	bool Swap = false;
10519
10520	// If both TVal and FVal are constants, see if FVal is the
10521	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10522	// instead of a CSEL in that case.
10523	if (TrueVal == ~FalseVal) {
10524	Opcode = AArch64ISD::CSINV;
10525	} else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10526	TrueVal == -FalseVal) {
10527	Opcode = AArch64ISD::CSNEG;
10528	} else if (TVal.getValueType() == MVT::i32) {
10529	// If our operands are only 32-bit wide, make sure we use 32-bit
10530	// arithmetic for the check whether we can use CSINC. This ensures that
10531	// the addition in the check will wrap around properly in case there is
10532	// an overflow (which would not be the case if we do the check with
10533	// 64-bit arithmetic).
10534	const uint32_t TrueVal32 = CTVal->getZExtValue();
10535	const uint32_t FalseVal32 = CFVal->getZExtValue();
10536
10537	if ((TrueVal32 == FalseVal32 + `1`) \|\| (TrueVal32 + `1` == FalseVal32)) {
10538	Opcode = AArch64ISD::CSINC;
10539
10540	if (TrueVal32 > FalseVal32) {
10541	Swap = true;
10542	}
10543	}
10544	} else {
10545	// 64-bit check whether we can use CSINC.
10546	const uint64_t TrueVal64 = TrueVal;
10547	const uint64_t FalseVal64 = FalseVal;
10548
10549	if ((TrueVal64 == FalseVal64 + `1`) \|\| (TrueVal64 + `1` == FalseVal64)) {
10550	Opcode = AArch64ISD::CSINC;
10551
10552	if (TrueVal > FalseVal) {
10553	Swap = true;
10554	}
10555	}
10556	}
10557
10558	// Swap TVal and FVal if necessary.
10559	if (Swap) {
10560	std::swap(a&: TVal, b&: FVal);
10561	std::swap(a&: CTVal, b&: CFVal);
10562	CC = ISD::getSetCCInverse(Operation: CC, Type: LHS.getValueType());
10563	}
10564
10565	if (Opcode != AArch64ISD::CSEL) {
10566	// Drop FVal since we can get its value by simply inverting/negating
10567	// TVal.
10568	FVal = TVal;
10569	}
10570	}
10571
10572	// Avoid materializing a constant when possible by reusing a known value in
10573	// a register. However, don't perform this optimization if the known value
10574	// is one, zero or negative one in the case of a CSEL. We can always
10575	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10576	// FVal, respectively.
10577	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(Val&: RHS);
10578	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10579	!RHSVal->isZero() && !RHSVal->isAllOnes()) {
10580	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10581	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10582	// "a != C ? x : a" to avoid materializing C.
10583	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10584	TVal = LHS;
10585	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10586	FVal = LHS;
10587	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10588	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10589	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10590	// avoid materializing C.
10591	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
10592	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10593	Opcode = AArch64ISD::CSINV;
10594	TVal = LHS;
10595	FVal = DAG.getConstant(Val: `0`, DL: dl, VT: FVal.getValueType());
10596	}
10597	}
10598
10599	SDValue CCVal;
10600	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, AArch64cc&: CCVal, DAG, dl);
10601	EVT VT = TVal.getValueType();
10602	return DAG.getNode(Opcode, DL: dl, VT, N1: TVal, N2: FVal, N3: CCVal, N4: Cmp);
10603	}
10604
10605	// Now we know we're dealing with FP values.
10606	assert(LHS.getValueType() == MVT::f16 \|\| LHS.getValueType() == MVT::f32 \|\|
10607	LHS.getValueType() == MVT::f64);
10608	assert(LHS.getValueType() == RHS.getValueType());
10609	EVT VT = TVal.getValueType();
10610	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10611
10612	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10613	// clean. Some of them require two CSELs to implement.
10614	AArch64CC::CondCode CC1, CC2;
10615	changeFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2);
10616
10617	if (DAG.getTarget().Options.UnsafeFPMath) {
10618	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10619	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10620	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(Val&: RHS);
10621	if (RHSVal && RHSVal->isZero()) {
10622	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(Val&: FVal);
10623	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(Val&: TVal);
10624
10625	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
10626	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10627	TVal = LHS;
10628	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
10629	CFVal && CFVal->isZero() &&
10630	FVal.getValueType() == LHS.getValueType())
10631	FVal = LHS;
10632	}
10633	}
10634
10635	// Emit first, and possibly only, CSEL.
10636	SDValue CC1Val = DAG.getConstant(Val: CC1, DL: dl, VT: MVT::i32);
10637	SDValue CS1 = DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: FVal, N3: CC1Val, N4: Cmp);
10638
10639	// If we need a second CSEL, emit it, using the output of the first as the
10640	// RHS. We're effectively OR'ing the two CC's together.
10641	if (CC2 != AArch64CC::AL) {
10642	SDValue CC2Val = DAG.getConstant(Val: CC2, DL: dl, VT: MVT::i32);
10643	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: TVal, N2: CS1, N3: CC2Val, N4: Cmp);
10644	}
10645
10646	// Otherwise, return the output of the first CSEL.
10647	return CS1;
10648	}
10649
10650	SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10651	SelectionDAG &DAG) const {
10652	EVT Ty = Op.getValueType();
10653	auto Idx = Op.getConstantOperandAPInt(i: `2`);
10654	int64_t IdxVal = Idx.getSExtValue();
10655	assert(Ty.isScalableVector() &&
10656	"Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10657
10658	// We can use the splice instruction for certain index values where we are
10659	// able to efficiently generate the correct predicate. The index will be
10660	// inverted and used directly as the input to the ptrue instruction, i.e.
10661	// -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10662	// splice predicate. However, we can only do this if we can guarantee that
10663	// there are enough elements in the vector, hence we check the index <= min
10664	// number of elements.
10665	std::optional<unsigned> PredPattern;
10666	if (Ty.isScalableVector() && IdxVal < `0` &&
10667	(PredPattern = getSVEPredPatternFromNumElements(MinNumElts: std::abs(i: IdxVal))) !=
10668	std::nullopt) {
10669	SDLoc DL(Op);
10670
10671	// Create a predicate where all but the last -IdxVal elements are false.
10672	EVT PredVT = Ty.changeVectorElementType(EltVT: MVT::i1);
10673	SDValue Pred = getPTrue(DAG, DL, VT: PredVT, Pattern: *PredPattern);
10674	Pred = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: PredVT, Operand: Pred);
10675
10676	// Now splice the two inputs together using the predicate.
10677	return DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: Ty, N1: Pred, N2: Op.getOperand(i: `0`),
10678	N3: Op.getOperand(i: `1`));
10679	}
10680
10681	// We can select to an EXT instruction when indexing the first 256 bytes.
10682	unsigned BlockSize = AArch64::SVEBitsPerBlock / Ty.getVectorMinNumElements();
10683	if (IdxVal >= `0` && (IdxVal * BlockSize / `8`) < `256`)
10684	return Op;
10685
10686	return SDValue ();
10687	}
10688
10689	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10690	SelectionDAG &DAG) const {
10691	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `4`))->get();
10692	SDValue LHS = Op.getOperand(i: `0`);
10693	SDValue RHS = Op.getOperand(i: `1`);
10694	SDValue TVal = Op.getOperand(i: `2`);
10695	SDValue FVal = Op.getOperand(i: `3`);
10696	SDLoc DL(Op);
10697	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10698	}
10699
10700	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10701	SelectionDAG &DAG) const {
10702	SDValue CCVal = Op ->getOperand(Num: `0`);
10703	SDValue TVal = Op ->getOperand(Num: `1`);
10704	SDValue FVal = Op ->getOperand(Num: `2`);
10705	SDLoc DL(Op);
10706
10707	EVT Ty = Op.getValueType();
10708	if (Ty == MVT::aarch64svcount) {
10709	TVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: TVal);
10710	FVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i1, Operand: FVal);
10711	SDValue Sel =
10712	DAG.getNode(Opcode: ISD::SELECT, DL, VT: MVT::nxv16i1, N1: CCVal, N2: TVal, N3: FVal);
10713	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Ty, Operand: Sel);
10714	}
10715
10716	if (Ty.isScalableVector()) {
10717	MVT PredVT = MVT::getVectorVT(VT: MVT::i1, EC: Ty.getVectorElementCount());
10718	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: CCVal);
10719	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10720	}
10721
10722	if (useSVEForFixedLengthVectorVT(VT: Ty, OverrideNEON: !Subtarget->isNeonAvailable())) {
10723	// FIXME: Ideally this would be the same as above using i1 types, however
10724	// for the moment we can't deal with fixed i1 vector types properly, so
10725	// instead extend the predicate to a result type sized integer vector.
10726	MVT SplatValVT = MVT::getIntegerVT(BitWidth: Ty.getScalarSizeInBits());
10727	MVT PredVT = MVT::getVectorVT(VT: SplatValVT, EC: Ty.getVectorElementCount());
10728	SDValue SplatVal = DAG.getSExtOrTrunc(Op: CCVal, DL, VT: SplatValVT);
10729	SDValue SplatPred = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: PredVT, Operand: SplatVal);
10730	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT: Ty, N1: SplatPred, N2: TVal, N3: FVal);
10731	}
10732
10733	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
10734	// instruction.
10735	if (ISD::isOverflowIntrOpRes(Op: CCVal)) {
10736	// Only lower legal XALUO ops.
10737	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: CCVal ->getValueType(ResNo: `0`)))
10738	return SDValue ();
10739
10740	AArch64CC::CondCode OFCC;
10741	SDValue Value, Overflow;
10742	std::tie(args&: Value, args&: Overflow) = getAArch64XALUOOp(CC&: OFCC, Op: CCVal.getValue(R: `0`), DAG);
10743	SDValue CCVal = DAG.getConstant(Val: OFCC, DL, VT: MVT::i32);
10744
10745	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: Op.getValueType(), N1: TVal, N2: FVal,
10746	N3: CCVal, N4: Overflow);
10747	}
10748
10749	// Lower it the same way as we would lower a SELECT_CC node.
10750	ISD::CondCode CC;
10751	SDValue LHS, RHS;
10752	if (CCVal.getOpcode() == ISD::SETCC) {
10753	LHS = CCVal.getOperand(i: `0`);
10754	RHS = CCVal.getOperand(i: `1`);
10755	CC = cast<CondCodeSDNode>(Val: CCVal.getOperand(i: `2`))->get();
10756	} else {
10757	LHS = CCVal;
10758	RHS = DAG.getConstant(Val: `0`, DL, VT: CCVal.getValueType());
10759	CC = ISD::SETNE;
10760	}
10761
10762	// If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10763	// order to use FCSELSrrr
10764	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10765	TVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
10766	Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: TVal);
10767	FVal = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
10768	Operand: DAG.getUNDEF(VT: MVT::f32), Subreg: FVal);
10769	}
10770
10771	SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, dl: DL, DAG);
10772
10773	if ((Ty == MVT::f16 \|\| Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10774	return DAG.getTargetExtractSubreg(SRIdx: AArch64::hsub, DL, VT: Ty, Operand: Res);
10775	}
10776
10777	return Res;
10778	}
10779
10780	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10781	SelectionDAG &DAG) const {
10782	// Jump table entries as PC relative offsets. No additional tweaking
10783	// is necessary here. Just get the address of the jump table.
10784	JumpTableSDNode *JT = cast<JumpTableSDNode>(Val&: Op);
10785
10786	CodeModel::Model CM = getTargetMachine().getCodeModel();
10787	if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
10788	!Subtarget->isTargetMachO())
10789	return getAddrLarge(N: JT, DAG);
10790	if (CM == CodeModel::Tiny)
10791	return getAddrTiny(N: JT, DAG);
10792	return getAddr(N: JT, DAG);
10793	}
10794
10795	SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10796	SelectionDAG &DAG) const {
10797	// Jump table entries as PC relative offsets. No additional tweaking
10798	// is necessary here. Just get the address of the jump table.
10799	SDLoc DL(Op);
10800	SDValue JT = Op.getOperand(i: `1`);
10801	SDValue Entry = Op.getOperand(i: `2`);
10802	int JTI = cast<JumpTableSDNode>(Val: JT.getNode())->getIndex();
10803
10804	auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10805	AFI->setJumpTableEntryInfo(Idx: JTI, Size: `4`, PCRelSym: nullptr);
10806
10807	// With aarch64-jump-table-hardening, we only expand the jump table dispatch
10808	// sequence later, to guarantee the integrity of the intermediate values.
10809	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
10810	Kind: "aarch64-jump-table-hardening")) {
10811	CodeModel::Model CM = getTargetMachine().getCodeModel();
10812	if (Subtarget->isTargetMachO()) {
10813	if (CM != CodeModel::Small && CM != CodeModel::Large)
10814	report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
10815	} else {
10816	// Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
10817	assert(Subtarget->isTargetELF() &&
10818	"jump table hardening only supported on MachO/ELF");
10819	if (CM != CodeModel::Small)
10820	report_fatal_error(reason: "Unsupported code-model for hardened jump-table");
10821	}
10822
10823	SDValue X16Copy = DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::X16,
10824	N: Entry, Glue: SDValue ());
10825	SDNode *B = DAG.getMachineNode(Opcode: AArch64::BR_JumpTable, dl: DL, VT: MVT::Other,
10826	Op1: DAG.getTargetJumpTable(JTI, VT: MVT::i32),
10827	Op2: X16Copy.getValue(R: `0`), Op3: X16Copy.getValue(R: `1`));
10828	return SDValue (B, `0`);
10829	}
10830
10831	SDNode *Dest =
10832	DAG.getMachineNode(Opcode: AArch64::JumpTableDest32, dl: DL, VT1: MVT::i64, VT2: MVT::i64, Op1: JT,
10833	Op2: Entry, Op3: DAG.getTargetJumpTable(JTI, VT: MVT::i32));
10834	SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Chain: Op.getOperand(i: `0`), DL);
10835	return DAG.getNode(Opcode: ISD::BRIND, DL, VT: MVT::Other, N1: JTInfo, N2: SDValue (Dest, `0`));
10836	}
10837
10838	SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
10839	SDValue Chain = Op.getOperand(i: `0`);
10840	SDValue Dest = Op.getOperand(i: `1`);
10841
10842	// BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
10843	// Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
10844	if (Dest ->isMachineOpcode() &&
10845	Dest ->getMachineOpcode() == AArch64::JumpTableDest32)
10846	return SDValue ();
10847
10848	const MachineFunction &MF = DAG.getMachineFunction();
10849	std::optional<uint16_t> BADisc =
10850	Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(ParentFn: MF.getFunction());
10851	if (!BADisc)
10852	return SDValue ();
10853
10854	SDLoc DL(Op);
10855
10856	SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
10857	SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
10858	SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10859
10860	SDNode *BrA = DAG.getMachineNode(Opcode: AArch64::BRA, dl: DL, VT: MVT::Other,
10861	Ops: {Dest, Key, Disc, AddrDisc, Chain});
10862	return SDValue (BrA, `0`);
10863	}
10864
10865	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10866	SelectionDAG &DAG) const {
10867	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Val&: Op);
10868	CodeModel::Model CM = getTargetMachine().getCodeModel();
10869	if (CM == CodeModel::Large) {
10870	// Use the GOT for the large code model on iOS.
10871	if (Subtarget->isTargetMachO()) {
10872	return getGOT(N: CP, DAG);
10873	}
10874	if (!getTargetMachine().isPositionIndependent())
10875	return getAddrLarge(N: CP, DAG);
10876	} else if (CM == CodeModel::Tiny) {
10877	return getAddrTiny(N: CP, DAG);
10878	}
10879	return getAddr(N: CP, DAG);
10880	}
10881
10882	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10883	SelectionDAG &DAG) const {
10884	BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Val&: Op);
10885	const BlockAddress *BA = BAN->getBlockAddress();
10886
10887	if (std::optional<uint16_t> BADisc =
10888	Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
10889	ParentFn: *BA->getFunction())) {
10890	SDLoc DL(Op);
10891
10892	// This isn't cheap, but BRIND is rare.
10893	SDValue TargetBA = DAG.getTargetBlockAddress(BA, VT: BAN->getValueType(ResNo: `0`));
10894
10895	SDValue Disc = DAG.getTargetConstant(Val: *BADisc, DL, VT: MVT::i64);
10896
10897	SDValue Key = DAG.getTargetConstant(Val: AArch64PACKey::IA, DL, VT: MVT::i32);
10898	SDValue AddrDisc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
10899
10900	SDNode *MOV =
10901	DAG.getMachineNode(Opcode: AArch64::MOVaddrPAC, dl: DL, ResultTys: {MVT::Other, MVT::Glue},
10902	Ops: {TargetBA, Key, AddrDisc, Disc});
10903	return DAG.getCopyFromReg(Chain: SDValue (MOV, `0`), dl: DL, Reg: AArch64::X16, VT: MVT::i64,
10904	Glue: SDValue (MOV, `1`));
10905	}
10906
10907	CodeModel::Model CM = getTargetMachine().getCodeModel();
10908	if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10909	if (!getTargetMachine().isPositionIndependent())
10910	return getAddrLarge(N: BAN, DAG);
10911	} else if (CM == CodeModel::Tiny) {
10912	return getAddrTiny(N: BAN, DAG);
10913	}
10914	return getAddr(N: BAN, DAG);
10915	}
10916
10917	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10918	SelectionDAG &DAG) const {
10919	AArch64FunctionInfo *FuncInfo =
10920	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10921
10922	SDLoc DL(Op);
10923	SDValue FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(),
10924	VT: getPointerTy(DL: DAG.getDataLayout()));
10925	FR = DAG.getZExtOrTrunc(Op: FR, DL, VT: getPointerMemTy(DL: DAG.getDataLayout()));
10926	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10927	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
10928	PtrInfo: MachinePointerInfo (SV));
10929	}
10930
10931	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10932	SelectionDAG &DAG) const {
10933	MachineFunction &MF = DAG.getMachineFunction();
10934	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10935
10936	SDLoc DL(Op);
10937	SDValue FR;
10938	if (Subtarget->isWindowsArm64EC()) {
10939	// With the Arm64EC ABI, we compute the address of the varargs save area
10940	// relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10941	// but calls from an entry thunk can pass in a different address.
10942	Register VReg = MF.addLiveIn(PReg: AArch64::X4, RC: &AArch64::GPR64RegClass);
10943	SDValue Val = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: VReg, VT: MVT::i64);
10944	uint64_t StackOffset;
10945	if (FuncInfo->getVarArgsGPRSize() > `0`)
10946	StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10947	else
10948	StackOffset = FuncInfo->getVarArgsStackOffset();
10949	FR = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Val,
10950	N2: DAG.getConstant(Val: StackOffset, DL, VT: MVT::i64));
10951	} else {
10952	FR = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRSize() > `0`
10953	? FuncInfo->getVarArgsGPRIndex()
10954	: FuncInfo->getVarArgsStackIndex(),
10955	VT: getPointerTy(DL: DAG.getDataLayout()));
10956	}
10957	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10958	return DAG.getStore(Chain: Op.getOperand(i: `0`), dl: DL, Val: FR, Ptr: Op.getOperand(i: `1`),
10959	PtrInfo: MachinePointerInfo (SV));
10960	}
10961
10962	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10963	SelectionDAG &DAG) const {
10964	// The layout of the va_list struct is specified in the AArch64 Procedure Call
10965	// Standard, section B.3.
10966	MachineFunction &MF = DAG.getMachineFunction();
10967	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10968	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
10969	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
10970	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
10971	SDLoc DL(Op);
10972
10973	SDValue Chain = Op.getOperand(i: `0`);
10974	SDValue VAList = Op.getOperand(i: `1`);
10975	const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
10976	SmallVector<SDValue, `4`> MemOps;
10977
10978	// void __stack at offset 0*
10979	unsigned Offset = `0`;
10980	SDValue Stack = DAG.getFrameIndex(FI: FuncInfo->getVarArgsStackIndex(), VT: PtrVT);
10981	Stack = DAG.getZExtOrTrunc(Op: Stack, DL, VT: PtrMemVT);
10982	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: Stack, Ptr: VAList,
10983	PtrInfo: MachinePointerInfo (SV), Alignment: Align (PtrSize)));
10984
10985	// void __gr_top at offset 8 (4 on ILP32)*
10986	Offset += PtrSize;
10987	int GPRSize = FuncInfo->getVarArgsGPRSize();
10988	if (GPRSize > `0`) {
10989	SDValue GRTop, GRTopAddr;
10990
10991	GRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
10992	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
10993
10994	GRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsGPRIndex(), VT: PtrVT);
10995	GRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: GRTop,
10996	N2: DAG.getConstant(Val: GPRSize, DL, VT: PtrVT));
10997	GRTop = DAG.getZExtOrTrunc(Op: GRTop, DL, VT: PtrMemVT);
10998
10999	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: GRTop, Ptr: GRTopAddr,
11000	PtrInfo: MachinePointerInfo (SV, Offset),
11001	Alignment: Align (PtrSize)));
11002	}
11003
11004	// void __vr_top at offset 16 (8 on ILP32)*
11005	Offset += PtrSize;
11006	int FPRSize = FuncInfo->getVarArgsFPRSize();
11007	if (FPRSize > `0`) {
11008	SDValue VRTop, VRTopAddr;
11009	VRTopAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11010	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11011
11012	VRTop = DAG.getFrameIndex(FI: FuncInfo->getVarArgsFPRIndex(), VT: PtrVT);
11013	VRTop = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VRTop,
11014	N2: DAG.getConstant(Val: FPRSize, DL, VT: PtrVT));
11015	VRTop = DAG.getZExtOrTrunc(Op: VRTop, DL, VT: PtrMemVT);
11016
11017	MemOps.push_back(Elt: DAG.getStore(Chain, dl: DL, Val: VRTop, Ptr: VRTopAddr,
11018	PtrInfo: MachinePointerInfo (SV, Offset),
11019	Alignment: Align (PtrSize)));
11020	}
11021
11022	// int __gr_offs at offset 24 (12 on ILP32)
11023	Offset += PtrSize;
11024	SDValue GROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11025	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11026	MemOps.push_back(
11027	Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: -GPRSize, DL, VT: MVT::i32),
11028	Ptr: GROffsAddr, PtrInfo: MachinePointerInfo (SV, Offset), Alignment: Align (`4`)));
11029
11030	// int __vr_offs at offset 28 (16 on ILP32)
11031	Offset += `4`;
11032	SDValue VROffsAddr = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11033	N2: DAG.getConstant(Val: Offset, DL, VT: PtrVT));
11034	MemOps.push_back(
11035	Elt: DAG.getStore(Chain, dl: DL, Val: DAG.getConstant(Val: -FPRSize, DL, VT: MVT::i32),
11036	Ptr: VROffsAddr, PtrInfo: MachinePointerInfo (SV, Offset), Alignment: Align (`4`)));
11037
11038	return DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: MemOps);
11039	}
11040
11041	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11042	SelectionDAG &DAG) const {
11043	MachineFunction &MF = DAG.getMachineFunction();
11044	Function &F = MF.getFunction();
11045
11046	if (Subtarget->isCallingConvWin64(CC: F.getCallingConv(), IsVarArg: F.isVarArg()))
11047	return LowerWin64_VASTART(Op, DAG);
11048	else if (Subtarget->isTargetDarwin())
11049	return LowerDarwin_VASTART(Op, DAG);
11050	else
11051	return LowerAAPCS_VASTART(Op, DAG);
11052	}
11053
11054	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11055	SelectionDAG &DAG) const {
11056	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11057	// pointer.
11058	SDLoc DL(Op);
11059	unsigned PtrSize = Subtarget->isTargetILP32() ? `4` : `8`;
11060	unsigned VaListSize =
11061	(Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
11062	? PtrSize
11063	: Subtarget->isTargetILP32() ? `20` : `32`;
11064	const Value *DestSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `3`))->getValue();
11065	const Value *SrcSV = cast<SrcValueSDNode>(Val: Op.getOperand(i: `4`))->getValue();
11066
11067	return DAG.getMemcpy(Chain: Op.getOperand(i: `0`), dl: DL, Dst: Op.getOperand(i: `1`), Src: Op.getOperand(i: `2`),
11068	Size: DAG.getConstant(Val: VaListSize, DL, VT: MVT::i32),
11069	Alignment: Align (PtrSize), isVol: false, AlwaysInline: false, /CI=/nullptr,
11070	OverrideTailCall: std::nullopt, DstPtrInfo: MachinePointerInfo (DestSV),
11071	SrcPtrInfo: MachinePointerInfo (SrcSV));
11072	}
11073
11074	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11075	assert(Subtarget->isTargetDarwin() &&
11076	"automatic va_arg instruction only works on Darwin");
11077
11078	const Value *V = cast<SrcValueSDNode>(Val: Op.getOperand(i: `2`))->getValue();
11079	EVT VT = Op.getValueType();
11080	SDLoc DL(Op);
11081	SDValue Chain = Op.getOperand(i: `0`);
11082	SDValue Addr = Op.getOperand(i: `1`);
11083	MaybeAlign Align(Op.getConstantOperandVal(i: `3`));
11084	unsigned MinSlotSize = Subtarget->isTargetILP32() ? `4` : `8`;
11085	auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
11086	auto PtrMemVT = getPointerMemTy(DL: DAG.getDataLayout());
11087	SDValue VAList =
11088	DAG.getLoad(VT: PtrMemVT, dl: DL, Chain, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
11089	Chain = VAList.getValue(R: `1`);
11090	VAList = DAG.getZExtOrTrunc(Op: VAList, DL, VT: PtrVT);
11091
11092	if (VT.isScalableVector())
11093	report_fatal_error(reason: "Passing SVE types to variadic functions is "
11094	"currently not supported");
11095
11096	if (Align && *Align > MinSlotSize) {
11097	VAList = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11098	N2: DAG.getConstant(Val: Align ->value() - `1`, DL, VT: PtrVT));
11099	VAList = DAG.getNode(Opcode: ISD::AND, DL, VT: PtrVT, N1: VAList,
11100	N2: DAG.getConstant(Val: -(int64_t)Align ->value(), DL, VT: PtrVT));
11101	}
11102
11103	Type ArgTy = VT.getTypeForEVT(Context&: DAG.getContext());
11104	unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(Ty: ArgTy);
11105
11106	// Scalar integer and FP values smaller than 64 bits are implicitly extended
11107	// up to 64 bits. At the very least, we have to increase the striding of the
11108	// vaargs list to match this, and for FP values we need to introduce
11109	// FP_ROUND nodes as well.
11110	if (VT.isInteger() && !VT.isVector())
11111	ArgSize = std::max(a: ArgSize, b: MinSlotSize);
11112	bool NeedFPTrunc = false;
11113	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11114	ArgSize = `8`;
11115	NeedFPTrunc = true;
11116	}
11117
11118	// Increment the pointer, VAList, to the next vaarg
11119	SDValue VANext = DAG.getNode(Opcode: ISD::ADD, DL, VT: PtrVT, N1: VAList,
11120	N2: DAG.getConstant(Val: ArgSize, DL, VT: PtrVT));
11121	VANext = DAG.getZExtOrTrunc(Op: VANext, DL, VT: PtrMemVT);
11122
11123	// Store the incremented VAList to the legalized pointer
11124	SDValue APStore =
11125	DAG.getStore(Chain, dl: DL, Val: VANext, Ptr: Addr, PtrInfo: MachinePointerInfo (V));
11126
11127	// Load the actual argument out of the pointer VAList
11128	if (NeedFPTrunc) {
11129	// Load the value as an f64.
11130	SDValue WideFP =
11131	DAG.getLoad(VT: MVT::f64, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo ());
11132	// Round the value down to an f32.
11133	SDValue NarrowFP =
11134	DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: WideFP.getValue(R: `0`),
11135	N2: DAG.getIntPtrConstant(Val: `1`, DL, /isTarget=/true));
11136	SDValue Ops[] = { NarrowFP, WideFP.getValue(R: `1`) };
11137	// Merge the rounded value with the chain output of the load.
11138	return DAG.getMergeValues(Ops, dl: DL);
11139	}
11140
11141	return DAG.getLoad(VT, dl: DL, Chain: APStore, Ptr: VAList, PtrInfo: MachinePointerInfo ());
11142	}
11143
11144	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11145	SelectionDAG &DAG) const {
11146	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11147	MFI.setFrameAddressIsTaken(true);
11148
11149	EVT VT = Op.getValueType();
11150	SDLoc DL(Op);
11151	unsigned Depth = Op.getConstantOperandVal(i: `0`);
11152	SDValue FrameAddr =
11153	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT: MVT::i64);
11154	while (Depth--)
11155	FrameAddr = DAG.getLoad(VT, dl: DL, Chain: DAG.getEntryNode(), Ptr: FrameAddr,
11156	PtrInfo: MachinePointerInfo ());
11157
11158	if (Subtarget->isTargetILP32())
11159	FrameAddr = DAG.getNode(Opcode: ISD::AssertZext, DL, VT: MVT::i64, N1: FrameAddr,
11160	N2: DAG.getValueType(VT));
11161
11162	return FrameAddr;
11163	}
11164
11165	SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11166	SelectionDAG &DAG) const {
11167	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11168
11169	EVT VT = getPointerTy(DL: DAG.getDataLayout());
11170	SDLoc DL(Op);
11171	int FI = MFI.CreateFixedObject(Size: `4`, SPOffset: `0`, IsImmutable: false);
11172	return DAG.getFrameIndex(FI, VT);
11173	}
11174
11175	#define GET_REGISTER_MATCHER
11176	#include "AArch64GenAsmMatcher.inc"
11177
11178	// FIXME? Maybe this could be a TableGen attribute on some registers and
11179	// this table could be generated automatically from RegInfo.
11180	Register AArch64TargetLowering::
11181	getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11182	Register Reg = MatchRegisterName(Name: RegName);
11183	if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11184	const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11185	unsigned DwarfRegNum = MRI->getDwarfRegNum(RegNum: Reg, isEH: false);
11186	if (!Subtarget->isXRegisterReserved(i: DwarfRegNum) &&
11187	!MRI->isReservedReg(MF, Reg))
11188	Reg = `0`;
11189	}
11190	if (Reg)
11191	return Reg;
11192	report_fatal_error(reason: Twine("Invalid register name \""
11193	+ StringRef (RegName) + "\"."));
11194	}
11195
11196	SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11197	SelectionDAG &DAG) const {
11198	DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
11199
11200	EVT VT = Op.getValueType();
11201	SDLoc DL(Op);
11202
11203	SDValue FrameAddr =
11204	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::FP, VT);
11205	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
11206
11207	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset);
11208	}
11209
11210	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11211	SelectionDAG &DAG) const {
11212	MachineFunction &MF = DAG.getMachineFunction();
11213	MachineFrameInfo &MFI = MF.getFrameInfo();
11214	MFI.setReturnAddressIsTaken(true);
11215
11216	EVT VT = Op.getValueType();
11217	SDLoc DL(Op);
11218	unsigned Depth = Op.getConstantOperandVal(i: `0`);
11219	SDValue ReturnAddress;
11220	if (Depth) {
11221	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11222	SDValue Offset = DAG.getConstant(Val: `8`, DL, VT: getPointerTy(DL: DAG.getDataLayout()));
11223	ReturnAddress = DAG.getLoad(
11224	VT, dl: DL, Chain: DAG.getEntryNode(),
11225	Ptr: DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: FrameAddr, N2: Offset), PtrInfo: MachinePointerInfo ());
11226	} else {
11227	// Return LR, which contains the return address. Mark it an implicit
11228	// live-in.
11229	Register Reg = MF.addLiveIn(PReg: AArch64::LR, RC: &AArch64::GPR64RegClass);
11230	ReturnAddress = DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg, VT);
11231	}
11232
11233	// The XPACLRI instruction assembles to a hint-space instruction before
11234	// Armv8.3-A therefore this instruction can be safely used for any pre
11235	// Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11236	// that instead.
11237	SDNode *St;
11238	if (Subtarget->hasPAuth()) {
11239	St = DAG.getMachineNode(Opcode: AArch64::XPACI, dl: DL, VT, Op1: ReturnAddress);
11240	} else {
11241	// XPACLRI operates on LR therefore we must move the operand accordingly.
11242	SDValue Chain =
11243	DAG.getCopyToReg(Chain: DAG.getEntryNode(), dl: DL, Reg: AArch64::LR, N: ReturnAddress);
11244	St = DAG.getMachineNode(Opcode: AArch64::XPACLRI, dl: DL, VT, Op1: Chain);
11245	}
11246	return SDValue (St, `0`);
11247	}
11248
11249	/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11250	/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11251	SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11252	SelectionDAG &DAG) const {
11253	SDValue Lo, Hi;
11254	expandShiftParts(N: Op.getNode(), Lo, Hi, DAG);
11255	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: SDLoc (Op));
11256	}
11257
11258	bool AArch64TargetLowering::isOffsetFoldingLegal(
11259	const GlobalAddressSDNode GA) const* {
11260	// Offsets are folded in the DAG combine rather than here so that we can
11261	// intelligently choose an offset based on the uses.
11262	return false;
11263	}
11264
11265	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
11266	bool OptForSize) const {
11267	bool IsLegal = false;
11268	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11269	// 16-bit case when target has full fp16 support.
11270	// We encode bf16 bit patterns as if they were fp16. This results in very
11271	// strange looking assembly but should populate the register with appropriate
11272	// values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11273	// end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11274	// FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11275	// FIXME: We should be able to handle f128 as well with a clever lowering.
11276	const APInt ImmInt = Imm.bitcastToAPInt();
11277	if (VT == MVT::f64)
11278	IsLegal = AArch64_AM::getFP64Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
11279	else if (VT == MVT::f32)
11280	IsLegal = AArch64_AM::getFP32Imm(Imm: ImmInt) != -`1` \|\| Imm.isPosZero();
11281	else if (VT == MVT::f16 \|\| VT == MVT::bf16)
11282	IsLegal =
11283	(Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(Imm: ImmInt) != -`1`) \|\|
11284	Imm.isPosZero();
11285
11286	// If we can not materialize in immediate field for fmov, check if the
11287	// value can be encoded as the immediate operand of a logical instruction.
11288	// The immediate value will be created with either MOVZ, MOVN, or ORR.
11289	// TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
11290	// generate that fmov.
11291	if (!IsLegal && (VT == MVT::f64 \|\| VT == MVT::f32)) {
11292	// The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
11293	// however the mov+fmov sequence is always better because of the reduced
11294	// cache pressure. The timings are still the same if you consider
11295	// movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
11296	// movw+movk is fused). So we limit up to 2 instrdduction at most.
11297	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
11298	AArch64_IMM::expandMOVImm(Imm: ImmInt.getZExtValue(), BitSize: VT.getSizeInBits(), Insn);
11299	unsigned Limit = (OptForSize ? `1` : (Subtarget->hasFuseLiterals() ? `5` : `2`));
11300	IsLegal = Insn.size() <= Limit;
11301	}
11302
11303	LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
11304	<< " imm value: "; Imm.dump(););
11305	return IsLegal;
11306	}
11307
11308	//===----------------------------------------------------------------------===//
11309	// AArch64 Optimization Hooks
11310	//===----------------------------------------------------------------------===//
11311
11312	static SDValue getEstimate(const AArch64Subtarget ST, unsigned* Opcode,
11313	SDValue Operand, SelectionDAG &DAG,
11314	int &ExtraSteps) {
11315	EVT VT = Operand.getValueType();
11316	if ((ST->hasNEON() &&
11317	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
11318	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\| VT == MVT::v2f32 \|\|
11319	VT == MVT::v4f32)) \|\|
11320	(ST->hasSVE() &&
11321	(VT == MVT::nxv8f16 \|\| VT == MVT::nxv4f32 \|\| VT == MVT::nxv2f64))) {
11322	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) {
11323	// For the reciprocal estimates, convergence is quadratic, so the number
11324	// of digits is doubled after each iteration. In ARMv8, the accuracy of
11325	// the initial estimate is 2^-8. Thus the number of extra steps to refine
11326	// the result for float (23 mantissa bits) is 2 and for double (52
11327	// mantissa bits) is 3.
11328	constexpr unsigned AccurateBits = `8`;
11329	unsigned DesiredBits =
11330	APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(VT));
11331	ExtraSteps = DesiredBits <= AccurateBits
11332	? `0`
11333	: Log2_64_Ceil(Value: DesiredBits) - Log2_64_Ceil(Value: AccurateBits);
11334	}
11335
11336	return DAG.getNode(Opcode, DL: SDLoc (Operand), VT, Operand);
11337	}
11338
11339	return SDValue ();
11340	}
11341
11342	SDValue
11343	AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
11344	const DenormalMode &Mode) const {
11345	SDLoc DL(Op);
11346	EVT VT = Op.getValueType();
11347	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), C&: *DAG.getContext(), VT);
11348	SDValue FPZero = DAG.getConstantFP(Val: `0.0`, DL, VT);
11349	return DAG.getSetCC(DL, VT: CCVT, LHS: Op, RHS: FPZero, Cond: ISD::SETEQ);
11350	}
11351
11352	SDValue
11353	AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
11354	SelectionDAG &DAG) const {
11355	return Op;
11356	}
11357
11358	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
11359	SelectionDAG &DAG, int Enabled,
11360	int &ExtraSteps,
11361	bool &UseOneConst,
11362	bool Reciprocal) const {
11363	if (Enabled == ReciprocalEstimate::Enabled \|\|
11364	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
11365	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRSQRTE, Operand,
11366	DAG, ExtraSteps)) {
11367	SDLoc DL(Operand);
11368	EVT VT = Operand.getValueType();
11369
11370	SDNodeFlags Flags;
11371	Flags.setAllowReassociation(true);
11372
11373	// Newton reciprocal square root iteration: E 0.5 * (3 - X * E^2)*
11374	// AArch64 reciprocal square root iteration instruction: 0.5 (3 - M * N)*
11375	for (int i = ExtraSteps; i > `0`; --i) {
11376	SDValue Step = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Estimate,
11377	Flags);
11378	Step = DAG.getNode(Opcode: AArch64ISD::FRSQRTS, DL, VT, N1: Operand, N2: Step, Flags);
11379	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
11380	}
11381	if (!Reciprocal)
11382	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Operand, N2: Estimate, Flags);
11383
11384	ExtraSteps = `0`;
11385	return Estimate;
11386	}
11387
11388	return SDValue ();
11389	}
11390
11391	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
11392	SelectionDAG &DAG, int Enabled,
11393	int &ExtraSteps) const {
11394	if (Enabled == ReciprocalEstimate::Enabled)
11395	if (SDValue Estimate = getEstimate(ST: Subtarget, Opcode: AArch64ISD::FRECPE, Operand,
11396	DAG, ExtraSteps)) {
11397	SDLoc DL(Operand);
11398	EVT VT = Operand.getValueType();
11399
11400	SDNodeFlags Flags;
11401	Flags.setAllowReassociation(true);
11402
11403	// Newton reciprocal iteration: E (2 - X * E)*
11404	// AArch64 reciprocal iteration instruction: (2 - M N)*
11405	for (int i = ExtraSteps; i > `0`; --i) {
11406	SDValue Step = DAG.getNode(Opcode: AArch64ISD::FRECPS, DL, VT, N1: Operand,
11407	N2: Estimate, Flags);
11408	Estimate = DAG.getNode(Opcode: ISD::FMUL, DL, VT, N1: Estimate, N2: Step, Flags);
11409	}
11410
11411	ExtraSteps = `0`;
11412	return Estimate;
11413	}
11414
11415	return SDValue ();
11416	}
11417
11418	//===----------------------------------------------------------------------===//
11419	// AArch64 Inline Assembly Support
11420	//===----------------------------------------------------------------------===//
11421
11422	// Table of Constraints
11423	// TODO: This is the current set of constraints supported by ARM for the
11424	// compiler, not all of them may make sense.
11425	//
11426	// r - A general register
11427	// w - An FP/SIMD register of some size in the range v0-v31
11428	// x - An FP/SIMD register of some size in the range v0-v15
11429	// I - Constant that can be used with an ADD instruction
11430	// J - Constant that can be used with a SUB instruction
11431	// K - Constant that can be used with a 32-bit logical instruction
11432	// L - Constant that can be used with a 64-bit logical instruction
11433	// M - Constant that can be used as a 32-bit MOV immediate
11434	// N - Constant that can be used as a 64-bit MOV immediate
11435	// Q - A memory reference with base register and no offset
11436	// S - A symbolic address
11437	// Y - Floating point constant zero
11438	// Z - Integer constant zero
11439	//
11440	// Note that general register operands will be output using their 64-bit x
11441	// register name, whatever the size of the variable, unless the asm operand
11442	// is prefixed by the %w modifier. Floating-point and SIMD register operands
11443	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
11444	// %q modifier.
11445	const char AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const* {
11446	// At this point, we have to lower this constraint to something else, so we
11447	// lower it to an "r" or "w". However, by doing this we will force the result
11448	// to be in register, while the X constraint is much more permissive.
11449	//
11450	// Although we are correct (we are free to emit anything, without
11451	// constraints), we might break use cases that would expect us to be more
11452	// efficient and emit something else.
11453	if (!Subtarget->hasFPARMv8())
11454	return "r";
11455
11456	if (ConstraintVT.isFloatingPoint())
11457	return "w";
11458
11459	if (ConstraintVT.isVector() &&
11460	(ConstraintVT.getSizeInBits() == `64` \|\|
11461	ConstraintVT.getSizeInBits() == `128`))
11462	return "w";
11463
11464	return "r";
11465	}
11466
11467	enum class PredicateConstraint { Uph, Upl, Upa };
11468
11469	static std::optional<PredicateConstraint>
11470	parsePredicateConstraint(StringRef Constraint) {
11471	return StringSwitch<std::optional<PredicateConstraint>>(Constraint)
11472	.Case(S: "Uph", Value: PredicateConstraint::Uph)
11473	.Case(S: "Upl", Value: PredicateConstraint::Upl)
11474	.Case(S: "Upa", Value: PredicateConstraint::Upa)
11475	.Default(Value: std::nullopt);
11476	}
11477
11478	static const TargetRegisterClass *
11479	getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT) {
11480	if (VT != MVT::aarch64svcount &&
11481	(!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1))
11482	return nullptr;
11483
11484	switch (Constraint) {
11485	case PredicateConstraint::Uph:
11486	return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
11487	: &AArch64::PPR_p8to15RegClass;
11488	case PredicateConstraint::Upl:
11489	return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
11490	: &AArch64::PPR_3bRegClass;
11491	case PredicateConstraint::Upa:
11492	return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
11493	: &AArch64::PPRRegClass;
11494	}
11495
11496	llvm_unreachable("Missing PredicateConstraint!");
11497	}
11498
11499	enum class ReducedGprConstraint { Uci, Ucj };
11500
11501	static std::optional<ReducedGprConstraint>
11502	parseReducedGprConstraint(StringRef Constraint) {
11503	return StringSwitch<std::optional<ReducedGprConstraint>>(Constraint)
11504	.Case(S: "Uci", Value: ReducedGprConstraint::Uci)
11505	.Case(S: "Ucj", Value: ReducedGprConstraint::Ucj)
11506	.Default(Value: std::nullopt);
11507	}
11508
11509	static const TargetRegisterClass *
11510	getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT) {
11511	if (!VT.isScalarInteger() \|\| VT.getFixedSizeInBits() > `64`)
11512	return nullptr;
11513
11514	switch (Constraint) {
11515	case ReducedGprConstraint::Uci:
11516	return &AArch64::MatrixIndexGPR32_8_11RegClass;
11517	case ReducedGprConstraint::Ucj:
11518	return &AArch64::MatrixIndexGPR32_12_15RegClass;
11519	}
11520
11521	llvm_unreachable("Missing ReducedGprConstraint!");
11522	}
11523
11524	// The set of cc code supported is from
11525	// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
11526	static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint) {
11527	AArch64CC::CondCode Cond = StringSwitch<AArch64CC::CondCode>(Constraint)
11528	.Case(S: "{@cchi}", Value: AArch64CC::HI)
11529	.Case(S: "{@cccs}", Value: AArch64CC::HS)
11530	.Case(S: "{@cclo}", Value: AArch64CC::LO)
11531	.Case(S: "{@ccls}", Value: AArch64CC::LS)
11532	.Case(S: "{@cccc}", Value: AArch64CC::LO)
11533	.Case(S: "{@cceq}", Value: AArch64CC::EQ)
11534	.Case(S: "{@ccgt}", Value: AArch64CC::GT)
11535	.Case(S: "{@ccge}", Value: AArch64CC::GE)
11536	.Case(S: "{@cclt}", Value: AArch64CC::LT)
11537	.Case(S: "{@ccle}", Value: AArch64CC::LE)
11538	.Case(S: "{@cchs}", Value: AArch64CC::HS)
11539	.Case(S: "{@ccne}", Value: AArch64CC::NE)
11540	.Case(S: "{@ccvc}", Value: AArch64CC::VC)
11541	.Case(S: "{@ccpl}", Value: AArch64CC::PL)
11542	.Case(S: "{@ccvs}", Value: AArch64CC::VS)
11543	.Case(S: "{@ccmi}", Value: AArch64CC::MI)
11544	.Default(Value: AArch64CC::Invalid);
11545	return Cond;
11546	}
11547
11548	/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
11549	/// WZR, invert(<cond>)'.
11550	static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL,
11551	SelectionDAG &DAG) {
11552	return DAG.getNode(
11553	Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
11554	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
11555	N3: DAG.getConstant(Val: getInvertedCondCode(Code: CC), DL, VT: MVT::i32), N4: NZCV);
11556	}
11557
11558	// Lower @cc flag output via getSETCC.
11559	SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
11560	SDValue &Chain, SDValue &Glue, const SDLoc &DL,
11561	const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
11562	AArch64CC::CondCode Cond = parseConstraintCode(Constraint: OpInfo.ConstraintCode);
11563	if (Cond == AArch64CC::Invalid)
11564	return SDValue ();
11565	// The output variable should be a scalar integer.
11566	if (OpInfo.ConstraintVT.isVector() \|\| !OpInfo.ConstraintVT.isInteger() \|\|
11567	OpInfo.ConstraintVT.getSizeInBits() < `8`)
11568	report_fatal_error(reason: "Flag output operand is of invalid type");
11569
11570	// Get NZCV register. Only update chain when copyfrom is glued.
11571	if (Glue.getNode()) {
11572	Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32, Glue);
11573	Chain = Glue.getValue(R: `1`);
11574	} else
11575	Glue = DAG.getCopyFromReg(Chain, dl: DL, Reg: AArch64::NZCV, VT: MVT::i32);
11576	// Extract CC code.
11577	SDValue CC = getSETCC(CC: Cond, NZCV: Glue, DL, DAG);
11578
11579	SDValue Result;
11580
11581	// Truncate or ZERO_EXTEND based on value types.
11582	if (OpInfo.ConstraintVT.getSizeInBits() <= `32`)
11583	Result = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: OpInfo.ConstraintVT, Operand: CC);
11584	else
11585	Result = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: OpInfo.ConstraintVT, Operand: CC);
11586
11587	return Result;
11588	}
11589
11590	/// getConstraintType - Given a constraint letter, return the type of
11591	/// constraint it is for this target.
11592	AArch64TargetLowering::ConstraintType
11593	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11594	if (Constraint.size() == `1`) {
11595	switch (Constraint [`0`]) {
11596	default:
11597	break;
11598	case `'x'`:
11599	case `'w'`:
11600	case `'y'`:
11601	return C_RegisterClass;
11602	// An address with a single base register. Due to the way we
11603	// currently handle addresses it is the same as 'r'.
11604	case `'Q'`:
11605	return C_Memory;
11606	case `'I'`:
11607	case `'J'`:
11608	case `'K'`:
11609	case `'L'`:
11610	case `'M'`:
11611	case `'N'`:
11612	case `'Y'`:
11613	case `'Z'`:
11614	return C_Immediate;
11615	case `'z'`:
11616	case `'S'`: // A symbol or label reference with a constant offset
11617	return C_Other;
11618	}
11619	} else if (parsePredicateConstraint(Constraint))
11620	return C_RegisterClass;
11621	else if (parseReducedGprConstraint(Constraint))
11622	return C_RegisterClass;
11623	else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11624	return C_Other;
11625	return TargetLowering::getConstraintType(Constraint);
11626	}
11627
11628	/// Examine constraint type and operand type and determine a weight value.
11629	/// This object must already have been set up with the operand type
11630	/// and the current alternative constraint selected.
11631	TargetLowering::ConstraintWeight
11632	AArch64TargetLowering::getSingleConstraintMatchWeight(
11633	AsmOperandInfo &info, const char constraint) const* {
11634	ConstraintWeight weight = CW_Invalid;
11635	Value *CallOperandVal = info.CallOperandVal;
11636	// If we don't have a value, we can't do a match,
11637	// but allow it at the lowest weight.
11638	if (!CallOperandVal)
11639	return CW_Default;
11640	Type *type = CallOperandVal->getType();
11641	// Look at the constraint type.
11642	switch (*constraint) {
11643	default:
11644	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
11645	break;
11646	case `'x'`:
11647	case `'w'`:
11648	case `'y'`:
11649	if (type->isFloatingPointTy() \|\| type->isVectorTy())
11650	weight = CW_Register;
11651	break;
11652	case `'z'`:
11653	weight = CW_Constant;
11654	break;
11655	case `'U'`:
11656	if (parsePredicateConstraint(Constraint: constraint) \|\|
11657	parseReducedGprConstraint(Constraint: constraint))
11658	weight = CW_Register;
11659	break;
11660	}
11661	return weight;
11662	}
11663
11664	std::pair<unsigned, const TargetRegisterClass *>
11665	AArch64TargetLowering::getRegForInlineAsmConstraint(
11666	const TargetRegisterInfo TRI, StringRef Constraint, MVT VT) const* {
11667	if (Constraint.size() == `1`) {
11668	switch (Constraint [`0`]) {
11669	case `'r'`:
11670	if (VT.isScalableVector())
11671	return std::make_pair(x: `0U`, y: nullptr);
11672	if (Subtarget->hasLS64() && VT.getSizeInBits() == `512`)
11673	return std::make_pair(x: `0U`, y: &AArch64::GPR64x8ClassRegClass);
11674	if (VT.getFixedSizeInBits() == `64`)
11675	return std::make_pair(x: `0U`, y: &AArch64::GPR64commonRegClass);
11676	return std::make_pair(x: `0U`, y: &AArch64::GPR32commonRegClass);
11677	case `'w'`: {
11678	if (!Subtarget->hasFPARMv8())
11679	break;
11680	if (VT.isScalableVector()) {
11681	if (VT.getVectorElementType() != MVT::i1)
11682	return std::make_pair(x: `0U`, y: &AArch64::ZPRRegClass);
11683	return std::make_pair(x: `0U`, y: nullptr);
11684	}
11685	if (VT == MVT::Other)
11686	break;
11687	uint64_t VTSize = VT.getFixedSizeInBits();
11688	if (VTSize == `16`)
11689	return std::make_pair(x: `0U`, y: &AArch64::FPR16RegClass);
11690	if (VTSize == `32`)
11691	return std::make_pair(x: `0U`, y: &AArch64::FPR32RegClass);
11692	if (VTSize == `64`)
11693	return std::make_pair(x: `0U`, y: &AArch64::FPR64RegClass);
11694	if (VTSize == `128`)
11695	return std::make_pair(x: `0U`, y: &AArch64::FPR128RegClass);
11696	break;
11697	}
11698	// The instructions that this constraint is designed for can
11699	// only take 128-bit registers so just use that regclass.
11700	case `'x'`:
11701	if (!Subtarget->hasFPARMv8())
11702	break;
11703	if (VT.isScalableVector())
11704	return std::make_pair(x: `0U`, y: &AArch64::ZPR_4bRegClass);
11705	if (VT.getSizeInBits() == `128`)
11706	return std::make_pair(x: `0U`, y: &AArch64::FPR128_loRegClass);
11707	break;
11708	case `'y'`:
11709	if (!Subtarget->hasFPARMv8())
11710	break;
11711	if (VT.isScalableVector())
11712	return std::make_pair(x: `0U`, y: &AArch64::ZPR_3bRegClass);
11713	break;
11714	}
11715	} else {
11716	if (const auto PC = parsePredicateConstraint(Constraint))
11717	if (const auto RegClass = getPredicateRegisterClass(Constraint: PC, VT))
11718	return std::make_pair(x: `0U`, y&: RegClass);
11719
11720	if (const auto RGC = parseReducedGprConstraint(Constraint))
11721	if (const auto RegClass = getReducedGprRegisterClass(Constraint: RGC, VT))
11722	return std::make_pair(x: `0U`, y&: RegClass);
11723	}
11724	if (StringRef ("{cc}").equals_insensitive(RHS: Constraint) \|\|
11725	parseConstraintCode(Constraint) != AArch64CC::Invalid)
11726	return std::make_pair(x: unsigned(AArch64::NZCV), y: &AArch64::CCRRegClass);
11727
11728	if (Constraint == "{za}") {
11729	return std::make_pair(x: unsigned(AArch64::ZA), y: &AArch64::MPRRegClass);
11730	}
11731
11732	if (Constraint == "{zt0}") {
11733	return std::make_pair(x: unsigned(AArch64::ZT0), y: &AArch64::ZTRRegClass);
11734	}
11735
11736	// Use the default implementation in TargetLowering to convert the register
11737	// constraint into a member of a register class.
11738	std::pair<unsigned, const TargetRegisterClass *> Res;
11739	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11740
11741	// Not found as a standard register?
11742	if (!Res.second) {
11743	unsigned Size = Constraint.size();
11744	if ((Size == `4` \|\| Size == `5`) && Constraint [`0`] == `'{'` &&
11745	tolower(c: Constraint [`1`]) == `'v'` && Constraint [Size - `1`] == `'}'`) {
11746	int RegNo;
11747	bool Failed = Constraint.slice(Start: `2`, End: Size - `1`).getAsInteger(Radix: `10`, Result&: RegNo);
11748	if (!Failed && RegNo >= `0` && RegNo <= `31`) {
11749	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11750	// By default we'll emit v0-v31 for this unless there's a modifier where
11751	// we'll emit the correct register as well.
11752	if (VT != MVT::Other && VT.getSizeInBits() == `64`) {
11753	Res.first = AArch64::FPR64RegClass.getRegister(i: RegNo);
11754	Res.second = &AArch64::FPR64RegClass;
11755	} else {
11756	Res.first = AArch64::FPR128RegClass.getRegister(i: RegNo);
11757	Res.second = &AArch64::FPR128RegClass;
11758	}
11759	}
11760	}
11761	}
11762
11763	if (Res.second && !Subtarget->hasFPARMv8() &&
11764	!AArch64::GPR32allRegClass.hasSubClassEq(RC: Res.second) &&
11765	!AArch64::GPR64allRegClass.hasSubClassEq(RC: Res.second))
11766	return std::make_pair(x: `0U`, y: nullptr);
11767
11768	return Res;
11769	}
11770
11771	EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
11772	llvm::Type *Ty,
11773	bool AllowUnknown) const {
11774	if (Subtarget->hasLS64() && Ty->isIntegerTy(Bitwidth: `512`))
11775	return EVT (MVT::i64x8);
11776
11777	return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11778	}
11779
11780	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11781	/// vector. If it is invalid, don't add anything to Ops.
11782	void AArch64TargetLowering::LowerAsmOperandForConstraint(
11783	SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11784	SelectionDAG &DAG) const {
11785	SDValue Result;
11786
11787	// Currently only support length 1 constraints.
11788	if (Constraint.size() != `1`)
11789	return;
11790
11791	char ConstraintLetter = Constraint [`0`];
11792	switch (ConstraintLetter) {
11793	default:
11794	break;
11795
11796	// This set of constraints deal with valid constants for various instructions.
11797	// Validate and return a target constant for them if we can.
11798	case `'z'`: {
11799	// 'z' maps to xzr or wzr so it needs an input of 0.
11800	if (!isNullConstant(V: Op))
11801	return;
11802
11803	if (Op.getValueType() == MVT::i64)
11804	Result = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
11805	else
11806	Result = DAG.getRegister(Reg: AArch64::WZR, VT: MVT::i32);
11807	break;
11808	}
11809	case `'S'`:
11810	// Use the generic code path for "s". In GCC's aarch64 port, "S" is
11811	// supported for PIC while "s" isn't, making "s" less useful. We implement
11812	// "S" but not "s".
11813	TargetLowering::LowerAsmOperandForConstraint(Op, Constraint: "s", Ops, DAG);
11814	break;
11815
11816	case `'I'`:
11817	case `'J'`:
11818	case `'K'`:
11819	case `'L'`:
11820	case `'M'`:
11821	case `'N'`:
11822	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val&: Op);
11823	if (!C)
11824	return;
11825
11826	// Grab the value and do some validation.
11827	uint64_t CVal = C->getZExtValue();
11828	switch (ConstraintLetter) {
11829	// The I constraint applies only to simple ADD or SUB immediate operands:
11830	// i.e. 0 to 4095 with optional shift by 12
11831	// The J constraint applies only to ADD or SUB immediates that would be
11832	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
11833	// instruction [or vice versa], in other words -1 to -4095 with optional
11834	// left shift by 12.
11835	case `'I'`:
11836	if (isUInt<`12`>(x: CVal) \|\| isShiftedUInt<`12`, `12`>(x: CVal))
11837	break;
11838	return;
11839	case `'J'`: {
11840	uint64_t NVal = -C->getSExtValue();
11841	if (isUInt<`12`>(x: NVal) \|\| isShiftedUInt<`12`, `12`>(x: NVal)) {
11842	CVal = C->getSExtValue();
11843	break;
11844	}
11845	return;
11846	}
11847	// The K and L constraints apply only* to logical immediates, including*
11848	// what used to be the MOVI alias for ORR (though the MOVI alias has now
11849	// been removed and MOV should be used). So these constraints have to
11850	// distinguish between bit patterns that are valid 32-bit or 64-bit
11851	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11852	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11853	// versa.
11854	case `'K'`:
11855	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
11856	break;
11857	return;
11858	case `'L'`:
11859	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
11860	break;
11861	return;
11862	// The M and N constraints are a superset of K and L respectively, for use
11863	// with the MOV (immediate) alias. As well as the logical immediates they
11864	// also match 32 or 64-bit immediates that can be loaded either using a
11865	// single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca*
11866	// (M) or 64-bit 0x1234000000000000 (N) etc.
11867	// As a note some of this code is liberally stolen from the asm parser.
11868	case `'M'`: {
11869	if (!isUInt<`32`>(x: CVal))
11870	return;
11871	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `32`))
11872	break;
11873	if ((CVal & `0xFFFF`) == CVal)
11874	break;
11875	if ((CVal & `0xFFFF0000ULL`) == CVal)
11876	break;
11877	uint64_t NCVal = ~(uint32_t)CVal;
11878	if ((NCVal & `0xFFFFULL`) == NCVal)
11879	break;
11880	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
11881	break;
11882	return;
11883	}
11884	case `'N'`: {
11885	if (AArch64_AM::isLogicalImmediate(imm: CVal, regSize: `64`))
11886	break;
11887	if ((CVal & `0xFFFFULL`) == CVal)
11888	break;
11889	if ((CVal & `0xFFFF0000ULL`) == CVal)
11890	break;
11891	if ((CVal & `0xFFFF00000000ULL`) == CVal)
11892	break;
11893	if ((CVal & `0xFFFF000000000000ULL`) == CVal)
11894	break;
11895	uint64_t NCVal = ~CVal;
11896	if ((NCVal & `0xFFFFULL`) == NCVal)
11897	break;
11898	if ((NCVal & `0xFFFF0000ULL`) == NCVal)
11899	break;
11900	if ((NCVal & `0xFFFF00000000ULL`) == NCVal)
11901	break;
11902	if ((NCVal & `0xFFFF000000000000ULL`) == NCVal)
11903	break;
11904	return;
11905	}
11906	default:
11907	return;
11908	}
11909
11910	// All assembler immediates are 64-bit integers.
11911	Result = DAG.getTargetConstant(Val: CVal, DL: SDLoc (Op), VT: MVT::i64);
11912	break;
11913	}
11914
11915	if (Result.getNode()) {
11916	Ops.push_back(x: Result);
11917	return;
11918	}
11919
11920	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11921	}
11922
11923	//===----------------------------------------------------------------------===//
11924	// AArch64 Advanced SIMD Support
11925	//===----------------------------------------------------------------------===//
11926
11927	/// WidenVector - Given a value in the V64 register class, produce the
11928	/// equivalent value in the V128 register class.
11929	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
11930	EVT VT = V64Reg.getValueType();
11931	unsigned NarrowSize = VT.getVectorNumElements();
11932	MVT EltTy = VT.getVectorElementType().getSimpleVT();
11933	MVT WideTy = MVT::getVectorVT(VT: EltTy, NumElements: `2` * NarrowSize);
11934	SDLoc DL(V64Reg);
11935
11936	return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideTy, N1: DAG.getUNDEF(VT: WideTy),
11937	N2: V64Reg, N3: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
11938	}
11939
11940	/// getExtFactor - Determine the adjustment factor for the position when
11941	/// generating an "extract from vector registers" instruction.
11942	static unsigned getExtFactor(SDValue &V) {
11943	EVT EltType = V.getValueType().getVectorElementType();
11944	return EltType.getSizeInBits() / `8`;
11945	}
11946
11947	// Check if a vector is built from one vector via extracted elements of
11948	// another together with an AND mask, ensuring that all elements fit
11949	// within range. This can be reconstructed using AND and NEON's TBL1.
11950	SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) {
11951	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11952	SDLoc dl(Op);
11953	EVT VT = Op.getValueType();
11954	assert(!VT.isScalableVector() &&
11955	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11956
11957	// Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11958	// directly to TBL1.
11959	if (VT != MVT::v16i8 && VT != MVT::v8i8)
11960	return SDValue ();
11961
11962	unsigned NumElts = VT.getVectorNumElements();
11963	assert((NumElts == `8` \|\| NumElts == `16`) &&
11964	"Need to have exactly 8 or 16 elements in vector.");
11965
11966	SDValue SourceVec;
11967	SDValue MaskSourceVec;
11968	SmallVector<SDValue, `16`> AndMaskConstants;
11969
11970	for (unsigned i = `0`; i < NumElts; ++i) {
11971	SDValue V = Op.getOperand(i);
11972	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11973	return SDValue ();
11974
11975	SDValue OperandSourceVec = V.getOperand(i: `0`);
11976	if (!SourceVec)
11977	SourceVec = OperandSourceVec;
11978	else if (SourceVec != OperandSourceVec)
11979	return SDValue ();
11980
11981	// This only looks at shuffles with elements that are
11982	// a) truncated by a constant AND mask extracted from a mask vector, or
11983	// b) extracted directly from a mask vector.
11984	SDValue MaskSource = V.getOperand(i: `1`);
11985	if (MaskSource.getOpcode() == ISD::AND) {
11986	if (!isa<ConstantSDNode>(Val: MaskSource.getOperand(i: `1`)))
11987	return SDValue ();
11988
11989	AndMaskConstants.push_back(Elt: MaskSource.getOperand(i: `1`));
11990	MaskSource = MaskSource ->getOperand(Num: `0`);
11991	} else if (!AndMaskConstants.empty()) {
11992	// Either all or no operands should have an AND mask.
11993	return SDValue ();
11994	}
11995
11996	// An ANY_EXTEND may be inserted between the AND and the source vector
11997	// extraction. We don't care about that, so we can just skip it.
11998	if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11999	MaskSource = MaskSource.getOperand(i: `0`);
12000
12001	if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12002	return SDValue ();
12003
12004	SDValue MaskIdx = MaskSource.getOperand(i: `1`);
12005	if (!isa<ConstantSDNode>(Val: MaskIdx) \|\|
12006	!cast<ConstantSDNode>(Val&: MaskIdx)->getConstantIntValue()->equalsInt(V: i))
12007	return SDValue ();
12008
12009	// We only apply this if all elements come from the same vector with the
12010	// same vector type.
12011	if (!MaskSourceVec) {
12012	MaskSourceVec = MaskSource ->getOperand(Num: `0`);
12013	if (MaskSourceVec.getValueType() != VT)
12014	return SDValue ();
12015	} else if (MaskSourceVec != MaskSource ->getOperand(Num: `0`)) {
12016	return SDValue ();
12017	}
12018	}
12019
12020	// We need a v16i8 for TBL, so we extend the source with a placeholder vector
12021	// for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12022	// insert, we know that the index in the mask must be smaller than the number
12023	// of elements in the source, or we would have an out-of-bounds access.
12024	if (NumElts == `8`)
12025	SourceVec = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: MVT::v16i8, N1: SourceVec,
12026	N2: DAG.getUNDEF(VT));
12027
12028	// Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12029	if (!AndMaskConstants.empty())
12030	MaskSourceVec = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: MaskSourceVec,
12031	N2: DAG.getBuildVector(VT, DL: dl, Ops: AndMaskConstants));
12032
12033	return DAG.getNode(
12034	Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT,
12035	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL: dl, VT: MVT::i32), N2: SourceVec,
12036	N3: MaskSourceVec);
12037	}
12038
12039	// Gather data to see if the operation can be modelled as a
12040	// shuffle in combination with VEXTs.
12041	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
12042	SelectionDAG &DAG) const {
12043	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12044	LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12045	SDLoc dl(Op);
12046	EVT VT = Op.getValueType();
12047	assert(!VT.isScalableVector() &&
12048	"Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12049	unsigned NumElts = VT.getVectorNumElements();
12050
12051	struct ShuffleSourceInfo {
12052	SDValue Vec;
12053	unsigned MinElt;
12054	unsigned MaxElt;
12055
12056	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12057	// be compatible with the shuffle we intend to construct. As a result
12058	// ShuffleVec will be some sliding window into the original Vec.
12059	SDValue ShuffleVec;
12060
12061	// Code should guarantee that element i in Vec starts at element "WindowBase
12062	// + i WindowScale in ShuffleVec".*
12063	int WindowBase;
12064	int WindowScale;
12065
12066	ShuffleSourceInfo(SDValue Vec)
12067	: Vec (Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(`0`),
12068	ShuffleVec (Vec), WindowBase(`0`), WindowScale(`1`) {}
12069
12070	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12071	};
12072
12073	// First gather all vectors used as an immediate source for this BUILD_VECTOR
12074	// node.
12075	SmallVector<ShuffleSourceInfo, `2`> Sources;
12076	for (unsigned i = `0`; i < NumElts; ++i) {
12077	SDValue V = Op.getOperand(i);
12078	if (V.isUndef())
12079	continue;
12080	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
12081	!isa<ConstantSDNode>(Val: V.getOperand(i: `1`)) \|\|
12082	V.getOperand(i: `0`).getValueType().isScalableVector()) {
12083	LLVM_DEBUG(
12084	dbgs() << "Reshuffle failed: "
12085	"a shuffle can only come from building a vector from "
12086	"various elements of other fixed-width vectors, provided "
12087	"their indices are constant\n");
12088	return SDValue ();
12089	}
12090
12091	// Add this element source to the list if it's not already there.
12092	SDValue SourceVec = V.getOperand(i: `0`);
12093	auto Source = find(Range&: Sources, Val: SourceVec);
12094	if (Source == Sources.end())
12095	Source = Sources.insert(I: Sources.end(), Elt: ShuffleSourceInfo(SourceVec));
12096
12097	// Update the minimum and maximum lane number seen.
12098	unsigned EltNo = V.getConstantOperandVal(i: `1`);
12099	Source->MinElt = std::min(a: Source->MinElt, b: EltNo);
12100	Source->MaxElt = std::max(a: Source->MaxElt, b: EltNo);
12101	}
12102
12103	// If we have 3 or 4 sources, try to generate a TBL, which will at least be
12104	// better than moving to/from gpr registers for larger vectors.
12105	if ((Sources.size() == `3` \|\| Sources.size() == `4`) && NumElts > `4`) {
12106	// Construct a mask for the tbl. We may need to adjust the index for types
12107	// larger than i8.
12108	SmallVector<unsigned, `16`> Mask;
12109	unsigned OutputFactor = VT.getScalarSizeInBits() / `8`;
12110	for (unsigned I = `0`; I < NumElts; ++I) {
12111	SDValue V = Op.getOperand(i: I);
12112	if (V.isUndef()) {
12113	for (unsigned OF = `0`; OF < OutputFactor; OF++)
12114	Mask.push_back(Elt: -`1`);
12115	continue;
12116	}
12117	// Set the Mask lanes adjusted for the size of the input and output
12118	// lanes. The Mask is always i8, so it will set OutputFactor lanes per
12119	// output element, adjusted in their positions per input and output types.
12120	unsigned Lane = V.getConstantOperandVal(i: `1`);
12121	for (unsigned S = `0`; S < Sources.size(); S++) {
12122	if (V.getOperand(i: `0`) == Sources [S].Vec) {
12123	unsigned InputSize = Sources [S].Vec.getScalarValueSizeInBits();
12124	unsigned InputBase = `16` * S + Lane * InputSize / `8`;
12125	for (unsigned OF = `0`; OF < OutputFactor; OF++)
12126	Mask.push_back(Elt: InputBase + OF);
12127	break;
12128	}
12129	}
12130	}
12131
12132	// Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12133	// v16i8, and the TBLMask
12134	SmallVector<SDValue, `16`> TBLOperands;
12135	TBLOperands.push_back(Elt: DAG.getConstant(Val: Sources.size() == `3`
12136	? Intrinsic::aarch64_neon_tbl3
12137	: Intrinsic::aarch64_neon_tbl4,
12138	DL: dl, VT: MVT::i32));
12139	for (unsigned i = `0`; i < Sources.size(); i++) {
12140	SDValue Src = Sources [i].Vec;
12141	EVT SrcVT = Src.getValueType();
12142	Src = DAG.getBitcast(VT: SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, V: Src);
12143	assert((SrcVT.is64BitVector() \|\| SrcVT.is128BitVector()) &&
12144	"Expected a legally typed vector");
12145	if (SrcVT.is64BitVector())
12146	Src = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: MVT::v16i8, N1: Src,
12147	N2: DAG.getUNDEF(VT: MVT::v8i8));
12148	TBLOperands.push_back(Elt: Src);
12149	}
12150
12151	SmallVector<SDValue, `16`> TBLMask;
12152	for (unsigned i = `0`; i < Mask.size(); i++)
12153	TBLMask.push_back(Elt: DAG.getConstant(Val: Mask [i], DL: dl, VT: MVT::i32));
12154	assert((Mask.size() == `8` \|\| Mask.size() == `16`) &&
12155	"Expected a v8i8 or v16i8 Mask");
12156	TBLOperands.push_back(
12157	Elt: DAG.getBuildVector(VT: Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, DL: dl, Ops: TBLMask));
12158
12159	SDValue Shuffle =
12160	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl,
12161	VT: Mask.size() == `8` ? MVT::v8i8 : MVT::v16i8, Ops: TBLOperands);
12162	return DAG.getBitcast(VT, V: Shuffle);
12163	}
12164
12165	if (Sources.size() > `2`) {
12166	LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12167	<< "sensible when at most two source vectors are "
12168	<< "involved\n");
12169	return SDValue ();
12170	}
12171
12172	// Find out the smallest element size among result and two sources, and use
12173	// it as element size to build the shuffle_vector.
12174	EVT SmallestEltTy = VT.getVectorElementType();
12175	for (auto &Source : Sources) {
12176	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12177	if (SrcEltTy.bitsLT(VT: SmallestEltTy)) {
12178	SmallestEltTy = SrcEltTy;
12179	}
12180	}
12181	unsigned ResMultiplier =
12182	VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12183	uint64_t VTSize = VT.getFixedSizeInBits();
12184	NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12185	EVT ShuffleVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SmallestEltTy, NumElements: NumElts);
12186
12187	// If the source vector is too wide or too narrow, we may nevertheless be able
12188	// to construct a compatible shuffle either by concatenating it with UNDEF or
12189	// extracting a suitable range of elements.
12190	for (auto &Src : Sources) {
12191	EVT SrcVT = Src.ShuffleVec.getValueType();
12192
12193	TypeSize SrcVTSize = SrcVT.getSizeInBits();
12194	if (SrcVTSize == TypeSize::getFixed(ExactSize: VTSize))
12195	continue;
12196
12197	// This stage of the search produces a source with the same element type as
12198	// the original, but with a total width matching the BUILD_VECTOR output.
12199	EVT EltVT = SrcVT.getVectorElementType();
12200	unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12201	EVT DestVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: NumSrcElts);
12202
12203	if (SrcVTSize.getFixedValue() < VTSize) {
12204	assert(`2` * SrcVTSize == VTSize);
12205	// We can pad out the smaller vector for free, so if it's part of a
12206	// shuffle...
12207	Src.ShuffleVec =
12208	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12209	N2: DAG.getUNDEF(VT: Src.ShuffleVec.getValueType()));
12210	continue;
12211	}
12212
12213	if (SrcVTSize.getFixedValue() != `2` * VTSize) {
12214	LLVM_DEBUG(
12215	dbgs() << "Reshuffle failed: result vector too small to extract\n");
12216	return SDValue ();
12217	}
12218
12219	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12220	LLVM_DEBUG(
12221	dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12222	return SDValue ();
12223	}
12224
12225	if (Src.MinElt >= NumSrcElts) {
12226	// The extraction can just take the second half
12227	Src.ShuffleVec =
12228	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12229	N2: DAG.getConstant(Val: NumSrcElts, DL: dl, VT: MVT::i64));
12230	Src.WindowBase = -NumSrcElts;
12231	} else if (Src.MaxElt < NumSrcElts) {
12232	// The extraction can just take the first half
12233	Src.ShuffleVec =
12234	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12235	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
12236	} else {
12237	// An actual VEXT is needed
12238	SDValue VEXTSrc1 =
12239	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12240	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
12241	SDValue VEXTSrc2 =
12242	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: DestVT, N1: Src.ShuffleVec,
12243	N2: DAG.getConstant(Val: NumSrcElts, DL: dl, VT: MVT::i64));
12244	unsigned Imm = Src.MinElt * getExtFactor(V&: VEXTSrc1);
12245
12246	if (!SrcVT.is64BitVector()) {
12247	LLVM_DEBUG(
12248	dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12249	"for SVE vectors.");
12250	return SDValue ();
12251	}
12252
12253	Src.ShuffleVec = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: DestVT, N1: VEXTSrc1,
12254	N2: VEXTSrc2,
12255	N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
12256	Src.WindowBase = -Src.MinElt;
12257	}
12258	}
12259
12260	// Another possible incompatibility occurs from the vector element types. We
12261	// can fix this by bitcasting the source vectors to the same type we intend
12262	// for the shuffle.
12263	for (auto &Src : Sources) {
12264	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
12265	if (SrcEltTy == SmallestEltTy)
12266	continue;
12267	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
12268	if (DAG.getDataLayout().isBigEndian()) {
12269	Src.ShuffleVec =
12270	DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
12271	} else {
12272	Src.ShuffleVec = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ShuffleVT, Operand: Src.ShuffleVec);
12273	}
12274	Src.WindowScale =
12275	SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12276	Src.WindowBase *= Src.WindowScale;
12277	}
12278
12279	// Final check before we try to actually produce a shuffle.
12280	LLVM_DEBUG(for (auto Src
12281	: Sources)
12282	assert(Src.ShuffleVec.getValueType() == ShuffleVT););
12283
12284	// The stars all align, our next step is to produce the mask for the shuffle.
12285	SmallVector<int, `8`> Mask(ShuffleVT.getVectorNumElements(), -`1`);
12286	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
12287	for (unsigned i = `0`; i < VT.getVectorNumElements(); ++i) {
12288	SDValue Entry = Op.getOperand(i);
12289	if (Entry.isUndef())
12290	continue;
12291
12292	auto Src = find(Range&: Sources, Val: Entry.getOperand(i: `0`));
12293	int EltNo = cast<ConstantSDNode>(Val: Entry.getOperand(i: `1`))->getSExtValue();
12294
12295	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
12296	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
12297	// segment.
12298	EVT OrigEltTy = Entry.getOperand(i: `0`).getValueType().getVectorElementType();
12299	int BitsDefined = std::min(a: OrigEltTy.getScalarSizeInBits(),
12300	b: VT.getScalarSizeInBits());
12301	int LanesDefined = BitsDefined / BitsPerShuffleLane;
12302
12303	// This source is expected to fill ResMultiplier lanes of the final shuffle,
12304	// starting at the appropriate offset.
12305	int LaneMask = &Mask [i ResMultiplier];
12306
12307	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
12308	ExtractBase += NumElts * (Src - Sources.begin());
12309	for (int j = `0`; j < LanesDefined; ++j)
12310	LaneMask[j] = ExtractBase + j;
12311	}
12312
12313	// Final check before we try to produce nonsense...
12314	if (!isShuffleMaskLegal(M: Mask, VT: ShuffleVT)) {
12315	LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
12316	return SDValue ();
12317	}
12318
12319	SDValue ShuffleOps[] = { DAG.getUNDEF(VT: ShuffleVT), DAG.getUNDEF(VT: ShuffleVT) };
12320	for (unsigned i = `0`; i < Sources.size(); ++i)
12321	ShuffleOps[i] = Sources [i].ShuffleVec;
12322
12323	SDValue Shuffle = DAG.getVectorShuffle(VT: ShuffleVT, dl, N1: ShuffleOps[`0`],
12324	N2: ShuffleOps[`1`], Mask);
12325	SDValue V;
12326	if (DAG.getDataLayout().isBigEndian()) {
12327	V = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Shuffle);
12328	} else {
12329	V = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Shuffle);
12330	}
12331
12332	LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
12333	dbgs() << "Reshuffle, creating node: "; V.dump(););
12334
12335	return V;
12336	}
12337
12338	// check if an EXT instruction can handle the shuffle mask when the
12339	// vector sources of the shuffle are the same.
12340	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
12341	unsigned NumElts = VT.getVectorNumElements();
12342
12343	// Assume that the first shuffle index is not UNDEF. Fail if it is.
12344	if (M [`0`] < `0`)
12345	return false;
12346
12347	Imm = M [`0`];
12348
12349	// If this is a VEXT shuffle, the immediate value is the index of the first
12350	// element. The other shuffle indices must be the successive elements after
12351	// the first one.
12352	unsigned ExpectedElt = Imm;
12353	for (unsigned i = `1`; i < NumElts; ++i) {
12354	// Increment the expected index. If it wraps around, just follow it
12355	// back to index zero and keep going.
12356	++ExpectedElt;
12357	if (ExpectedElt == NumElts)
12358	ExpectedElt = `0`;
12359
12360	if (M [i] < `0`)
12361	continue; // ignore UNDEF indices
12362	if (ExpectedElt != static_cast<unsigned>(M [i]))
12363	return false;
12364	}
12365
12366	return true;
12367	}
12368
12369	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12370	// v4i32s. This is really a truncate, which we can construct out of (legal)
12371	// concats and truncate nodes.
12372	static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
12373	if (V.getValueType() != MVT::v16i8)
12374	return SDValue ();
12375	assert(V.getNumOperands() == `16` && "Expected 16 operands on the BUILDVECTOR");
12376
12377	for (unsigned X = `0`; X < `4`; X++) {
12378	// Check the first item in each group is an extract from lane 0 of a v4i32
12379	// or v4i16.
12380	SDValue BaseExt = V.getOperand(i: X * `4`);
12381	if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
12382	(BaseExt.getOperand(i: `0`).getValueType() != MVT::v4i16 &&
12383	BaseExt.getOperand(i: `0`).getValueType() != MVT::v4i32) \|\|
12384	!isa<ConstantSDNode>(Val: BaseExt.getOperand(i: `1`)) \|\|
12385	BaseExt.getConstantOperandVal(i: `1`) != `0`)
12386	return SDValue ();
12387	SDValue Base = BaseExt.getOperand(i: `0`);
12388	// And check the other items are extracts from the same vector.
12389	for (unsigned Y = `1`; Y < `4`; Y++) {
12390	SDValue Ext = V.getOperand(i: X * `4` + Y);
12391	if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
12392	Ext.getOperand(i: `0`) != Base \|\|
12393	!isa<ConstantSDNode>(Val: Ext.getOperand(i: `1`)) \|\|
12394	Ext.getConstantOperandVal(i: `1`) != Y)
12395	return SDValue ();
12396	}
12397	}
12398
12399	// Turn the buildvector into a series of truncates and concates, which will
12400	// become uzip1's. Any v4i32s we found get truncated to v4i16, which are
12401	// concat together to produce 2 v8i16. These are both truncated and concat
12402	// together.
12403	SDLoc DL(V);
12404	SDValue Trunc[`4`] = {
12405	V.getOperand(i: `0`).getOperand(i: `0`), V.getOperand(i: `4`).getOperand(i: `0`),
12406	V.getOperand(i: `8`).getOperand(i: `0`), V.getOperand(i: `12`).getOperand(i: `0`)};
12407	for (SDValue &V : Trunc)
12408	if (V.getValueType() == MVT::v4i32)
12409	V = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v4i16, Operand: V);
12410	SDValue Concat0 =
12411	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[`0`], N2: Trunc[`1`]);
12412	SDValue Concat1 =
12413	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v8i16, N1: Trunc[`2`], N2: Trunc[`3`]);
12414	SDValue Trunc0 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat0);
12415	SDValue Trunc1 = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::v8i8, Operand: Concat1);
12416	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: Trunc0, N2: Trunc1);
12417	}
12418
12419	/// Check if a vector shuffle corresponds to a DUP instructions with a larger
12420	/// element width than the vector lane type. If that is the case the function
12421	/// returns true and writes the value of the DUP instruction lane operand into
12422	/// DupLaneOp
12423	static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
12424	unsigned &DupLaneOp) {
12425	assert((BlockSize == `16` \|\| BlockSize == `32` \|\| BlockSize == `64`) &&
12426	"Only possible block sizes for wide DUP are: 16, 32, 64");
12427
12428	if (BlockSize <= VT.getScalarSizeInBits())
12429	return false;
12430	if (BlockSize % VT.getScalarSizeInBits() != `0`)
12431	return false;
12432	if (VT.getSizeInBits() % BlockSize != `0`)
12433	return false;
12434
12435	size_t SingleVecNumElements = VT.getVectorNumElements();
12436	size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
12437	size_t NumBlocks = VT.getSizeInBits() / BlockSize;
12438
12439	// We are looking for masks like
12440	// [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
12441	// might be replaced by 'undefined'. BlockIndices will eventually contain
12442	// lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
12443	// for the above examples)
12444	SmallVector<int, `8`> BlockElts(NumEltsPerBlock, -`1`);
12445	for (size_t BlockIndex = `0`; BlockIndex < NumBlocks; BlockIndex++)
12446	for (size_t I = `0`; I < NumEltsPerBlock; I++) {
12447	int Elt = M [BlockIndex * NumEltsPerBlock + I];
12448	if (Elt < `0`)
12449	continue;
12450	// For now we don't support shuffles that use the second operand
12451	if ((unsigned)Elt >= SingleVecNumElements)
12452	return false;
12453	if (BlockElts [I] < `0`)
12454	BlockElts [I] = Elt;
12455	else if (BlockElts [I] != Elt)
12456	return false;
12457	}
12458
12459	// We found a candidate block (possibly with some undefs). It must be a
12460	// sequence of consecutive integers starting with a value divisible by
12461	// NumEltsPerBlock with some values possibly replaced by undef-s.
12462
12463	// Find first non-undef element
12464	auto FirstRealEltIter = find_if(Range&: BlockElts, P: [](int Elt) { return Elt >= `0`; });
12465	assert(FirstRealEltIter != BlockElts.end() &&
12466	"Shuffle with all-undefs must have been caught by previous cases, "
12467	"e.g. isSplat()");
12468	if (FirstRealEltIter == BlockElts.end()) {
12469	DupLaneOp = `0`;
12470	return true;
12471	}
12472
12473	// Index of FirstRealElt in BlockElts
12474	size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
12475
12476	if ((unsigned)*FirstRealEltIter < FirstRealIndex)
12477	return false;
12478	// BlockElts[0] must have the following value if it isn't undef:
12479	size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
12480
12481	// Check the first element
12482	if (Elt0 % NumEltsPerBlock != `0`)
12483	return false;
12484	// Check that the sequence indeed consists of consecutive integers (modulo
12485	// undefs)
12486	for (size_t I = `0`; I < NumEltsPerBlock; I++)
12487	if (BlockElts [I] >= `0` && (unsigned)BlockElts [I] != Elt0 + I)
12488	return false;
12489
12490	DupLaneOp = Elt0 / NumEltsPerBlock;
12491	return true;
12492	}
12493
12494	// check if an EXT instruction can handle the shuffle mask when the
12495	// vector sources of the shuffle are different.
12496	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
12497	unsigned &Imm) {
12498	// Look for the first non-undef element.
12499	const int FirstRealElt = find_if(Range&: M, P: [](int* Elt) { return Elt >= `0`; });
12500
12501	// Benefit form APInt to handle overflow when calculating expected element.
12502	unsigned NumElts = VT.getVectorNumElements();
12503	unsigned MaskBits = APInt (`32`, NumElts * `2`).logBase2();
12504	APInt ExpectedElt = APInt (MaskBits, *FirstRealElt + `1`);
12505	// The following shuffle indices must be the successive elements after the
12506	// first real element.
12507	bool FoundWrongElt = std::any_of(first: FirstRealElt + `1`, last: M.end(), pred: [&](int Elt) {
12508	return Elt != ExpectedElt ++ && Elt != -`1`;
12509	});
12510	if (FoundWrongElt)
12511	return false;
12512
12513	// The index of an EXT is the first element if it is not UNDEF.
12514	// Watch out for the beginning UNDEFs. The EXT index should be the expected
12515	// value of the first element. E.g.
12516	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
12517	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
12518	// ExpectedElt is the last mask index plus 1.
12519	Imm = ExpectedElt.getZExtValue();
12520
12521	// There are two difference cases requiring to reverse input vectors.
12522	// For example, for vector <4 x i32> we have the following cases,
12523	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
12524	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
12525	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
12526	// to reverse two input vectors.
12527	if (Imm < NumElts)
12528	ReverseEXT = true;
12529	else
12530	Imm -= NumElts;
12531
12532	return true;
12533	}
12534
12535	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
12536	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12537	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
12538	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12539	unsigned NumElts = VT.getVectorNumElements();
12540	if (NumElts % `2` != `0`)
12541	return false;
12542	WhichResult = (M [`0`] == `0` ? `0` : `1`);
12543	unsigned Idx = WhichResult * NumElts / `2`;
12544	for (unsigned i = `0`; i != NumElts; i += `2`) {
12545	if ((M [i] >= `0` && (unsigned)M [i] != Idx) \|\|
12546	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != Idx))
12547	return false;
12548	Idx += `1`;
12549	}
12550
12551	return true;
12552	}
12553
12554	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
12555	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12556	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
12557	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12558	unsigned Half = VT.getVectorNumElements() / `2`;
12559	WhichResult = (M [`0`] == `0` ? `0` : `1`);
12560	for (unsigned j = `0`; j != `2`; ++j) {
12561	unsigned Idx = WhichResult;
12562	for (unsigned i = `0`; i != Half; ++i) {
12563	int MIdx = M [i + j * Half];
12564	if (MIdx >= `0` && (unsigned)MIdx != Idx)
12565	return false;
12566	Idx += `2`;
12567	}
12568	}
12569
12570	return true;
12571	}
12572
12573	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
12574	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12575	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
12576	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12577	unsigned NumElts = VT.getVectorNumElements();
12578	if (NumElts % `2` != `0`)
12579	return false;
12580	WhichResult = (M [`0`] == `0` ? `0` : `1`);
12581	for (unsigned i = `0`; i < NumElts; i += `2`) {
12582	if ((M [i] >= `0` && (unsigned)M [i] != i + WhichResult) \|\|
12583	(M [i + `1`] >= `0` && (unsigned)M [i + `1`] != i + WhichResult))
12584	return false;
12585	}
12586	return true;
12587	}
12588
12589	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
12590	bool &DstIsLeft, int &Anomaly) {
12591	if (M.size() != static_cast<size_t>(NumInputElements))
12592	return false;
12593
12594	int NumLHSMatch = `0`, NumRHSMatch = `0`;
12595	int LastLHSMismatch = -`1`, LastRHSMismatch = -`1`;
12596
12597	for (int i = `0`; i < NumInputElements; ++i) {
12598	if (M [i] == -`1`) {
12599	++NumLHSMatch;
12600	++NumRHSMatch;
12601	continue;
12602	}
12603
12604	if (M [i] == i)
12605	++NumLHSMatch;
12606	else
12607	LastLHSMismatch = i;
12608
12609	if (M [i] == i + NumInputElements)
12610	++NumRHSMatch;
12611	else
12612	LastRHSMismatch = i;
12613	}
12614
12615	if (NumLHSMatch == NumInputElements - `1`) {
12616	DstIsLeft = true;
12617	Anomaly = LastLHSMismatch;
12618	return true;
12619	} else if (NumRHSMatch == NumInputElements - `1`) {
12620	DstIsLeft = false;
12621	Anomaly = LastRHSMismatch;
12622	return true;
12623	}
12624
12625	return false;
12626	}
12627
12628	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12629	if (VT.getSizeInBits() != `128`)
12630	return false;
12631
12632	unsigned NumElts = VT.getVectorNumElements();
12633
12634	for (int I = `0`, E = NumElts / `2`; I != E; I++) {
12635	if (Mask [I] != I)
12636	return false;
12637	}
12638
12639	int Offset = NumElts / `2`;
12640	for (int I = NumElts / `2`, E = NumElts; I != E; I++) {
12641	if (Mask [I] != I + SplitLHS * Offset)
12642	return false;
12643	}
12644
12645	return true;
12646	}
12647
12648	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
12649	SDLoc DL(Op);
12650	EVT VT = Op.getValueType();
12651	SDValue V0 = Op.getOperand(i: `0`);
12652	SDValue V1 = Op.getOperand(i: `1`);
12653	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
12654
12655	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
12656	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
12657	return SDValue ();
12658
12659	bool SplitV0 = V0.getValueSizeInBits() == `128`;
12660
12661	if (!isConcatMask(Mask, VT, SplitLHS: SplitV0))
12662	return SDValue ();
12663
12664	EVT CastVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
12665	if (SplitV0) {
12666	V0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V0,
12667	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
12668	}
12669	if (V1.getValueSizeInBits() == `128`) {
12670	V1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: CastVT, N1: V1,
12671	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
12672	}
12673	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: V0, N2: V1);
12674	}
12675
12676	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12677	/// the specified operations to build the shuffle. ID is the perfect-shuffle
12678	//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12679	//table entry and LHS/RHS are the immediate inputs for this stage of the
12680	//shuffle.
12681	static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
12682	SDValue V2, unsigned PFEntry, SDValue LHS,
12683	SDValue RHS, SelectionDAG &DAG,
12684	const SDLoc &dl) {
12685	unsigned OpNum = (PFEntry >> `26`) & `0x0F`;
12686	unsigned LHSID = (PFEntry >> `13`) & ((`1` << `13`) - `1`);
12687	unsigned RHSID = (PFEntry >> `0`) & ((`1` << `13`) - `1`);
12688
12689	enum {
12690	OP_COPY = `0`, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12691	OP_VREV,
12692	OP_VDUP0,
12693	OP_VDUP1,
12694	OP_VDUP2,
12695	OP_VDUP3,
12696	OP_VEXT1,
12697	OP_VEXT2,
12698	OP_VEXT3,
12699	OP_VUZPL, // VUZP, left result
12700	OP_VUZPR, // VUZP, right result
12701	OP_VZIPL, // VZIP, left result
12702	OP_VZIPR, // VZIP, right result
12703	OP_VTRNL, // VTRN, left result
12704	OP_VTRNR, // VTRN, right result
12705	OP_MOVLANE // Move lane. RHSID is the lane to move into
12706	};
12707
12708	if (OpNum == OP_COPY) {
12709	if (LHSID == (`1` * `9` + `2`) * `9` + `3`)
12710	return LHS;
12711	assert(LHSID == ((`4` * `9` + `5`) * `9` + `6`) * `9` + `7` && "Illegal OP_COPY!");
12712	return RHS;
12713	}
12714
12715	if (OpNum == OP_MOVLANE) {
12716	// Decompose a PerfectShuffle ID to get the Mask for lane Elt
12717	auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12718	assert(Elt < `4` && "Expected Perfect Lanes to be less than 4");
12719	Elt = `3` - Elt;
12720	while (Elt > `0`) {
12721	ID /= `9`;
12722	Elt--;
12723	}
12724	return (ID % `9` == `8`) ? -`1` : ID % `9`;
12725	};
12726
12727	// For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12728	// get the lane to move from the PFID, which is always from the
12729	// original vectors (V1 or V2).
12730	SDValue OpLHS = GeneratePerfectShuffle(
12731	ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12732	EVT VT = OpLHS.getValueType();
12733	assert(RHSID < `8` && "Expected a lane index for RHSID!");
12734	unsigned ExtLane = `0`;
12735	SDValue Input;
12736
12737	// OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12738	// convert into a higher type.
12739	if (RHSID & `0x4`) {
12740	int MaskElt = getPFIDLane (ID, (RHSID & `0x01`) << `1`) >> `1`;
12741	if (MaskElt == -`1`)
12742	MaskElt = (getPFIDLane (ID, ((RHSID & `0x01`) << `1`) + `1`) - `1`) >> `1`;
12743	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
12744	ExtLane = MaskElt < `2` ? MaskElt : (MaskElt - `2`);
12745	Input = MaskElt < `2` ? V1 : V2;
12746	if (VT.getScalarSizeInBits() == `16`) {
12747	Input = DAG.getBitcast(VT: MVT::v2f32, V: Input);
12748	OpLHS = DAG.getBitcast(VT: MVT::v2f32, V: OpLHS);
12749	} else {
12750	assert(VT.getScalarSizeInBits() == `32` &&
12751	"Expected 16 or 32 bit shuffle elemements");
12752	Input = DAG.getBitcast(VT: MVT::v2f64, V: Input);
12753	OpLHS = DAG.getBitcast(VT: MVT::v2f64, V: OpLHS);
12754	}
12755	} else {
12756	int MaskElt = getPFIDLane (ID, RHSID);
12757	assert(MaskElt >= `0` && "Didn't expect an undef movlane index!");
12758	ExtLane = MaskElt < `4` ? MaskElt : (MaskElt - `4`);
12759	Input = MaskElt < `4` ? V1 : V2;
12760	// Be careful about creating illegal types. Use f16 instead of i16.
12761	if (VT == MVT::v4i16) {
12762	Input = DAG.getBitcast(VT: MVT::v4f16, V: Input);
12763	OpLHS = DAG.getBitcast(VT: MVT::v4f16, V: OpLHS);
12764	}
12765	}
12766	SDValue Ext = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl,
12767	VT: Input.getValueType().getVectorElementType(),
12768	N1: Input, N2: DAG.getVectorIdxConstant(Val: ExtLane, DL: dl));
12769	SDValue Ins =
12770	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT: Input.getValueType(), N1: OpLHS,
12771	N2: Ext, N3: DAG.getVectorIdxConstant(Val: RHSID & `0x3`, DL: dl));
12772	return DAG.getBitcast(VT, V: Ins);
12773	}
12774
12775	SDValue OpLHS, OpRHS;
12776	OpLHS = GeneratePerfectShuffle(ID: LHSID, V1, V2, PFEntry: PerfectShuffleTable[LHSID], LHS,
12777	RHS, DAG, dl);
12778	OpRHS = GeneratePerfectShuffle(ID: RHSID, V1, V2, PFEntry: PerfectShuffleTable[RHSID], LHS,
12779	RHS, DAG, dl);
12780	EVT VT = OpLHS.getValueType();
12781
12782	switch (OpNum) {
12783	default:
12784	llvm_unreachable("Unknown shuffle opcode!");
12785	case OP_VREV:
12786	// VREV divides the vector in half and swaps within the half.
12787	if (VT.getVectorElementType() == MVT::i32 \|\|
12788	VT.getVectorElementType() == MVT::f32)
12789	return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: OpLHS);
12790	// vrev <4 x i16> -> REV32
12791	if (VT.getVectorElementType() == MVT::i16 \|\|
12792	VT.getVectorElementType() == MVT::f16 \|\|
12793	VT.getVectorElementType() == MVT::bf16)
12794	return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT, Operand: OpLHS);
12795	// vrev <4 x i8> -> REV16
12796	assert(VT.getVectorElementType() == MVT::i8);
12797	return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT, Operand: OpLHS);
12798	case OP_VDUP0:
12799	case OP_VDUP1:
12800	case OP_VDUP2:
12801	case OP_VDUP3: {
12802	EVT EltTy = VT.getVectorElementType();
12803	unsigned Opcode;
12804	if (EltTy == MVT::i8)
12805	Opcode = AArch64ISD::DUPLANE8;
12806	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16 \|\| EltTy == MVT::bf16)
12807	Opcode = AArch64ISD::DUPLANE16;
12808	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
12809	Opcode = AArch64ISD::DUPLANE32;
12810	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
12811	Opcode = AArch64ISD::DUPLANE64;
12812	else
12813	llvm_unreachable("Invalid vector element type?");
12814
12815	if (VT.getSizeInBits() == `64`)
12816	OpLHS = WidenVector(V64Reg: OpLHS, DAG);
12817	SDValue Lane = DAG.getConstant(Val: OpNum - OP_VDUP0, DL: dl, VT: MVT::i64);
12818	return DAG.getNode(Opcode, DL: dl, VT, N1: OpLHS, N2: Lane);
12819	}
12820	case OP_VEXT1:
12821	case OP_VEXT2:
12822	case OP_VEXT3: {
12823	unsigned Imm = (OpNum - OP_VEXT1 + `1`) * getExtFactor(V&: OpLHS);
12824	return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT, N1: OpLHS, N2: OpRHS,
12825	N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
12826	}
12827	case OP_VUZPL:
12828	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12829	case OP_VUZPR:
12830	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12831	case OP_VZIPL:
12832	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12833	case OP_VZIPR:
12834	return DAG.getNode(Opcode: AArch64ISD::ZIP2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12835	case OP_VTRNL:
12836	return DAG.getNode(Opcode: AArch64ISD::TRN1, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12837	case OP_VTRNR:
12838	return DAG.getNode(Opcode: AArch64ISD::TRN2, DL: dl, VT, N1: OpLHS, N2: OpRHS);
12839	}
12840	}
12841
12842	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
12843	SelectionDAG &DAG) {
12844	// Check to see if we can use the TBL instruction.
12845	SDValue V1 = Op.getOperand(i: `0`);
12846	SDValue V2 = Op.getOperand(i: `1`);
12847	SDLoc DL(Op);
12848
12849	EVT EltVT = Op.getValueType().getVectorElementType();
12850	unsigned BytesPerElt = EltVT.getSizeInBits() / `8`;
12851
12852	bool Swap = false;
12853	if (V1.isUndef() \|\| isZerosVector(N: V1.getNode())) {
12854	std::swap(a&: V1, b&: V2);
12855	Swap = true;
12856	}
12857
12858	// If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12859	// out of range values with 0s. We do need to make sure that any out-of-range
12860	// values are really out-of-range for a v16i8 vector.
12861	bool IsUndefOrZero = V2.isUndef() \|\| isZerosVector(N: V2.getNode());
12862	MVT IndexVT = MVT::v8i8;
12863	unsigned IndexLen = `8`;
12864	if (Op.getValueSizeInBits() == `128`) {
12865	IndexVT = MVT::v16i8;
12866	IndexLen = `16`;
12867	}
12868
12869	SmallVector<SDValue, `8`> TBLMask;
12870	for (int Val : ShuffleMask) {
12871	for (unsigned Byte = `0`; Byte < BytesPerElt; ++Byte) {
12872	unsigned Offset = Byte + Val * BytesPerElt;
12873	if (Swap)
12874	Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12875	if (IsUndefOrZero && Offset >= IndexLen)
12876	Offset = `255`;
12877	TBLMask.push_back(Elt: DAG.getConstant(Val: Offset, DL, VT: MVT::i32));
12878	}
12879	}
12880
12881	SDValue V1Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V1);
12882	SDValue V2Cst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IndexVT, Operand: V2);
12883
12884	SDValue Shuffle;
12885	if (IsUndefOrZero) {
12886	if (IndexLen == `8`)
12887	V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V1Cst);
12888	Shuffle = DAG.getNode(
12889	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12890	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
12891	N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12892	} else {
12893	if (IndexLen == `8`) {
12894	V1Cst = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v16i8, N1: V1Cst, N2: V2Cst);
12895	Shuffle = DAG.getNode(
12896	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12897	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl1, DL, VT: MVT::i32), N2: V1Cst,
12898	N3: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12899	} else {
12900	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12901	// cannot currently represent the register constraints on the input
12902	// table registers.
12903	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12904	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12905	// IndexLen));
12906	Shuffle = DAG.getNode(
12907	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT,
12908	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_tbl2, DL, VT: MVT::i32), N2: V1Cst,
12909	N3: V2Cst,
12910	N4: DAG.getBuildVector(VT: IndexVT, DL, Ops: ArrayRef(TBLMask.data(), IndexLen)));
12911	}
12912	}
12913	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
12914	}
12915
12916	static unsigned getDUPLANEOp(EVT EltType) {
12917	if (EltType == MVT::i8)
12918	return AArch64ISD::DUPLANE8;
12919	if (EltType == MVT::i16 \|\| EltType == MVT::f16 \|\| EltType == MVT::bf16)
12920	return AArch64ISD::DUPLANE16;
12921	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
12922	return AArch64ISD::DUPLANE32;
12923	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
12924	return AArch64ISD::DUPLANE64;
12925
12926	llvm_unreachable("Invalid vector element type?");
12927	}
12928
12929	static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12930	unsigned Opcode, SelectionDAG &DAG) {
12931	// Try to eliminate a bitcasted extract subvector before a DUPLANE.
12932	auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12933	// Match: dup (bitcast (extract_subv X, C)), LaneC
12934	if (BitCast.getOpcode() != ISD::BITCAST \|\|
12935	BitCast.getOperand(i: `0`).getOpcode() != ISD::EXTRACT_SUBVECTOR)
12936	return false;
12937
12938	// The extract index must align in the destination type. That may not
12939	// happen if the bitcast is from narrow to wide type.
12940	SDValue Extract = BitCast.getOperand(i: `0`);
12941	unsigned ExtIdx = Extract.getConstantOperandVal(i: `1`);
12942	unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12943	unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12944	unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12945	if (ExtIdxInBits % CastedEltBitWidth != `0`)
12946	return false;
12947
12948	// Can't handle cases where vector size is not 128-bit
12949	if (!Extract.getOperand(i: `0`).getValueType().is128BitVector())
12950	return false;
12951
12952	// Update the lane value by offsetting with the scaled extract index.
12953	LaneC += ExtIdxInBits / CastedEltBitWidth;
12954
12955	// Determine the casted vector type of the wide vector input.
12956	// dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12957	// Examples:
12958	// dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12959	// dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12960	unsigned SrcVecNumElts =
12961	Extract.getOperand(i: `0`).getValueSizeInBits() / CastedEltBitWidth;
12962	CastVT = MVT::getVectorVT(VT: BitCast.getSimpleValueType().getScalarType(),
12963	NumElements: SrcVecNumElts);
12964	return true;
12965	};
12966	MVT CastVT;
12967	if (getScaledOffsetDup (V, Lane, CastVT)) {
12968	V = DAG.getBitcast(VT: CastVT, V: V.getOperand(i: `0`).getOperand(i: `0`));
12969	} else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12970	V.getOperand(i: `0`).getValueType().is128BitVector()) {
12971	// The lane is incremented by the index of the extract.
12972	// Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12973	Lane += V.getConstantOperandVal(i: `1`);
12974	V = V.getOperand(i: `0`);
12975	} else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12976	// The lane is decremented if we are splatting from the 2nd operand.
12977	// Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12978	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / `2`;
12979	Lane -= Idx * VT.getVectorNumElements() / `2`;
12980	V = WidenVector(V64Reg: V.getOperand(i: Idx), DAG);
12981	} else if (VT.getSizeInBits() == `64`) {
12982	// Widen the operand to 128-bit register with undef.
12983	V = WidenVector(V64Reg: V, DAG);
12984	}
12985	return DAG.getNode(Opcode, DL: dl, VT, N1: V, N2: DAG.getConstant(Val: Lane, DL: dl, VT: MVT::i64));
12986	}
12987
12988	// Return true if we can get a new shuffle mask by checking the parameter mask
12989	// array to test whether every two adjacent mask values are continuous and
12990	// starting from an even number.
12991	static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
12992	SmallVectorImpl<int> &NewMask) {
12993	unsigned NumElts = VT.getVectorNumElements();
12994	if (NumElts % `2` != `0`)
12995	return false;
12996
12997	NewMask.clear();
12998	for (unsigned i = `0`; i < NumElts; i += `2`) {
12999	int M0 = M [i];
13000	int M1 = M [i + `1`];
13001
13002	// If both elements are undef, new mask is undef too.
13003	if (M0 == -`1` && M1 == -`1`) {
13004	NewMask.push_back(Elt: -`1`);
13005	continue;
13006	}
13007
13008	if (M0 == -`1` && M1 != -`1` && (M1 % `2`) == `1`) {
13009	NewMask.push_back(Elt: M1 / `2`);
13010	continue;
13011	}
13012
13013	if (M0 != -`1` && (M0 % `2`) == `0` && ((M0 + `1`) == M1 \|\| M1 == -`1`)) {
13014	NewMask.push_back(Elt: M0 / `2`);
13015	continue;
13016	}
13017
13018	NewMask.clear();
13019	return false;
13020	}
13021
13022	assert(NewMask.size() == NumElts / `2` && "Incorrect size for mask!");
13023	return true;
13024	}
13025
13026	// Try to widen element type to get a new mask value for a better permutation
13027	// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13028	// UZP1/2, TRN1/2, REV, INS, etc.
13029	// For example:
13030	// shufflevector <4 x i32> %a, <4 x i32> %b,
13031	// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13032	// is equivalent to:
13033	// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13034	// Finally, we can get:
13035	// mov v0.d[0], v1.d[1]
13036	static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
13037	SDLoc DL(Op);
13038	EVT VT = Op.getValueType();
13039	EVT ScalarVT = VT.getVectorElementType();
13040	unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13041	SDValue V0 = Op.getOperand(i: `0`);
13042	SDValue V1 = Op.getOperand(i: `1`);
13043	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Val&: Op)->getMask();
13044
13045	// If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13046	// We need to make sure the wider element type is legal. Thus, ElementSize
13047	// should be not larger than 32 bits, and i1 type should also be excluded.
13048	if (ElementSize > `32` \|\| ElementSize == `1`)
13049	return SDValue ();
13050
13051	SmallVector<int, `8`> NewMask;
13052	if (isWideTypeMask(M: Mask, VT, NewMask)) {
13053	MVT NewEltVT = VT.isFloatingPoint()
13054	? MVT::getFloatingPointVT(BitWidth: ElementSize * `2`)
13055	: MVT::getIntegerVT(BitWidth: ElementSize * `2`);
13056	MVT NewVT = MVT::getVectorVT(VT: NewEltVT, NumElements: VT.getVectorNumElements() / `2`);
13057	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: NewVT)) {
13058	V0 = DAG.getBitcast(VT: NewVT, V: V0);
13059	V1 = DAG.getBitcast(VT: NewVT, V: V1);
13060	return DAG.getBitcast(VT,
13061	V: DAG.getVectorShuffle(VT: NewVT, dl: DL, N1: V0, N2: V1, Mask: NewMask));
13062	}
13063	}
13064
13065	return SDValue ();
13066	}
13067
13068	// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13069	static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op,
13070	ArrayRef<int> ShuffleMask,
13071	SelectionDAG &DAG) {
13072	SDValue Tbl1 = Op ->getOperand(Num: `0`);
13073	SDValue Tbl2 = Op ->getOperand(Num: `1`);
13074	SDLoc dl(Op);
13075	SDValue Tbl2ID =
13076	DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl2, DL: dl, VT: MVT::i64);
13077
13078	EVT VT = Op.getValueType();
13079	if (Tbl1 ->getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
13080	Tbl1 ->getOperand(Num: `0`) != Tbl2ID \|\|
13081	Tbl2 ->getOpcode() != ISD::INTRINSIC_WO_CHAIN \|\|
13082	Tbl2 ->getOperand(Num: `0`) != Tbl2ID)
13083	return SDValue ();
13084
13085	if (Tbl1 ->getValueType(ResNo: `0`) != MVT::v16i8 \|\|
13086	Tbl2 ->getValueType(ResNo: `0`) != MVT::v16i8)
13087	return SDValue ();
13088
13089	SDValue Mask1 = Tbl1 ->getOperand(Num: `3`);
13090	SDValue Mask2 = Tbl2 ->getOperand(Num: `3`);
13091	SmallVector<SDValue, `16`> TBLMaskParts(`16`, SDValue ());
13092	for (unsigned I = `0`; I < `16`; I++) {
13093	if (ShuffleMask [I] < `16`)
13094	TBLMaskParts [I] = Mask1 ->getOperand(Num: ShuffleMask [I]);
13095	else {
13096	auto *C =
13097	dyn_cast<ConstantSDNode>(Val: Mask2 ->getOperand(Num: ShuffleMask [I] - `16`));
13098	if (!C)
13099	return SDValue ();
13100	TBLMaskParts [I] = DAG.getConstant(Val: C->getSExtValue() + `32`, DL: dl, VT: MVT::i32);
13101	}
13102	}
13103
13104	SDValue TBLMask = DAG.getBuildVector(VT, DL: dl, Ops: TBLMaskParts);
13105	SDValue ID =
13106	DAG.getTargetConstant(Val: Intrinsic::aarch64_neon_tbl4, DL: dl, VT: MVT::i64);
13107
13108	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: dl, VT: MVT::v16i8,
13109	Ops: {ID, Tbl1 ->getOperand(Num: `1`), Tbl1 ->getOperand(Num: `2`),
13110	Tbl2 ->getOperand(Num: `1`), Tbl2 ->getOperand(Num: `2`), TBLMask});
13111	}
13112
13113	// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13114	// but we don't have an appropriate instruction,
13115	// so custom-lower it as ZIP1-with-zeros.
13116	SDValue
13117	AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13118	SelectionDAG &DAG) const {
13119	SDLoc dl(Op);
13120	EVT VT = Op.getValueType();
13121	SDValue SrcOp = Op.getOperand(i: `0`);
13122	EVT SrcVT = SrcOp.getValueType();
13123	assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == `0` &&
13124	"Unexpected extension factor.");
13125	unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13126	// FIXME: support multi-step zipping?
13127	if (Scale != `2`)
13128	return SDValue ();
13129	SDValue Zeros = DAG.getConstant(Val: `0`, DL: dl, VT: SrcVT);
13130	return DAG.getBitcast(VT,
13131	V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT: SrcVT, N1: SrcOp, N2: Zeros));
13132	}
13133
13134	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13135	SelectionDAG &DAG) const {
13136	SDLoc dl(Op);
13137	EVT VT = Op.getValueType();
13138
13139	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
13140
13141	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13142	return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13143
13144	// Convert shuffles that are directly supported on NEON to target-specific
13145	// DAG nodes, instead of keeping them as shuffles and matching them again
13146	// during code selection. This is more efficient and avoids the possibility
13147	// of inconsistencies between legalization and selection.
13148	ArrayRef<int> ShuffleMask = SVN->getMask();
13149
13150	SDValue V1 = Op.getOperand(i: `0`);
13151	SDValue V2 = Op.getOperand(i: `1`);
13152
13153	assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13154	assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13155	"Unexpected VECTOR_SHUFFLE mask size!");
13156
13157	if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13158	return Res;
13159
13160	if (SVN->isSplat()) {
13161	int Lane = SVN->getSplatIndex();
13162	// If this is undef splat, generate it via "just" vdup, if possible.
13163	if (Lane == -`1`)
13164	Lane = `0`;
13165
13166	if (Lane == `0` && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13167	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT: V1.getValueType(),
13168	Operand: V1.getOperand(i: `0`));
13169	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13170	// constant. If so, we can just reference the lane's definition directly.
13171	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13172	!isa<ConstantSDNode>(Val: V1.getOperand(i: Lane)))
13173	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: V1.getOperand(i: Lane));
13174
13175	// Otherwise, duplicate from the lane of the input vector.
13176	unsigned Opcode = getDUPLANEOp(EltType: V1.getValueType().getVectorElementType());
13177	return constructDup(V: V1, Lane, dl, VT, Opcode, DAG);
13178	}
13179
13180	// Check if the mask matches a DUP for a wider element
13181	for (unsigned LaneSize : {`64U`, `32U`, `16U`}) {
13182	unsigned Lane = `0`;
13183	if (isWideDUPMask(M: ShuffleMask, VT, BlockSize: LaneSize, DupLaneOp&: Lane)) {
13184	unsigned Opcode = LaneSize == `64` ? AArch64ISD::DUPLANE64
13185	: LaneSize == `32` ? AArch64ISD::DUPLANE32
13186	: AArch64ISD::DUPLANE16;
13187	// Cast V1 to an integer vector with required lane size
13188	MVT NewEltTy = MVT::getIntegerVT(BitWidth: LaneSize);
13189	unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13190	MVT NewVecTy = MVT::getVectorVT(VT: NewEltTy, NumElements: NewEltCount);
13191	V1 = DAG.getBitcast(VT: NewVecTy, V: V1);
13192	// Constuct the DUP instruction
13193	V1 = constructDup(V: V1, Lane, dl, VT: NewVecTy, Opcode, DAG);
13194	// Cast back to the original type
13195	return DAG.getBitcast(VT, V: V1);
13196	}
13197	}
13198
13199	unsigned NumElts = VT.getVectorNumElements();
13200	unsigned EltSize = VT.getScalarSizeInBits();
13201	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `64`))
13202	return DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13203	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `32`))
13204	return DAG.getNode(Opcode: AArch64ISD::REV32, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13205	if (isREVMask(M: ShuffleMask, EltSize, NumElts, BlockSize: `16`))
13206	return DAG.getNode(Opcode: AArch64ISD::REV16, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13207
13208	if (((NumElts == `8` && EltSize == `16`) \|\| (NumElts == `16` && EltSize == `8`)) &&
13209	ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size())) {
13210	SDValue Rev = DAG.getNode(Opcode: AArch64ISD::REV64, DL: dl, VT, Operand: V1);
13211	return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT, N1: Rev, N2: Rev,
13212	N3: DAG.getConstant(Val: `8`, DL: dl, VT: MVT::i32));
13213	}
13214
13215	bool ReverseEXT = false;
13216	unsigned Imm;
13217	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm)) {
13218	if (ReverseEXT)
13219	std::swap(a&: V1, b&: V2);
13220	Imm *= getExtFactor(V&: V1);
13221	return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2,
13222	N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
13223	} else if (V2 ->isUndef() && isSingletonEXTMask(M: ShuffleMask, VT, Imm)) {
13224	Imm *= getExtFactor(V&: V1);
13225	return DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1,
13226	N3: DAG.getConstant(Val: Imm, DL: dl, VT: MVT::i32));
13227	}
13228
13229	unsigned WhichResult;
13230	if (isZIPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
13231	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13232	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13233	}
13234	if (isUZPMask(M: ShuffleMask, NumElts, WhichResultOut&: WhichResult)) {
13235	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13236	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13237	}
13238	if (isTRNMask(M: ShuffleMask, NumElts, WhichResult)) {
13239	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13240	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V2);
13241	}
13242
13243	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13244	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13245	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13246	}
13247	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13248	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13249	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13250	}
13251	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
13252	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13253	return DAG.getNode(Opcode: Opc, DL: dl, VT: V1.getValueType(), N1: V1, N2: V1);
13254	}
13255
13256	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
13257	return Concat;
13258
13259	bool DstIsLeft;
13260	int Anomaly;
13261	int NumInputElements = V1.getValueType().getVectorNumElements();
13262	if (isINSMask(M: ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13263	SDValue DstVec = DstIsLeft ? V1 : V2;
13264	SDValue DstLaneV = DAG.getConstant(Val: Anomaly, DL: dl, VT: MVT::i64);
13265
13266	SDValue SrcVec = V1;
13267	int SrcLane = ShuffleMask [Anomaly];
13268	if (SrcLane >= NumInputElements) {
13269	SrcVec = V2;
13270	SrcLane -= NumElts;
13271	}
13272	SDValue SrcLaneV = DAG.getConstant(Val: SrcLane, DL: dl, VT: MVT::i64);
13273
13274	EVT ScalarVT = VT.getVectorElementType();
13275
13276	if (ScalarVT.getFixedSizeInBits() < `32` && ScalarVT.isInteger())
13277	ScalarVT = MVT::i32;
13278
13279	return DAG.getNode(
13280	Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: DstVec,
13281	N2: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: ScalarVT, N1: SrcVec, N2: SrcLaneV),
13282	N3: DstLaneV);
13283	}
13284
13285	if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13286	return NewSD;
13287
13288	// If the shuffle is not directly supported and it has 4 elements, use
13289	// the PerfectShuffle-generated table to synthesize it from other shuffles.
13290	if (NumElts == `4`) {
13291	unsigned PFIndexes[`4`];
13292	for (unsigned i = `0`; i != `4`; ++i) {
13293	if (ShuffleMask [i] < `0`)
13294	PFIndexes[i] = `8`;
13295	else
13296	PFIndexes[i] = ShuffleMask [i];
13297	}
13298
13299	// Compute the index in the perfect shuffle table.
13300	unsigned PFTableIndex = PFIndexes[`0`] * `9` * `9` * `9` + PFIndexes[`1`] * `9` * `9` +
13301	PFIndexes[`2`] * `9` + PFIndexes[`3`];
13302	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
13303	return GeneratePerfectShuffle(ID: PFTableIndex, V1, V2, PFEntry, LHS: V1, RHS: V2, DAG,
13304	dl);
13305	}
13306
13307	return GenerateTBL(Op, ShuffleMask, DAG);
13308	}
13309
13310	SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
13311	SelectionDAG &DAG) const {
13312	EVT VT = Op.getValueType();
13313
13314	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
13315	return LowerToScalableOp(Op, DAG);
13316
13317	assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
13318	"Unexpected vector type!");
13319
13320	// We can handle the constant cases during isel.
13321	if (isa<ConstantSDNode>(Val: Op.getOperand(i: `0`)))
13322	return Op;
13323
13324	// There isn't a natural way to handle the general i1 case, so we use some
13325	// trickery with whilelo.
13326	SDLoc DL(Op);
13327	SDValue SplatVal = DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL, VT: MVT::i64);
13328	SplatVal = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: MVT::i64, N1: SplatVal,
13329	N2: DAG.getValueType(MVT::i1));
13330	SDValue ID =
13331	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo, DL, VT: MVT::i64);
13332	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
13333	if (VT == MVT::nxv1i1)
13334	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::nxv1i1,
13335	N1: DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::nxv2i1, N1: ID,
13336	N2: Zero, N3: SplatVal),
13337	N2: Zero);
13338	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: ID, N2: Zero, N3: SplatVal);
13339	}
13340
13341	SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
13342	SelectionDAG &DAG) const {
13343	SDLoc DL(Op);
13344
13345	EVT VT = Op.getValueType();
13346	if (!isTypeLegal(VT) \|\| !VT.isScalableVector())
13347	return SDValue ();
13348
13349	// Current lowering only supports the SVE-ACLE types.
13350	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
13351	return SDValue ();
13352
13353	// The DUPQ operation is indepedent of element type so normalise to i64s.
13354	SDValue Idx128 = Op.getOperand(i: `2`);
13355
13356	// DUPQ can be used when idx is in range.
13357	auto *CIdx = dyn_cast<ConstantSDNode>(Val&: Idx128);
13358	if (CIdx && (CIdx->getZExtValue() <= `3`)) {
13359	SDValue CI = DAG.getTargetConstant(Val: CIdx->getZExtValue(), DL, VT: MVT::i64);
13360	return DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT, N1: Op.getOperand(i: `1`), N2: CI);
13361	}
13362
13363	SDValue V = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv2i64, Operand: Op.getOperand(i: `1`));
13364
13365	// The ACLE says this must produce the same result as:
13366	// svtbl(data, svadd_x(svptrue_b64(),
13367	// svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
13368	// index 2))*
13369	SDValue One = DAG.getConstant(Val: `1`, DL, VT: MVT::i64);
13370	SDValue SplatOne = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: One);
13371
13372	// create the vector 0,1,0,1,...
13373	SDValue SV = DAG.getStepVector(DL, ResVT: MVT::nxv2i64);
13374	SV = DAG.getNode(Opcode: ISD::AND, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatOne);
13375
13376	// create the vector idx64,idx64+1,idx64,idx64+1,...
13377	SDValue Idx64 = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: Idx128, N2: Idx128);
13378	SDValue SplatIdx64 = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Idx64);
13379	SDValue ShuffleMask = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::nxv2i64, N1: SV, N2: SplatIdx64);
13380
13381	// create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
13382	SDValue TBL = DAG.getNode(Opcode: AArch64ISD::TBL, DL, VT: MVT::nxv2i64, N1: V, N2: ShuffleMask);
13383	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: TBL);
13384	}
13385
13386
13387	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
13388	APInt &UndefBits) {
13389	EVT VT = BVN->getValueType(ResNo: `0`);
13390	APInt SplatBits, SplatUndef;
13391	unsigned SplatBitSize;
13392	bool HasAnyUndefs;
13393	if (BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13394	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
13395
13396	for (unsigned i = `0`; i < NumSplats; ++i) {
13397	CnstBits <<= SplatBitSize;
13398	UndefBits <<= SplatBitSize;
13399	CnstBits \|= SplatBits.zextOrTrunc(width: VT.getSizeInBits());
13400	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(width: VT.getSizeInBits());
13401	}
13402
13403	return true;
13404	}
13405
13406	return false;
13407	}
13408
13409	// Try 64-bit splatted SIMD immediate.
13410	static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13411	const APInt &Bits) {
13412	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13413	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13414	EVT VT = Op.getValueType();
13415	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v2i64 : MVT::f64;
13416
13417	if (AArch64_AM::isAdvSIMDModImmType10(Imm: Value)) {
13418	Value = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Value);
13419
13420	SDLoc dl(Op);
13421	SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13422	Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13423	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13424	}
13425	}
13426
13427	return SDValue ();
13428	}
13429
13430	// Try 32-bit splatted SIMD immediate.
13431	static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13432	const APInt &Bits,
13433	const SDValue LHS = nullptr*) {
13434	EVT VT = Op.getValueType();
13435	if (VT.isFixedLengthVector() &&
13436	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
13437	return SDValue ();
13438
13439	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13440	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13441	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
13442	bool isAdvSIMDModImm = false;
13443	uint64_t Shift;
13444
13445	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Imm: Value))) {
13446	Value = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Value);
13447	Shift = `0`;
13448	}
13449	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Imm: Value))) {
13450	Value = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Value);
13451	Shift = `8`;
13452	}
13453	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Imm: Value))) {
13454	Value = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Value);
13455	Shift = `16`;
13456	}
13457	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Imm: Value))) {
13458	Value = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Value);
13459	Shift = `24`;
13460	}
13461
13462	if (isAdvSIMDModImm) {
13463	SDLoc dl(Op);
13464	SDValue Mov;
13465
13466	if (LHS)
13467	Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13468	N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: MovTy, Operand: *LHS),
13469	N2: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13470	N3: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13471	else
13472	Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13473	N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13474	N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13475
13476	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13477	}
13478	}
13479
13480	return SDValue ();
13481	}
13482
13483	// Try 16-bit splatted SIMD immediate.
13484	static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13485	const APInt &Bits,
13486	const SDValue LHS = nullptr*) {
13487	EVT VT = Op.getValueType();
13488	if (VT.isFixedLengthVector() &&
13489	!DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable())
13490	return SDValue ();
13491
13492	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13493	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13494	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v8i16 : MVT::v4i16;
13495	bool isAdvSIMDModImm = false;
13496	uint64_t Shift;
13497
13498	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Imm: Value))) {
13499	Value = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Value);
13500	Shift = `0`;
13501	}
13502	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Imm: Value))) {
13503	Value = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Value);
13504	Shift = `8`;
13505	}
13506
13507	if (isAdvSIMDModImm) {
13508	SDLoc dl(Op);
13509	SDValue Mov;
13510
13511	if (LHS)
13512	Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13513	N1: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT: MovTy, Operand: *LHS),
13514	N2: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13515	N3: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13516	else
13517	Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13518	N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13519	N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13520
13521	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13522	}
13523	}
13524
13525	return SDValue ();
13526	}
13527
13528	// Try 32-bit splatted SIMD immediate with shifted ones.
13529	static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
13530	SelectionDAG &DAG, const APInt &Bits) {
13531	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13532	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13533	EVT VT = Op.getValueType();
13534	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v4i32 : MVT::v2i32;
13535	bool isAdvSIMDModImm = false;
13536	uint64_t Shift;
13537
13538	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Imm: Value))) {
13539	Value = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Value);
13540	Shift = `264`;
13541	}
13542	else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Imm: Value))) {
13543	Value = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Value);
13544	Shift = `272`;
13545	}
13546
13547	if (isAdvSIMDModImm) {
13548	SDLoc dl(Op);
13549	SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13550	N1: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32),
13551	N2: DAG.getConstant(Val: Shift, DL: dl, VT: MVT::i32));
13552	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13553	}
13554	}
13555
13556	return SDValue ();
13557	}
13558
13559	// Try 8-bit splatted SIMD immediate.
13560	static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13561	const APInt &Bits) {
13562	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13563	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13564	EVT VT = Op.getValueType();
13565	MVT MovTy = (VT.getSizeInBits() == `128`) ? MVT::v16i8 : MVT::v8i8;
13566
13567	if (AArch64_AM::isAdvSIMDModImmType9(Imm: Value)) {
13568	Value = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Value);
13569
13570	SDLoc dl(Op);
13571	SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13572	Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13573	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13574	}
13575	}
13576
13577	return SDValue ();
13578	}
13579
13580	// Try FP splatted SIMD immediate.
13581	static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13582	const APInt &Bits) {
13583	if (Bits.getHiBits(numBits: `64`) == Bits.getLoBits(numBits: `64`)) {
13584	uint64_t Value = Bits.zextOrTrunc(width: `64`).getZExtValue();
13585	EVT VT = Op.getValueType();
13586	bool isWide = (VT.getSizeInBits() == `128`);
13587	MVT MovTy;
13588	bool isAdvSIMDModImm = false;
13589
13590	if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Imm: Value))) {
13591	Value = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Value);
13592	MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
13593	}
13594	else if (isWide &&
13595	(isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Imm: Value))) {
13596	Value = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Value);
13597	MovTy = MVT::v2f64;
13598	}
13599
13600	if (isAdvSIMDModImm) {
13601	SDLoc dl(Op);
13602	SDValue Mov = DAG.getNode(Opcode: NewOp, DL: dl, VT: MovTy,
13603	Operand: DAG.getConstant(Val: Value, DL: dl, VT: MVT::i32));
13604	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: dl, VT, Operand: Mov);
13605	}
13606	}
13607
13608	return SDValue ();
13609	}
13610
13611	// Specialized code to quickly find if PotentialBVec is a BuildVector that
13612	// consists of only the same constant int value, returned in reference arg
13613	// ConstVal
13614	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13615	uint64_t &ConstVal) {
13616	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(Val: PotentialBVec);
13617	if (!Bvec)
13618	return false;
13619	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: `0`));
13620	if (!FirstElt)
13621	return false;
13622	EVT VT = Bvec->getValueType(ResNo: `0`);
13623	unsigned NumElts = VT.getVectorNumElements();
13624	for (unsigned i = `1`; i < NumElts; ++i)
13625	if (dyn_cast<ConstantSDNode>(Val: Bvec->getOperand(Num: i)) != FirstElt)
13626	return false;
13627	ConstVal = FirstElt->getZExtValue();
13628	return true;
13629	}
13630
13631	static bool isAllInactivePredicate(SDValue N) {
13632	// Look through cast.
13633	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13634	N = N.getOperand(i: `0`);
13635
13636	return ISD::isConstantSplatVectorAllZeros(N: N.getNode());
13637	}
13638
13639	static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
13640	unsigned NumElts = N.getValueType().getVectorMinNumElements();
13641
13642	// Look through cast.
13643	while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13644	N = N.getOperand(i: `0`);
13645	// When reinterpreting from a type with fewer elements the "new" elements
13646	// are not active, so bail if they're likely to be used.
13647	if (N.getValueType().getVectorMinNumElements() < NumElts)
13648	return false;
13649	}
13650
13651	if (ISD::isConstantSplatVectorAllOnes(N: N.getNode()))
13652	return true;
13653
13654	// "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13655	// or smaller than the implicit element type represented by N.
13656	// NOTE: A larger element count implies a smaller element type.
13657	if (N.getOpcode() == AArch64ISD::PTRUE &&
13658	N.getConstantOperandVal(i: `0`) == AArch64SVEPredPattern::all)
13659	return N.getValueType().getVectorMinNumElements() >= NumElts;
13660
13661	// If we're compiling for a specific vector-length, we can check if the
13662	// pattern's VL equals that of the scalable vector at runtime.
13663	if (N.getOpcode() == AArch64ISD::PTRUE) {
13664	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13665	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13666	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13667	if (MaxSVESize && MinSVESize == MaxSVESize) {
13668	unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13669	unsigned PatNumElts =
13670	getNumElementsFromSVEPredPattern(Pattern: N.getConstantOperandVal(i: `0`));
13671	return PatNumElts == (NumElts * VScale);
13672	}
13673	}
13674
13675	return false;
13676	}
13677
13678	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13679	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13680	// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13681	// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13682	// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13683	// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13684	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
13685	EVT VT = N->getValueType(ResNo: `0`);
13686
13687	if (!VT.isVector())
13688	return SDValue ();
13689
13690	SDLoc DL(N);
13691
13692	SDValue And;
13693	SDValue Shift;
13694
13695	SDValue FirstOp = N->getOperand(Num: `0`);
13696	unsigned FirstOpc = FirstOp.getOpcode();
13697	SDValue SecondOp = N->getOperand(Num: `1`);
13698	unsigned SecondOpc = SecondOp.getOpcode();
13699
13700	// Is one of the operands an AND or a BICi? The AND may have been optimised to
13701	// a BICi in order to use an immediate instead of a register.
13702	// Is the other operand an shl or lshr? This will have been turned into:
13703	// AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13704	// or (AArch64ISD::SHL_PRED \|\| AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13705	if ((FirstOpc == ISD::AND \|\| FirstOpc == AArch64ISD::BICi) &&
13706	(SecondOpc == AArch64ISD::VSHL \|\| SecondOpc == AArch64ISD::VLSHR \|\|
13707	SecondOpc == AArch64ISD::SHL_PRED \|\|
13708	SecondOpc == AArch64ISD::SRL_PRED)) {
13709	And = FirstOp;
13710	Shift = SecondOp;
13711
13712	} else if ((SecondOpc == ISD::AND \|\| SecondOpc == AArch64ISD::BICi) &&
13713	(FirstOpc == AArch64ISD::VSHL \|\| FirstOpc == AArch64ISD::VLSHR \|\|
13714	FirstOpc == AArch64ISD::SHL_PRED \|\|
13715	FirstOpc == AArch64ISD::SRL_PRED)) {
13716	And = SecondOp;
13717	Shift = FirstOp;
13718	} else
13719	return SDValue ();
13720
13721	bool IsAnd = And.getOpcode() == ISD::AND;
13722	bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR \|\|
13723	Shift.getOpcode() == AArch64ISD::SRL_PRED;
13724	bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED \|\|
13725	Shift.getOpcode() == AArch64ISD::SRL_PRED;
13726
13727	// Is the shift amount constant and are all lanes active?
13728	uint64_t C2;
13729	if (ShiftHasPredOp) {
13730	if (!isAllActivePredicate(DAG, N: Shift.getOperand(i: `0`)))
13731	return SDValue ();
13732	APInt C;
13733	if (!ISD::isConstantSplatVector(N: Shift.getOperand(i: `2`).getNode(), SplatValue&: C))
13734	return SDValue ();
13735	C2 = C.getZExtValue();
13736	} else if (ConstantSDNode *C2node =
13737	dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`)))
13738	C2 = C2node->getZExtValue();
13739	else
13740	return SDValue ();
13741
13742	APInt C1AsAPInt;
13743	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13744	if (IsAnd) {
13745	// Is the and mask vector all constant?
13746	if (!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: C1AsAPInt))
13747	return SDValue ();
13748	} else {
13749	// Reconstruct the corresponding AND immediate from the two BICi immediates.
13750	ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `1`));
13751	ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(Val: And.getOperand(i: `2`));
13752	assert(C1nodeImm && C1nodeShift);
13753	C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13754	C1AsAPInt = C1AsAPInt.zextOrTrunc(width: ElemSizeInBits);
13755	}
13756
13757	// Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13758	// C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13759	// how much one can shift elements of a particular size?
13760	if (C2 > ElemSizeInBits)
13761	return SDValue ();
13762
13763	APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(numBits: ElemSizeInBits, hiBitsSet: C2)
13764	: APInt::getLowBitsSet(numBits: ElemSizeInBits, loBitsSet: C2);
13765	if (C1AsAPInt != RequiredC1)
13766	return SDValue ();
13767
13768	SDValue X = And.getOperand(i: `0`);
13769	SDValue Y = ShiftHasPredOp ? Shift.getOperand(i: `1`) : Shift.getOperand(i: `0`);
13770	SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(Val: C2, DL, VT: MVT::i32)
13771	: Shift.getOperand(i: `1`);
13772
13773	unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13774	SDValue ResultSLI = DAG.getNode(Opcode: Inst, DL, VT, N1: X, N2: Y, N3: Imm);
13775
13776	LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13777	LLVM_DEBUG(N->dump(&DAG));
13778	LLVM_DEBUG(dbgs() << "into: \n");
13779	LLVM_DEBUG(ResultSLI->dump(&DAG));
13780
13781	++NumShiftInserts;
13782	return ResultSLI;
13783	}
13784
13785	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13786	SelectionDAG &DAG) const {
13787	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
13788	OverrideNEON: !Subtarget->isNeonAvailable()))
13789	return LowerToScalableOp(Op, DAG);
13790
13791	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13792	if (SDValue Res = tryLowerToSLI(N: Op.getNode(), DAG))
13793	return Res;
13794
13795	EVT VT = Op.getValueType();
13796	if (VT.isScalableVector())
13797	return Op;
13798
13799	SDValue LHS = Op.getOperand(i: `0`);
13800	BuildVectorSDNode *BVN =
13801	dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `1`).getNode());
13802	if (!BVN) {
13803	// OR commutes, so try swapping the operands.
13804	LHS = Op.getOperand(i: `1`);
13805	BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getOperand(i: `0`).getNode());
13806	}
13807	if (!BVN)
13808	return Op;
13809
13810	APInt DefBits(VT.getSizeInBits(), `0`);
13811	APInt UndefBits(VT.getSizeInBits(), `0`);
13812	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13813	SDValue NewOp;
13814
13815	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13816	Bits: DefBits, LHS: &LHS)) \|\|
13817	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13818	Bits: DefBits, LHS: &LHS)))
13819	return NewOp;
13820
13821	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::ORRi, Op, DAG,
13822	Bits: UndefBits, LHS: &LHS)) \|\|
13823	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::ORRi, Op, DAG,
13824	Bits: UndefBits, LHS: &LHS)))
13825	return NewOp;
13826	}
13827
13828	// We can always fall back to a non-immediate OR.
13829	return Op;
13830	}
13831
13832	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13833	// be truncated to fit element width.
13834	static SDValue NormalizeBuildVector(SDValue Op,
13835	SelectionDAG &DAG) {
13836	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13837	SDLoc dl(Op);
13838	EVT VT = Op.getValueType();
13839	EVT EltTy= VT.getVectorElementType();
13840
13841	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > `16`)
13842	return Op;
13843
13844	SmallVector<SDValue, `16`> Ops;
13845	for (SDValue Lane : Op ->ops()) {
13846	// For integer vectors, type legalization would have promoted the
13847	// operands already. Otherwise, if Op is a floating-point splat
13848	// (with operands cast to integers), then the only possibilities
13849	// are constants and UNDEFs.
13850	if (auto *CstLane = dyn_cast<ConstantSDNode>(Val&: Lane)) {
13851	APInt LowBits(EltTy.getSizeInBits(),
13852	CstLane->getZExtValue());
13853	Lane = DAG.getConstant(Val: LowBits.getZExtValue(), DL: dl, VT: MVT::i32);
13854	} else if (Lane.getNode()->isUndef()) {
13855	Lane = DAG.getUNDEF(VT: MVT::i32);
13856	} else {
13857	assert(Lane.getValueType() == MVT::i32 &&
13858	"Unexpected BUILD_VECTOR operand type");
13859	}
13860	Ops.push_back(Elt: Lane);
13861	}
13862	return DAG.getBuildVector(VT, DL: dl, Ops);
13863	}
13864
13865	static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
13866	const AArch64Subtarget *ST) {
13867	EVT VT = Op.getValueType();
13868	assert((VT.getSizeInBits() == `64` \|\| VT.getSizeInBits() == `128`) &&
13869	"Expected a legal NEON vector");
13870
13871	APInt DefBits(VT.getSizeInBits(), `0`);
13872	APInt UndefBits(VT.getSizeInBits(), `0`);
13873	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13874	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
13875	auto TryMOVIWithBits = [&](APInt DefBits) {
13876	SDValue NewOp;
13877	if ((NewOp =
13878	tryAdvSIMDModImm64(NewOp: AArch64ISD::MOVIedit, Op, DAG, Bits: DefBits)) \|\|
13879	(NewOp =
13880	tryAdvSIMDModImm32(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
13881	(NewOp =
13882	tryAdvSIMDModImm321s(NewOp: AArch64ISD::MOVImsl, Op, DAG, Bits: DefBits)) \|\|
13883	(NewOp =
13884	tryAdvSIMDModImm16(NewOp: AArch64ISD::MOVIshift, Op, DAG, Bits: DefBits)) \|\|
13885	(NewOp = tryAdvSIMDModImm8(NewOp: AArch64ISD::MOVI, Op, DAG, Bits: DefBits)) \|\|
13886	(NewOp = tryAdvSIMDModImmFP(NewOp: AArch64ISD::FMOV, Op, DAG, Bits: DefBits)))
13887	return NewOp;
13888
13889	APInt NotDefBits = ~DefBits;
13890	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::MVNIshift, Op, DAG,
13891	Bits: NotDefBits)) \|\|
13892	(NewOp = tryAdvSIMDModImm321s(NewOp: AArch64ISD::MVNImsl, Op, DAG,
13893	Bits: NotDefBits)) \|\|
13894	(NewOp =
13895	tryAdvSIMDModImm16(NewOp: AArch64ISD::MVNIshift, Op, DAG, Bits: NotDefBits)))
13896	return NewOp;
13897	return SDValue ();
13898	};
13899	if (SDValue R = TryMOVIWithBits (DefBits))
13900	return R;
13901	if (SDValue R = TryMOVIWithBits (UndefBits))
13902	return R;
13903
13904	// See if a fneg of the constant can be materialized with a MOVI, etc
13905	auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13906	// FNegate each sub-element of the constant
13907	assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == `0`);
13908	APInt Neg = APInt::getHighBitsSet(numBits: FVT.getSizeInBits(), hiBitsSet: `1`)
13909	.zext(width: VT.getSizeInBits());
13910	APInt NegBits(VT.getSizeInBits(), `0`);
13911	unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13912	for (unsigned i = `0`; i < NumElts; i++)
13913	NegBits \|= Neg << (FVT.getScalarSizeInBits() * i);
13914	NegBits = DefBits ^ NegBits;
13915
13916	// Try to create the new constants with MOVI, and if so generate a fneg
13917	// for it.
13918	if (SDValue NewOp = TryMOVIWithBits (NegBits)) {
13919	SDLoc DL(Op);
13920	MVT VFVT = NumElts == `1` ? FVT : MVT::getVectorVT(VT: FVT, NumElements: NumElts);
13921	return DAG.getNode(
13922	Opcode: AArch64ISD::NVCAST, DL, VT,
13923	Operand: DAG.getNode(Opcode: ISD::FNEG, DL, VT: VFVT,
13924	Operand: DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: VFVT, Operand: NewOp)));
13925	}
13926	return SDValue ();
13927	};
13928	SDValue R;
13929	if ((R = TryWithFNeg (DefBits, MVT::f32)) \|\|
13930	(R = TryWithFNeg (DefBits, MVT::f64)) \|\|
13931	(ST->hasFullFP16() && (R = TryWithFNeg (DefBits, MVT::f16))))
13932	return R;
13933	}
13934
13935	return SDValue ();
13936	}
13937
13938	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13939	SelectionDAG &DAG) const {
13940	EVT VT = Op.getValueType();
13941
13942	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
13943	if (auto SeqInfo = cast<BuildVectorSDNode>(Val&: Op)->isConstantSequence()) {
13944	SDLoc DL(Op);
13945	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13946	SDValue Start = DAG.getConstant(Val: SeqInfo ->first, DL, VT: ContainerVT);
13947	SDValue Steps = DAG.getStepVector(DL, ResVT: ContainerVT, StepVal: SeqInfo ->second);
13948	SDValue Seq = DAG.getNode(Opcode: ISD::ADD, DL, VT: ContainerVT, N1: Start, N2: Steps);
13949	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Seq);
13950	}
13951
13952	// Revert to common legalisation for all other variants.
13953	return SDValue ();
13954	}
13955
13956	// Try to build a simple constant vector.
13957	Op = NormalizeBuildVector(Op, DAG);
13958	// Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13959	// abort.
13960	if (Op.getOpcode() != ISD::BUILD_VECTOR)
13961	return SDValue ();
13962
13963	// Certain vector constants, used to express things like logical NOT and
13964	// arithmetic NEG, are passed through unmodified. This allows special
13965	// patterns for these operations to match, which will lower these constants
13966	// to whatever is proven necessary.
13967	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Val: Op.getNode());
13968	if (BVN->isConstant()) {
13969	if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13970	unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13971	APInt Val(BitSize,
13972	Const->getAPIntValue().zextOrTrunc(width: BitSize).getZExtValue());
13973	if (Val.isZero() \|\| (VT.isInteger() && Val.isAllOnes()))
13974	return Op;
13975	}
13976	if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13977	if (Const->isZero() && !Const->isNegative())
13978	return Op;
13979	}
13980
13981	if (SDValue V = ConstantBuildVector(Op, DAG, ST: Subtarget))
13982	return V;
13983
13984	// Scan through the operands to find some interesting properties we can
13985	// exploit:
13986	// 1) If only one value is used, we can use a DUP, or
13987	// 2) if only the low element is not undef, we can just insert that, or
13988	// 3) if only one constant value is used (w/ some non-constant lanes),
13989	// we can splat the constant value into the whole vector then fill
13990	// in the non-constant lanes.
13991	// 4) FIXME: If different constant values are used, but we can intelligently
13992	// select the values we'll be overwriting for the non-constant
13993	// lanes such that we can directly materialize the vector
13994	// some other way (MOVI, e.g.), we can be sneaky.
13995	// 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13996	SDLoc dl(Op);
13997	unsigned NumElts = VT.getVectorNumElements();
13998	bool isOnlyLowElement = true;
13999	bool usesOnlyOneValue = true;
14000	bool usesOnlyOneConstantValue = true;
14001	bool isConstant = true;
14002	bool AllLanesExtractElt = true;
14003	unsigned NumConstantLanes = `0`;
14004	unsigned NumDifferentLanes = `0`;
14005	unsigned NumUndefLanes = `0`;
14006	SDValue Value;
14007	SDValue ConstantValue;
14008	SmallMapVector<SDValue, unsigned, `16`> DifferentValueMap;
14009	unsigned ConsecutiveValCount = `0`;
14010	SDValue PrevVal;
14011	for (unsigned i = `0`; i < NumElts; ++i) {
14012	SDValue V = Op.getOperand(i);
14013	if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14014	AllLanesExtractElt = false;
14015	if (V.isUndef()) {
14016	++NumUndefLanes;
14017	continue;
14018	}
14019	if (i > `0`)
14020	isOnlyLowElement = false;
14021	if (!isIntOrFPConstant(V))
14022	isConstant = false;
14023
14024	if (isIntOrFPConstant(V)) {
14025	++NumConstantLanes;
14026	if (!ConstantValue.getNode())
14027	ConstantValue = V;
14028	else if (ConstantValue != V)
14029	usesOnlyOneConstantValue = false;
14030	}
14031
14032	if (!Value.getNode())
14033	Value = V;
14034	else if (V != Value) {
14035	usesOnlyOneValue = false;
14036	++NumDifferentLanes;
14037	}
14038
14039	if (PrevVal != V) {
14040	ConsecutiveValCount = `0`;
14041	PrevVal = V;
14042	}
14043
14044	// Keep different values and its last consecutive count. For example,
14045	//
14046	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14047	// t24, t24, t24, t24, t24, t24, t24, t24
14048	// t23 = consecutive count 8
14049	// t24 = consecutive count 8
14050	// ------------------------------------------------------------------
14051	// t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14052	// t24, t24, t24, t24, t24, t24, t24, t24
14053	// t23 = consecutive count 5
14054	// t24 = consecutive count 9
14055	DifferentValueMap [V] = ++ConsecutiveValCount;
14056	}
14057
14058	if (!Value.getNode()) {
14059	LLVM_DEBUG(
14060	dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14061	return DAG.getUNDEF(VT);
14062	}
14063
14064	// Convert BUILD_VECTOR where all elements but the lowest are undef into
14065	// SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14066	// as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14067	if (isOnlyLowElement && !(NumElts == `1` && isIntOrFPConstant(V: Value))) {
14068	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14069	"SCALAR_TO_VECTOR node\n");
14070	return DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Value);
14071	}
14072
14073	if (AllLanesExtractElt) {
14074	SDNode Vector = nullptr*;
14075	bool Even = false;
14076	bool Odd = false;
14077	// Check whether the extract elements match the Even pattern <0,2,4,...> or
14078	// the Odd pattern <1,3,5,...>.
14079	for (unsigned i = `0`; i < NumElts; ++i) {
14080	SDValue V = Op.getOperand(i);
14081	const SDNode *N = V.getNode();
14082	if (!isa<ConstantSDNode>(Val: N->getOperand(Num: `1`))) {
14083	Even = false;
14084	Odd = false;
14085	break;
14086	}
14087	SDValue N0 = N->getOperand(Num: `0`);
14088
14089	// All elements are extracted from the same vector.
14090	if (!Vector) {
14091	Vector = N0.getNode();
14092	// Check that the type of EXTRACT_VECTOR_ELT matches the type of
14093	// BUILD_VECTOR.
14094	if (VT.getVectorElementType() !=
14095	N0.getValueType().getVectorElementType())
14096	break;
14097	} else if (Vector != N0.getNode()) {
14098	Odd = false;
14099	Even = false;
14100	break;
14101	}
14102
14103	// Extracted values are either at Even indices <0,2,4,...> or at Odd
14104	// indices <1,3,5,...>.
14105	uint64_t Val = N->getConstantOperandVal(Num: `1`);
14106	if (Val == `2` * i) {
14107	Even = true;
14108	continue;
14109	}
14110	if (Val - `1` == `2` * i) {
14111	Odd = true;
14112	continue;
14113	}
14114
14115	// Something does not match: abort.
14116	Odd = false;
14117	Even = false;
14118	break;
14119	}
14120	if (Even \|\| Odd) {
14121	SDValue LHS =
14122	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: SDValue (Vector, `0`),
14123	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
14124	SDValue RHS =
14125	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT, N1: SDValue (Vector, `0`),
14126	N2: DAG.getConstant(Val: NumElts, DL: dl, VT: MVT::i64));
14127
14128	if (Even && !Odd)
14129	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: LHS, N2: RHS);
14130	if (Odd && !Even)
14131	return DAG.getNode(Opcode: AArch64ISD::UZP2, DL: dl, VT, N1: LHS, N2: RHS);
14132	}
14133	}
14134
14135	// Use DUP for non-constant splats. For f32 constant splats, reduce to
14136	// i32 and try again.
14137	if (usesOnlyOneValue) {
14138	if (!isConstant) {
14139	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
14140	Value.getValueType() != VT) {
14141	LLVM_DEBUG(
14142	dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14143	return DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: Value);
14144	}
14145
14146	// This is actually a DUPLANExx operation, which keeps everything vectory.
14147
14148	SDValue Lane = Value.getOperand(i: `1`);
14149	Value = Value.getOperand(i: `0`);
14150	if (Value.getValueSizeInBits() == `64`) {
14151	LLVM_DEBUG(
14152	dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14153	"widening it\n");
14154	Value = WidenVector(V64Reg: Value, DAG);
14155	}
14156
14157	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
14158	return DAG.getNode(Opcode, DL: dl, VT, N1: Value, N2: Lane);
14159	}
14160
14161	if (VT.getVectorElementType().isFloatingPoint()) {
14162	SmallVector<SDValue, `8`> Ops;
14163	EVT EltTy = VT.getVectorElementType();
14164	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::bf16 \|\| EltTy == MVT::f32 \|\|
14165	EltTy == MVT::f64) && "Unsupported floating-point vector type");
14166	LLVM_DEBUG(
14167	dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14168	"BITCASTS, and try again\n");
14169	MVT NewType = MVT::getIntegerVT(BitWidth: EltTy.getSizeInBits());
14170	for (unsigned i = `0`; i < NumElts; ++i)
14171	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: NewType, Operand: Op.getOperand(i)));
14172	EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: NewType, NumElements: NumElts);
14173	SDValue Val = DAG.getBuildVector(VT: VecVT, DL: dl, Ops);
14174	LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14175	Val.dump(););
14176	Val = LowerBUILD_VECTOR(Op: Val, DAG);
14177	if (Val.getNode())
14178	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: Val);
14179	}
14180	}
14181
14182	// If we need to insert a small number of different non-constant elements and
14183	// the vector width is sufficiently large, prefer using DUP with the common
14184	// value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14185	// skip the constant lane handling below.
14186	bool PreferDUPAndInsert =
14187	!isConstant && NumDifferentLanes >= `1` &&
14188	NumDifferentLanes < ((NumElts - NumUndefLanes) / `2`) &&
14189	NumDifferentLanes >= NumConstantLanes;
14190
14191	// If there was only one constant value used and for more than one lane,
14192	// start by splatting that value, then replace the non-constant lanes. This
14193	// is better than the default, which will perform a separate initialization
14194	// for each lane.
14195	if (!PreferDUPAndInsert && NumConstantLanes > `0` && usesOnlyOneConstantValue) {
14196	// Firstly, try to materialize the splat constant.
14197	SDValue Val = DAG.getSplatBuildVector(VT, DL: dl, Op: ConstantValue);
14198	unsigned BitSize = VT.getScalarSizeInBits();
14199	APInt ConstantValueAPInt(`1`, `0`);
14200	if (auto *C = dyn_cast<ConstantSDNode>(Val&: ConstantValue))
14201	ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(width: BitSize);
14202	if (!isNullConstant(V: ConstantValue) && !isNullFPConstant(V: ConstantValue) &&
14203	!ConstantValueAPInt.isAllOnes()) {
14204	Val = ConstantBuildVector(Op: Val, DAG, ST: Subtarget);
14205	if (!Val)
14206	// Otherwise, materialize the constant and splat it.
14207	Val = DAG.getNode(Opcode: AArch64ISD::DUP, DL: dl, VT, Operand: ConstantValue);
14208	}
14209
14210	// Now insert the non-constant lanes.
14211	for (unsigned i = `0`; i < NumElts; ++i) {
14212	SDValue V = Op.getOperand(i);
14213	SDValue LaneIdx = DAG.getConstant(Val: i, DL: dl, VT: MVT::i64);
14214	if (!isIntOrFPConstant(V))
14215	// Note that type legalization likely mucked about with the VT of the
14216	// source operand, so we may have to convert it here before inserting.
14217	Val = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Val, N2: V, N3: LaneIdx);
14218	}
14219	return Val;
14220	}
14221
14222	// This will generate a load from the constant pool.
14223	if (isConstant) {
14224	LLVM_DEBUG(
14225	dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14226	"expansion\n");
14227	return SDValue ();
14228	}
14229
14230	// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14231	// v4i32s. This is really a truncate, which we can construct out of (legal)
14232	// concats and truncate nodes.
14233	if (SDValue M = ReconstructTruncateFromBuildVector(V: Op, DAG))
14234	return M;
14235
14236	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
14237	if (NumElts >= `4`) {
14238	if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
14239	return Shuffle;
14240
14241	if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
14242	return Shuffle;
14243	}
14244
14245	if (PreferDUPAndInsert) {
14246	// First, build a constant vector with the common element.
14247	SmallVector<SDValue, `8`> Ops(NumElts, Value);
14248	SDValue NewVector = LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT, DL: dl, Ops), DAG);
14249	// Next, insert the elements that do not match the common value.
14250	for (unsigned I = `0`; I < NumElts; ++I)
14251	if (Op.getOperand(i: I) != Value)
14252	NewVector =
14253	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: NewVector,
14254	N2: Op.getOperand(i: I), N3: DAG.getConstant(Val: I, DL: dl, VT: MVT::i64));
14255
14256	return NewVector;
14257	}
14258
14259	// If vector consists of two different values, try to generate two DUPs and
14260	// (CONCAT_VECTORS or VECTOR_SHUFFLE).
14261	if (DifferentValueMap.size() == `2` && NumUndefLanes == `0`) {
14262	SmallVector<SDValue, `2`> Vals;
14263	// Check the consecutive count of the value is the half number of vector
14264	// elements. In this case, we can use CONCAT_VECTORS. For example,
14265	//
14266	// canUseVECTOR_CONCAT = true;
14267	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14268	// t24, t24, t24, t24, t24, t24, t24, t24
14269	//
14270	// canUseVECTOR_CONCAT = false;
14271	// t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
14272	// t24, t24, t24, t24, t24, t24, t24, t24
14273	bool canUseVECTOR_CONCAT = true;
14274	for (auto Pair : DifferentValueMap) {
14275	// Check different values have same length which is NumElts / 2.
14276	if (Pair.second != NumElts / `2`)
14277	canUseVECTOR_CONCAT = false;
14278	Vals.push_back(Elt: Pair.first);
14279	}
14280
14281	// If canUseVECTOR_CONCAT is true, we can generate two DUPs and
14282	// CONCAT_VECTORs. For example,
14283	//
14284	// t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
14285	// t24, t24, t24, t24, t24, t24, t24, t24
14286	// ==>
14287	// t26: v8i8 = AArch64ISD::DUP t23
14288	// t28: v8i8 = AArch64ISD::DUP t24
14289	// t29: v16i8 = concat_vectors t26, t28
14290	if (canUseVECTOR_CONCAT) {
14291	EVT SubVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
14292	if (isTypeLegal(VT: SubVT) && SubVT.isVector() &&
14293	SubVT.getVectorNumElements() >= `2`) {
14294	SmallVector<SDValue, `8`> Ops1(NumElts / `2`, Vals [`0`]);
14295	SmallVector<SDValue, `8`> Ops2(NumElts / `2`, Vals [`1`]);
14296	SDValue DUP1 =
14297	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops1), DAG);
14298	SDValue DUP2 =
14299	LowerBUILD_VECTOR(Op: DAG.getBuildVector(VT: SubVT, DL: dl, Ops: Ops2), DAG);
14300	SDValue CONCAT_VECTORS =
14301	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: DUP1, N2: DUP2);
14302	return CONCAT_VECTORS;
14303	}
14304	}
14305
14306	// Let's try to generate VECTOR_SHUFFLE. For example,
14307	//
14308	// t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
14309	// ==>
14310	// t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
14311	// t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
14312	// t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
14313	if (NumElts >= `8`) {
14314	SmallVector<int, `16`> MaskVec;
14315	// Build mask for VECTOR_SHUFLLE.
14316	SDValue FirstLaneVal = Op.getOperand(i: `0`);
14317	for (unsigned i = `0`; i < NumElts; ++i) {
14318	SDValue Val = Op.getOperand(i);
14319	if (FirstLaneVal == Val)
14320	MaskVec.push_back(Elt: i);
14321	else
14322	MaskVec.push_back(Elt: i + NumElts);
14323	}
14324
14325	SmallVector<SDValue, `8`> Ops1(NumElts, Vals [`0`]);
14326	SmallVector<SDValue, `8`> Ops2(NumElts, Vals [`1`]);
14327	SDValue VEC1 = DAG.getBuildVector(VT, DL: dl, Ops: Ops1);
14328	SDValue VEC2 = DAG.getBuildVector(VT, DL: dl, Ops: Ops2);
14329	SDValue VECTOR_SHUFFLE =
14330	DAG.getVectorShuffle(VT, dl, N1: VEC1, N2: VEC2, Mask: MaskVec);
14331	return VECTOR_SHUFFLE;
14332	}
14333	}
14334
14335	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
14336	// know the default expansion would otherwise fall back on something even
14337	// worse. For a vector with one or two non-undef values, that's
14338	// scalar_to_vector for the elements followed by a shuffle (provided the
14339	// shuffle is valid for the target) and materialization element by element
14340	// on the stack followed by a load for everything else.
14341	if (!isConstant && !usesOnlyOneValue) {
14342	LLVM_DEBUG(
14343	dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
14344	"of INSERT_VECTOR_ELT\n");
14345
14346	SDValue Vec = DAG.getUNDEF(VT);
14347	SDValue Op0 = Op.getOperand(i: `0`);
14348	unsigned i = `0`;
14349
14350	// Use SCALAR_TO_VECTOR for lane zero to
14351	// a) Avoid a RMW dependency on the full vector register, and
14352	// b) Allow the register coalescer to fold away the copy if the
14353	// value is already in an S or D register, and we're forced to emit an
14354	// INSERT_SUBREG that we can't fold anywhere.
14355	//
14356	// We also allow types like i8 and i16 which are illegal scalar but legal
14357	// vector element types. After type-legalization the inserted value is
14358	// extended (i32) and it is safe to cast them to the vector type by ignoring
14359	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
14360	if (!Op0.isUndef()) {
14361	LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
14362	Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT, Operand: Op0);
14363	++i;
14364	}
14365	LLVM_DEBUG(if (i < NumElts) dbgs()
14366	<< "Creating nodes for the other vector elements:\n";);
14367	for (; i < NumElts; ++i) {
14368	SDValue V = Op.getOperand(i);
14369	if (V.isUndef())
14370	continue;
14371	SDValue LaneIdx = DAG.getConstant(Val: i, DL: dl, VT: MVT::i64);
14372	Vec = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL: dl, VT, N1: Vec, N2: V, N3: LaneIdx);
14373	}
14374	return Vec;
14375	}
14376
14377	LLVM_DEBUG(
14378	dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
14379	"better alternative\n");
14380	return SDValue ();
14381	}
14382
14383	SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14384	SelectionDAG &DAG) const {
14385	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14386	OverrideNEON: !Subtarget->isNeonAvailable()))
14387	return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
14388
14389	assert(Op.getValueType().isScalableVector() &&
14390	isTypeLegal(Op.getValueType()) &&
14391	"Expected legal scalable vector type!");
14392
14393	if (isTypeLegal(VT: Op.getOperand(i: `0`).getValueType())) {
14394	unsigned NumOperands = Op ->getNumOperands();
14395	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
14396	"Unexpected number of operands in CONCAT_VECTORS");
14397
14398	if (NumOperands == `2`)
14399	return Op;
14400
14401	// Concat each pair of subvectors and pack into the lower half of the array.
14402	SmallVector<SDValue> ConcatOps(Op ->op_begin(), Op ->op_end());
14403	while (ConcatOps.size() > `1`) {
14404	for (unsigned I = `0`, E = ConcatOps.size(); I != E; I += `2`) {
14405	SDValue V1 = ConcatOps [I];
14406	SDValue V2 = ConcatOps [I + `1`];
14407	EVT SubVT = V1.getValueType();
14408	EVT PairVT = SubVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
14409	ConcatOps [I / `2`] =
14410	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op), VT: PairVT, N1: V1, N2: V2);
14411	}
14412	ConcatOps.resize(N: ConcatOps.size() / `2`);
14413	}
14414	return ConcatOps [`0`];
14415	}
14416
14417	return SDValue ();
14418	}
14419
14420	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14421	SelectionDAG &DAG) const {
14422	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
14423
14424	if (useSVEForFixedLengthVectorVT(VT: Op.getValueType(),
14425	OverrideNEON: !Subtarget->isNeonAvailable()))
14426	return LowerFixedLengthInsertVectorElt(Op, DAG);
14427
14428	EVT VT = Op.getOperand(i: `0`).getValueType();
14429
14430	if (VT.getScalarType() == MVT::i1) {
14431	EVT VectorVT = getPromotedVTForPredicate(VT);
14432	SDLoc DL(Op);
14433	SDValue ExtendedVector =
14434	DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL, VT: VectorVT);
14435	SDValue ExtendedValue =
14436	DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `1`), DL,
14437	VT: VectorVT.getScalarType().getSizeInBits() < `32`
14438	? MVT::i32
14439	: VectorVT.getScalarType());
14440	ExtendedVector =
14441	DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: VectorVT, N1: ExtendedVector,
14442	N2: ExtendedValue, N3: Op.getOperand(i: `2`));
14443	return DAG.getAnyExtOrTrunc(Op: ExtendedVector, DL, VT);
14444	}
14445
14446	// Check for non-constant or out of range lane.
14447	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `2`));
14448	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
14449	return SDValue ();
14450
14451	return Op;
14452	}
14453
14454	SDValue
14455	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14456	SelectionDAG &DAG) const {
14457	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
14458	EVT VT = Op.getOperand(i: `0`).getValueType();
14459
14460	if (VT.getScalarType() == MVT::i1) {
14461	// We can't directly extract from an SVE predicate; extend it first.
14462	// (This isn't the only possible lowering, but it's straightforward.)
14463	EVT VectorVT = getPromotedVTForPredicate(VT);
14464	SDLoc DL(Op);
14465	SDValue Extend =
14466	DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VectorVT, Operand: Op.getOperand(i: `0`));
14467	MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
14468	SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtractTy,
14469	N1: Extend, N2: Op.getOperand(i: `1`));
14470	return DAG.getAnyExtOrTrunc(Op: Extract, DL, VT: Op.getValueType());
14471	}
14472
14473	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14474	return LowerFixedLengthExtractVectorElt(Op, DAG);
14475
14476	// Check for non-constant or out of range lane.
14477	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
14478	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
14479	return SDValue ();
14480
14481	// Insertion/extraction are legal for V128 types.
14482	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
14483	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
14484	VT == MVT::v8f16 \|\| VT == MVT::v8bf16)
14485	return Op;
14486
14487	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
14488	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
14489	VT != MVT::v4bf16)
14490	return SDValue ();
14491
14492	// For V64 types, we perform extraction by expanding the value
14493	// to a V128 type and perform the extraction on that.
14494	SDLoc DL(Op);
14495	SDValue WideVec = WidenVector(V64Reg: Op.getOperand(i: `0`), DAG);
14496	EVT WideTy = WideVec.getValueType();
14497
14498	EVT ExtrTy = WideTy.getVectorElementType();
14499	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
14500	ExtrTy = MVT::i32;
14501
14502	// For extractions, we just return the result directly.
14503	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ExtrTy, N1: WideVec,
14504	N2: Op.getOperand(i: `1`));
14505	}
14506
14507	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14508	SelectionDAG &DAG) const {
14509	EVT VT = Op.getValueType();
14510	assert(VT.isFixedLengthVector() &&
14511	"Only cases that extract a fixed length vector are supported!");
14512	EVT InVT = Op.getOperand(i: `0`).getValueType();
14513
14514	// If we don't have legal types yet, do nothing
14515	if (!isTypeLegal(VT: InVT))
14516	return SDValue ();
14517
14518	if (InVT.is128BitVector()) {
14519	assert(VT.is64BitVector() && "Extracting unexpected vector type!");
14520	unsigned Idx = Op.getConstantOperandVal(i: `1`);
14521
14522	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
14523	if (Idx == `0`)
14524	return Op;
14525
14526	// If this is extracting the upper 64-bits of a 128-bit vector, we match
14527	// that directly.
14528	if (Idx * InVT.getScalarSizeInBits() == `64` && Subtarget->isNeonAvailable())
14529	return Op;
14530	}
14531
14532	if (InVT.isScalableVector() \|\|
14533	useSVEForFixedLengthVectorVT(VT: InVT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14534	SDLoc DL(Op);
14535	SDValue Vec = Op.getOperand(i: `0`);
14536	SDValue Idx = Op.getOperand(i: `1`);
14537
14538	EVT PackedVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
14539	if (PackedVT != InVT) {
14540	// Pack input into the bottom part of an SVE register and try again.
14541	SDValue Container = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: PackedVT,
14542	N1: DAG.getUNDEF(VT: PackedVT), N2: Vec,
14543	N3: DAG.getVectorIdxConstant(Val: `0`, DL));
14544	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Container, N2: Idx);
14545	}
14546
14547	// This will get matched by custom code during ISelDAGToDAG.
14548	if (isNullConstant(V: Idx))
14549	return Op;
14550
14551	assert(InVT.isScalableVector() && "Unexpected vector type!");
14552	// Move requested subvector to the start of the vector and try again.
14553	SDValue Splice = DAG.getNode(Opcode: ISD::VECTOR_SPLICE, DL, VT: InVT, N1: Vec, N2: Vec, N3: Idx);
14554	return convertFromScalableVector(DAG, VT, V: Splice);
14555	}
14556
14557	return SDValue ();
14558	}
14559
14560	SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
14561	SelectionDAG &DAG) const {
14562	assert(Op.getValueType().isScalableVector() &&
14563	"Only expect to lower inserts into scalable vectors!");
14564
14565	EVT InVT = Op.getOperand(i: `1`).getValueType();
14566	unsigned Idx = Op.getConstantOperandVal(i: `2`);
14567
14568	SDValue Vec0 = Op.getOperand(i: `0`);
14569	SDValue Vec1 = Op.getOperand(i: `1`);
14570	SDLoc DL(Op);
14571	EVT VT = Op.getValueType();
14572
14573	if (InVT.isScalableVector()) {
14574	if (!isTypeLegal(VT))
14575	return SDValue ();
14576
14577	// Break down insert_subvector into simpler parts.
14578	if (VT.getVectorElementType() == MVT::i1) {
14579	unsigned NumElts = VT.getVectorMinNumElements();
14580	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
14581
14582	SDValue Lo, Hi;
14583	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
14584	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
14585	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: Vec0,
14586	N2: DAG.getVectorIdxConstant(Val: NumElts / `2`, DL));
14587	if (Idx < (NumElts / `2`))
14588	Lo = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Lo, N2: Vec1,
14589	N3: DAG.getVectorIdxConstant(Val: Idx, DL));
14590	else
14591	Hi = DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: HalfVT, N1: Hi, N2: Vec1,
14592	N3: DAG.getVectorIdxConstant(Val: Idx - (NumElts / `2`), DL));
14593
14594	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Lo, N2: Hi);
14595	}
14596
14597	// Ensure the subvector is half the size of the main vector.
14598	if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * `2`))
14599	return SDValue ();
14600
14601	// Here narrow and wide refers to the vector element types. After "casting"
14602	// both vectors must have the same bit length and so because the subvector
14603	// has fewer elements, those elements need to be bigger.
14604	EVT NarrowVT = getPackedSVEVectorVT(EC: VT.getVectorElementCount());
14605	EVT WideVT = getPackedSVEVectorVT(EC: InVT.getVectorElementCount());
14606
14607	// NOP cast operands to the largest legal vector of the same element count.
14608	if (VT.isFloatingPoint()) {
14609	Vec0 = getSVESafeBitCast(VT: NarrowVT, Op: Vec0, DAG);
14610	Vec1 = getSVESafeBitCast(VT: WideVT, Op: Vec1, DAG);
14611	} else {
14612	// Legal integer vectors are already their largest so Vec0 is fine as is.
14613	Vec1 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: WideVT, Operand: Vec1);
14614	}
14615
14616	// To replace the top/bottom half of vector V with vector SubV we widen the
14617	// preserved half of V, concatenate this to SubV (the order depending on the
14618	// half being replaced) and then narrow the result.
14619	SDValue Narrow;
14620	if (Idx == `0`) {
14621	SDValue HiVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKHI, DL, VT: WideVT, Operand: Vec0);
14622	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: Vec1, N2: HiVec0);
14623	} else {
14624	assert(Idx == InVT.getVectorMinNumElements() &&
14625	"Invalid subvector index!");
14626	SDValue LoVec0 = DAG.getNode(Opcode: AArch64ISD::UUNPKLO, DL, VT: WideVT, Operand: Vec0);
14627	Narrow = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: NarrowVT, N1: LoVec0, N2: Vec1);
14628	}
14629
14630	return getSVESafeBitCast(VT, Op: Narrow, DAG);
14631	}
14632
14633	if (Idx == `0` && isPackedVectorType(VT, DAG)) {
14634	// This will be matched by custom code during ISelDAGToDAG.
14635	if (Vec0.isUndef())
14636	return Op;
14637
14638	std::optional<unsigned> PredPattern =
14639	getSVEPredPatternFromNumElements(MinNumElts: InVT.getVectorNumElements());
14640	auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
14641	SDValue PTrue = getPTrue(DAG, DL, VT: PredTy, Pattern: *PredPattern);
14642	SDValue ScalableVec1 = convertToScalableVector(DAG, VT, V: Vec1);
14643	return DAG.getNode(Opcode: ISD::VSELECT, DL, VT, N1: PTrue, N2: ScalableVec1, N3: Vec0);
14644	}
14645
14646	return SDValue ();
14647	}
14648
14649	static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14650	if (Op.getOpcode() != AArch64ISD::DUP &&
14651	Op.getOpcode() != ISD::SPLAT_VECTOR &&
14652	Op.getOpcode() != ISD::BUILD_VECTOR)
14653	return false;
14654
14655	if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14656	!isAllConstantBuildVector(PotentialBVec: Op, ConstVal&: SplatVal))
14657	return false;
14658
14659	if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14660	!isa<ConstantSDNode>(Val: Op ->getOperand(Num: `0`)))
14661	return false;
14662
14663	SplatVal = Op ->getConstantOperandVal(Num: `0`);
14664	if (Op.getValueType().getVectorElementType() != MVT::i64)
14665	SplatVal = (int32_t)SplatVal;
14666
14667	Negated = false;
14668	if (isPowerOf2_64(Value: SplatVal))
14669	return true;
14670
14671	Negated = true;
14672	if (isPowerOf2_64(Value: -SplatVal)) {
14673	SplatVal = -SplatVal;
14674	return true;
14675	}
14676
14677	return false;
14678	}
14679
14680	SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14681	EVT VT = Op.getValueType();
14682	SDLoc dl(Op);
14683
14684	if (useSVEForFixedLengthVectorVT(VT, /OverrideNEON=/true))
14685	return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14686
14687	assert(VT.isScalableVector() && "Expected a scalable vector.");
14688
14689	bool Signed = Op.getOpcode() == ISD::SDIV;
14690	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14691
14692	bool Negated;
14693	uint64_t SplatVal;
14694	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
14695	SDValue Pg = getPredicateForScalableVector(DAG, DL&: dl, VT);
14696	SDValue Res =
14697	DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT, N1: Pg, N2: Op ->getOperand(Num: `0`),
14698	N3: DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL: dl, VT: MVT::i32));
14699	if (Negated)
14700	Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT, N1: DAG.getConstant(Val: `0`, DL: dl, VT), N2: Res);
14701
14702	return Res;
14703	}
14704
14705	if (VT == MVT::nxv4i32 \|\| VT == MVT::nxv2i64)
14706	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
14707
14708	// SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14709	// operations, and truncate the result.
14710	EVT WidenedVT;
14711	if (VT == MVT::nxv16i8)
14712	WidenedVT = MVT::nxv8i16;
14713	else if (VT == MVT::nxv8i16)
14714	WidenedVT = MVT::nxv4i32;
14715	else
14716	llvm_unreachable("Unexpected Custom DIV operation");
14717
14718	unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14719	unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14720	SDValue Op0Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
14721	SDValue Op1Lo = DAG.getNode(Opcode: UnpkLo, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
14722	SDValue Op0Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `0`));
14723	SDValue Op1Hi = DAG.getNode(Opcode: UnpkHi, DL: dl, VT: WidenedVT, Operand: Op.getOperand(i: `1`));
14724	SDValue ResultLo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Lo, N2: Op1Lo);
14725	SDValue ResultHi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WidenedVT, N1: Op0Hi, N2: Op1Hi);
14726	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL: dl, VT, N1: ResultLo, N2: ResultHi);
14727	}
14728
14729	bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
14730	EVT VT, unsigned DefinedValues) const {
14731	if (!Subtarget->isNeonAvailable())
14732	return false;
14733	return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
14734	}
14735
14736	bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
14737	// Currently no fixed length shuffles that require SVE are legal.
14738	if (useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14739	return false;
14740
14741	if (VT.getVectorNumElements() == `4` &&
14742	(VT.is128BitVector() \|\| VT.is64BitVector())) {
14743	unsigned Cost = getPerfectShuffleCost(M);
14744	if (Cost <= `1`)
14745	return true;
14746	}
14747
14748	bool DummyBool;
14749	int DummyInt;
14750	unsigned DummyUnsigned;
14751
14752	unsigned EltSize = VT.getScalarSizeInBits();
14753	unsigned NumElts = VT.getVectorNumElements();
14754	return (ShuffleVectorSDNode::isSplatMask(Mask: &M [`0`], VT) \|\|
14755	isREVMask(M, EltSize, NumElts, BlockSize: `64`) \|\|
14756	isREVMask(M, EltSize, NumElts, BlockSize: `32`) \|\|
14757	isREVMask(M, EltSize, NumElts, BlockSize: `16`) \|\|
14758	isEXTMask(M, VT, ReverseEXT&: DummyBool, Imm&: DummyUnsigned) \|\|
14759	isTRNMask(M, NumElts, WhichResult&: DummyUnsigned) \|\|
14760	isUZPMask(M, NumElts, WhichResultOut&: DummyUnsigned) \|\|
14761	isZIPMask(M, NumElts, WhichResultOut&: DummyUnsigned) \|\|
14762	isTRN_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14763	isUZP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14764	isZIP_v_undef_Mask(M, VT, WhichResult&: DummyUnsigned) \|\|
14765	isINSMask(M, NumInputElements: NumElts, DstIsLeft&: DummyBool, Anomaly&: DummyInt) \|\|
14766	isConcatMask(Mask: M, VT, SplitLHS: VT.getSizeInBits() == `128`));
14767	}
14768
14769	bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
14770	EVT VT) const {
14771	// Just delegate to the generic legality, clear masks aren't special.
14772	return isShuffleMaskLegal(M, VT);
14773	}
14774
14775	/// getVShiftImm - Check if this is a valid build_vector for the immediate
14776	/// operand of a vector shift operation, where all the elements of the
14777	/// build_vector must have the same constant integer value.
14778	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14779	// Ignore bit_converts.
14780	while (Op.getOpcode() == ISD::BITCAST)
14781	Op = Op.getOperand(i: `0`);
14782	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: Op.getNode());
14783	APInt SplatBits, SplatUndef;
14784	unsigned SplatBitSize;
14785	bool HasAnyUndefs;
14786	if (!BVN \|\| !BVN->isConstantSplat(SplatValue&: SplatBits, SplatUndef, SplatBitSize,
14787	HasAnyUndefs, MinSplatBits: ElementBits) \|\|
14788	SplatBitSize > ElementBits)
14789	return false;
14790	Cnt = SplatBits.getSExtValue();
14791	return true;
14792	}
14793
14794	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14795	/// operand of a vector shift left operation. That value must be in the range:
14796	/// 0 <= Value < ElementBits for a left shift; or
14797	/// 0 <= Value <= ElementBits for a long left shift.
14798	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14799	assert(VT.isVector() && "vector shift count is not a vector type");
14800	int64_t ElementBits = VT.getScalarSizeInBits();
14801	if (!getVShiftImm(Op, ElementBits, Cnt))
14802	return false;
14803	return (Cnt >= `0` && (isLong ? Cnt - `1` : Cnt) < ElementBits);
14804	}
14805
14806	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14807	/// operand of a vector shift right operation. The value must be in the range:
14808	/// 1 <= Value <= ElementBits for a right shift; or
14809	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14810	assert(VT.isVector() && "vector shift count is not a vector type");
14811	int64_t ElementBits = VT.getScalarSizeInBits();
14812	if (!getVShiftImm(Op, ElementBits, Cnt))
14813	return false;
14814	return (Cnt >= `1` && Cnt <= (isNarrow ? ElementBits / `2` : ElementBits));
14815	}
14816
14817	SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14818	SelectionDAG &DAG) const {
14819	EVT VT = Op.getValueType();
14820
14821	if (VT.getScalarType() == MVT::i1) {
14822	// Lower i1 truncate to `(x & 1) != 0`.
14823	SDLoc dl(Op);
14824	EVT OpVT = Op.getOperand(i: `0`).getValueType();
14825	SDValue Zero = DAG.getConstant(Val: `0`, DL: dl, VT: OpVT);
14826	SDValue One = DAG.getConstant(Val: `1`, DL: dl, VT: OpVT);
14827	SDValue And = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: OpVT, N1: Op.getOperand(i: `0`), N2: One);
14828	return DAG.getSetCC(DL: dl, VT, LHS: And, RHS: Zero, Cond: ISD::SETNE);
14829	}
14830
14831	if (!VT.isVector() \|\| VT.isScalableVector())
14832	return SDValue ();
14833
14834	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
14835	OverrideNEON: !Subtarget->isNeonAvailable()))
14836	return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14837
14838	return SDValue ();
14839	}
14840
14841	// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14842	// possibly a truncated type, it tells how many bits of the value are to be
14843	// used.
14844	static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT,
14845	SelectionDAG &DAG,
14846	unsigned &ShiftValue,
14847	SDValue &RShOperand) {
14848	if (Shift ->getOpcode() != ISD::SRL)
14849	return false;
14850
14851	EVT VT = Shift.getValueType();
14852	assert(VT.isScalableVT());
14853
14854	auto ShiftOp1 =
14855	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Shift ->getOperand(Num: `1`)));
14856	if (!ShiftOp1)
14857	return false;
14858
14859	ShiftValue = ShiftOp1->getZExtValue();
14860	if (ShiftValue < `1` \|\| ShiftValue > ResVT.getScalarSizeInBits())
14861	return false;
14862
14863	SDValue Add = Shift ->getOperand(Num: `0`);
14864	if (Add ->getOpcode() != ISD::ADD \|\| !Add ->hasOneUse())
14865	return false;
14866
14867	assert(ResVT.getScalarSizeInBits() <= VT.getScalarSizeInBits() &&
14868	"ResVT must be truncated or same type as the shift.");
14869	// Check if an overflow can lead to incorrect results.
14870	uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14871	if (ShiftValue > ExtraBits && !Add ->getFlags().hasNoUnsignedWrap())
14872	return false;
14873
14874	auto AddOp1 =
14875	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: Add ->getOperand(Num: `1`)));
14876	if (!AddOp1)
14877	return false;
14878	uint64_t AddValue = AddOp1->getZExtValue();
14879	if (AddValue != `1ULL` << (ShiftValue - `1`))
14880	return false;
14881
14882	RShOperand = Add ->getOperand(Num: `0`);
14883	return true;
14884	}
14885
14886	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14887	SelectionDAG &DAG) const {
14888	EVT VT = Op.getValueType();
14889	SDLoc DL(Op);
14890	int64_t Cnt;
14891
14892	if (!Op.getOperand(i: `1`).getValueType().isVector())
14893	return Op;
14894	unsigned EltSize = VT.getScalarSizeInBits();
14895
14896	switch (Op.getOpcode()) {
14897	case ISD::SHL:
14898	if (VT.isScalableVector() \|\|
14899	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable()))
14900	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SHL_PRED);
14901
14902	if (isVShiftLImm(Op: Op.getOperand(i: `1`), VT, isLong: false, Cnt) && Cnt < EltSize)
14903	return DAG.getNode(Opcode: AArch64ISD::VSHL, DL, VT, N1: Op.getOperand(i: `0`),
14904	N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32));
14905	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
14906	N1: DAG.getConstant(Val: Intrinsic::aarch64_neon_ushl, DL,
14907	VT: MVT::i32),
14908	N2: Op.getOperand(i: `0`), N3: Op.getOperand(i: `1`));
14909	case ISD::SRA:
14910	case ISD::SRL:
14911	if (VT.isScalableVector() &&
14912	(Subtarget->hasSVE2() \|\|
14913	(Subtarget->hasSME() && Subtarget->isStreaming()))) {
14914	SDValue RShOperand;
14915	unsigned ShiftValue;
14916	if (canLowerSRLToRoundingShiftForVT(Shift: Op, ResVT: VT, DAG, ShiftValue, RShOperand))
14917	return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT,
14918	N1: getPredicateForVector(DAG, DL, VT), N2: RShOperand,
14919	N3: DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32));
14920	}
14921
14922	if (VT.isScalableVector() \|\|
14923	useSVEForFixedLengthVectorVT(VT, OverrideNEON: !Subtarget->isNeonAvailable())) {
14924	unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14925	: AArch64ISD::SRL_PRED;
14926	return LowerToPredicatedOp(Op, DAG, NewOp: Opc);
14927	}
14928
14929	// Right shift immediate
14930	if (isVShiftRImm(Op: Op.getOperand(i: `1`), VT, isNarrow: false, Cnt) && Cnt < EltSize) {
14931	unsigned Opc =
14932	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14933	return DAG.getNode(Opcode: Opc, DL, VT, N1: Op.getOperand(i: `0`),
14934	N2: DAG.getConstant(Val: Cnt, DL, VT: MVT::i32), Flags: Op ->getFlags());
14935	}
14936
14937	// Right shift register. Note, there is not a shift right register
14938	// instruction, but the shift left register instruction takes a signed
14939	// value, where negative numbers specify a right shift.
14940	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14941	: Intrinsic::aarch64_neon_ushl;
14942	// negate the shift amount
14943	SDValue NegShift = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
14944	N2: Op.getOperand(i: `1`));
14945	SDValue NegShiftLeft =
14946	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT,
14947	N1: DAG.getConstant(Val: Opc, DL, VT: MVT::i32), N2: Op.getOperand(i: `0`),
14948	N3: NegShift);
14949	return NegShiftLeft;
14950	}
14951
14952	llvm_unreachable("unexpected shift opcode");
14953	}
14954
14955	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
14956	AArch64CC::CondCode CC, bool NoNans, EVT VT,
14957	const SDLoc &dl, SelectionDAG &DAG) {
14958	EVT SrcVT = LHS.getValueType();
14959	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14960	"function only supposed to emit natural comparisons");
14961
14962	APInt SplatValue;
14963	APInt SplatUndef;
14964	unsigned SplatBitSize = `0`;
14965	bool HasAnyUndefs;
14966
14967	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
14968	bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14969	SplatBitSize, HasAnyUndefs);
14970
14971	bool IsZero = IsCnst && SplatValue == `0`;
14972	bool IsOne =
14973	IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == `1`;
14974	bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14975
14976	if (SrcVT.getVectorElementType().isFloatingPoint()) {
14977	switch (CC) {
14978	default:
14979	return SDValue ();
14980	case AArch64CC::NE: {
14981	SDValue Fcmeq;
14982	if (IsZero)
14983	Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14984	else
14985	Fcmeq = DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14986	return DAG.getNOT(DL: dl, Val: Fcmeq, VT);
14987	}
14988	case AArch64CC::EQ:
14989	if (IsZero)
14990	return DAG.getNode(Opcode: AArch64ISD::FCMEQz, DL: dl, VT, Operand: LHS);
14991	return DAG.getNode(Opcode: AArch64ISD::FCMEQ, DL: dl, VT, N1: LHS, N2: RHS);
14992	case AArch64CC::GE:
14993	if (IsZero)
14994	return DAG.getNode(Opcode: AArch64ISD::FCMGEz, DL: dl, VT, Operand: LHS);
14995	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: LHS, N2: RHS);
14996	case AArch64CC::GT:
14997	if (IsZero)
14998	return DAG.getNode(Opcode: AArch64ISD::FCMGTz, DL: dl, VT, Operand: LHS);
14999	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: LHS, N2: RHS);
15000	case AArch64CC::LE:
15001	if (!NoNans)
15002	return SDValue ();
15003	// If we ignore NaNs then we can use to the LS implementation.
15004	[[fallthrough]];
15005	case AArch64CC::LS:
15006	if (IsZero)
15007	return DAG.getNode(Opcode: AArch64ISD::FCMLEz, DL: dl, VT, Operand: LHS);
15008	return DAG.getNode(Opcode: AArch64ISD::FCMGE, DL: dl, VT, N1: RHS, N2: LHS);
15009	case AArch64CC::LT:
15010	if (!NoNans)
15011	return SDValue ();
15012	// If we ignore NaNs then we can use to the MI implementation.
15013	[[fallthrough]];
15014	case AArch64CC::MI:
15015	if (IsZero)
15016	return DAG.getNode(Opcode: AArch64ISD::FCMLTz, DL: dl, VT, Operand: LHS);
15017	return DAG.getNode(Opcode: AArch64ISD::FCMGT, DL: dl, VT, N1: RHS, N2: LHS);
15018	}
15019	}
15020
15021	switch (CC) {
15022	default:
15023	return SDValue ();
15024	case AArch64CC::NE: {
15025	SDValue Cmeq;
15026	if (IsZero)
15027	Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
15028	else
15029	Cmeq = DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
15030	return DAG.getNOT(DL: dl, Val: Cmeq, VT);
15031	}
15032	case AArch64CC::EQ:
15033	if (IsZero)
15034	return DAG.getNode(Opcode: AArch64ISD::CMEQz, DL: dl, VT, Operand: LHS);
15035	return DAG.getNode(Opcode: AArch64ISD::CMEQ, DL: dl, VT, N1: LHS, N2: RHS);
15036	case AArch64CC::GE:
15037	if (IsZero)
15038	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, Operand: LHS);
15039	return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: LHS, N2: RHS);
15040	case AArch64CC::GT:
15041	if (IsZero)
15042	return DAG.getNode(Opcode: AArch64ISD::CMGTz, DL: dl, VT, Operand: LHS);
15043	if (IsMinusOne)
15044	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: dl, VT, N1: LHS, N2: RHS);
15045	return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: LHS, N2: RHS);
15046	case AArch64CC::LE:
15047	if (IsZero)
15048	return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
15049	return DAG.getNode(Opcode: AArch64ISD::CMGE, DL: dl, VT, N1: RHS, N2: LHS);
15050	case AArch64CC::LS:
15051	return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: RHS, N2: LHS);
15052	case AArch64CC::LO:
15053	return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: RHS, N2: LHS);
15054	case AArch64CC::LT:
15055	if (IsZero)
15056	return DAG.getNode(Opcode: AArch64ISD::CMLTz, DL: dl, VT, Operand: LHS);
15057	if (IsOne)
15058	return DAG.getNode(Opcode: AArch64ISD::CMLEz, DL: dl, VT, Operand: LHS);
15059	return DAG.getNode(Opcode: AArch64ISD::CMGT, DL: dl, VT, N1: RHS, N2: LHS);
15060	case AArch64CC::HI:
15061	return DAG.getNode(Opcode: AArch64ISD::CMHI, DL: dl, VT, N1: LHS, N2: RHS);
15062	case AArch64CC::HS:
15063	return DAG.getNode(Opcode: AArch64ISD::CMHS, DL: dl, VT, N1: LHS, N2: RHS);
15064	}
15065	}
15066
15067	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15068	SelectionDAG &DAG) const {
15069	if (Op.getValueType().isScalableVector())
15070	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::SETCC_MERGE_ZERO);
15071
15072	if (useSVEForFixedLengthVectorVT(VT: Op.getOperand(i: `0`).getValueType(),
15073	OverrideNEON: !Subtarget->isNeonAvailable()))
15074	return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15075
15076	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
15077	SDValue LHS = Op.getOperand(i: `0`);
15078	SDValue RHS = Op.getOperand(i: `1`);
15079	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15080	SDLoc dl(Op);
15081
15082	if (LHS.getValueType().getVectorElementType().isInteger()) {
15083	assert(LHS.getValueType() == RHS.getValueType());
15084	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
15085	SDValue Cmp =
15086	EmitVectorComparison(LHS, RHS, CC: AArch64CC, NoNans: false, VT: CmpVT, dl, DAG);
15087	return DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
15088	}
15089
15090	// Lower isnan(x) \| isnan(never-nan) to x != x.
15091	// Lower !isnan(x) & !isnan(never-nan) to x == x.
15092	if (CC == ISD::SETUO \|\| CC == ISD::SETO) {
15093	bool OneNaN = false;
15094	if (LHS == RHS) {
15095	OneNaN = true;
15096	} else if (DAG.isKnownNeverNaN(Op: RHS)) {
15097	OneNaN = true;
15098	RHS = LHS;
15099	} else if (DAG.isKnownNeverNaN(Op: LHS)) {
15100	OneNaN = true;
15101	LHS = RHS;
15102	}
15103	if (OneNaN) {
15104	CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
15105	}
15106	}
15107
15108	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15109
15110	// Make v4f16 (only) fcmp operations utilise vector instructions
15111	// v8f16 support will be a litle more complicated
15112	if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) \|\|
15113	LHS.getValueType().getVectorElementType() == MVT::bf16) {
15114	if (LHS.getValueType().getVectorNumElements() == `4`) {
15115	LHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::v4f32, Operand: LHS);
15116	RHS = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: dl, VT: MVT::v4f32, Operand: RHS);
15117	SDValue NewSetcc = DAG.getSetCC(DL: dl, VT: MVT::v4i16, LHS, RHS, Cond: CC);
15118	DAG.ReplaceAllUsesWith(From: Op, To: NewSetcc);
15119	CmpVT = MVT::v4i32;
15120	} else
15121	return SDValue ();
15122	}
15123
15124	assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) \|\|
15125	LHS.getValueType().getVectorElementType() != MVT::bf16 \|\|
15126	LHS.getValueType().getVectorElementType() != MVT::f128);
15127
15128	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15129	// clean. Some of them require two branches to implement.
15130	AArch64CC::CondCode CC1, CC2;
15131	bool ShouldInvert;
15132	changeVectorFPCCToAArch64CC(CC, CondCode&: CC1, CondCode2&: CC2, Invert&: ShouldInvert);
15133
15134	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath \|\| Op ->getFlags().hasNoNaNs();
15135	SDValue Cmp =
15136	EmitVectorComparison(LHS, RHS, CC: CC1, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
15137	if (!Cmp.getNode())
15138	return SDValue ();
15139
15140	if (CC2 != AArch64CC::AL) {
15141	SDValue Cmp2 =
15142	EmitVectorComparison(LHS, RHS, CC: CC2, NoNans: NoNaNs, VT: CmpVT, dl, DAG);
15143	if (!Cmp2.getNode())
15144	return SDValue ();
15145
15146	Cmp = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: CmpVT, N1: Cmp, N2: Cmp2);
15147	}
15148
15149	Cmp = DAG.getSExtOrTrunc(Op: Cmp, DL: dl, VT: Op.getValueType());
15150
15151	if (ShouldInvert)
15152	Cmp = DAG.getNOT(DL: dl, Val: Cmp, VT: Cmp.getValueType());
15153
15154	return Cmp;
15155	}
15156
15157	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15158	SelectionDAG &DAG) {
15159	SDValue VecOp = ScalarOp.getOperand(i: `0`);
15160	auto Rdx = DAG.getNode(Opcode: Op, DL, VT: VecOp.getSimpleValueType(), Operand: VecOp);
15161	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarOp.getValueType(), N1: Rdx,
15162	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
15163	}
15164
15165	static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15166	SDLoc DL, SelectionDAG &DAG) {
15167	unsigned ScalarOpcode;
15168	switch (Opcode) {
15169	case ISD::VECREDUCE_AND:
15170	ScalarOpcode = ISD::AND;
15171	break;
15172	case ISD::VECREDUCE_OR:
15173	ScalarOpcode = ISD::OR;
15174	break;
15175	case ISD::VECREDUCE_XOR:
15176	ScalarOpcode = ISD::XOR;
15177	break;
15178	default:
15179	llvm_unreachable("Expected bitwise vector reduction");
15180	return SDValue ();
15181	}
15182
15183	EVT VecVT = Vec.getValueType();
15184	assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15185	"Expected power-of-2 length vector");
15186
15187	EVT ElemVT = VecVT.getVectorElementType();
15188
15189	SDValue Result;
15190	unsigned NumElems = VecVT.getVectorNumElements();
15191
15192	// Special case for boolean reductions
15193	if (ElemVT == MVT::i1) {
15194	// Split large vectors into smaller ones
15195	if (NumElems > `16`) {
15196	SDValue Lo, Hi;
15197	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15198	EVT HalfVT = Lo.getValueType();
15199	SDValue HalfVec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: HalfVT, N1: Lo, N2: Hi);
15200	return getVectorBitwiseReduce(Opcode, Vec: HalfVec, VT, DL, DAG);
15201	}
15202
15203	// Vectors that are less than 64 bits get widened to neatly fit a 64 bit
15204	// register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
15205	// this element size leads to the best codegen, since e.g. setcc results
15206	// might need to be truncated otherwise.
15207	EVT ExtendedVT = MVT::getIntegerVT(BitWidth: std::max(a: `64u` / NumElems, b: `8u`));
15208
15209	// any_ext doesn't work with umin/umax, so only use it for uadd.
15210	unsigned ExtendOp =
15211	ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
15212	SDValue Extended = DAG.getNode(
15213	Opcode: ExtendOp, DL, VT: VecVT.changeVectorElementType(EltVT: ExtendedVT), Operand: Vec);
15214	switch (ScalarOpcode) {
15215	case ISD::AND:
15216	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMIN, DL, VT: ExtendedVT, Operand: Extended);
15217	break;
15218	case ISD::OR:
15219	Result = DAG.getNode(Opcode: ISD::VECREDUCE_UMAX, DL, VT: ExtendedVT, Operand: Extended);
15220	break;
15221	case ISD::XOR:
15222	Result = DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ExtendedVT, Operand: Extended);
15223	break;
15224	default:
15225	llvm_unreachable("Unexpected Opcode");
15226	}
15227
15228	Result = DAG.getAnyExtOrTrunc(Op: Result, DL, VT: MVT::i1);
15229	} else {
15230	// Iteratively split the vector in half and combine using the bitwise
15231	// operation until it fits in a 64 bit register.
15232	while (VecVT.getSizeInBits() > `64`) {
15233	SDValue Lo, Hi;
15234	std::tie(args&: Lo, args&: Hi) = DAG.SplitVector(N: Vec, DL);
15235	VecVT = Lo.getValueType();
15236	NumElems = VecVT.getVectorNumElements();
15237	Vec = DAG.getNode(Opcode: ScalarOpcode, DL, VT: VecVT, N1: Lo, N2: Hi);
15238	}
15239
15240	EVT ScalarVT = EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: VecVT.getSizeInBits());
15241
15242	// Do the remaining work on a scalar since it allows the code generator to
15243	// combine the shift and bitwise operation into one instruction and since
15244	// integer instructions can have higher throughput than vector instructions.
15245	SDValue Scalar = DAG.getBitcast(VT: ScalarVT, V: Vec);
15246
15247	// Iteratively combine the lower and upper halves of the scalar using the
15248	// bitwise operation, halving the relevant region of the scalar in each
15249	// iteration, until the relevant region is just one element of the original
15250	// vector.
15251	for (unsigned Shift = NumElems / `2`; Shift > `0`; Shift /= `2`) {
15252	SDValue ShiftAmount =
15253	DAG.getConstant(Val: Shift * ElemVT.getSizeInBits(), DL, VT: MVT::i64);
15254	SDValue Shifted =
15255	DAG.getNode(Opcode: ISD::SRL, DL, VT: ScalarVT, N1: Scalar, N2: ShiftAmount);
15256	Scalar = DAG.getNode(Opcode: ScalarOpcode, DL, VT: ScalarVT, N1: Scalar, N2: Shifted);
15257	}
15258
15259	Result = DAG.getAnyExtOrTrunc(Op: Scalar, DL, VT: ElemVT);
15260	}
15261
15262	return DAG.getAnyExtOrTrunc(Op: Result, DL, VT);
15263	}
15264
15265	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
15266	SelectionDAG &DAG) const {
15267	SDValue Src = Op.getOperand(i: `0`);
15268
15269	// Try to lower fixed length reductions to SVE.
15270	EVT SrcVT = Src.getValueType();
15271	bool OverrideNEON = !Subtarget->isNeonAvailable() \|\|
15272	Op.getOpcode() == ISD::VECREDUCE_AND \|\|
15273	Op.getOpcode() == ISD::VECREDUCE_OR \|\|
15274	Op.getOpcode() == ISD::VECREDUCE_XOR \|\|
15275	Op.getOpcode() == ISD::VECREDUCE_FADD \|\|
15276	(Op.getOpcode() != ISD::VECREDUCE_ADD &&
15277	SrcVT.getVectorElementType() == MVT::i64);
15278	if (SrcVT.isScalableVector() \|\|
15279	useSVEForFixedLengthVectorVT(
15280	VT: SrcVT, OverrideNEON: OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
15281
15282	if (SrcVT.getVectorElementType() == MVT::i1)
15283	return LowerPredReductionToSVE(ScalarOp: Op, DAG);
15284
15285	switch (Op.getOpcode()) {
15286	case ISD::VECREDUCE_ADD:
15287	return LowerReductionToSVE(Opcode: AArch64ISD::UADDV_PRED, ScalarOp: Op, DAG);
15288	case ISD::VECREDUCE_AND:
15289	return LowerReductionToSVE(Opcode: AArch64ISD::ANDV_PRED, ScalarOp: Op, DAG);
15290	case ISD::VECREDUCE_OR:
15291	return LowerReductionToSVE(Opcode: AArch64ISD::ORV_PRED, ScalarOp: Op, DAG);
15292	case ISD::VECREDUCE_SMAX:
15293	return LowerReductionToSVE(Opcode: AArch64ISD::SMAXV_PRED, ScalarOp: Op, DAG);
15294	case ISD::VECREDUCE_SMIN:
15295	return LowerReductionToSVE(Opcode: AArch64ISD::SMINV_PRED, ScalarOp: Op, DAG);
15296	case ISD::VECREDUCE_UMAX:
15297	return LowerReductionToSVE(Opcode: AArch64ISD::UMAXV_PRED, ScalarOp: Op, DAG);
15298	case ISD::VECREDUCE_UMIN:
15299	return LowerReductionToSVE(Opcode: AArch64ISD::UMINV_PRED, ScalarOp: Op, DAG);
15300	case ISD::VECREDUCE_XOR:
15301	return LowerReductionToSVE(Opcode: AArch64ISD::EORV_PRED, ScalarOp: Op, DAG);
15302	case ISD::VECREDUCE_FADD:
15303	return LowerReductionToSVE(Opcode: AArch64ISD::FADDV_PRED, ScalarOp: Op, DAG);
15304	case ISD::VECREDUCE_FMAX:
15305	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXNMV_PRED, ScalarOp: Op, DAG);
15306	case ISD::VECREDUCE_FMIN:
15307	return LowerReductionToSVE(Opcode: AArch64ISD::FMINNMV_PRED, ScalarOp: Op, DAG);
15308	case ISD::VECREDUCE_FMAXIMUM:
15309	return LowerReductionToSVE(Opcode: AArch64ISD::FMAXV_PRED, ScalarOp: Op, DAG);
15310	case ISD::VECREDUCE_FMINIMUM:
15311	return LowerReductionToSVE(Opcode: AArch64ISD::FMINV_PRED, ScalarOp: Op, DAG);
15312	default:
15313	llvm_unreachable("Unhandled fixed length reduction");
15314	}
15315	}
15316
15317	// Lower NEON reductions.
15318	SDLoc dl(Op);
15319	switch (Op.getOpcode()) {
15320	case ISD::VECREDUCE_AND:
15321	case ISD::VECREDUCE_OR:
15322	case ISD::VECREDUCE_XOR:
15323	return getVectorBitwiseReduce(Opcode: Op.getOpcode(), Vec: Op.getOperand(i: `0`),
15324	VT: Op.getValueType(), DL: dl, DAG);
15325	case ISD::VECREDUCE_ADD:
15326	return getReductionSDNode(Op: AArch64ISD::UADDV, DL: dl, ScalarOp: Op, DAG);
15327	case ISD::VECREDUCE_SMAX:
15328	return getReductionSDNode(Op: AArch64ISD::SMAXV, DL: dl, ScalarOp: Op, DAG);
15329	case ISD::VECREDUCE_SMIN:
15330	return getReductionSDNode(Op: AArch64ISD::SMINV, DL: dl, ScalarOp: Op, DAG);
15331	case ISD::VECREDUCE_UMAX:
15332	return getReductionSDNode(Op: AArch64ISD::UMAXV, DL: dl, ScalarOp: Op, DAG);
15333	case ISD::VECREDUCE_UMIN:
15334	return getReductionSDNode(Op: AArch64ISD::UMINV, DL: dl, ScalarOp: Op, DAG);
15335	default:
15336	llvm_unreachable("Unhandled reduction");
15337	}
15338	}
15339
15340	SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
15341	SelectionDAG &DAG) const {
15342	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15343	// No point replacing if we don't have the relevant instruction/libcall anyway
15344	if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
15345	return SDValue ();
15346
15347	// LSE has an atomic load-clear instruction, but not a load-and.
15348	SDLoc dl(Op);
15349	MVT VT = Op.getSimpleValueType();
15350	assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
15351	SDValue RHS = Op.getOperand(i: `2`);
15352	AtomicSDNode *AN = cast<AtomicSDNode>(Val: Op.getNode());
15353	RHS = DAG.getNode(Opcode: ISD::XOR, DL: dl, VT, N1: DAG.getConstant(Val: -`1ULL`, DL: dl, VT), N2: RHS);
15354	return DAG.getAtomic(Opcode: ISD::ATOMIC_LOAD_CLR, dl, MemVT: AN->getMemoryVT(),
15355	Chain: Op.getOperand(i: `0`), Ptr: Op.getOperand(i: `1`), Val: RHS,
15356	MMO: AN->getMemOperand());
15357	}
15358
15359	SDValue
15360	AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
15361	SelectionDAG &DAG) const {
15362
15363	SDLoc dl(Op);
15364	// Get the inputs.
15365	SDNode *Node = Op.getNode();
15366	SDValue Chain = Op.getOperand(i: `0`);
15367	SDValue Size = Op.getOperand(i: `1`);
15368	MaybeAlign Align =
15369	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
15370	EVT VT = Node->getValueType(ResNo: `0`);
15371
15372	if (DAG.getMachineFunction().getFunction().hasFnAttribute(
15373	Kind: "no-stack-arg-probe")) {
15374	SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15375	Chain = SP.getValue(R: `1`);
15376	SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15377	if (Align)
15378	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
15379	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
15380	Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::SP, N: SP);
15381	SDValue Ops[`2`] = {SP, Chain};
15382	return DAG.getMergeValues(Ops, dl);
15383	}
15384
15385	Chain = DAG.getCALLSEQ_START(Chain, InSize: `0`, OutSize: `0`, DL: dl);
15386
15387	EVT PtrVT = getPointerTy(DL: DAG.getDataLayout());
15388	SDValue Callee = DAG.getTargetExternalSymbol(Sym: Subtarget->getChkStkName(),
15389	VT: PtrVT, TargetFlags: `0`);
15390
15391	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
15392	const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
15393	if (Subtarget->hasCustomCallingConv())
15394	TRI->UpdateCustomCallPreservedMask(MF&: DAG.getMachineFunction(), Mask: &Mask);
15395
15396	Size = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT: MVT::i64, N1: Size,
15397	N2: DAG.getConstant(Val: `4`, DL: dl, VT: MVT::i64));
15398	Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::X15, N: Size, Glue: SDValue ());
15399	Chain =
15400	DAG.getNode(Opcode: AArch64ISD::CALL, DL: dl, VTList: DAG.getVTList(VT1: MVT::Other, VT2: MVT::Glue),
15401	N1: Chain, N2: Callee, N3: DAG.getRegister(Reg: AArch64::X15, VT: MVT::i64),
15402	N4: DAG.getRegisterMask(RegMask: Mask), N5: Chain.getValue(R: `1`));
15403	// To match the actual intent better, we should read the output from X15 here
15404	// again (instead of potentially spilling it to the stack), but rereading Size
15405	// from X15 here doesn't work at -O0, since it thinks that X15 is undefined
15406	// here.
15407
15408	Size = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MVT::i64, N1: Size,
15409	N2: DAG.getConstant(Val: `4`, DL: dl, VT: MVT::i64));
15410
15411	SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15412	Chain = SP.getValue(R: `1`);
15413	SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15414	if (Align)
15415	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
15416	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
15417	Chain = DAG.getCopyToReg(Chain, dl, Reg: AArch64::SP, N: SP);
15418
15419	Chain = DAG.getCALLSEQ_END(Chain, Size1: `0`, Size2: `0`, Glue: SDValue (), DL: dl);
15420
15421	SDValue Ops[`2`] = {SP, Chain};
15422	return DAG.getMergeValues(Ops, dl);
15423	}
15424
15425	SDValue
15426	AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
15427	SelectionDAG &DAG) const {
15428	// Get the inputs.
15429	SDNode *Node = Op.getNode();
15430	SDValue Chain = Op.getOperand(i: `0`);
15431	SDValue Size = Op.getOperand(i: `1`);
15432
15433	MaybeAlign Align =
15434	cast<ConstantSDNode>(Val: Op.getOperand(i: `2`))->getMaybeAlignValue();
15435	SDLoc dl(Op);
15436	EVT VT = Node->getValueType(ResNo: `0`);
15437
15438	// Construct the new SP value in a GPR.
15439	SDValue SP = DAG.getCopyFromReg(Chain, dl, Reg: AArch64::SP, VT: MVT::i64);
15440	Chain = SP.getValue(R: `1`);
15441	SP = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: MVT::i64, N1: SP, N2: Size);
15442	if (Align)
15443	SP = DAG.getNode(Opcode: ISD::AND, DL: dl, VT, N1: SP.getValue(R: `0`),
15444	N2: DAG.getConstant(Val: -(uint64_t)Align ->value(), DL: dl, VT));
15445
15446	// Set the real SP to the new value with a probing loop.
15447	Chain = DAG.getNode(Opcode: AArch64ISD::PROBED_ALLOCA, DL: dl, VT: MVT::Other, N1: Chain, N2: SP);
15448	SDValue Ops[`2`] = {SP, Chain};
15449	return DAG.getMergeValues(Ops, dl);
15450	}
15451
15452	SDValue
15453	AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
15454	SelectionDAG &DAG) const {
15455	MachineFunction &MF = DAG.getMachineFunction();
15456
15457	if (Subtarget->isTargetWindows())
15458	return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
15459	else if (hasInlineStackProbe(MF))
15460	return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
15461	else
15462	return SDValue ();
15463	}
15464
15465	SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
15466	unsigned NewOp) const {
15467	if (Subtarget->hasSVE2())
15468	return LowerToPredicatedOp(Op, DAG, NewOp);
15469
15470	// Default to expand.
15471	return SDValue ();
15472	}
15473
15474	SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
15475	SelectionDAG &DAG) const {
15476	EVT VT = Op.getValueType();
15477	assert(VT != MVT::i64 && "Expected illegal VSCALE node");
15478
15479	SDLoc DL(Op);
15480	APInt MulImm = Op.getConstantOperandAPInt(i: `0`);
15481	return DAG.getZExtOrTrunc(Op: DAG.getVScale(DL, VT: MVT::i64, MulImm: MulImm.sext(width: `64`)), DL,
15482	VT);
15483	}
15484
15485	/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
15486	template <unsigned NumVecs>
15487	static bool
15488	setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
15489	AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
15490	Info.opc = ISD::INTRINSIC_VOID;
15491	// Retrieve EC from first vector argument.
15492	const EVT VT = TLI.getMemValueType(DL, Ty: CI.getArgOperand(i: `0`)->getType());
15493	ElementCount EC = VT.getVectorElementCount();
15494	#ifndef NDEBUG
15495	// Check the assumption that all input vectors are the same type.
15496	for (unsigned I = `0`; I < NumVecs; ++I)
15497	assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
15498	"Invalid type.");
15499	#endif
15500	// memVT is `NumVecs VT`.*
15501	Info.memVT = EVT::getVectorVT(Context&: CI.getType()->getContext(), VT: VT.getScalarType(),
15502	EC: EC * NumVecs);
15503	Info.ptrVal = CI.getArgOperand(i: CI.arg_size() - `1`);
15504	Info.offset = `0`;
15505	Info.align.reset();
15506	Info.flags = MachineMemOperand::MOStore;
15507	return true;
15508	}
15509
15510	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15511	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
15512	/// specified in the intrinsic calls.
15513	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
15514	const CallInst &I,
15515	MachineFunction &MF,
15516	unsigned Intrinsic) const {
15517	auto &DL = I.getDataLayout();
15518	switch (Intrinsic) {
15519	case Intrinsic::aarch64_sve_st2:
15520	return setInfoSVEStN<`2`>(TLI: *this, DL, Info, CI: I);
15521	case Intrinsic::aarch64_sve_st3:
15522	return setInfoSVEStN<`3`>(TLI: *this, DL, Info, CI: I);
15523	case Intrinsic::aarch64_sve_st4:
15524	return setInfoSVEStN<`4`>(TLI: *this, DL, Info, CI: I);
15525	case Intrinsic::aarch64_neon_ld2:
15526	case Intrinsic::aarch64_neon_ld3:
15527	case Intrinsic::aarch64_neon_ld4:
15528	case Intrinsic::aarch64_neon_ld1x2:
15529	case Intrinsic::aarch64_neon_ld1x3:
15530	case Intrinsic::aarch64_neon_ld1x4: {
15531	Info.opc = ISD::INTRINSIC_W_CHAIN;
15532	uint64_t NumElts = DL.getTypeSizeInBits(Ty: I.getType()) / `64`;
15533	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
15534	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
15535	Info.offset = `0`;
15536	Info.align.reset();
15537	// volatile loads with NEON intrinsics not supported
15538	Info.flags = MachineMemOperand::MOLoad;
15539	return true;
15540	}
15541	case Intrinsic::aarch64_neon_ld2lane:
15542	case Intrinsic::aarch64_neon_ld3lane:
15543	case Intrinsic::aarch64_neon_ld4lane:
15544	case Intrinsic::aarch64_neon_ld2r:
15545	case Intrinsic::aarch64_neon_ld3r:
15546	case Intrinsic::aarch64_neon_ld4r: {
15547	Info.opc = ISD::INTRINSIC_W_CHAIN;
15548	// ldx return struct with the same vec type
15549	Type *RetTy = I.getType();
15550	auto *StructTy = cast<StructType>(Val: RetTy);
15551	unsigned NumElts = StructTy->getNumElements();
15552	Type *VecTy = StructTy->getElementType(N: `0`);
15553	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
15554	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
15555	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
15556	Info.offset = `0`;
15557	Info.align.reset();
15558	// volatile loads with NEON intrinsics not supported
15559	Info.flags = MachineMemOperand::MOLoad;
15560	return true;
15561	}
15562	case Intrinsic::aarch64_neon_st2:
15563	case Intrinsic::aarch64_neon_st3:
15564	case Intrinsic::aarch64_neon_st4:
15565	case Intrinsic::aarch64_neon_st1x2:
15566	case Intrinsic::aarch64_neon_st1x3:
15567	case Intrinsic::aarch64_neon_st1x4: {
15568	Info.opc = ISD::INTRINSIC_VOID;
15569	unsigned NumElts = `0`;
15570	for (const Value *Arg : I.args()) {
15571	Type *ArgTy = Arg->getType();
15572	if (!ArgTy->isVectorTy())
15573	break;
15574	NumElts += DL.getTypeSizeInBits(Ty: ArgTy) / `64`;
15575	}
15576	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: MVT::i64, NumElements: NumElts);
15577	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
15578	Info.offset = `0`;
15579	Info.align.reset();
15580	// volatile stores with NEON intrinsics not supported
15581	Info.flags = MachineMemOperand::MOStore;
15582	return true;
15583	}
15584	case Intrinsic::aarch64_neon_st2lane:
15585	case Intrinsic::aarch64_neon_st3lane:
15586	case Intrinsic::aarch64_neon_st4lane: {
15587	Info.opc = ISD::INTRINSIC_VOID;
15588	unsigned NumElts = `0`;
15589	// all the vector type is same
15590	Type *VecTy = I.getArgOperand(i: `0`)->getType();
15591	MVT EleVT = MVT::getVT(Ty: VecTy).getVectorElementType();
15592
15593	for (const Value *Arg : I.args()) {
15594	Type *ArgTy = Arg->getType();
15595	if (!ArgTy->isVectorTy())
15596	break;
15597	NumElts += `1`;
15598	}
15599
15600	Info.memVT = EVT::getVectorVT(Context&: I.getType()->getContext(), VT: EleVT, NumElements: NumElts);
15601	Info.ptrVal = I.getArgOperand(i: I.arg_size() - `1`);
15602	Info.offset = `0`;
15603	Info.align.reset();
15604	// volatile stores with NEON intrinsics not supported
15605	Info.flags = MachineMemOperand::MOStore;
15606	return true;
15607	}
15608	case Intrinsic::aarch64_ldaxr:
15609	case Intrinsic::aarch64_ldxr: {
15610	Type *ValTy = I.getParamElementType(ArgNo: `0`);
15611	Info.opc = ISD::INTRINSIC_W_CHAIN;
15612	Info.memVT = MVT::getVT(Ty: ValTy);
15613	Info.ptrVal = I.getArgOperand(i: `0`);
15614	Info.offset = `0`;
15615	Info.align = DL.getABITypeAlign(Ty: ValTy);
15616	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
15617	return true;
15618	}
15619	case Intrinsic::aarch64_stlxr:
15620	case Intrinsic::aarch64_stxr: {
15621	Type *ValTy = I.getParamElementType(ArgNo: `1`);
15622	Info.opc = ISD::INTRINSIC_W_CHAIN;
15623	Info.memVT = MVT::getVT(Ty: ValTy);
15624	Info.ptrVal = I.getArgOperand(i: `1`);
15625	Info.offset = `0`;
15626	Info.align = DL.getABITypeAlign(Ty: ValTy);
15627	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
15628	return true;
15629	}
15630	case Intrinsic::aarch64_ldaxp:
15631	case Intrinsic::aarch64_ldxp:
15632	Info.opc = ISD::INTRINSIC_W_CHAIN;
15633	Info.memVT = MVT::i128;
15634	Info.ptrVal = I.getArgOperand(i: `0`);
15635	Info.offset = `0`;
15636	Info.align = Align (`16`);
15637	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOVolatile;
15638	return true;
15639	case Intrinsic::aarch64_stlxp:
15640	case Intrinsic::aarch64_stxp:
15641	Info.opc = ISD::INTRINSIC_W_CHAIN;
15642	Info.memVT = MVT::i128;
15643	Info.ptrVal = I.getArgOperand(i: `2`);
15644	Info.offset = `0`;
15645	Info.align = Align (`16`);
15646	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MOVolatile;
15647	return true;
15648	case Intrinsic::aarch64_sve_ldnt1: {
15649	Type *ElTy = cast<VectorType>(Val: I.getType())->getElementType();
15650	Info.opc = ISD::INTRINSIC_W_CHAIN;
15651	Info.memVT = MVT::getVT(Ty: I.getType());
15652	Info.ptrVal = I.getArgOperand(i: `1`);
15653	Info.offset = `0`;
15654	Info.align = DL.getABITypeAlign(Ty: ElTy);
15655	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MONonTemporal;
15656	return true;
15657	}
15658	case Intrinsic::aarch64_sve_stnt1: {
15659	Type *ElTy =
15660	cast<VectorType>(Val: I.getArgOperand(i: `0`)->getType())->getElementType();
15661	Info.opc = ISD::INTRINSIC_W_CHAIN;
15662	Info.memVT = MVT::getVT(Ty: I.getOperand(i_nocapture: `0`)->getType());
15663	Info.ptrVal = I.getArgOperand(i: `2`);
15664	Info.offset = `0`;
15665	Info.align = DL.getABITypeAlign(Ty: ElTy);
15666	Info.flags = MachineMemOperand::MOStore \| MachineMemOperand::MONonTemporal;
15667	return true;
15668	}
15669	case Intrinsic::aarch64_mops_memset_tag: {
15670	Value *Dst = I.getArgOperand(i: `0`);
15671	Value *Val = I.getArgOperand(i: `1`);
15672	Info.opc = ISD::INTRINSIC_W_CHAIN;
15673	Info.memVT = MVT::getVT(Ty: Val->getType());
15674	Info.ptrVal = Dst;
15675	Info.offset = `0`;
15676	Info.align = I.getParamAlign(ArgNo: `0`).valueOrOne();
15677	Info.flags = MachineMemOperand::MOStore;
15678	// The size of the memory being operated on is unknown at this point
15679	Info.size = MemoryLocation::UnknownSize;
15680	return true;
15681	}
15682	default:
15683	break;
15684	}
15685
15686	return false;
15687	}
15688
15689	bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
15690	ISD::LoadExtType ExtTy,
15691	EVT NewVT) const {
15692	// TODO: This may be worth removing. Check regression tests for diffs.
15693	if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15694	return false;
15695
15696	// If we're reducing the load width in order to avoid having to use an extra
15697	// instruction to do extension then it's probably a good idea.
15698	if (ExtTy != ISD::NON_EXTLOAD)
15699	return true;
15700	// Don't reduce load width if it would prevent us from combining a shift into
15701	// the offset.
15702	MemSDNode *Mem = dyn_cast<MemSDNode>(Val: Load);
15703	assert(Mem);
15704	const SDValue &Base = Mem->getBasePtr();
15705	if (Base.getOpcode() == ISD::ADD &&
15706	Base.getOperand(i: `1`).getOpcode() == ISD::SHL &&
15707	Base.getOperand(i: `1`).hasOneUse() &&
15708	Base.getOperand(i: `1`).getOperand(i: `1`).getOpcode() == ISD::Constant) {
15709	// It's unknown whether a scalable vector has a power-of-2 bitwidth.
15710	if (Mem->getMemoryVT().isScalableVector())
15711	return false;
15712	// The shift can be combined if it matches the size of the value being
15713	// loaded (and so reducing the width would make it not match).
15714	uint64_t ShiftAmount = Base.getOperand(i: `1`).getConstantOperandVal(i: `1`);
15715	uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/`8`;
15716	if (ShiftAmount == Log2_32(Value: LoadBytes))
15717	return false;
15718	}
15719	// We have no reason to disallow reducing the load width, so allow it.
15720	return true;
15721	}
15722
15723	// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15724	bool AArch64TargetLowering::shouldRemoveRedundantExtend(SDValue Extend) const {
15725	EVT VT = Extend.getValueType();
15726	if ((VT == MVT::i64 \|\| VT == MVT::i32) && Extend ->use_size()) {
15727	SDValue Extract = Extend.getOperand(i: `0`);
15728	if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15729	Extract = Extract.getOperand(i: `0`);
15730	if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15731	EVT VecVT = Extract.getOperand(i: `0`).getValueType();
15732	if (VecVT.getScalarType() == MVT::i8 \|\| VecVT.getScalarType() == MVT::i16)
15733	return false;
15734	}
15735	}
15736	return true;
15737	}
15738
15739	// Truncations from 64-bit GPR to 32-bit GPR is free.
15740	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
15741	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
15742	return false;
15743	uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15744	uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15745	return NumBits1 > NumBits2;
15746	}
15747	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
15748	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
15749	return false;
15750	uint64_t NumBits1 = VT1.getFixedSizeInBits();
15751	uint64_t NumBits2 = VT2.getFixedSizeInBits();
15752	return NumBits1 > NumBits2;
15753	}
15754
15755	/// Check if it is profitable to hoist instruction in then/else to if.
15756	/// Not profitable if I and it's user can form a FMA instruction
15757	/// because we prefer FMSUB/FMADD.
15758	bool AArch64TargetLowering::isProfitableToHoist(Instruction I) const* {
15759	if (I->getOpcode() != Instruction::FMul)
15760	return true;
15761
15762	if (!I->hasOneUse())
15763	return true;
15764
15765	Instruction *User = I->user_back();
15766
15767	if (!(User->getOpcode() == Instruction::FSub \|\|
15768	User->getOpcode() == Instruction::FAdd))
15769	return true;
15770
15771	const TargetOptions &Options = getTargetMachine().Options;
15772	const Function *F = I->getFunction();
15773	const DataLayout &DL = F->getDataLayout();
15774	Type *Ty = User->getOperand(i: `0`)->getType();
15775
15776	return !(isFMAFasterThanFMulAndFAdd(F: *F, Ty) &&
15777	isOperationLegalOrCustom(Op: ISD::FMA, VT: getValueType(DL, Ty)) &&
15778	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
15779	Options.UnsafeFPMath));
15780	}
15781
15782	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15783	// 64-bit GPR.
15784	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
15785	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
15786	return false;
15787	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15788	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15789	return NumBits1 == `32` && NumBits2 == `64`;
15790	}
15791	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
15792	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
15793	return false;
15794	unsigned NumBits1 = VT1.getSizeInBits();
15795	unsigned NumBits2 = VT2.getSizeInBits();
15796	return NumBits1 == `32` && NumBits2 == `64`;
15797	}
15798
15799	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
15800	EVT VT1 = Val.getValueType();
15801	if (isZExtFree(VT1, VT2)) {
15802	return true;
15803	}
15804
15805	if (Val.getOpcode() != ISD::LOAD)
15806	return false;
15807
15808	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15809	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15810	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15811	VT1.getSizeInBits() <= `32`);
15812	}
15813
15814	bool AArch64TargetLowering::isExtFreeImpl(const Instruction Ext) const* {
15815	if (isa<FPExtInst>(Val: Ext))
15816	return false;
15817
15818	// Vector types are not free.
15819	if (Ext->getType()->isVectorTy())
15820	return false;
15821
15822	for (const Use &U : Ext->uses()) {
15823	// The extension is free if we can fold it with a left shift in an
15824	// addressing mode or an arithmetic operation: add, sub, and cmp.
15825
15826	// Is there a shift?
15827	const Instruction *Instr = cast<Instruction>(Val: U.getUser());
15828
15829	// Is this a constant shift?
15830	switch (Instr->getOpcode()) {
15831	case Instruction::Shl:
15832	if (!isa<ConstantInt>(Val: Instr->getOperand(i: `1`)))
15833	return false;
15834	break;
15835	case Instruction::GetElementPtr: {
15836	gep_type_iterator GTI = gep_type_begin(GEP: Instr);
15837	auto &DL = Ext->getDataLayout();
15838	std::advance(i&: GTI, n: U.getOperandNo()-`1`);
15839	Type *IdxTy = GTI.getIndexedType();
15840	// This extension will end up with a shift because of the scaling factor.
15841	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15842	// Get the shift amount based on the scaling factor:
15843	// log2(sizeof(IdxTy)) - log2(8).
15844	if (IdxTy->isScalableTy())
15845	return false;
15846	uint64_t ShiftAmt =
15847	llvm::countr_zero(Val: DL.getTypeStoreSizeInBits(Ty: IdxTy).getFixedValue()) -
15848	`3`;
15849	// Is the constant foldable in the shift of the addressing mode?
15850	// I.e., shift amount is between 1 and 4 inclusive.
15851	if (ShiftAmt == `0` \|\| ShiftAmt > `4`)
15852	return false;
15853	break;
15854	}
15855	case Instruction::Trunc:
15856	// Check if this is a noop.
15857	// trunc(sext ty1 to ty2) to ty1.
15858	if (Instr->getType() == Ext->getOperand(i: `0`)->getType())
15859	continue;
15860	[[fallthrough]];
15861	default:
15862	return false;
15863	}
15864
15865	// At this point we can use the bfm family, so this extension is free
15866	// for that use.
15867	}
15868	return true;
15869	}
15870
15871	static bool isSplatShuffle(Value *V) {
15872	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V))
15873	return all_equal(Range: Shuf->getShuffleMask());
15874	return false;
15875	}
15876
15877	/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15878	/// or upper half of the vector elements.
15879	static bool areExtractShuffleVectors(Value Op1, Value Op2,
15880	bool AllowSplat = false) {
15881	auto areTypesHalfed = [](Value FullV, Value HalfV) {
15882	auto *FullTy = FullV->getType();
15883	auto *HalfTy = HalfV->getType();
15884	return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15885	`2` * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15886	};
15887
15888	auto extractHalf = [](Value FullV, Value HalfV) {
15889	auto *FullVT = cast<FixedVectorType>(Val: FullV->getType());
15890	auto *HalfVT = cast<FixedVectorType>(Val: HalfV->getType());
15891	return FullVT->getNumElements() == `2` * HalfVT->getNumElements();
15892	};
15893
15894	ArrayRef<int> M1, M2;
15895	Value S1Op1 = nullptr, S2Op1 = nullptr;
15896	if (!match(V: Op1, P: m_Shuffle(v1: m_Value(V&: S1Op1), v2: m_Undef(), mask: m_Mask (M1))) \|\|
15897	!match(V: Op2, P: m_Shuffle(v1: m_Value(V&: S2Op1), v2: m_Undef(), mask: m_Mask (M2))))
15898	return false;
15899
15900	// If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15901	// it is not checked as an extract below.
15902	if (AllowSplat && isSplatShuffle(V: Op1))
15903	S1Op1 = nullptr;
15904	if (AllowSplat && isSplatShuffle(V: Op2))
15905	S2Op1 = nullptr;
15906
15907	// Check that the operands are half as wide as the result and we extract
15908	// half of the elements of the input vectors.
15909	if ((S1Op1 && (!areTypesHalfed (S1Op1, Op1) \|\| !extractHalf (S1Op1, Op1))) \|\|
15910	(S2Op1 && (!areTypesHalfed (S2Op1, Op2) \|\| !extractHalf (S2Op1, Op2))))
15911	return false;
15912
15913	// Check the mask extracts either the lower or upper half of vector
15914	// elements.
15915	int M1Start = `0`;
15916	int M2Start = `0`;
15917	int NumElements = cast<FixedVectorType>(Val: Op1->getType())->getNumElements() * `2`;
15918	if ((S1Op1 &&
15919	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M1, NumSrcElts: NumElements, Index&: M1Start)) \|\|
15920	(S2Op1 &&
15921	!ShuffleVectorInst::isExtractSubvectorMask(Mask: M2, NumSrcElts: NumElements, Index&: M2Start)))
15922	return false;
15923
15924	if ((M1Start != `0` && M1Start != (NumElements / `2`)) \|\|
15925	(M2Start != `0` && M2Start != (NumElements / `2`)))
15926	return false;
15927	if (S1Op1 && S2Op1 && M1Start != M2Start)
15928	return false;
15929
15930	return true;
15931	}
15932
15933	/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15934	/// of the vector elements.
15935	static bool areExtractExts(Value Ext1, Value Ext2) {
15936	auto areExtDoubled = [](Instruction *Ext) {
15937	return Ext->getType()->getScalarSizeInBits() ==
15938	`2` * Ext->getOperand(i: `0`)->getType()->getScalarSizeInBits();
15939	};
15940
15941	if (!match(V: Ext1, P: m_ZExtOrSExt(Op: m_Value())) \|\|
15942	!match(V: Ext2, P: m_ZExtOrSExt(Op: m_Value())) \|\|
15943	!areExtDoubled (cast<Instruction>(Val: Ext1)) \|\|
15944	!areExtDoubled (cast<Instruction>(Val: Ext2)))
15945	return false;
15946
15947	return true;
15948	}
15949
15950	/// Check if Op could be used with vmull_high_p64 intrinsic.
15951	static bool isOperandOfVmullHighP64(Value *Op) {
15952	Value VectorOperand = nullptr*;
15953	ConstantInt ElementIndex = nullptr*;
15954	return match(V: Op, P: m_ExtractElt(Val: m_Value(V&: VectorOperand),
15955	Idx: m_ConstantInt(CI&: ElementIndex))) &&
15956	ElementIndex->getValue() == `1` &&
15957	isa<FixedVectorType>(Val: VectorOperand->getType()) &&
15958	cast<FixedVectorType>(Val: VectorOperand->getType())->getNumElements() == `2`;
15959	}
15960
15961	/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15962	static bool areOperandsOfVmullHighP64(Value Op1, Value Op2) {
15963	return isOperandOfVmullHighP64(Op: Op1) && isOperandOfVmullHighP64(Op: Op2);
15964	}
15965
15966	static bool shouldSinkVectorOfPtrs(Value Ptrs, SmallVectorImpl<Use > &Ops) {
15967	// Restrict ourselves to the form CodeGenPrepare typically constructs.
15968	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptrs);
15969	if (!GEP \|\| GEP->getNumOperands() != `2`)
15970	return false;
15971
15972	Value *Base = GEP->getOperand(i_nocapture: `0`);
15973	Value *Offsets = GEP->getOperand(i_nocapture: `1`);
15974
15975	// We only care about scalar_base+vector_offsets.
15976	if (Base->getType()->isVectorTy() \|\| !Offsets->getType()->isVectorTy())
15977	return false;
15978
15979	// Sink extends that would allow us to use 32-bit offset vectors.
15980	if (isa<SExtInst>(Val: Offsets) \|\| isa<ZExtInst>(Val: Offsets)) {
15981	auto *OffsetsInst = cast<Instruction>(Val: Offsets);
15982	if (OffsetsInst->getType()->getScalarSizeInBits() > `32` &&
15983	OffsetsInst->getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `32`)
15984	Ops.push_back(Elt: &GEP->getOperandUse(i: `1`));
15985	}
15986
15987	// Sink the GEP.
15988	return true;
15989	}
15990
15991	/// We want to sink following cases:
15992	/// (add\|sub\|gep) A, ((mul\|shl) vscale, imm); (add\|sub\|gep) A, vscale;
15993	/// (add\|sub\|gep) A, ((mul\|shl) zext(vscale), imm);
15994	static bool shouldSinkVScale(Value Op, SmallVectorImpl<Use > &Ops) {
15995	if (match(V: Op, P: m_VScale()))
15996	return true;
15997	if (match(V: Op, P: m_Shl(L: m_VScale(), R: m_ConstantInt())) \|\|
15998	match(V: Op, P: m_Mul(L: m_VScale(), R: m_ConstantInt()))) {
15999	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
16000	return true;
16001	}
16002	if (match(V: Op, P: m_Shl(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt())) \|\|
16003	match(V: Op, P: m_Mul(L: m_ZExt(Op: m_VScale()), R: m_ConstantInt()))) {
16004	Value *ZExtOp = cast<Instruction>(Val: Op)->getOperand(i: `0`);
16005	Ops.push_back(Elt: &cast<Instruction>(Val: ZExtOp)->getOperandUse(i: `0`));
16006	Ops.push_back(Elt: &cast<Instruction>(Val: Op)->getOperandUse(i: `0`));
16007	return true;
16008	}
16009	return false;
16010	}
16011
16012	/// Check if sinking \p I's operands to I's basic block is profitable, because
16013	/// the operands can be folded into a target instruction, e.g.
16014	/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
16015	bool AArch64TargetLowering::shouldSinkOperands(
16016	Instruction I, SmallVectorImpl<Use > &Ops) const {
16017	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: I)) {
16018	switch (II->getIntrinsicID()) {
16019	case Intrinsic::aarch64_neon_smull:
16020	case Intrinsic::aarch64_neon_umull:
16021	if (areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`),
16022	/AllowSplat=/true)) {
16023	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
16024	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
16025	return true;
16026	}
16027	[[fallthrough]];
16028
16029	case Intrinsic::fma:
16030	if (isa<VectorType>(Val: I->getType()) &&
16031	cast<VectorType>(Val: I->getType())->getElementType()->isHalfTy() &&
16032	!Subtarget->hasFullFP16())
16033	return false;
16034	[[fallthrough]];
16035	case Intrinsic::aarch64_neon_sqdmull:
16036	case Intrinsic::aarch64_neon_sqdmulh:
16037	case Intrinsic::aarch64_neon_sqrdmulh:
16038	// Sink splats for index lane variants
16039	if (isSplatShuffle(V: II->getOperand(i_nocapture: `0`)))
16040	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
16041	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
16042	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
16043	return !Ops.empty();
16044	case Intrinsic::aarch64_neon_fmlal:
16045	case Intrinsic::aarch64_neon_fmlal2:
16046	case Intrinsic::aarch64_neon_fmlsl:
16047	case Intrinsic::aarch64_neon_fmlsl2:
16048	// Sink splats for index lane variants
16049	if (isSplatShuffle(V: II->getOperand(i_nocapture: `1`)))
16050	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
16051	if (isSplatShuffle(V: II->getOperand(i_nocapture: `2`)))
16052	Ops.push_back(Elt: &II->getOperandUse(i: `2`));
16053	return !Ops.empty();
16054	case Intrinsic::aarch64_sve_ptest_first:
16055	case Intrinsic::aarch64_sve_ptest_last:
16056	if (auto *IIOp = dyn_cast<IntrinsicInst>(Val: II->getOperand(i_nocapture: `0`)))
16057	if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
16058	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
16059	return !Ops.empty();
16060	case Intrinsic::aarch64_sme_write_horiz:
16061	case Intrinsic::aarch64_sme_write_vert:
16062	case Intrinsic::aarch64_sme_writeq_horiz:
16063	case Intrinsic::aarch64_sme_writeq_vert: {
16064	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `1`));
16065	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
16066	return false;
16067	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
16068	return true;
16069	}
16070	case Intrinsic::aarch64_sme_read_horiz:
16071	case Intrinsic::aarch64_sme_read_vert:
16072	case Intrinsic::aarch64_sme_readq_horiz:
16073	case Intrinsic::aarch64_sme_readq_vert:
16074	case Intrinsic::aarch64_sme_ld1b_vert:
16075	case Intrinsic::aarch64_sme_ld1h_vert:
16076	case Intrinsic::aarch64_sme_ld1w_vert:
16077	case Intrinsic::aarch64_sme_ld1d_vert:
16078	case Intrinsic::aarch64_sme_ld1q_vert:
16079	case Intrinsic::aarch64_sme_st1b_vert:
16080	case Intrinsic::aarch64_sme_st1h_vert:
16081	case Intrinsic::aarch64_sme_st1w_vert:
16082	case Intrinsic::aarch64_sme_st1d_vert:
16083	case Intrinsic::aarch64_sme_st1q_vert:
16084	case Intrinsic::aarch64_sme_ld1b_horiz:
16085	case Intrinsic::aarch64_sme_ld1h_horiz:
16086	case Intrinsic::aarch64_sme_ld1w_horiz:
16087	case Intrinsic::aarch64_sme_ld1d_horiz:
16088	case Intrinsic::aarch64_sme_ld1q_horiz:
16089	case Intrinsic::aarch64_sme_st1b_horiz:
16090	case Intrinsic::aarch64_sme_st1h_horiz:
16091	case Intrinsic::aarch64_sme_st1w_horiz:
16092	case Intrinsic::aarch64_sme_st1d_horiz:
16093	case Intrinsic::aarch64_sme_st1q_horiz: {
16094	auto *Idx = dyn_cast<Instruction>(Val: II->getOperand(i_nocapture: `3`));
16095	if (!Idx \|\| Idx->getOpcode() != Instruction::Add)
16096	return false;
16097	Ops.push_back(Elt: &II->getOperandUse(i: `3`));
16098	return true;
16099	}
16100	case Intrinsic::aarch64_neon_pmull:
16101	if (!areExtractShuffleVectors(Op1: II->getOperand(i_nocapture: `0`), Op2: II->getOperand(i_nocapture: `1`)))
16102	return false;
16103	Ops.push_back(Elt: &II->getOperandUse(i: `0`));
16104	Ops.push_back(Elt: &II->getOperandUse(i: `1`));
16105	return true;
16106	case Intrinsic::aarch64_neon_pmull64:
16107	if (!areOperandsOfVmullHighP64(Op1: II->getArgOperand(i: `0`),
16108	Op2: II->getArgOperand(i: `1`)))
16109	return false;
16110	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
16111	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
16112	return true;
16113	case Intrinsic::masked_gather:
16114	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `0`), Ops))
16115	return false;
16116	Ops.push_back(Elt: &II->getArgOperandUse(i: `0`));
16117	return true;
16118	case Intrinsic::masked_scatter:
16119	if (!shouldSinkVectorOfPtrs(Ptrs: II->getArgOperand(i: `1`), Ops))
16120	return false;
16121	Ops.push_back(Elt: &II->getArgOperandUse(i: `1`));
16122	return true;
16123	default:
16124	return false;
16125	}
16126	}
16127
16128	// Sink vscales closer to uses for better isel
16129	switch (I->getOpcode()) {
16130	case Instruction::GetElementPtr:
16131	case Instruction::Add:
16132	case Instruction::Sub:
16133	for (unsigned Op = `0`; Op < I->getNumOperands(); ++Op) {
16134	if (shouldSinkVScale(Op: I->getOperand(i: Op), Ops)) {
16135	Ops.push_back(Elt: &I->getOperandUse(i: Op));
16136	return true;
16137	}
16138	}
16139	break;
16140	default:
16141	break;
16142	}
16143
16144	if (!I->getType()->isVectorTy())
16145	return false;
16146
16147	switch (I->getOpcode()) {
16148	case Instruction::Sub:
16149	case Instruction::Add: {
16150	if (!areExtractExts(Ext1: I->getOperand(i: `0`), Ext2: I->getOperand(i: `1`)))
16151	return false;
16152
16153	// If the exts' operands extract either the lower or upper elements, we
16154	// can sink them too.
16155	auto Ext1 = cast<Instruction>(Val: I->getOperand(i: `0`));
16156	auto Ext2 = cast<Instruction>(Val: I->getOperand(i: `1`));
16157	if (areExtractShuffleVectors(Op1: Ext1->getOperand(i: `0`), Op2: Ext2->getOperand(i: `0`))) {
16158	Ops.push_back(Elt: &Ext1->getOperandUse(i: `0`));
16159	Ops.push_back(Elt: &Ext2->getOperandUse(i: `0`));
16160	}
16161
16162	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
16163	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
16164
16165	return true;
16166	}
16167	case Instruction::Or: {
16168	// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
16169	// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
16170	if (Subtarget->hasNEON()) {
16171	Instruction OtherAnd, IA, *IB;
16172	Value *MaskValue;
16173	// MainAnd refers to And instruction that has 'Not' as one of its operands
16174	if (match(V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_Instruction(I&: OtherAnd)),
16175	R: m_OneUse(SubPattern: m_c_And(L: m_OneUse(SubPattern: m_Not(V: m_Value(V&: MaskValue))),
16176	R: m_Instruction(I&: IA)))))) {
16177	if (match(V: OtherAnd,
16178	P: m_c_And(L: m_Specific(V: MaskValue), R: m_Instruction(I&: IB)))) {
16179	Instruction *MainAnd = I->getOperand(i: `0`) == OtherAnd
16180	? cast<Instruction>(Val: I->getOperand(i: `1`))
16181	: cast<Instruction>(Val: I->getOperand(i: `0`));
16182
16183	// Both Ands should be in same basic block as Or
16184	if (I->getParent() != MainAnd->getParent() \|\|
16185	I->getParent() != OtherAnd->getParent())
16186	return false;
16187
16188	// Non-mask operands of both Ands should also be in same basic block
16189	if (I->getParent() != IA->getParent() \|\|
16190	I->getParent() != IB->getParent())
16191	return false;
16192
16193	Ops.push_back(Elt: &MainAnd->getOperandUse(i: MainAnd->getOperand(i: `0`) == IA ? `1` : `0`));
16194	Ops.push_back(Elt: &I->getOperandUse(i: `0`));
16195	Ops.push_back(Elt: &I->getOperandUse(i: `1`));
16196
16197	return true;
16198	}
16199	}
16200	}
16201
16202	return false;
16203	}
16204	case Instruction::Mul: {
16205	int NumZExts = `0`, NumSExts = `0`;
16206	for (auto &Op : I->operands()) {
16207	// Make sure we are not already sinking this operand
16208	if (any_of(Range&: Ops, P: [&](Use U) { return* U->get() == Op; }))
16209	continue;
16210
16211	if (match(V: &Op, P: m_SExt(Op: m_Value()))) {
16212	NumSExts++;
16213	continue;
16214	} else if (match(V: &Op, P: m_ZExt(Op: m_Value()))) {
16215	NumZExts++;
16216	continue;
16217	}
16218
16219	ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Val&: Op);
16220
16221	// If the Shuffle is a splat and the operand is a zext/sext, sinking the
16222	// operand and the s/zext can help create indexed s/umull. This is
16223	// especially useful to prevent i64 mul being scalarized.
16224	if (Shuffle && isSplatShuffle(V: Shuffle) &&
16225	match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_ZExtOrSExt(Op: m_Value()))) {
16226	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
16227	Ops.push_back(Elt: &Op);
16228	if (match(V: Shuffle->getOperand(i_nocapture: `0`), P: m_SExt(Op: m_Value())))
16229	NumSExts++;
16230	else
16231	NumZExts++;
16232	continue;
16233	}
16234
16235	if (!Shuffle)
16236	continue;
16237
16238	Value *ShuffleOperand = Shuffle->getOperand(i_nocapture: `0`);
16239	InsertElementInst *Insert = dyn_cast<InsertElementInst>(Val: ShuffleOperand);
16240	if (!Insert)
16241	continue;
16242
16243	Instruction *OperandInstr = dyn_cast<Instruction>(Val: Insert->getOperand(i_nocapture: `1`));
16244	if (!OperandInstr)
16245	continue;
16246
16247	ConstantInt *ElementConstant =
16248	dyn_cast<ConstantInt>(Val: Insert->getOperand(i_nocapture: `2`));
16249	// Check that the insertelement is inserting into element 0
16250	if (!ElementConstant \|\| !ElementConstant->isZero())
16251	continue;
16252
16253	unsigned Opcode = OperandInstr->getOpcode();
16254	if (Opcode == Instruction::SExt)
16255	NumSExts++;
16256	else if (Opcode == Instruction::ZExt)
16257	NumZExts++;
16258	else {
16259	// If we find that the top bits are known 0, then we can sink and allow
16260	// the backend to generate a umull.
16261	unsigned Bitwidth = I->getType()->getScalarSizeInBits();
16262	APInt UpperMask = APInt::getHighBitsSet(numBits: Bitwidth, hiBitsSet: Bitwidth / `2`);
16263	const DataLayout &DL = I->getDataLayout();
16264	if (!MaskedValueIsZero(V: OperandInstr, Mask: UpperMask, DL))
16265	continue;
16266	NumZExts++;
16267	}
16268
16269	Ops.push_back(Elt: &Shuffle->getOperandUse(i: `0`));
16270	Ops.push_back(Elt: &Op);
16271	}
16272
16273	// Is it profitable to sink if we found two of the same type of extends.
16274	return !Ops.empty() && (NumSExts == `2` \|\| NumZExts == `2`);
16275	}
16276	default:
16277	return false;
16278	}
16279	return false;
16280	}
16281
16282	static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16283	unsigned NumElts, bool IsLittleEndian,
16284	SmallVectorImpl<int> &Mask) {
16285	if (DstWidth % `8` != `0` \|\| DstWidth <= `16` \|\| DstWidth >= `64`)
16286	return false;
16287
16288	assert(DstWidth % SrcWidth == `0` &&
16289	"TBL lowering is not supported for a conversion instruction with this "
16290	"source and destination element type.");
16291
16292	unsigned Factor = DstWidth / SrcWidth;
16293	unsigned MaskLen = NumElts * Factor;
16294
16295	Mask.clear();
16296	Mask.resize(N: MaskLen, NV: NumElts);
16297
16298	unsigned SrcIndex = `0`;
16299	for (unsigned I = IsLittleEndian ? `0` : Factor - `1`; I < MaskLen; I += Factor)
16300	Mask [I] = SrcIndex++;
16301
16302	return true;
16303	}
16304
16305	static Value createTblShuffleForZExt(IRBuilderBase &Builder, Value Op,
16306	FixedVectorType *ZExtTy,
16307	FixedVectorType *DstTy,
16308	bool IsLittleEndian) {
16309	auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16310	unsigned NumElts = SrcTy->getNumElements();
16311	auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16312	auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16313
16314	SmallVector<int> Mask;
16315	if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16316	return nullptr;
16317
16318	auto *FirstEltZero = Builder.CreateInsertElement(
16319	Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: `0`), Idx: uint64_t(`0`));
16320	Value *Result = Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16321	Result = Builder.CreateBitCast(V: Result, DestTy: DstTy);
16322	if (DstTy != ZExtTy)
16323	Result = Builder.CreateZExt(V: Result, DestTy: ZExtTy);
16324	return Result;
16325	}
16326
16327	static Value createTblShuffleForSExt(IRBuilderBase &Builder, Value Op,
16328	FixedVectorType *DstTy,
16329	bool IsLittleEndian) {
16330	auto *SrcTy = cast<FixedVectorType>(Val: Op->getType());
16331	auto SrcWidth = cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16332	auto DstWidth = cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16333
16334	SmallVector<int> Mask;
16335	if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts: SrcTy->getNumElements(),
16336	IsLittleEndian: !IsLittleEndian, Mask))
16337	return nullptr;
16338
16339	auto *FirstEltZero = Builder.CreateInsertElement(
16340	Vec: PoisonValue::get(T: SrcTy), NewElt: Builder.getInt8(C: `0`), Idx: uint64_t(`0`));
16341
16342	return Builder.CreateShuffleVector(V1: Op, V2: FirstEltZero, Mask);
16343	}
16344
16345	static void createTblForTrunc(TruncInst TI, bool* IsLittleEndian) {
16346	IRBuilder<> Builder(TI);
16347	SmallVector<Value *> Parts;
16348	int NumElements = cast<FixedVectorType>(Val: TI->getType())->getNumElements();
16349	auto *SrcTy = cast<FixedVectorType>(Val: TI->getOperand(i_nocapture: `0`)->getType());
16350	auto *DstTy = cast<FixedVectorType>(Val: TI->getType());
16351	assert(SrcTy->getElementType()->isIntegerTy() &&
16352	"Non-integer type source vector element is not supported");
16353	assert(DstTy->getElementType()->isIntegerTy(`8`) &&
16354	"Unsupported destination vector element type");
16355	unsigned SrcElemTySz =
16356	cast<IntegerType>(Val: SrcTy->getElementType())->getBitWidth();
16357	unsigned DstElemTySz =
16358	cast<IntegerType>(Val: DstTy->getElementType())->getBitWidth();
16359	assert((SrcElemTySz % DstElemTySz == `0`) &&
16360	"Cannot lower truncate to tbl instructions for a source element size "
16361	"that is not divisible by the destination element size");
16362	unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16363	assert((SrcElemTySz == `16` \|\| SrcElemTySz == `32` \|\| SrcElemTySz == `64`) &&
16364	"Unsupported source vector element type size");
16365	Type *VecTy = FixedVectorType::get(ElementType: Builder.getInt8Ty(), NumElts: `16`);
16366
16367	// Create a mask to choose every nth byte from the source vector table of
16368	// bytes to create the truncated destination vector, where 'n' is the truncate
16369	// ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16370	// 0,8,16,..Y8th bytes for the little-endian format*
16371	SmallVector<Constant *, `16`> MaskConst;
16372	for (int Itr = `0`; Itr < `16`; Itr++) {
16373	if (Itr < NumElements)
16374	MaskConst.push_back(Elt: Builder.getInt8(
16375	C: IsLittleEndian ? Itr * TruncFactor
16376	: Itr * TruncFactor + (TruncFactor - `1`)));
16377	else
16378	MaskConst.push_back(Elt: Builder.getInt8(C: `255`));
16379	}
16380
16381	int MaxTblSz = `128` * `4`;
16382	int MaxSrcSz = SrcElemTySz * NumElements;
16383	int ElemsPerTbl =
16384	(MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16385	assert(ElemsPerTbl <= `16` &&
16386	"Maximum elements selected using TBL instruction cannot exceed 16!");
16387
16388	int ShuffleCount = `128` / SrcElemTySz;
16389	SmallVector<int> ShuffleLanes;
16390	for (int i = `0`; i < ShuffleCount; ++i)
16391	ShuffleLanes.push_back(Elt: i);
16392
16393	// Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16394	// over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16395	// call TBL & save the result in a vector of TBL results for combining later.
16396	SmallVector<Value *> Results;
16397	while (ShuffleLanes.back() < NumElements) {
16398	Parts.push_back(Elt: Builder.CreateBitCast(
16399	V: Builder.CreateShuffleVector(V: TI->getOperand(i_nocapture: `0`), Mask: ShuffleLanes), DestTy: VecTy));
16400
16401	if (Parts.size() == `4`) {
16402	auto *F = Intrinsic::getDeclaration(M: TI->getModule(),
16403	id: Intrinsic::aarch64_neon_tbl4, Tys: VecTy);
16404	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16405	Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
16406	Parts.clear();
16407	}
16408
16409	for (int i = `0`; i < ShuffleCount; ++i)
16410	ShuffleLanes [i] += ShuffleCount;
16411	}
16412
16413	assert((Parts.empty() \|\| Results.empty()) &&
16414	"Lowering trunc for vectors requiring different TBL instructions is "
16415	"not supported!");
16416	// Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16417	// registers
16418	if (!Parts.empty()) {
16419	Intrinsic::ID TblID;
16420	switch (Parts.size()) {
16421	case `1`:
16422	TblID = Intrinsic::aarch64_neon_tbl1;
16423	break;
16424	case `2`:
16425	TblID = Intrinsic::aarch64_neon_tbl2;
16426	break;
16427	case `3`:
16428	TblID = Intrinsic::aarch64_neon_tbl3;
16429	break;
16430	}
16431
16432	auto *F = Intrinsic::getDeclaration(M: TI->getModule(), id: TblID, Tys: VecTy);
16433	Parts.push_back(Elt: ConstantVector::get(V: MaskConst));
16434	Results.push_back(Elt: Builder.CreateCall(Callee: F, Args: Parts));
16435	}
16436
16437	// Extract the destination vector from TBL result(s) after combining them
16438	// where applicable. Currently, at most two TBLs are supported.
16439	assert(Results.size() <= `2` && "Trunc lowering does not support generation of "
16440	"more than 2 tbl instructions!");
16441	Value *FinalResult = Results [`0`];
16442	if (Results.size() == `1`) {
16443	if (ElemsPerTbl < `16`) {
16444	SmallVector<int> FinalMask(ElemsPerTbl);
16445	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
16446	FinalResult = Builder.CreateShuffleVector(V: Results [`0`], Mask: FinalMask);
16447	}
16448	} else {
16449	SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16450	if (ElemsPerTbl < `16`) {
16451	std::iota(first: FinalMask.begin(), last: FinalMask.begin() + ElemsPerTbl, value: `0`);
16452	std::iota(first: FinalMask.begin() + ElemsPerTbl, last: FinalMask.end(), value: `16`);
16453	} else {
16454	std::iota(first: FinalMask.begin(), last: FinalMask.end(), value: `0`);
16455	}
16456	FinalResult =
16457	Builder.CreateShuffleVector(V1: Results [`0`], V2: Results [`1`], Mask: FinalMask);
16458	}
16459
16460	TI->replaceAllUsesWith(V: FinalResult);
16461	TI->eraseFromParent();
16462	}
16463
16464	bool AArch64TargetLowering::optimizeExtendOrTruncateConversion(
16465	Instruction I, Loop L, const TargetTransformInfo &TTI) const {
16466	// shuffle_vector instructions are serialized when targeting SVE,
16467	// see LowerSPLAT_VECTOR. This peephole is not beneficial.
16468	if (!EnableExtToTBL \|\| Subtarget->useSVEForFixedLengthVectors())
16469	return false;
16470
16471	// Try to optimize conversions using tbl. This requires materializing constant
16472	// index vectors, which can increase code size and add loads. Skip the
16473	// transform unless the conversion is in a loop block guaranteed to execute
16474	// and we are not optimizing for size.
16475	Function *F = I->getParent()->getParent();
16476	if (!L \|\| L->getHeader() != I->getParent() \|\| F->hasMinSize() \|\|
16477	F->hasOptSize())
16478	return false;
16479
16480	auto *SrcTy = dyn_cast<FixedVectorType>(Val: I->getOperand(i: `0`)->getType());
16481	auto *DstTy = dyn_cast<FixedVectorType>(Val: I->getType());
16482	if (!SrcTy \|\| !DstTy)
16483	return false;
16484
16485	// Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16486	// lowered to tbl instructions to insert the original i8 elements
16487	// into i8x lanes. This is enabled for cases where it is beneficial.
16488	auto *ZExt = dyn_cast<ZExtInst>(Val: I);
16489	if (ZExt && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
16490	auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16491	if (DstWidth % `8` != `0`)
16492	return false;
16493
16494	auto *TruncDstType =
16495	cast<FixedVectorType>(Val: VectorType::getTruncatedElementVectorType(VTy: DstTy));
16496	// If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16497	// the remaining ZExt folded into the user, don't use tbl lowering.
16498	auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16499	if (TTI.getCastInstrCost(Opcode: I->getOpcode(), Dst: DstTy, Src: TruncDstType,
16500	CCH: TargetTransformInfo::getCastContextHint(I),
16501	CostKind: TTI::TCK_SizeAndLatency, I) == TTI::TCC_Free) {
16502	if (SrcWidth * `2` >= TruncDstType->getElementType()->getScalarSizeInBits())
16503	return false;
16504
16505	DstTy = TruncDstType;
16506	}
16507	IRBuilder<> Builder(ZExt);
16508	Value *Result = createTblShuffleForZExt(
16509	Builder, Op: ZExt->getOperand(i_nocapture: `0`), ZExtTy: cast<FixedVectorType>(Val: ZExt->getType()),
16510	DstTy, IsLittleEndian: Subtarget->isLittleEndian());
16511	if (!Result)
16512	return false;
16513	ZExt->replaceAllUsesWith(V: Result);
16514	ZExt->eraseFromParent();
16515	return true;
16516	}
16517
16518	auto *UIToFP = dyn_cast<UIToFPInst>(Val: I);
16519	if (UIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16520	DstTy->getElementType()->isFloatTy()) {
16521	IRBuilder<> Builder(I);
16522	Value *ZExt = createTblShuffleForZExt(
16523	Builder, Op: I->getOperand(i: `0`), ZExtTy: FixedVectorType::getInteger(VTy: DstTy),
16524	DstTy: FixedVectorType::getInteger(VTy: DstTy), IsLittleEndian: Subtarget->isLittleEndian());
16525	assert(ZExt && "Cannot fail for the i8 to float conversion");
16526	auto *UI = Builder.CreateUIToFP(V: ZExt, DestTy: DstTy);
16527	I->replaceAllUsesWith(V: UI);
16528	I->eraseFromParent();
16529	return true;
16530	}
16531
16532	auto *SIToFP = dyn_cast<SIToFPInst>(Val: I);
16533	if (SIToFP && SrcTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16534	DstTy->getElementType()->isFloatTy()) {
16535	IRBuilder<> Builder(I);
16536	auto *Shuffle = createTblShuffleForSExt(Builder, Op: I->getOperand(i: `0`),
16537	DstTy: FixedVectorType::getInteger(VTy: DstTy),
16538	IsLittleEndian: Subtarget->isLittleEndian());
16539	assert(Shuffle && "Cannot fail for the i8 to float conversion");
16540	auto *Cast = Builder.CreateBitCast(V: Shuffle, DestTy: VectorType::getInteger(VTy: DstTy));
16541	auto AShr = Builder.CreateAShr(LHS: Cast, RHS: `24`, Name: "", isExact: true*);
16542	auto *SI = Builder.CreateSIToFP(V: AShr, DestTy: DstTy);
16543	I->replaceAllUsesWith(V: SI);
16544	I->eraseFromParent();
16545	return true;
16546	}
16547
16548	// Convert 'fptoui <(8\|16) x float> to <(8\|16) x i8>' to a wide fptoui
16549	// followed by a truncate lowered to using tbl.4.
16550	auto *FPToUI = dyn_cast<FPToUIInst>(Val: I);
16551	if (FPToUI &&
16552	(SrcTy->getNumElements() == `8` \|\| SrcTy->getNumElements() == `16`) &&
16553	SrcTy->getElementType()->isFloatTy() &&
16554	DstTy->getElementType()->isIntegerTy(Bitwidth: `8`)) {
16555	IRBuilder<> Builder(I);
16556	auto *WideConv = Builder.CreateFPToUI(V: FPToUI->getOperand(i_nocapture: `0`),
16557	DestTy: VectorType::getInteger(VTy: SrcTy));
16558	auto *TruncI = Builder.CreateTrunc(V: WideConv, DestTy: DstTy);
16559	I->replaceAllUsesWith(V: TruncI);
16560	I->eraseFromParent();
16561	createTblForTrunc(TI: cast<TruncInst>(Val: TruncI), IsLittleEndian: Subtarget->isLittleEndian());
16562	return true;
16563	}
16564
16565	// Convert 'trunc <(8\|16) x (i32\|i64)> %x to <(8\|16) x i8>' to an appropriate
16566	// tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16567	// per lane of the input that is represented using 1,2,3 or 4 128-bit table
16568	// registers
16569	auto *TI = dyn_cast<TruncInst>(Val: I);
16570	if (TI && DstTy->getElementType()->isIntegerTy(Bitwidth: `8`) &&
16571	((SrcTy->getElementType()->isIntegerTy(Bitwidth: `32`) \|\|
16572	SrcTy->getElementType()->isIntegerTy(Bitwidth: `64`)) &&
16573	(SrcTy->getNumElements() == `16` \|\| SrcTy->getNumElements() == `8`))) {
16574	createTblForTrunc(TI, IsLittleEndian: Subtarget->isLittleEndian());
16575	return true;
16576	}
16577
16578	return false;
16579	}
16580
16581	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
16582	Align &RequiredAligment) const {
16583	if (!LoadedType.isSimple() \|\|
16584	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16585	return false;
16586	// Cyclone supports unaligned accesses.
16587	RequiredAligment = Align (`1`);
16588	unsigned NumBits = LoadedType.getSizeInBits();
16589	return NumBits == `32` \|\| NumBits == `64`;
16590	}
16591
16592	/// A helper function for determining the number of interleaved accesses we
16593	/// will generate when lowering accesses of the given type.
16594	unsigned AArch64TargetLowering::getNumInterleavedAccesses(
16595	VectorType VecTy, const* DataLayout &DL, bool UseScalable) const {
16596	unsigned VecSize = `128`;
16597	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16598	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16599	if (UseScalable && isa<FixedVectorType>(Val: VecTy))
16600	VecSize = std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: `128u`);
16601	return std::max<unsigned>(a: `1`, b: (MinElts * ElSize + `127`) / VecSize);
16602	}
16603
16604	MachineMemOperand::Flags
16605	AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
16606	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16607	I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16608	return MOStridedAccess;
16609	return MachineMemOperand::MONone;
16610	}
16611
16612	bool AArch64TargetLowering::isLegalInterleavedAccessType(
16613	VectorType VecTy, const* DataLayout &DL, bool &UseScalable) const {
16614	unsigned ElSize = DL.getTypeSizeInBits(Ty: VecTy->getElementType());
16615	auto EC = VecTy->getElementCount();
16616	unsigned MinElts = EC.getKnownMinValue();
16617
16618	UseScalable = false;
16619
16620	if (isa<FixedVectorType>(Val: VecTy) && !Subtarget->isNeonAvailable() &&
16621	(!Subtarget->useSVEForFixedLengthVectors() \|\|
16622	!getSVEPredPatternFromNumElements(MinNumElts: MinElts)))
16623	return false;
16624
16625	if (isa<ScalableVectorType>(Val: VecTy) &&
16626	!Subtarget->isSVEorStreamingSVEAvailable())
16627	return false;
16628
16629	// Ensure the number of vector elements is greater than 1.
16630	if (MinElts < `2`)
16631	return false;
16632
16633	// Ensure the element type is legal.
16634	if (ElSize != `8` && ElSize != `16` && ElSize != `32` && ElSize != `64`)
16635	return false;
16636
16637	if (EC.isScalable()) {
16638	UseScalable = true;
16639	return isPowerOf2_32(Value: MinElts) && (MinElts * ElSize) % `128` == `0`;
16640	}
16641
16642	unsigned VecSize = DL.getTypeSizeInBits(Ty: VecTy);
16643	if (Subtarget->useSVEForFixedLengthVectors()) {
16644	unsigned MinSVEVectorSize =
16645	std::max(a: Subtarget->getMinSVEVectorSizeInBits(), b: `128u`);
16646	if (VecSize % MinSVEVectorSize == `0` \|\|
16647	(VecSize < MinSVEVectorSize && isPowerOf2_32(Value: MinElts) &&
16648	(!Subtarget->isNeonAvailable() \|\| VecSize > `128`))) {
16649	UseScalable = true;
16650	return true;
16651	}
16652	}
16653
16654	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
16655	// 128 will be split into multiple interleaved accesses.
16656	return Subtarget->isNeonAvailable() && (VecSize == `64` \|\| VecSize % `128` == `0`);
16657	}
16658
16659	static ScalableVectorType getSVEContainerIRType(FixedVectorType VTy) {
16660	if (VTy->getElementType() == Type::getDoubleTy(C&: VTy->getContext()))
16661	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
16662
16663	if (VTy->getElementType() == Type::getFloatTy(C&: VTy->getContext()))
16664	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
16665
16666	if (VTy->getElementType() == Type::getBFloatTy(C&: VTy->getContext()))
16667	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
16668
16669	if (VTy->getElementType() == Type::getHalfTy(C&: VTy->getContext()))
16670	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
16671
16672	if (VTy->getElementType() == Type::getInt64Ty(C&: VTy->getContext()))
16673	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `2`);
16674
16675	if (VTy->getElementType() == Type::getInt32Ty(C&: VTy->getContext()))
16676	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `4`);
16677
16678	if (VTy->getElementType() == Type::getInt16Ty(C&: VTy->getContext()))
16679	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `8`);
16680
16681	if (VTy->getElementType() == Type::getInt8Ty(C&: VTy->getContext()))
16682	return ScalableVectorType::get(ElementType: VTy->getElementType(), MinNumElts: `16`);
16683
16684	llvm_unreachable("Cannot handle input vector type");
16685	}
16686
16687	static Function getStructuredLoadFunction(Module M, unsigned Factor,
16688	bool Scalable, Type *LDVTy,
16689	Type *PtrTy) {
16690	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
16691	static const Intrinsic::ID SVELoads[`3`] = {Intrinsic::aarch64_sve_ld2_sret,
16692	Intrinsic::aarch64_sve_ld3_sret,
16693	Intrinsic::aarch64_sve_ld4_sret};
16694	static const Intrinsic::ID NEONLoads[`3`] = {Intrinsic::aarch64_neon_ld2,
16695	Intrinsic::aarch64_neon_ld3,
16696	Intrinsic::aarch64_neon_ld4};
16697	if (Scalable)
16698	return Intrinsic::getDeclaration(M, id: SVELoads[Factor - `2`], Tys: {LDVTy});
16699
16700	return Intrinsic::getDeclaration(M, id: NEONLoads[Factor - `2`], Tys: {LDVTy, PtrTy});
16701	}
16702
16703	static Function getStructuredStoreFunction(Module M, unsigned Factor,
16704	bool Scalable, Type *STVTy,
16705	Type *PtrTy) {
16706	assert(Factor >= `2` && Factor <= `4` && "Invalid interleave factor");
16707	static const Intrinsic::ID SVEStores[`3`] = {Intrinsic::aarch64_sve_st2,
16708	Intrinsic::aarch64_sve_st3,
16709	Intrinsic::aarch64_sve_st4};
16710	static const Intrinsic::ID NEONStores[`3`] = {Intrinsic::aarch64_neon_st2,
16711	Intrinsic::aarch64_neon_st3,
16712	Intrinsic::aarch64_neon_st4};
16713	if (Scalable)
16714	return Intrinsic::getDeclaration(M, id: SVEStores[Factor - `2`], Tys: {STVTy});
16715
16716	return Intrinsic::getDeclaration(M, id: NEONStores[Factor - `2`], Tys: {STVTy, PtrTy});
16717	}
16718
16719	/// Lower an interleaved load into a ldN intrinsic.
16720	///
16721	/// E.g. Lower an interleaved load (Factor = 2):
16722	/// %wide.vec = load <8 x i32>, <8 x i32> %ptr*
16723	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16724	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16725	///
16726	/// Into:
16727	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16728	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16729	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16730	bool AArch64TargetLowering::lowerInterleavedLoad(
16731	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
16732	ArrayRef<unsigned> Indices, unsigned Factor) const {
16733	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
16734	"Invalid interleave factor");
16735	assert(!Shuffles.empty() && "Empty shufflevector input");
16736	assert(Shuffles.size() == Indices.size() &&
16737	"Unmatched number of shufflevectors and indices");
16738
16739	const DataLayout &DL = LI->getDataLayout();
16740
16741	VectorType *VTy = Shuffles [`0`]->getType();
16742
16743	// Skip if we do not have NEON and skip illegal vector types. We can
16744	// "legalize" wide vector types into multiple interleaved accesses as long as
16745	// the vector types are divisible by 128.
16746	bool UseScalable;
16747	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
16748	return false;
16749
16750	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
16751
16752	auto *FVTy = cast<FixedVectorType>(Val: VTy);
16753
16754	// A pointer vector can not be the return type of the ldN intrinsics. Need to
16755	// load integer vectors first and then convert to pointer vectors.
16756	Type *EltTy = FVTy->getElementType();
16757	if (EltTy->isPointerTy())
16758	FVTy =
16759	FixedVectorType::get(ElementType: DL.getIntPtrType(EltTy), NumElts: FVTy->getNumElements());
16760
16761	// If we're going to generate more than one load, reset the sub-vector type
16762	// to something legal.
16763	FVTy = FixedVectorType::get(ElementType: FVTy->getElementType(),
16764	NumElts: FVTy->getNumElements() / NumLoads);
16765
16766	auto *LDVTy =
16767	UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: FVTy)) : FVTy;
16768
16769	IRBuilder<> Builder(LI);
16770
16771	// The base address of the load.
16772	Value *BaseAddr = LI->getPointerOperand();
16773
16774	Type *PtrTy = LI->getPointerOperandType();
16775	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: LDVTy->getContext()),
16776	EC: LDVTy->getElementCount());
16777
16778	Function *LdNFunc = getStructuredLoadFunction(M: LI->getModule(), Factor,
16779	Scalable: UseScalable, LDVTy, PtrTy);
16780
16781	// Holds sub-vectors extracted from the load intrinsic return values. The
16782	// sub-vectors are associated with the shufflevector instructions they will
16783	// replace.
16784	DenseMap<ShuffleVectorInst , SmallVector<Value , `4`>> SubVecs;
16785
16786	Value PTrue = nullptr*;
16787	if (UseScalable) {
16788	std::optional<unsigned> PgPattern =
16789	getSVEPredPatternFromNumElements(MinNumElts: FVTy->getNumElements());
16790	if (Subtarget->getMinSVEVectorSizeInBits() ==
16791	Subtarget->getMaxSVEVectorSizeInBits() &&
16792	Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(Ty: FVTy))
16793	PgPattern = AArch64SVEPredPattern::all;
16794
16795	auto *PTruePat =
16796	ConstantInt::get(Ty: Type::getInt32Ty(C&: LDVTy->getContext()), V: *PgPattern);
16797	PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
16798	Args: {PTruePat});
16799	}
16800
16801	for (unsigned LoadCount = `0`; LoadCount < NumLoads; ++LoadCount) {
16802
16803	// If we're generating more than one load, compute the base address of
16804	// subsequent loads as an offset from the previous.
16805	if (LoadCount > `0`)
16806	BaseAddr = Builder.CreateConstGEP1_32(Ty: LDVTy->getElementType(), Ptr: BaseAddr,
16807	Idx0: FVTy->getNumElements() * Factor);
16808
16809	CallInst *LdN;
16810	if (UseScalable)
16811	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {PTrue, BaseAddr}, Name: "ldN");
16812	else
16813	LdN = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
16814
16815	// Extract and store the sub-vectors returned by the load intrinsic.
16816	for (unsigned i = `0`; i < Shuffles.size(); i++) {
16817	ShuffleVectorInst *SVI = Shuffles [i];
16818	unsigned Index = Indices [i];
16819
16820	Value *SubVec = Builder.CreateExtractValue(Agg: LdN, Idxs: Index);
16821
16822	if (UseScalable)
16823	SubVec = Builder.CreateExtractVector(
16824	DstType: FVTy, SrcVec: SubVec,
16825	Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: VTy->getContext()), V: `0`));
16826
16827	// Convert the integer vector to pointer vector if the element is pointer.
16828	if (EltTy->isPointerTy())
16829	SubVec = Builder.CreateIntToPtr(
16830	V: SubVec, DestTy: FixedVectorType::get(ElementType: SVI->getType()->getElementType(),
16831	NumElts: FVTy->getNumElements()));
16832
16833	SubVecs [SVI].push_back(Elt: SubVec);
16834	}
16835	}
16836
16837	// Replace uses of the shufflevector instructions with the sub-vectors
16838	// returned by the load intrinsic. If a shufflevector instruction is
16839	// associated with more than one sub-vector, those sub-vectors will be
16840	// concatenated into a single wide vector.
16841	for (ShuffleVectorInst *SVI : Shuffles) {
16842	auto &SubVec = SubVecs [SVI];
16843	auto *WideVec =
16844	SubVec.size() > `1` ? concatenateVectors(Builder, Vecs: SubVec) : SubVec [`0`];
16845	SVI->replaceAllUsesWith(V: WideVec);
16846	}
16847
16848	return true;
16849	}
16850
16851	template <typename Iter>
16852	bool hasNearbyPairedStore(Iter It, Iter End, Value Ptr, const* DataLayout &DL) {
16853	int MaxLookupDist = `20`;
16854	unsigned IdxWidth = DL.getIndexSizeInBits(AS: `0`);
16855	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
16856	const Value *PtrA1 =
16857	Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
16858
16859	while (++It != End) {
16860	if (It->isDebugOrPseudoInst())
16861	continue;
16862	if (MaxLookupDist-- == `0`)
16863	break;
16864	if (const auto SI = dyn_cast<StoreInst>(&It)) {
16865	const Value *PtrB1 =
16866	SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16867	DL, OffsetB);
16868	if (PtrA1 == PtrB1 &&
16869	(OffsetA.sextOrTrunc(width: IdxWidth) - OffsetB.sextOrTrunc(width: IdxWidth))
16870	.abs() == `16`)
16871	return true;
16872	}
16873	}
16874
16875	return false;
16876	}
16877
16878	/// Lower an interleaved store into a stN intrinsic.
16879	///
16880	/// E.g. Lower an interleaved store (Factor = 3):
16881	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16882	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16883	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
16884	///
16885	/// Into:
16886	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16887	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16888	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16889	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16890	///
16891	/// Note that the new shufflevectors will be removed and we'll only generate one
16892	/// st3 instruction in CodeGen.
16893	///
16894	/// Example for a more general valid mask (Factor 3). Lower:
16895	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16896	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16897	/// store <12 x i32> %i.vec, <12 x i32> %ptr*
16898	///
16899	/// Into:
16900	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16901	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16902	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16903	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16904	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16905	ShuffleVectorInst *SVI,
16906	unsigned Factor) const {
16907
16908	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
16909	"Invalid interleave factor");
16910
16911	auto *VecTy = cast<FixedVectorType>(Val: SVI->getType());
16912	assert(VecTy->getNumElements() % Factor == `0` && "Invalid interleaved store");
16913
16914	unsigned LaneLen = VecTy->getNumElements() / Factor;
16915	Type *EltTy = VecTy->getElementType();
16916	auto *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: LaneLen);
16917
16918	const DataLayout &DL = SI->getDataLayout();
16919	bool UseScalable;
16920
16921	// Skip if we do not have NEON and skip illegal vector types. We can
16922	// "legalize" wide vector types into multiple interleaved accesses as long as
16923	// the vector types are divisible by 128.
16924	if (!isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable))
16925	return false;
16926
16927	unsigned NumStores = getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable);
16928
16929	Value *Op0 = SVI->getOperand(i_nocapture: `0`);
16930	Value *Op1 = SVI->getOperand(i_nocapture: `1`);
16931	IRBuilder<> Builder(SI);
16932
16933	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
16934	// vectors to integer vectors.
16935	if (EltTy->isPointerTy()) {
16936	Type *IntTy = DL.getIntPtrType(EltTy);
16937	unsigned NumOpElts =
16938	cast<FixedVectorType>(Val: Op0->getType())->getNumElements();
16939
16940	// Convert to the corresponding integer vector.
16941	auto *IntVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: NumOpElts);
16942	Op0 = Builder.CreatePtrToInt(V: Op0, DestTy: IntVecTy);
16943	Op1 = Builder.CreatePtrToInt(V: Op1, DestTy: IntVecTy);
16944
16945	SubVecTy = FixedVectorType::get(ElementType: IntTy, NumElts: LaneLen);
16946	}
16947
16948	// If we're going to generate more than one store, reset the lane length
16949	// and sub-vector type to something legal.
16950	LaneLen /= NumStores;
16951	SubVecTy = FixedVectorType::get(ElementType: SubVecTy->getElementType(), NumElts: LaneLen);
16952
16953	auto *STVTy = UseScalable ? cast<VectorType>(Val: getSVEContainerIRType(VTy: SubVecTy))
16954	: SubVecTy;
16955
16956	// The base address of the store.
16957	Value *BaseAddr = SI->getPointerOperand();
16958
16959	auto Mask = SVI->getShuffleMask();
16960
16961	// Sanity check if all the indices are NOT in range.
16962	// If mask is `poison`, `Mask` may be a vector of -1s.
16963	// If all of them are `poison`, OOB read will happen later.
16964	if (llvm::all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
16965	return false;
16966	}
16967	// A 64bit st2 which does not start at element 0 will involved adding extra
16968	// ext elements making the st2 unprofitable, and if there is a nearby store
16969	// that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16970	// zip;ldp pair which has higher throughput.
16971	if (Factor == `2` && SubVecTy->getPrimitiveSizeInBits() == `64` &&
16972	(Mask [`0`] != `0` \|\|
16973	hasNearbyPairedStore(It: SI->getIterator(), End: SI->getParent()->end(), Ptr: BaseAddr,
16974	DL) \|\|
16975	hasNearbyPairedStore(It: SI->getReverseIterator(), End: SI->getParent()->rend(),
16976	Ptr: BaseAddr, DL)))
16977	return false;
16978
16979	Type *PtrTy = SI->getPointerOperandType();
16980	Type *PredTy = VectorType::get(ElementType: Type::getInt1Ty(C&: STVTy->getContext()),
16981	EC: STVTy->getElementCount());
16982
16983	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
16984	Scalable: UseScalable, STVTy, PtrTy);
16985
16986	Value PTrue = nullptr*;
16987	if (UseScalable) {
16988	std::optional<unsigned> PgPattern =
16989	getSVEPredPatternFromNumElements(MinNumElts: SubVecTy->getNumElements());
16990	if (Subtarget->getMinSVEVectorSizeInBits() ==
16991	Subtarget->getMaxSVEVectorSizeInBits() &&
16992	Subtarget->getMinSVEVectorSizeInBits() ==
16993	DL.getTypeSizeInBits(Ty: SubVecTy))
16994	PgPattern = AArch64SVEPredPattern::all;
16995
16996	auto *PTruePat =
16997	ConstantInt::get(Ty: Type::getInt32Ty(C&: STVTy->getContext()), V: *PgPattern);
16998	PTrue = Builder.CreateIntrinsic(ID: Intrinsic::aarch64_sve_ptrue, Types: {PredTy},
16999	Args: {PTruePat});
17000	}
17001
17002	for (unsigned StoreCount = `0`; StoreCount < NumStores; ++StoreCount) {
17003
17004	SmallVector<Value *, `5`> Ops;
17005
17006	// Split the shufflevector operands into sub vectors for the new stN call.
17007	for (unsigned i = `0`; i < Factor; i++) {
17008	Value *Shuffle;
17009	unsigned IdxI = StoreCount * LaneLen * Factor + i;
17010	if (Mask [IdxI] >= `0`) {
17011	Shuffle = Builder.CreateShuffleVector(
17012	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: Mask [IdxI], NumInts: LaneLen, NumUndefs: `0`));
17013	} else {
17014	unsigned StartMask = `0`;
17015	for (unsigned j = `1`; j < LaneLen; j++) {
17016	unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17017	if (Mask [IdxJ] >= `0`) {
17018	StartMask = Mask [IdxJ] - j;
17019	break;
17020	}
17021	}
17022	// Note: Filling undef gaps with random elements is ok, since
17023	// those elements were being written anyway (with undefs).
17024	// In the case of all undefs we're defaulting to using elems from 0
17025	// Note: StartMask cannot be negative, it's checked in
17026	// isReInterleaveMask
17027	Shuffle = Builder.CreateShuffleVector(
17028	V1: Op0, V2: Op1, Mask: createSequentialMask(Start: StartMask, NumInts: LaneLen, NumUndefs: `0`));
17029	}
17030
17031	if (UseScalable)
17032	Shuffle = Builder.CreateInsertVector(
17033	DstType: STVTy, SrcVec: UndefValue::get(T: STVTy), SubVec: Shuffle,
17034	Idx: ConstantInt::get(Ty: Type::getInt64Ty(C&: STVTy->getContext()), V: `0`));
17035
17036	Ops.push_back(Elt: Shuffle);
17037	}
17038
17039	if (UseScalable)
17040	Ops.push_back(Elt: PTrue);
17041
17042	// If we generating more than one store, we compute the base address of
17043	// subsequent stores as an offset from the previous.
17044	if (StoreCount > `0`)
17045	BaseAddr = Builder.CreateConstGEP1_32(Ty: SubVecTy->getElementType(),
17046	Ptr: BaseAddr, Idx0: LaneLen * Factor);
17047
17048	Ops.push_back(Elt: BaseAddr);
17049	Builder.CreateCall(Callee: StNFunc, Args: Ops);
17050	}
17051	return true;
17052	}
17053
17054	bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
17055	IntrinsicInst DI, LoadInst LI) const {
17056	// Only deinterleave2 supported at present.
17057	if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17058	return false;
17059
17060	// Only a factor of 2 supported at present.
17061	const unsigned Factor = `2`;
17062
17063	VectorType *VTy = cast<VectorType>(Val: DI->getType()->getContainedType(i: `0`));
17064	const DataLayout &DL = DI->getDataLayout();
17065	bool UseScalable;
17066	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17067	return false;
17068
17069	// TODO: Add support for using SVE instructions with fixed types later, using
17070	// the code from lowerInterleavedLoad to obtain the correct container type.
17071	if (UseScalable && !VTy->isScalableTy())
17072	return false;
17073
17074	unsigned NumLoads = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17075
17076	VectorType *LdTy =
17077	VectorType::get(ElementType: VTy->getElementType(),
17078	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumLoads));
17079
17080	Type *PtrTy = LI->getPointerOperandType();
17081	Function *LdNFunc = getStructuredLoadFunction(M: DI->getModule(), Factor,
17082	Scalable: UseScalable, LDVTy: LdTy, PtrTy);
17083
17084	IRBuilder<> Builder(LI);
17085
17086	Value Pred = nullptr*;
17087	if (UseScalable)
17088	Pred =
17089	Builder.CreateVectorSplat(EC: LdTy->getElementCount(), V: Builder.getTrue());
17090
17091	Value *BaseAddr = LI->getPointerOperand();
17092	Value *Result;
17093	if (NumLoads > `1`) {
17094	Value *Left = PoisonValue::get(T: VTy);
17095	Value *Right = PoisonValue::get(T: VTy);
17096
17097	for (unsigned I = `0`; I < NumLoads; ++I) {
17098	Value Offset = Builder.getInt64(C: I Factor);
17099
17100	Value *Address = Builder.CreateGEP(Ty: LdTy, Ptr: BaseAddr, IdxList: {Offset});
17101	Value LdN = nullptr*;
17102	if (UseScalable)
17103	LdN = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, Address}, Name: "ldN");
17104	else
17105	LdN = Builder.CreateCall(Callee: LdNFunc, Args: Address, Name: "ldN");
17106
17107	Value *Idx =
17108	Builder.getInt64(C: I * LdTy->getElementCount().getKnownMinValue());
17109	Left = Builder.CreateInsertVector(
17110	DstType: VTy, SrcVec: Left, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: `0`), Idx);
17111	Right = Builder.CreateInsertVector(
17112	DstType: VTy, SrcVec: Right, SubVec: Builder.CreateExtractValue(Agg: LdN, Idxs: `1`), Idx);
17113	}
17114
17115	Result = PoisonValue::get(T: DI->getType());
17116	Result = Builder.CreateInsertValue(Agg: Result, Val: Left, Idxs: `0`);
17117	Result = Builder.CreateInsertValue(Agg: Result, Val: Right, Idxs: `1`);
17118	} else {
17119	if (UseScalable)
17120	Result = Builder.CreateCall(Callee: LdNFunc, Args: {Pred, BaseAddr}, Name: "ldN");
17121	else
17122	Result = Builder.CreateCall(Callee: LdNFunc, Args: BaseAddr, Name: "ldN");
17123	}
17124
17125	DI->replaceAllUsesWith(V: Result);
17126	return true;
17127	}
17128
17129	bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
17130	IntrinsicInst II, StoreInst SI) const {
17131	// Only interleave2 supported at present.
17132	if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17133	return false;
17134
17135	// Only a factor of 2 supported at present.
17136	const unsigned Factor = `2`;
17137
17138	VectorType *VTy = cast<VectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
17139	const DataLayout &DL = II->getDataLayout();
17140	bool UseScalable;
17141	if (!isLegalInterleavedAccessType(VecTy: VTy, DL, UseScalable))
17142	return false;
17143
17144	// TODO: Add support for using SVE instructions with fixed types later, using
17145	// the code from lowerInterleavedStore to obtain the correct container type.
17146	if (UseScalable && !VTy->isScalableTy())
17147	return false;
17148
17149	unsigned NumStores = getNumInterleavedAccesses(VecTy: VTy, DL, UseScalable);
17150
17151	VectorType *StTy =
17152	VectorType::get(ElementType: VTy->getElementType(),
17153	EC: VTy->getElementCount().divideCoefficientBy(RHS: NumStores));
17154
17155	Type *PtrTy = SI->getPointerOperandType();
17156	Function *StNFunc = getStructuredStoreFunction(M: SI->getModule(), Factor,
17157	Scalable: UseScalable, STVTy: StTy, PtrTy);
17158
17159	IRBuilder<> Builder(SI);
17160
17161	Value *BaseAddr = SI->getPointerOperand();
17162	Value Pred = nullptr*;
17163
17164	if (UseScalable)
17165	Pred =
17166	Builder.CreateVectorSplat(EC: StTy->getElementCount(), V: Builder.getTrue());
17167
17168	Value *L = II->getOperand(i_nocapture: `0`);
17169	Value *R = II->getOperand(i_nocapture: `1`);
17170
17171	for (unsigned I = `0`; I < NumStores; ++I) {
17172	Value *Address = BaseAddr;
17173	if (NumStores > `1`) {
17174	Value Offset = Builder.getInt64(C: I Factor);
17175	Address = Builder.CreateGEP(Ty: StTy, Ptr: BaseAddr, IdxList: {Offset});
17176
17177	Value *Idx =
17178	Builder.getInt64(C: I * StTy->getElementCount().getKnownMinValue());
17179	L = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: `0`), Idx);
17180	R = Builder.CreateExtractVector(DstType: StTy, SrcVec: II->getOperand(i_nocapture: `1`), Idx);
17181	}
17182
17183	if (UseScalable)
17184	Builder.CreateCall(Callee: StNFunc, Args: {L, R, Pred, Address});
17185	else
17186	Builder.CreateCall(Callee: StNFunc, Args: {L, R, Address});
17187	}
17188
17189	return true;
17190	}
17191
17192	EVT AArch64TargetLowering::getOptimalMemOpType(
17193	const MemOp &Op, const AttributeList &FuncAttributes) const {
17194	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17195	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17196	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17197	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
17198	// taken one instruction to materialize the v2i64 zero and one store (with
17199	// restrictive addressing mode). Just do i64 stores.
17200	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
17201	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17202	if (Op.isAligned(AlignCheck))
17203	return true;
17204	unsigned Fast;
17205	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
17206	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17207	Fast;
17208	};
17209
17210	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17211	AlignmentIsAcceptable (MVT::v16i8, Align (`16`)))
17212	return MVT::v16i8;
17213	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable (MVT::f128, Align (`16`)))
17214	return MVT::f128;
17215	if (Op.size() >= `8` && AlignmentIsAcceptable (MVT::i64, Align (`8`)))
17216	return MVT::i64;
17217	if (Op.size() >= `4` && AlignmentIsAcceptable (MVT::i32, Align (`4`)))
17218	return MVT::i32;
17219	return MVT::Other;
17220	}
17221
17222	LLT AArch64TargetLowering::getOptimalMemOpLLT(
17223	const MemOp &Op, const AttributeList &FuncAttributes) const {
17224	bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Kind: Attribute::NoImplicitFloat);
17225	bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17226	bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17227	// Only use AdvSIMD to implement memset of 32-byte and above. It would have
17228	// taken one instruction to materialize the v2i64 zero and one store (with
17229	// restrictive addressing mode). Just do i64 stores.
17230	bool IsSmallMemset = Op.isMemset() && Op.size() < `32`;
17231	auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17232	if (Op.isAligned(AlignCheck))
17233	return true;
17234	unsigned Fast;
17235	return allowsMisalignedMemoryAccesses(VT, AddrSpace: `0`, Alignment: Align (`1`),
17236	Flags: MachineMemOperand::MONone, Fast: &Fast) &&
17237	Fast;
17238	};
17239
17240	if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17241	AlignmentIsAcceptable (MVT::v2i64, Align (`16`)))
17242	return LLT::fixed_vector(NumElements: `2`, ScalarSizeInBits: `64`);
17243	if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable (MVT::f128, Align (`16`)))
17244	return LLT::scalar(SizeInBits: `128`);
17245	if (Op.size() >= `8` && AlignmentIsAcceptable (MVT::i64, Align (`8`)))
17246	return LLT::scalar(SizeInBits: `64`);
17247	if (Op.size() >= `4` && AlignmentIsAcceptable (MVT::i32, Align (`4`)))
17248	return LLT::scalar(SizeInBits: `32`);
17249	return LLT ();
17250	}
17251
17252	// 12-bit optionally shifted immediates are legal for adds.
17253	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
17254	if (Immed == std::numeric_limits<int64_t>::min()) {
17255	LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17256	<< ": avoid UB for INT64_MIN\n");
17257	return false;
17258	}
17259	// Same encoding for add/sub, just flip the sign.
17260	Immed = std::abs(i: Immed);
17261	bool IsLegal = ((Immed >> `12`) == `0` \|\|
17262	((Immed & `0xfff`) == `0` && Immed >> `24` == `0`));
17263	LLVM_DEBUG(dbgs() << "Is " << Immed
17264	<< " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17265	return IsLegal;
17266	}
17267
17268	bool AArch64TargetLowering::isLegalAddScalableImmediate(int64_t Imm) const {
17269	// We will only emit addvl/inc instructions for SVE2*
17270	if (!Subtarget->hasSVE2())
17271	return false;
17272
17273	// addvl's immediates are in terms of the number of bytes in a register.
17274	// Since there are 16 in the base supported size (128bits), we need to
17275	// divide the immediate by that much to give us a useful immediate to
17276	// multiply by vscale. We can't have a remainder as a result of this.
17277	if (Imm % `16` == `0`)
17278	return isInt<`6`>(x: Imm / `16`);
17279
17280	// Inc[b\|h\|w\|d] instructions take a pattern and a positive immediate
17281	// multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17282	// of addvl as a result, so only take h\|w\|d into account.
17283	// Dec[h\|w\|d] will cover subtractions.
17284	// Immediates are in the range [1,16], so we can't do a 2's complement check.
17285	// FIXME: Can we make use of other patterns to cover other immediates?
17286
17287	// inch\|dech
17288	if (Imm % `8` == `0`)
17289	return std::abs(i: Imm / `8`) <= `16`;
17290	// incw\|decw
17291	if (Imm % `4` == `0`)
17292	return std::abs(i: Imm / `4`) <= `16`;
17293	// incd\|decd
17294	if (Imm % `2` == `0`)
17295	return std::abs(i: Imm / `2`) <= `16`;
17296
17297	return false;
17298	}
17299
17300	// Return false to prevent folding
17301	// (mul (add x, c1), c2) -> (add (mul x, c2), c2c1) in DAGCombine,*
17302	// if the folding leads to worse code.
17303	bool AArch64TargetLowering::isMulAddWithConstProfitable(
17304	SDValue AddNode, SDValue ConstNode) const {
17305	// Let the DAGCombiner decide for vector types and large types.
17306	const EVT VT = AddNode.getValueType();
17307	if (VT.isVector() \|\| VT.getScalarSizeInBits() > `64`)
17308	return true;
17309
17310	// It is worse if c1 is legal add immediate, while c1c2 is not*
17311	// and has to be composed by at least two instructions.
17312	const ConstantSDNode *C1Node = cast<ConstantSDNode>(Val: AddNode.getOperand(i: `1`));
17313	const ConstantSDNode *C2Node = cast<ConstantSDNode>(Val&: ConstNode);
17314	const int64_t C1 = C1Node->getSExtValue();
17315	const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17316	if (!isLegalAddImmediate(Immed: C1) \|\| isLegalAddImmediate(Immed: C1C2.getSExtValue()))
17317	return true;
17318	SmallVector<AArch64_IMM::ImmInsnModel, `4`> Insn;
17319	// Adapt to the width of a register.
17320	unsigned BitSize = VT.getSizeInBits() <= `32` ? `32` : `64`;
17321	AArch64_IMM::expandMOVImm(Imm: C1C2.getZExtValue(), BitSize, Insn);
17322	if (Insn.size() > `1`)
17323	return false;
17324
17325	// Default to true and let the DAGCombiner decide.
17326	return true;
17327	}
17328
17329	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17330	// immediates is the same as for an add or a sub.
17331	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
17332	return isLegalAddImmediate(Immed);
17333	}
17334
17335	/// isLegalAddressingMode - Return true if the addressing mode represented
17336	/// by AM is legal for this target, for a load/store of the specified type.
17337	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
17338	const AddrMode &AMode, Type *Ty,
17339	unsigned AS, Instruction I) const* {
17340	// AArch64 has five basic addressing modes:
17341	// reg
17342	// reg + 9-bit signed offset
17343	// reg + SIZE_IN_BYTES 12-bit unsigned offset*
17344	// reg1 + reg2
17345	// reg + SIZE_IN_BYTES reg*
17346
17347	// No global is ever allowed as a base.
17348	if (AMode.BaseGV)
17349	return false;
17350
17351	// No reg+reg+imm addressing.
17352	if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17353	return false;
17354
17355	// Canonicalise `1ScaledReg + imm` into `BaseReg + imm` and*
17356	// `2ScaledReg` into `BaseReg + ScaledReg`*
17357	AddrMode AM = AMode;
17358	if (AM.Scale && !AM.HasBaseReg) {
17359	if (AM.Scale == `1`) {
17360	AM.HasBaseReg = true;
17361	AM.Scale = `0`;
17362	} else if (AM.Scale == `2`) {
17363	AM.HasBaseReg = true;
17364	AM.Scale = `1`;
17365	} else {
17366	return false;
17367	}
17368	}
17369
17370	// A base register is required in all addressing modes.
17371	if (!AM.HasBaseReg)
17372	return false;
17373
17374	if (Ty->isScalableTy()) {
17375	if (isa<ScalableVectorType>(Val: Ty)) {
17376	// See if we have a foldable vscale-based offset, for vector types which
17377	// are either legal or smaller than the minimum; more work will be
17378	// required if we need to consider addressing for types which need
17379	// legalization by splitting.
17380	uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / `8`;
17381	if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17382	(AM.ScalableOffset % VecNumBytes == `0`) && VecNumBytes <= `16` &&
17383	isPowerOf2_64(Value: VecNumBytes))
17384	return isInt<`4`>(x: AM.ScalableOffset / (int64_t)VecNumBytes);
17385
17386	uint64_t VecElemNumBytes =
17387	DL.getTypeSizeInBits(Ty: cast<VectorType>(Val: Ty)->getElementType()) / `8`;
17388	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17389	(AM.Scale == `0` \|\| (uint64_t)AM.Scale == VecElemNumBytes);
17390	}
17391
17392	return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17393	}
17394
17395	// No scalable offsets allowed for non-scalable types.
17396	if (AM.ScalableOffset)
17397	return false;
17398
17399	// check reg + imm case:
17400	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES uimm12*
17401	uint64_t NumBytes = `0`;
17402	if (Ty->isSized()) {
17403	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17404	NumBytes = NumBits / `8`;
17405	if (!isPowerOf2_64(Value: NumBits))
17406	NumBytes = `0`;
17407	}
17408
17409	return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, Offset: AM.BaseOffs,
17410	Scale: AM.Scale);
17411	}
17412
17413	// Check whether the 2 offsets belong to the same imm24 range, and their high
17414	// 12bits are same, then their high part can be decoded with the offset of add.
17415	int64_t
17416	AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset,
17417	int64_t MaxOffset) const {
17418	int64_t HighPart = MinOffset & ~`0xfffULL`;
17419	if (MinOffset >> `12` == MaxOffset >> `12` && isLegalAddImmediate(Immed: HighPart)) {
17420	// Rebase the value to an integer multiple of imm12.
17421	return HighPart;
17422	}
17423
17424	return `0`;
17425	}
17426
17427	bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
17428	// Consider splitting large offset of struct or array.
17429	return true;
17430	}
17431
17432	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
17433	const MachineFunction &MF, EVT VT) const {
17434	VT = VT.getScalarType();
17435
17436	if (!VT.isSimple())
17437	return false;
17438
17439	switch (VT.getSimpleVT().SimpleTy) {
17440	case MVT::f16:
17441	return Subtarget->hasFullFP16();
17442	case MVT::f32:
17443	case MVT::f64:
17444	return true;
17445	default:
17446	break;
17447	}
17448
17449	return false;
17450	}
17451
17452	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17453	Type Ty) const* {
17454	switch (Ty->getScalarType()->getTypeID()) {
17455	case Type::FloatTyID:
17456	case Type::DoubleTyID:
17457	return true;
17458	default:
17459	return false;
17460	}
17461	}
17462
17463	bool AArch64TargetLowering::generateFMAsInMachineCombiner(
17464	EVT VT, CodeGenOptLevel OptLevel) const {
17465	return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17466	!useSVEForFixedLengthVectorVT(VT);
17467	}
17468
17469	const MCPhysReg *
17470	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
17471	// LR is a callee-save register, but we must treat it as clobbered by any call
17472	// site. Hence we include LR in the scratch registers, which are in turn added
17473	// as implicit-defs for stackmaps and patchpoints.
17474	static const MCPhysReg ScratchRegs[] = {
17475	AArch64::X16, AArch64::X17, AArch64::LR, `0`
17476	};
17477	return ScratchRegs;
17478	}
17479
17480	ArrayRef<MCPhysReg> AArch64TargetLowering::getRoundingControlRegisters() const {
17481	static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17482	return RCRegs;
17483	}
17484
17485	bool
17486	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
17487	CombineLevel Level) const {
17488	assert((N->getOpcode() == ISD::SHL \|\| N->getOpcode() == ISD::SRA \|\|
17489	N->getOpcode() == ISD::SRL) &&
17490	"Expected shift op");
17491
17492	SDValue ShiftLHS = N->getOperand(Num: `0`);
17493	EVT VT = N->getValueType(ResNo: `0`);
17494
17495	// If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17496	// combine it with shift 'N' to let it be lowered to UBFX except:
17497	// ((x >> C) & mask) << C.
17498	if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
17499	isa<ConstantSDNode>(Val: ShiftLHS.getOperand(i: `1`))) {
17500	uint64_t TruncMask = ShiftLHS.getConstantOperandVal(i: `1`);
17501	if (isMask_64(Value: TruncMask)) {
17502	SDValue AndLHS = ShiftLHS.getOperand(i: `0`);
17503	if (AndLHS.getOpcode() == ISD::SRL) {
17504	if (auto *SRLC = dyn_cast<ConstantSDNode>(Val: AndLHS.getOperand(i: `1`))) {
17505	if (N->getOpcode() == ISD::SHL)
17506	if (auto *SHLC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`)))
17507	return SRLC->getZExtValue() == SHLC->getZExtValue();
17508	return false;
17509	}
17510	}
17511	}
17512	}
17513	return true;
17514	}
17515
17516	bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
17517	const SDNode N) const* {
17518	assert(N->getOpcode() == ISD::XOR &&
17519	(N->getOperand(`0`).getOpcode() == ISD::SHL \|\|
17520	N->getOperand(`0`).getOpcode() == ISD::SRL) &&
17521	"Expected XOR(SHIFT) pattern");
17522
17523	// Only commute if the entire NOT mask is a hidden shifted mask.
17524	auto *XorC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
17525	auto *ShiftC = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
17526	if (XorC && ShiftC) {
17527	unsigned MaskIdx, MaskLen;
17528	if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17529	unsigned ShiftAmt = ShiftC->getZExtValue();
17530	unsigned BitWidth = N->getValueType(ResNo: `0`).getScalarSizeInBits();
17531	if (N->getOperand(Num: `0`).getOpcode() == ISD::SHL)
17532	return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17533	return MaskIdx == `0` && MaskLen == (BitWidth - ShiftAmt);
17534	}
17535	}
17536
17537	return false;
17538	}
17539
17540	bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
17541	const SDNode N, CombineLevel Level) const* {
17542	assert(((N->getOpcode() == ISD::SHL &&
17543	N->getOperand(`0`).getOpcode() == ISD::SRL) \|\|
17544	(N->getOpcode() == ISD::SRL &&
17545	N->getOperand(`0`).getOpcode() == ISD::SHL)) &&
17546	"Expected shift-shift mask");
17547	// Don't allow multiuse shift folding with the same shift amount.
17548	if (!N->getOperand(Num: `0`)->hasOneUse())
17549	return false;
17550
17551	// Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17552	EVT VT = N->getValueType(ResNo: `0`);
17553	if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 \|\| VT == MVT::i64)) {
17554	auto *C1 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`).getOperand(i: `1`));
17555	auto *C2 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
17556	return (!C1 \|\| !C2 \|\| C1->getZExtValue() >= C2->getZExtValue());
17557	}
17558
17559	return true;
17560	}
17561
17562	bool AArch64TargetLowering::shouldFoldSelectWithIdentityConstant(
17563	unsigned BinOpcode, EVT VT) const {
17564	return VT.isScalableVector() && isTypeLegal(VT);
17565	}
17566
17567	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17568	Type Ty) const* {
17569	assert(Ty->isIntegerTy());
17570
17571	unsigned BitSize = Ty->getPrimitiveSizeInBits();
17572	if (BitSize == `0`)
17573	return false;
17574
17575	int64_t Val = Imm.getSExtValue();
17576	if (Val == `0` \|\| AArch64_AM::isLogicalImmediate(imm: Val, regSize: BitSize))
17577	return true;
17578
17579	if ((int64_t)Val < `0`)
17580	Val = ~Val;
17581	if (BitSize == `32`)
17582	Val &= (`1LL` << `32`) - `1`;
17583
17584	unsigned Shift = llvm::Log2_64(Value: (uint64_t)Val) / `16`;
17585	// MOVZ is free so return true for one or fewer MOVK.
17586	return Shift < `3`;
17587	}
17588
17589	bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
17590	unsigned Index) const {
17591	if (!isOperationLegalOrCustom(Op: ISD::EXTRACT_SUBVECTOR, VT: ResVT))
17592	return false;
17593
17594	return (Index == `0` \|\| Index == ResVT.getVectorMinNumElements());
17595	}
17596
17597	/// Turn vector tests of the signbit in the form of:
17598	/// xor (sra X, elt_size(X)-1), -1
17599	/// into:
17600	/// cmge X, X, #0
17601	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
17602	const AArch64Subtarget *Subtarget) {
17603	EVT VT = N->getValueType(ResNo: `0`);
17604	if (!Subtarget->hasNEON() \|\| !VT.isVector())
17605	return SDValue ();
17606
17607	// There must be a shift right algebraic before the xor, and the xor must be a
17608	// 'not' operation.
17609	SDValue Shift = N->getOperand(Num: `0`);
17610	SDValue Ones = N->getOperand(Num: `1`);
17611	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
17612	!ISD::isBuildVectorAllOnes(N: Ones.getNode()))
17613	return SDValue ();
17614
17615	// The shift should be smearing the sign bit across each vector element.
17616	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Val: Shift.getOperand(i: `1`));
17617	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17618	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - `1`)
17619	return SDValue ();
17620
17621	return DAG.getNode(Opcode: AArch64ISD::CMGEz, DL: SDLoc (N), VT, Operand: Shift.getOperand(i: `0`));
17622	}
17623
17624	// Given a vecreduce_add node, detect the below pattern and convert it to the
17625	// node sequence with UABDL, [S\|U]ADB and UADDLP.
17626	//
17627	// i32 vecreduce_add(
17628	// v16i32 abs(
17629	// v16i32 sub(
17630	// v16i32 [sign\|zero]_extend(v16i8 a), v16i32 [sign\|zero]_extend(v16i8 b))))
17631	// =================>
17632	// i32 vecreduce_add(
17633	// v4i32 UADDLP(
17634	// v8i16 add(
17635	// v8i16 zext(
17636	// v8i8 [S\|U]ABD low8:v16i8 a, low8:v16i8 b
17637	// v8i16 zext(
17638	// v8i8 [S\|U]ABD high8:v16i8 a, high8:v16i8 b
17639	static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
17640	SelectionDAG &DAG) {
17641	// Assumed i32 vecreduce_add
17642	if (N->getValueType(ResNo: `0`) != MVT::i32)
17643	return SDValue ();
17644
17645	SDValue VecReduceOp0 = N->getOperand(Num: `0`);
17646	unsigned Opcode = VecReduceOp0.getOpcode();
17647	// Assumed v16i32 abs
17648	if (Opcode != ISD::ABS \|\| VecReduceOp0 ->getValueType(ResNo: `0`) != MVT::v16i32)
17649	return SDValue ();
17650
17651	SDValue ABS = VecReduceOp0;
17652	// Assumed v16i32 sub
17653	if (ABS ->getOperand(Num: `0`)->getOpcode() != ISD::SUB \|\|
17654	ABS ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i32)
17655	return SDValue ();
17656
17657	SDValue SUB = ABS ->getOperand(Num: `0`);
17658	unsigned Opcode0 = SUB ->getOperand(Num: `0`).getOpcode();
17659	unsigned Opcode1 = SUB ->getOperand(Num: `1`).getOpcode();
17660	// Assumed v16i32 type
17661	if (SUB ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i32 \|\|
17662	SUB ->getOperand(Num: `1`)->getValueType(ResNo: `0`) != MVT::v16i32)
17663	return SDValue ();
17664
17665	// Assumed zext or sext
17666	bool IsZExt = false;
17667	if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17668	IsZExt = true;
17669	} else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17670	IsZExt = false;
17671	} else
17672	return SDValue ();
17673
17674	SDValue EXT0 = SUB ->getOperand(Num: `0`);
17675	SDValue EXT1 = SUB ->getOperand(Num: `1`);
17676	// Assumed zext's operand has v16i8 type
17677	if (EXT0 ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i8 \|\|
17678	EXT1 ->getOperand(Num: `0`)->getValueType(ResNo: `0`) != MVT::v16i8)
17679	return SDValue ();
17680
17681	// Pattern is dectected. Let's convert it to sequence of nodes.
17682	SDLoc DL(N);
17683
17684	// First, create the node pattern of UABD/SABD.
17685	SDValue UABDHigh8Op0 =
17686	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0 ->getOperand(Num: `0`),
17687	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
17688	SDValue UABDHigh8Op1 =
17689	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1 ->getOperand(Num: `0`),
17690	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
17691	SDValue UABDHigh8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
17692	N1: UABDHigh8Op0, N2: UABDHigh8Op1);
17693	SDValue UABDL = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDHigh8);
17694
17695	// Second, create the node pattern of UABAL.
17696	SDValue UABDLo8Op0 =
17697	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT0 ->getOperand(Num: `0`),
17698	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
17699	SDValue UABDLo8Op1 =
17700	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: EXT1 ->getOperand(Num: `0`),
17701	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
17702	SDValue UABDLo8 = DAG.getNode(Opcode: IsZExt ? ISD::ABDU : ISD::ABDS, DL, VT: MVT::v8i8,
17703	N1: UABDLo8Op0, N2: UABDLo8Op1);
17704	SDValue ZExtUABD = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::v8i16, Operand: UABDLo8);
17705	SDValue UABAL = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::v8i16, N1: UABDL, N2: ZExtUABD);
17706
17707	// Third, create the node of UADDLP.
17708	SDValue UADDLP = DAG.getNode(Opcode: AArch64ISD::UADDLP, DL, VT: MVT::v4i32, Operand: UABAL);
17709
17710	// Fourth, create the node of VECREDUCE_ADD.
17711	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i32, Operand: UADDLP);
17712	}
17713
17714	// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17715	// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17716	// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17717	// If we have vectors larger than v16i8 we extract v16i8 vectors,
17718	// Follow the same steps above to get DOT instructions concatenate them
17719	// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17720	static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
17721	const AArch64Subtarget *ST) {
17722	if (!ST->isNeonAvailable())
17723	return SDValue ();
17724
17725	if (!ST->hasDotProd())
17726	return performVecReduceAddCombineWithUADDLP(N, DAG);
17727
17728	SDValue Op0 = N->getOperand(Num: `0`);
17729	if (N->getValueType(ResNo: `0`) != MVT::i32 \|\| Op0.getValueType().isScalableVT() \|\|
17730	Op0.getValueType().getVectorElementType() != MVT::i32)
17731	return SDValue ();
17732
17733	unsigned ExtOpcode = Op0.getOpcode();
17734	SDValue A = Op0;
17735	SDValue B;
17736	if (ExtOpcode == ISD::MUL) {
17737	A = Op0.getOperand(i: `0`);
17738	B = Op0.getOperand(i: `1`);
17739	if (A.getOpcode() != B.getOpcode() \|\|
17740	A.getOperand(i: `0`).getValueType() != B.getOperand(i: `0`).getValueType())
17741	return SDValue ();
17742	ExtOpcode = A.getOpcode();
17743	}
17744	if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17745	return SDValue ();
17746
17747	EVT Op0VT = A.getOperand(i: `0`).getValueType();
17748	bool IsValidElementCount = Op0VT.getVectorNumElements() % `8` == `0`;
17749	bool IsValidSize = Op0VT.getScalarSizeInBits() == `8`;
17750	if (!IsValidElementCount \|\| !IsValidSize)
17751	return SDValue ();
17752
17753	SDLoc DL(Op0);
17754	// For non-mla reductions B can be set to 1. For MLA we take the operand of
17755	// the extend B.
17756	if (!B)
17757	B = DAG.getConstant(Val: `1`, DL, VT: Op0VT);
17758	else
17759	B = B.getOperand(i: `0`);
17760
17761	unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % `16` == `0`;
17762	unsigned NumOfVecReduce;
17763	EVT TargetType;
17764	if (IsMultipleOf16) {
17765	NumOfVecReduce = Op0VT.getVectorNumElements() / `16`;
17766	TargetType = MVT::v4i32;
17767	} else {
17768	NumOfVecReduce = Op0VT.getVectorNumElements() / `8`;
17769	TargetType = MVT::v2i32;
17770	}
17771	auto DotOpcode =
17772	(ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
17773	// Handle the case where we need to generate only one Dot operation.
17774	if (NumOfVecReduce == `1`) {
17775	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: TargetType);
17776	SDValue Dot = DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros,
17777	N2: A.getOperand(i: `0`), N3: B);
17778	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
17779	}
17780	// Generate Dot instructions that are multiple of 16.
17781	unsigned VecReduce16Num = Op0VT.getVectorNumElements() / `16`;
17782	SmallVector<SDValue, `4`> SDotVec16;
17783	unsigned I = `0`;
17784	for (; I < VecReduce16Num; I += `1`) {
17785	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: MVT::v4i32);
17786	SDValue Op0 =
17787	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: A.getOperand(i: `0`),
17788	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
17789	SDValue Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v16i8, N1: B,
17790	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
17791	SDValue Dot =
17792	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Op0, N3: Op1);
17793	SDotVec16.push_back(Elt: Dot);
17794	}
17795	// Concatenate dot operations.
17796	EVT SDot16EVT =
17797	EVT::getVectorVT(Context&: DAG.getContext(), VT: MVT::i32, NumElements: `4` VecReduce16Num);
17798	SDValue ConcatSDot16 =
17799	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: SDot16EVT, Ops: SDotVec16);
17800	SDValue VecReduceAdd16 =
17801	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: ConcatSDot16);
17802	unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % `16`) / `8`;
17803	if (VecReduce8Num == `0`)
17804	return VecReduceAdd16;
17805
17806	// Generate the remainder Dot operation that is multiple of 8.
17807	SmallVector<SDValue, `4`> SDotVec8;
17808	SDValue Zeros = DAG.getConstant(Val: `0`, DL, VT: MVT::v2i32);
17809	SDValue Vec8Op0 =
17810	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: A.getOperand(i: `0`),
17811	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
17812	SDValue Vec8Op1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v8i8, N1: B,
17813	N2: DAG.getConstant(Val: I * `16`, DL, VT: MVT::i64));
17814	SDValue Dot =
17815	DAG.getNode(Opcode: DotOpcode, DL, VT: Zeros.getValueType(), N1: Zeros, N2: Vec8Op0, N3: Vec8Op1);
17816	SDValue VecReudceAdd8 =
17817	DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: N->getValueType(ResNo: `0`), Operand: Dot);
17818	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: VecReduceAdd16,
17819	N2: VecReudceAdd8);
17820	}
17821
17822	// Given an (integer) vecreduce, we know the order of the inputs does not
17823	// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17824	// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17825	// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17826	static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
17827	auto DetectAddExtract = [&](SDValue A) {
17828	// Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17829	// UADDLP(x) if found.
17830	assert(A.getOpcode() == ISD::ADD);
17831	EVT VT = A.getValueType();
17832	SDValue Op0 = A.getOperand(i: `0`);
17833	SDValue Op1 = A.getOperand(i: `1`);
17834	if (Op0.getOpcode() != Op0.getOpcode() \|\|
17835	(Op0.getOpcode() != ISD::ZERO_EXTEND &&
17836	Op0.getOpcode() != ISD::SIGN_EXTEND))
17837	return SDValue ();
17838	SDValue Ext0 = Op0.getOperand(i: `0`);
17839	SDValue Ext1 = Op1.getOperand(i: `0`);
17840	if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
17841	Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
17842	Ext0.getOperand(i: `0`) != Ext1.getOperand(i: `0`))
17843	return SDValue ();
17844	// Check that the type is twice the add types, and the extract are from
17845	// upper/lower parts of the same source.
17846	if (Ext0.getOperand(i: `0`).getValueType().getVectorNumElements() !=
17847	VT.getVectorNumElements() * `2`)
17848	return SDValue ();
17849	if ((Ext0.getConstantOperandVal(i: `1`) != `0` \|\|
17850	Ext1.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()) &&
17851	(Ext1.getConstantOperandVal(i: `1`) != `0` \|\|
17852	Ext0.getConstantOperandVal(i: `1`) != VT.getVectorNumElements()))
17853	return SDValue ();
17854	unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17855	: AArch64ISD::SADDLP;
17856	return DAG.getNode(Opcode, DL: SDLoc (A), VT, Operand: Ext0.getOperand(i: `0`));
17857	};
17858
17859	if (SDValue R = DetectAddExtract (A))
17860	return R;
17861
17862	if (A.getOperand(i: `0`).getOpcode() == ISD::ADD && A.getOperand(i: `0`).hasOneUse())
17863	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `0`), DAG))
17864	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
17865	N2: A.getOperand(i: `1`));
17866	if (A.getOperand(i: `1`).getOpcode() == ISD::ADD && A.getOperand(i: `1`).hasOneUse())
17867	if (SDValue R = performUADDVAddCombine(A: A.getOperand(i: `1`), DAG))
17868	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (A), VT: A.getValueType(), N1: R,
17869	N2: A.getOperand(i: `0`));
17870	return SDValue ();
17871	}
17872
17873	// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17874	// UADDLV(concat), where the concat represents the 64-bit zext sources.
17875	static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
17876	// Look for add(zext(64-bit source), zext(64-bit source)), returning
17877	// UADDLV(concat(zext, zext)) if found.
17878	assert(A.getOpcode() == ISD::ADD);
17879	EVT VT = A.getValueType();
17880	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17881	return SDValue ();
17882	SDValue Op0 = A.getOperand(i: `0`);
17883	SDValue Op1 = A.getOperand(i: `1`);
17884	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\| Op0.getOpcode() != Op1.getOpcode())
17885	return SDValue ();
17886	SDValue Ext0 = Op0.getOperand(i: `0`);
17887	SDValue Ext1 = Op1.getOperand(i: `0`);
17888	EVT ExtVT0 = Ext0.getValueType();
17889	EVT ExtVT1 = Ext1.getValueType();
17890	// Check zext VTs are the same and 64-bit length.
17891	if (ExtVT0 != ExtVT1 \|\|
17892	VT.getScalarSizeInBits() != (`2` * ExtVT0.getScalarSizeInBits()))
17893	return SDValue ();
17894	// Get VT for concat of zext sources.
17895	EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
17896	SDValue Concat =
17897	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (A), VT: PairVT, N1: Ext0, N2: Ext1);
17898
17899	switch (VT.getSimpleVT().SimpleTy) {
17900	case MVT::v2i64:
17901	case MVT::v4i32:
17902	return DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc (A), VT, Operand: Concat);
17903	case MVT::v8i16: {
17904	SDValue Uaddlv =
17905	DAG.getNode(Opcode: AArch64ISD::UADDLV, DL: SDLoc (A), VT: MVT::v4i32, Operand: Concat);
17906	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc (A), VT: MVT::v8i16, Operand: Uaddlv);
17907	}
17908	default:
17909	llvm_unreachable("Unhandled vector type");
17910	}
17911	}
17912
17913	static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
17914	SDValue A = N->getOperand(Num: `0`);
17915	if (A.getOpcode() == ISD::ADD) {
17916	if (SDValue R = performUADDVAddCombine(A, DAG))
17917	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: R);
17918	else if (SDValue R = performUADDVZextCombine(A, DAG))
17919	return R;
17920	}
17921	return SDValue ();
17922	}
17923
17924	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
17925	TargetLowering::DAGCombinerInfo &DCI,
17926	const AArch64Subtarget *Subtarget) {
17927	if (DCI.isBeforeLegalizeOps())
17928	return SDValue ();
17929
17930	return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17931	}
17932
17933	SDValue
17934	AArch64TargetLowering::BuildSDIVPow2(SDNode N, const* APInt &Divisor,
17935	SelectionDAG &DAG,
17936	SmallVectorImpl<SDNode > &Created) const* {
17937	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17938	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
17939	return SDValue (N, `0`); // Lower SDIV as SDIV
17940
17941	EVT VT = N->getValueType(ResNo: `0`);
17942
17943	// For scalable and fixed types, mark them as cheap so we can handle it much
17944	// later. This allows us to handle larger than legal types.
17945	if (VT.isScalableVector() \|\|
17946	(VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
17947	return SDValue (N, `0`);
17948
17949	// fold (sdiv X, pow2)
17950	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
17951	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
17952	return SDValue ();
17953
17954	// If the divisor is 2 or -2, the default expansion is better. It will add
17955	// (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
17956	if (Divisor == `2` \|\|
17957	Divisor == APInt (Divisor.getBitWidth(), -`2`, /isSigned/ true))
17958	return SDValue ();
17959
17960	return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17961	}
17962
17963	SDValue
17964	AArch64TargetLowering::BuildSREMPow2(SDNode N, const* APInt &Divisor,
17965	SelectionDAG &DAG,
17966	SmallVectorImpl<SDNode > &Created) const* {
17967	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
17968	if (isIntDivCheap(VT: N->getValueType(ResNo: `0`), Attr))
17969	return SDValue (N, `0`); // Lower SREM as SREM
17970
17971	EVT VT = N->getValueType(ResNo: `0`);
17972
17973	// For scalable and fixed types, mark them as cheap so we can handle it much
17974	// later. This allows us to handle larger than legal types.
17975	if (VT.isScalableVector() \|\| Subtarget->useSVEForFixedLengthVectors())
17976	return SDValue (N, `0`);
17977
17978	// fold (srem X, pow2)
17979	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
17980	!(Divisor.isPowerOf2() \|\| Divisor.isNegatedPowerOf2()))
17981	return SDValue ();
17982
17983	unsigned Lg2 = Divisor.countr_zero();
17984	if (Lg2 == `0`)
17985	return SDValue ();
17986
17987	SDLoc DL(N);
17988	SDValue N0 = N->getOperand(Num: `0`);
17989	SDValue Pow2MinusOne = DAG.getConstant(Val: (`1ULL` << Lg2) - `1`, DL, VT);
17990	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
17991	SDValue CCVal, CSNeg;
17992	if (Lg2 == `1`) {
17993	SDValue Cmp = getAArch64Cmp(LHS: N0, RHS: Zero, CC: ISD::SETGE, AArch64cc&: CCVal, DAG, dl: DL);
17994	SDValue And = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
17995	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: And, N2: And, N3: CCVal, N4: Cmp);
17996
17997	Created.push_back(Elt: Cmp.getNode());
17998	Created.push_back(Elt: And.getNode());
17999	} else {
18000	SDValue CCVal = DAG.getConstant(Val: AArch64CC::MI, DL, VT: MVT_CC);
18001	SDVTList VTs = DAG.getVTList(VT1: VT, VT2: MVT::i32);
18002
18003	SDValue Negs = DAG.getNode(Opcode: AArch64ISD::SUBS, DL, VTList: VTs, N1: Zero, N2: N0);
18004	SDValue AndPos = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: N0, N2: Pow2MinusOne);
18005	SDValue AndNeg = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Negs, N2: Pow2MinusOne);
18006	CSNeg = DAG.getNode(Opcode: AArch64ISD::CSNEG, DL, VT, N1: AndPos, N2: AndNeg, N3: CCVal,
18007	N4: Negs.getValue(R: `1`));
18008
18009	Created.push_back(Elt: Negs.getNode());
18010	Created.push_back(Elt: AndPos.getNode());
18011	Created.push_back(Elt: AndNeg.getNode());
18012	}
18013
18014	return CSNeg;
18015	}
18016
18017	static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18018	switch(getIntrinsicID(N: S.getNode())) {
18019	default:
18020	break;
18021	case Intrinsic::aarch64_sve_cntb:
18022	return `8`;
18023	case Intrinsic::aarch64_sve_cnth:
18024	return `16`;
18025	case Intrinsic::aarch64_sve_cntw:
18026	return `32`;
18027	case Intrinsic::aarch64_sve_cntd:
18028	return `64`;
18029	}
18030	return {};
18031	}
18032
18033	/// Calculates what the pre-extend type is, based on the extension
18034	/// operation node provided by \p Extend.
18035	///
18036	/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18037	/// pre-extend type is pulled directly from the operand, while other extend
18038	/// operations need a bit more inspection to get this information.
18039	///
18040	/// \param Extend The SDNode from the DAG that represents the extend operation
18041	///
18042	/// \returns The type representing the \p Extend source type, or \p MVT::Other
18043	/// if no valid type can be determined
18044	static EVT calculatePreExtendType(SDValue Extend) {
18045	switch (Extend.getOpcode()) {
18046	case ISD::SIGN_EXTEND:
18047	case ISD::ZERO_EXTEND:
18048	return Extend.getOperand(i: `0`).getValueType();
18049	case ISD::AssertSext:
18050	case ISD::AssertZext:
18051	case ISD::SIGN_EXTEND_INREG: {
18052	VTSDNode *TypeNode = dyn_cast<VTSDNode>(Val: Extend.getOperand(i: `1`));
18053	if (!TypeNode)
18054	return MVT::Other;
18055	return TypeNode->getVT();
18056	}
18057	case ISD::AND: {
18058	ConstantSDNode *Constant =
18059	dyn_cast<ConstantSDNode>(Val: Extend.getOperand(i: `1`).getNode());
18060	if (!Constant)
18061	return MVT::Other;
18062
18063	uint32_t Mask = Constant->getZExtValue();
18064
18065	if (Mask == UCHAR_MAX)
18066	return MVT::i8;
18067	else if (Mask == USHRT_MAX)
18068	return MVT::i16;
18069	else if (Mask == UINT_MAX)
18070	return MVT::i32;
18071
18072	return MVT::Other;
18073	}
18074	default:
18075	return MVT::Other;
18076	}
18077	}
18078
18079	/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18080	/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18081	/// SExt/ZExt rather than the scalar SExt/ZExt
18082	static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
18083	EVT VT = BV.getValueType();
18084	if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18085	BV.getOpcode() != ISD::VECTOR_SHUFFLE)
18086	return SDValue ();
18087
18088	// Use the first item in the buildvector/shuffle to get the size of the
18089	// extend, and make sure it looks valid.
18090	SDValue Extend = BV ->getOperand(Num: `0`);
18091	unsigned ExtendOpcode = Extend.getOpcode();
18092	bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND \|\|
18093	ExtendOpcode == ISD::SIGN_EXTEND_INREG \|\|
18094	ExtendOpcode == ISD::AssertSext;
18095	if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18096	ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18097	return SDValue ();
18098	// Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
18099	// calculatePreExtendType will work without issue.
18100	if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18101	ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18102	return SDValue ();
18103
18104	// Restrict valid pre-extend data type
18105	EVT PreExtendType = calculatePreExtendType(Extend);
18106	if (PreExtendType == MVT::Other \|\|
18107	PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / `2`)
18108	return SDValue ();
18109
18110	// Make sure all other operands are equally extended
18111	for (SDValue Op : drop_begin(RangeOrContainer: BV ->ops())) {
18112	if (Op.isUndef())
18113	continue;
18114	unsigned Opc = Op.getOpcode();
18115	bool OpcIsSExt = Opc == ISD::SIGN_EXTEND \|\| Opc == ISD::SIGN_EXTEND_INREG \|\|
18116	Opc == ISD::AssertSext;
18117	if (OpcIsSExt != IsSExt \|\| calculatePreExtendType(Extend: Op) != PreExtendType)
18118	return SDValue ();
18119	}
18120
18121	SDValue NBV;
18122	SDLoc DL(BV);
18123	if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18124	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType);
18125	EVT PreExtendLegalType =
18126	PreExtendType.getScalarSizeInBits() < `32` ? MVT::i32 : PreExtendType;
18127	SmallVector<SDValue, `8`> NewOps;
18128	for (SDValue Op : BV ->ops())
18129	NewOps.push_back(Elt: Op.isUndef() ? DAG.getUNDEF(VT: PreExtendLegalType)
18130	: DAG.getAnyExtOrTrunc(Op: Op.getOperand(i: `0`), DL,
18131	VT: PreExtendLegalType));
18132	NBV = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: PreExtendVT, Ops: NewOps);
18133	} else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18134	EVT PreExtendVT = VT.changeVectorElementType(EltVT: PreExtendType.getScalarType());
18135	NBV = DAG.getVectorShuffle(VT: PreExtendVT, dl: DL, N1: BV.getOperand(i: `0`).getOperand(i: `0`),
18136	N2: BV.getOperand(i: `1`).isUndef()
18137	? DAG.getUNDEF(VT: PreExtendVT)
18138	: BV.getOperand(i: `1`).getOperand(i: `0`),
18139	Mask: cast<ShuffleVectorSDNode>(Val&: BV)->getMask());
18140	}
18141	return DAG.getNode(Opcode: IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, Operand: NBV);
18142	}
18143
18144	/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18145	/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18146	static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
18147	// If the value type isn't a vector, none of the operands are going to be dups
18148	EVT VT = Mul->getValueType(ResNo: `0`);
18149	if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18150	return SDValue ();
18151
18152	SDValue Op0 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `0`), DAG);
18153	SDValue Op1 = performBuildShuffleExtendCombine(BV: Mul->getOperand(Num: `1`), DAG);
18154
18155	// Neither operands have been changed, don't make any further changes
18156	if (!Op0 && !Op1)
18157	return SDValue ();
18158
18159	SDLoc DL(Mul);
18160	return DAG.getNode(Opcode: Mul->getOpcode(), DL, VT, N1: Op0 ? Op0 : Mul->getOperand(Num: `0`),
18161	N2: Op1 ? Op1 : Mul->getOperand(Num: `1`));
18162	}
18163
18164	// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18165	// Same for other types with equivalent constants.
18166	static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
18167	EVT VT = N->getValueType(ResNo: `0`);
18168	if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18169	VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18170	return SDValue ();
18171	if (N->getOperand(Num: `0`).getOpcode() != ISD::AND \|\|
18172	N->getOperand(Num: `0`).getOperand(i: `0`).getOpcode() != ISD::SRL)
18173	return SDValue ();
18174
18175	SDValue And = N->getOperand(Num: `0`);
18176	SDValue Srl = And.getOperand(i: `0`);
18177
18178	APInt V1, V2, V3;
18179	if (!ISD::isConstantSplatVector(N: N->getOperand(Num: `1`).getNode(), SplatValue&: V1) \|\|
18180	!ISD::isConstantSplatVector(N: And.getOperand(i: `1`).getNode(), SplatValue&: V2) \|\|
18181	!ISD::isConstantSplatVector(N: Srl.getOperand(i: `1`).getNode(), SplatValue&: V3))
18182	return SDValue ();
18183
18184	unsigned HalfSize = VT.getScalarSizeInBits() / `2`;
18185	if (!V1.isMask(numBits: HalfSize) \|\| V2 != (`1ULL` \| `1ULL` << HalfSize) \|\|
18186	V3 != (HalfSize - `1`))
18187	return SDValue ();
18188
18189	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18190	VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: HalfSize),
18191	EC: VT.getVectorElementCount() * `2`);
18192
18193	SDLoc DL(N);
18194	SDValue In = DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: HalfVT, Operand: Srl.getOperand(i: `0`));
18195	SDValue CM = DAG.getNode(Opcode: AArch64ISD::CMLTz, DL, VT: HalfVT, Operand: In);
18196	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT, Operand: CM);
18197	}
18198
18199	// Transform vector add(zext i8 to i32, zext i8 to i32)
18200	// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18201	// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18202	// extends.
18203	static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG) {
18204	EVT VT = N->getValueType(ResNo: `0`);
18205	if (!VT.isFixedLengthVector() \|\| VT.getSizeInBits() <= `128` \|\|
18206	(N->getOperand(Num: `0`).getOpcode() != ISD::ZERO_EXTEND &&
18207	N->getOperand(Num: `0`).getOpcode() != ISD::SIGN_EXTEND) \|\|
18208	(N->getOperand(Num: `1`).getOpcode() != ISD::ZERO_EXTEND &&
18209	N->getOperand(Num: `1`).getOpcode() != ISD::SIGN_EXTEND) \|\|
18210	N->getOperand(Num: `0`).getOperand(i: `0`).getValueType() !=
18211	N->getOperand(Num: `1`).getOperand(i: `0`).getValueType())
18212	return SDValue ();
18213
18214	if (N->getOpcode() == ISD::MUL &&
18215	N->getOperand(Num: `0`).getOpcode() != N->getOperand(Num: `1`).getOpcode())
18216	return SDValue ();
18217
18218	SDValue N0 = N->getOperand(Num: `0`).getOperand(i: `0`);
18219	SDValue N1 = N->getOperand(Num: `1`).getOperand(i: `0`);
18220	EVT InVT = N0.getValueType();
18221
18222	EVT S1 = InVT.getScalarType();
18223	EVT S2 = VT.getScalarType();
18224	if ((S2 == MVT::i32 && S1 == MVT::i8) \|\|
18225	(S2 == MVT::i64 && (S1 == MVT::i8 \|\| S1 == MVT::i16))) {
18226	SDLoc DL(N);
18227	EVT HalfVT = EVT::getVectorVT(Context&: *DAG.getContext(),
18228	VT: S2.getHalfSizedIntegerVT(Context&: *DAG.getContext()),
18229	EC: VT.getVectorElementCount());
18230	SDValue NewN0 = DAG.getNode(Opcode: N->getOperand(Num: `0`).getOpcode(), DL, VT: HalfVT, Operand: N0);
18231	SDValue NewN1 = DAG.getNode(Opcode: N->getOperand(Num: `1`).getOpcode(), DL, VT: HalfVT, Operand: N1);
18232	SDValue NewOp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: HalfVT, N1: NewN0, N2: NewN1);
18233	return DAG.getNode(Opcode: N->getOpcode() == ISD::MUL ? N->getOperand(Num: `0`).getOpcode()
18234	: (unsigned)ISD::SIGN_EXTEND,
18235	DL, VT, Operand: NewOp);
18236	}
18237	return SDValue ();
18238	}
18239
18240	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
18241	TargetLowering::DAGCombinerInfo &DCI,
18242	const AArch64Subtarget *Subtarget) {
18243
18244	if (SDValue Ext = performMulVectorExtendCombine(Mul: N, DAG))
18245	return Ext;
18246	if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
18247	return Ext;
18248	if (SDValue Ext = performVectorExtCombine(N, DAG))
18249	return Ext;
18250
18251	if (DCI.isBeforeLegalizeOps())
18252	return SDValue ();
18253
18254	// Canonicalize X(Y+1) -> XY+X and (X+1)Y -> XY+Y,
18255	// and in MachineCombiner pass, add+mul will be combined into madd.
18256	// Similarly, X(1-Y) -> X - XY and (1-Y)X -> X - YX.
18257	SDLoc DL(N);
18258	EVT VT = N->getValueType(ResNo: `0`);
18259	SDValue N0 = N->getOperand(Num: `0`);
18260	SDValue N1 = N->getOperand(Num: `1`);
18261	SDValue MulOper;
18262	unsigned AddSubOpc;
18263
18264	auto IsAddSubWith1 = [&](SDValue V) -> bool {
18265	AddSubOpc = V ->getOpcode();
18266	if ((AddSubOpc == ISD::ADD \|\| AddSubOpc == ISD::SUB) && V ->hasOneUse()) {
18267	SDValue Opnd = V ->getOperand(Num: `1`);
18268	MulOper = V ->getOperand(Num: `0`);
18269	if (AddSubOpc == ISD::SUB)
18270	std::swap(a&: Opnd, b&: MulOper);
18271	if (auto C = dyn_cast<ConstantSDNode>(Val&: Opnd))
18272	return C->isOne();
18273	}
18274	return false;
18275	};
18276
18277	if (IsAddSubWith1 (N0)) {
18278	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1, N2: MulOper);
18279	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1, N2: MulVal);
18280	}
18281
18282	if (IsAddSubWith1 (N1)) {
18283	SDValue MulVal = DAG.getNode(Opcode: ISD::MUL, DL, VT, N1: N0, N2: MulOper);
18284	return DAG.getNode(Opcode: AddSubOpc, DL, VT, N1: N0, N2: MulVal);
18285	}
18286
18287	// The below optimizations require a constant RHS.
18288	if (!isa<ConstantSDNode>(Val: N1))
18289	return SDValue ();
18290
18291	ConstantSDNode *C = cast<ConstantSDNode>(Val&: N1);
18292	const APInt &ConstValue = C->getAPIntValue();
18293
18294	// Allow the scaling to be folded into the `cnt` instruction by preventing
18295	// the scaling to be obscured here. This makes it easier to pattern match.
18296	if (IsSVECntIntrinsic(S: N0) \|\|
18297	(N0 ->getOpcode() == ISD::TRUNCATE &&
18298	(IsSVECntIntrinsic(S: N0 ->getOperand(Num: `0`)))))
18299	if (ConstValue.sge(RHS: `1`) && ConstValue.sle(RHS: `16`))
18300	return SDValue ();
18301
18302	// Multiplication of a power of two plus/minus one can be done more
18303	// cheaply as shift+add/sub. For now, this is true unilaterally. If
18304	// future CPUs have a cheaper MADD instruction, this may need to be
18305	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18306	// 64-bit is 5 cycles, so this is always a win.
18307	// More aggressively, some multiplications N0 C can be lowered to*
18308	// shift+add+shift if the constant C = A B where A = 2^N + 1 and B = 2^M,*
18309	// e.g. 6=32=(2+1)2, 45=(1+4)(1+8)*
18310	// TODO: lower more cases.
18311
18312	// TrailingZeroes is used to test if the mul can be lowered to
18313	// shift+add+shift.
18314	unsigned TrailingZeroes = ConstValue.countr_zero();
18315	if (TrailingZeroes) {
18316	// Conservatively do not lower to shift+add+shift if the mul might be
18317	// folded into smul or umul.
18318	if (N0 ->hasOneUse() && (isSignExtended(N: N0, DAG) \|\|
18319	isZeroExtended(N: N0, DAG)))
18320	return SDValue ();
18321	// Conservatively do not lower to shift+add+shift if the mul might be
18322	// folded into madd or msub.
18323	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
18324	N->use_begin()->getOpcode() == ISD::SUB))
18325	return SDValue ();
18326	}
18327	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18328	// and shift+add+shift.
18329	APInt ShiftedConstValue = ConstValue.ashr(ShiftAmt: TrailingZeroes);
18330	unsigned ShiftAmt;
18331
18332	auto Shl = [&](SDValue N0, unsigned N1) {
18333	if (!N0.getNode())
18334	return SDValue ();
18335	// If shift causes overflow, ignore this combine.
18336	if (N1 >= N0.getValueSizeInBits())
18337	return SDValue ();
18338	SDValue RHS = DAG.getConstant(Val: N1, DL, VT: MVT::i64);
18339	return DAG.getNode(Opcode: ISD::SHL, DL, VT, N1: N0, N2: RHS);
18340	};
18341	auto Add = [&](SDValue N0, SDValue N1) {
18342	if (!N0.getNode() \|\| !N1.getNode())
18343	return SDValue ();
18344	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: N0, N2: N1);
18345	};
18346	auto Sub = [&](SDValue N0, SDValue N1) {
18347	if (!N0.getNode() \|\| !N1.getNode())
18348	return SDValue ();
18349	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: N0, N2: N1);
18350	};
18351	auto Negate = [&](SDValue N) {
18352	if (!N0.getNode())
18353	return SDValue ();
18354	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
18355	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: N);
18356	};
18357
18358	// Can the const C be decomposed into (1+2^M1)(1+2^N1), eg:*
18359	// C = 45 is equal to (1+4)(1+8), we don't decompose it into (1+2)(16-1) as
18360	// the (2^N - 1) can't be execused via a single instruction.
18361	auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18362	unsigned BitWidth = C.getBitWidth();
18363	for (unsigned i = `1`; i < BitWidth / `2`; i++) {
18364	APInt Rem;
18365	APInt X(BitWidth, (`1` << i) + `1`);
18366	APInt::sdivrem(LHS: C, RHS: X, Quotient&: N, Remainder&: Rem);
18367	APInt NVMinus1 = N - `1`;
18368	if (Rem == `0` && NVMinus1.isPowerOf2()) {
18369	M = X;
18370	return true;
18371	}
18372	}
18373	return false;
18374	};
18375
18376	// Can the const C be decomposed into (2^M + 1) 2^N + 1), eg:*
18377	// C = 11 is equal to (1+4)2+1, we don't decompose it into (1+2)4-1 as
18378	// the (2^N - 1) can't be execused via a single instruction.
18379	auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18380	APInt CVMinus1 = C - `1`;
18381	if (CVMinus1.isNegative())
18382	return false;
18383	unsigned TrailingZeroes = CVMinus1.countr_zero();
18384	APInt SCVMinus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) - `1`;
18385	if (SCVMinus1.isPowerOf2()) {
18386	unsigned BitWidth = SCVMinus1.getBitWidth();
18387	M = APInt (BitWidth, SCVMinus1.logBase2());
18388	N = APInt (BitWidth, TrailingZeroes);
18389	return true;
18390	}
18391	return false;
18392	};
18393
18394	// Can the const C be decomposed into (1 - (1 - 2^M) 2^N), eg:*
18395	// C = 29 is equal to 1 - (1 - 2^3) 2^2.*
18396	auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18397	APInt CVMinus1 = C - `1`;
18398	if (CVMinus1.isNegative())
18399	return false;
18400	unsigned TrailingZeroes = CVMinus1.countr_zero();
18401	APInt CVPlus1 = CVMinus1.ashr(ShiftAmt: TrailingZeroes) + `1`;
18402	if (CVPlus1.isPowerOf2()) {
18403	unsigned BitWidth = CVPlus1.getBitWidth();
18404	M = APInt (BitWidth, CVPlus1.logBase2());
18405	N = APInt (BitWidth, TrailingZeroes);
18406	return true;
18407	}
18408	return false;
18409	};
18410
18411	if (ConstValue.isNonNegative()) {
18412	// (mul x, (2^N + 1) 2^M) => (shl (add (shl x, N), x), M)*
18413	// (mul x, 2^N - 1) => (sub (shl x, N), x)
18414	// (mul x, (2^(N-M) - 1) 2^M) => (sub (shl x, N), (shl x, M))*
18415	// (mul x, (2^M + 1) (2^N + 1))*
18416	// => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18417	// (mul x, (2^M + 1) 2^N + 1))*
18418	// => MV = add (shl x, M), x); add (shl MV, N), x)
18419	// (mul x, 1 - (1 - 2^M) 2^N))*
18420	// => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18421	APInt SCVMinus1 = ShiftedConstValue - `1`;
18422	APInt SCVPlus1 = ShiftedConstValue + `1`;
18423	APInt CVPlus1 = ConstValue + `1`;
18424	APInt CVM, CVN;
18425	if (SCVMinus1.isPowerOf2()) {
18426	ShiftAmt = SCVMinus1.logBase2();
18427	return Shl (Add (Shl (N0, ShiftAmt), N0), TrailingZeroes);
18428	} else if (CVPlus1.isPowerOf2()) {
18429	ShiftAmt = CVPlus1.logBase2();
18430	return Sub (Shl (N0, ShiftAmt), N0);
18431	} else if (SCVPlus1.isPowerOf2()) {
18432	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18433	return Sub (Shl (N0, ShiftAmt), Shl (N0, TrailingZeroes));
18434	}
18435	if (Subtarget->hasALULSLFast() &&
18436	isPowPlusPlusConst (ConstValue, CVM, CVN)) {
18437	APInt CVMMinus1 = CVM - `1`;
18438	APInt CVNMinus1 = CVN - `1`;
18439	unsigned ShiftM1 = CVMMinus1.logBase2();
18440	unsigned ShiftN1 = CVNMinus1.logBase2();
18441	// ALULSLFast implicate that Shifts <= 4 places are fast
18442	if (ShiftM1 <= `4` && ShiftN1 <= `4`) {
18443	SDValue MVal = Add (Shl (N0, ShiftM1), N0);
18444	return Add (Shl (MVal, ShiftN1), MVal);
18445	}
18446	}
18447	if (Subtarget->hasALULSLFast() &&
18448	isPowPlusPlusOneConst (ConstValue, CVM, CVN)) {
18449	unsigned ShiftM = CVM.getZExtValue();
18450	unsigned ShiftN = CVN.getZExtValue();
18451	// ALULSLFast implicate that Shifts <= 4 places are fast
18452	if (ShiftM <= `4` && ShiftN <= `4`) {
18453	SDValue MVal = Add (Shl (N0, CVM.getZExtValue()), N0);
18454	return Add (Shl (MVal, CVN.getZExtValue()), N0);
18455	}
18456	}
18457
18458	if (Subtarget->hasALULSLFast() &&
18459	isPowMinusMinusOneConst (ConstValue, CVM, CVN)) {
18460	unsigned ShiftM = CVM.getZExtValue();
18461	unsigned ShiftN = CVN.getZExtValue();
18462	// ALULSLFast implicate that Shifts <= 4 places are fast
18463	if (ShiftM <= `4` && ShiftN <= `4`) {
18464	SDValue MVal = Sub (N0, Shl (N0, CVM.getZExtValue()));
18465	return Sub (N0, Shl (MVal, CVN.getZExtValue()));
18466	}
18467	}
18468	} else {
18469	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18470	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18471	// (mul x, -(2^(N-M) - 1) 2^M) => (sub (shl x, M), (shl x, N))*
18472	APInt SCVPlus1 = -ShiftedConstValue + `1`;
18473	APInt CVNegPlus1 = -ConstValue + `1`;
18474	APInt CVNegMinus1 = -ConstValue - `1`;
18475	if (CVNegPlus1.isPowerOf2()) {
18476	ShiftAmt = CVNegPlus1.logBase2();
18477	return Sub (N0, Shl (N0, ShiftAmt));
18478	} else if (CVNegMinus1.isPowerOf2()) {
18479	ShiftAmt = CVNegMinus1.logBase2();
18480	return Negate (Add (Shl (N0, ShiftAmt), N0));
18481	} else if (SCVPlus1.isPowerOf2()) {
18482	ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18483	return Sub (Shl (N0, TrailingZeroes), Shl (N0, ShiftAmt));
18484	}
18485	}
18486
18487	return SDValue ();
18488	}
18489
18490	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
18491	SelectionDAG &DAG) {
18492	// Take advantage of vector comparisons producing 0 or -1 in each lane to
18493	// optimize away operation when it's from a constant.
18494	//
18495	// The general transformation is:
18496	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18497	// AND(VECTOR_CMP(x,y), constant2)
18498	// constant2 = UNARYOP(constant)
18499
18500	// Early exit if this isn't a vector operation, the operand of the
18501	// unary operation isn't a bitwise AND, or if the sizes of the operations
18502	// aren't the same.
18503	EVT VT = N->getValueType(ResNo: `0`);
18504	if (!VT.isVector() \|\| N->getOperand(Num: `0`)->getOpcode() != ISD::AND \|\|
18505	N->getOperand(Num: `0`)->getOperand(Num: `0`)->getOpcode() != ISD::SETCC \|\|
18506	VT.getSizeInBits() != N->getOperand(Num: `0`)->getValueType(ResNo: `0`).getSizeInBits())
18507	return SDValue ();
18508
18509	// Now check that the other operand of the AND is a constant. We could
18510	// make the transformation for non-constant splats as well, but it's unclear
18511	// that would be a benefit as it would not eliminate any operations, just
18512	// perform one more step in scalar code before moving to the vector unit.
18513	if (BuildVectorSDNode *BV =
18514	dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `0`)->getOperand(Num: `1`))) {
18515	// Bail out if the vector isn't a constant.
18516	if (!BV->isConstant())
18517	return SDValue ();
18518
18519	// Everything checks out. Build up the new and improved node.
18520	SDLoc DL(N);
18521	EVT IntVT = BV->getValueType(ResNo: `0`);
18522	// Create a new constant of the appropriate type for the transformed
18523	// DAG.
18524	SDValue SourceConst = DAG.getNode(Opcode: N->getOpcode(), DL, VT, Operand: SDValue (BV, `0`));
18525	// The AND node needs bitcasts to/from an integer vector type around it.
18526	SDValue MaskConst = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IntVT, Operand: SourceConst);
18527	SDValue NewAnd = DAG.getNode(Opcode: ISD::AND, DL, VT: IntVT,
18528	N1: N->getOperand(Num: `0`)->getOperand(Num: `0`), N2: MaskConst);
18529	SDValue Res = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewAnd);
18530	return Res;
18531	}
18532
18533	return SDValue ();
18534	}
18535
18536	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
18537	const AArch64Subtarget *Subtarget) {
18538	// First try to optimize away the conversion when it's conditionally from
18539	// a constant. Vectors only.
18540	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
18541	return Res;
18542
18543	EVT VT = N->getValueType(ResNo: `0`);
18544	if (VT != MVT::f32 && VT != MVT::f64)
18545	return SDValue ();
18546
18547	// Only optimize when the source and destination types have the same width.
18548	if (VT.getSizeInBits() != N->getOperand(Num: `0`).getValueSizeInBits())
18549	return SDValue ();
18550
18551	// If the result of an integer load is only used by an integer-to-float
18552	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
18553	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
18554	SDValue N0 = N->getOperand(Num: `0`);
18555	if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N: N0.getNode()) &&
18556	N0.hasOneUse() &&
18557	// Do not change the width of a volatile load.
18558	!cast<LoadSDNode>(Val&: N0)->isVolatile()) {
18559	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
18560	SDValue Load = DAG.getLoad(VT, dl: SDLoc (N), Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
18561	PtrInfo: LN0->getPointerInfo(), Alignment: LN0->getAlign(),
18562	MMOFlags: LN0->getMemOperand()->getFlags());
18563
18564	// Make sure successors of the original load stay after it by updating them
18565	// to use the new Chain.
18566	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LN0, `1`), To: Load.getValue(R: `1`));
18567
18568	unsigned Opcode =
18569	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
18570	return DAG.getNode(Opcode, DL: SDLoc (N), VT, Operand: Load);
18571	}
18572
18573	return SDValue ();
18574	}
18575
18576	/// Fold a floating-point multiply by power of two into floating-point to
18577	/// fixed-point conversion.
18578	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
18579	TargetLowering::DAGCombinerInfo &DCI,
18580	const AArch64Subtarget *Subtarget) {
18581	if (!Subtarget->isNeonAvailable())
18582	return SDValue ();
18583
18584	if (!N->getValueType(ResNo: `0`).isSimple())
18585	return SDValue ();
18586
18587	SDValue Op = N->getOperand(Num: `0`);
18588	if (!Op.getValueType().isSimple() \|\| Op.getOpcode() != ISD::FMUL)
18589	return SDValue ();
18590
18591	if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
18592	return SDValue ();
18593
18594	SDValue ConstVec = Op ->getOperand(Num: `1`);
18595	if (!isa<BuildVectorSDNode>(Val: ConstVec))
18596	return SDValue ();
18597
18598	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
18599	uint32_t FloatBits = FloatTy.getSizeInBits();
18600	if (FloatBits != `32` && FloatBits != `64` &&
18601	(FloatBits != `16` \|\| !Subtarget->hasFullFP16()))
18602	return SDValue ();
18603
18604	MVT IntTy = N->getSimpleValueType(ResNo: `0`).getVectorElementType();
18605	uint32_t IntBits = IntTy.getSizeInBits();
18606	if (IntBits != `16` && IntBits != `32` && IntBits != `64`)
18607	return SDValue ();
18608
18609	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
18610	if (IntBits > FloatBits)
18611	return SDValue ();
18612
18613	BitVector UndefElements;
18614	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Val&: ConstVec);
18615	int32_t Bits = IntBits == `64` ? `64` : `32`;
18616	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(UndefElements: &UndefElements, BitWidth: Bits + `1`);
18617	if (C == -`1` \|\| C == `0` \|\| C > Bits)
18618	return SDValue ();
18619
18620	EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
18621	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ResTy))
18622	return SDValue ();
18623
18624	if (N->getOpcode() == ISD::FP_TO_SINT_SAT \|\|
18625	N->getOpcode() == ISD::FP_TO_UINT_SAT) {
18626	EVT SatVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
18627	if (SatVT.getScalarSizeInBits() != IntBits \|\| IntBits != FloatBits)
18628	return SDValue ();
18629	}
18630
18631	SDLoc DL(N);
18632	bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT \|\|
18633	N->getOpcode() == ISD::FP_TO_SINT_SAT);
18634	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
18635	: Intrinsic::aarch64_neon_vcvtfp2fxu;
18636	SDValue FixConv =
18637	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ResTy,
18638	N1: DAG.getConstant(Val: IntrinsicOpcode, DL, VT: MVT::i32),
18639	N2: Op ->getOperand(Num: `0`), N3: DAG.getConstant(Val: C, DL, VT: MVT::i32));
18640	// We can handle smaller integers by generating an extra trunc.
18641	if (IntBits < FloatBits)
18642	FixConv = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: N->getValueType(ResNo: `0`), Operand: FixConv);
18643
18644	return FixConv;
18645	}
18646
18647	static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18648	const AArch64TargetLowering &TLI) {
18649	EVT VT = N->getValueType(ResNo: `0`);
18650	SelectionDAG &DAG = DCI.DAG;
18651	SDLoc DL(N);
18652	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18653
18654	if (!VT.isVector())
18655	return SDValue ();
18656
18657	if (VT.isScalableVector() && !Subtarget.hasSVE2())
18658	return SDValue ();
18659
18660	if (VT.isFixedLengthVector() &&
18661	(!Subtarget.isNeonAvailable() \|\| TLI.useSVEForFixedLengthVectorVT(VT)))
18662	return SDValue ();
18663
18664	SDValue N0 = N->getOperand(Num: `0`);
18665	if (N0.getOpcode() != ISD::AND)
18666	return SDValue ();
18667
18668	SDValue N1 = N->getOperand(Num: `1`);
18669	if (N1.getOpcode() != ISD::AND)
18670	return SDValue ();
18671
18672	// InstCombine does (not (neg a)) => (add a -1).
18673	// Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18674	// Loop over all combinations of AND operands.
18675	for (int i = `1`; i >= `0`; --i) {
18676	for (int j = `1`; j >= `0`; --j) {
18677	SDValue O0 = N0 ->getOperand(Num: i);
18678	SDValue O1 = N1 ->getOperand(Num: j);
18679	SDValue Sub, Add, SubSibling, AddSibling;
18680
18681	// Find a SUB and an ADD operand, one from each AND.
18682	if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18683	Sub = O0;
18684	Add = O1;
18685	SubSibling = N0 ->getOperand(Num: `1` - i);
18686	AddSibling = N1 ->getOperand(Num: `1` - j);
18687	} else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18688	Add = O0;
18689	Sub = O1;
18690	AddSibling = N0 ->getOperand(Num: `1` - i);
18691	SubSibling = N1 ->getOperand(Num: `1` - j);
18692	} else
18693	continue;
18694
18695	if (!ISD::isConstantSplatVectorAllZeros(N: Sub.getOperand(i: `0`).getNode()))
18696	continue;
18697
18698	// Constant ones is always righthand operand of the Add.
18699	if (!ISD::isConstantSplatVectorAllOnes(N: Add.getOperand(i: `1`).getNode()))
18700	continue;
18701
18702	if (Sub.getOperand(i: `1`) != Add.getOperand(i: `0`))
18703	continue;
18704
18705	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: Sub, N2: SubSibling, N3: AddSibling);
18706	}
18707	}
18708
18709	// (or (and a b) (and (not a) c)) => (bsl a b c)
18710	// We only have to look for constant vectors here since the general, variable
18711	// case can be handled in TableGen.
18712	unsigned Bits = VT.getScalarSizeInBits();
18713	uint64_t BitMask = Bits == `64` ? -`1ULL` : ((`1ULL` << Bits) - `1`);
18714	for (int i = `1`; i >= `0`; --i)
18715	for (int j = `1`; j >= `0`; --j) {
18716	APInt Val1, Val2;
18717
18718	if (ISD::isConstantSplatVector(N: N0 ->getOperand(Num: i).getNode(), SplatValue&: Val1) &&
18719	ISD::isConstantSplatVector(N: N1 ->getOperand(Num: j).getNode(), SplatValue&: Val2) &&
18720	(BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18721	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
18722	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
18723	}
18724	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(Val: N0 ->getOperand(Num: i));
18725	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(Val: N1 ->getOperand(Num: j));
18726	if (!BVN0 \|\| !BVN1)
18727	continue;
18728
18729	bool FoundMatch = true;
18730	for (unsigned k = `0`; k < VT.getVectorNumElements(); ++k) {
18731	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(Val: BVN0->getOperand(Num: k));
18732	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Val: BVN1->getOperand(Num: k));
18733	if (!CN0 \|\| !CN1 \|\|
18734	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18735	FoundMatch = false;
18736	break;
18737	}
18738	}
18739	if (FoundMatch)
18740	return DAG.getNode(Opcode: AArch64ISD::BSP, DL, VT, N1: N0 ->getOperand(Num: i),
18741	N2: N0 ->getOperand(Num: `1` - i), N3: N1 ->getOperand(Num: `1` - j));
18742	}
18743
18744	return SDValue ();
18745	}
18746
18747	// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18748	// convert to csel(ccmp(.., cc0)), depending on cc1:
18749
18750	// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18751	// =>
18752	// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18753	//
18754	// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18755	// =>
18756	// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18757	static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
18758	EVT VT = N->getValueType(ResNo: `0`);
18759	SDValue CSel0 = N->getOperand(Num: `0`);
18760	SDValue CSel1 = N->getOperand(Num: `1`);
18761
18762	if (CSel0.getOpcode() != AArch64ISD::CSEL \|\|
18763	CSel1.getOpcode() != AArch64ISD::CSEL)
18764	return SDValue ();
18765
18766	if (!CSel0 ->hasOneUse() \|\| !CSel1 ->hasOneUse())
18767	return SDValue ();
18768
18769	if (!isNullConstant(V: CSel0.getOperand(i: `0`)) \|\|
18770	!isOneConstant(V: CSel0.getOperand(i: `1`)) \|\|
18771	!isNullConstant(V: CSel1.getOperand(i: `0`)) \|\|
18772	!isOneConstant(V: CSel1.getOperand(i: `1`)))
18773	return SDValue ();
18774
18775	SDValue Cmp0 = CSel0.getOperand(i: `3`);
18776	SDValue Cmp1 = CSel1.getOperand(i: `3`);
18777	AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(i: `2`);
18778	AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(i: `2`);
18779	if (!Cmp0 ->hasOneUse() \|\| !Cmp1 ->hasOneUse())
18780	return SDValue ();
18781	if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18782	Cmp0.getOpcode() == AArch64ISD::SUBS) {
18783	std::swap(a&: Cmp0, b&: Cmp1);
18784	std::swap(a&: CC0, b&: CC1);
18785	}
18786
18787	if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18788	return SDValue ();
18789
18790	SDLoc DL(N);
18791	SDValue CCmp, Condition;
18792	unsigned NZCV;
18793
18794	if (N->getOpcode() == ISD::AND) {
18795	AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(Code: CC0);
18796	Condition = DAG.getConstant(Val: InvCC0, DL, VT: MVT_CC);
18797	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: CC1);
18798	} else {
18799	AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
18800	Condition = DAG.getConstant(Val: CC0, DL, VT: MVT_CC);
18801	NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvCC1);
18802	}
18803
18804	SDValue NZCVOp = DAG.getConstant(Val: NZCV, DL, VT: MVT::i32);
18805
18806	auto *Op1 = dyn_cast<ConstantSDNode>(Val: Cmp1.getOperand(i: `1`));
18807	if (Op1 && Op1->getAPIntValue().isNegative() &&
18808	Op1->getAPIntValue().sgt(RHS: -`32`)) {
18809	// CCMP accept the constant int the range [0, 31]
18810	// if the Op1 is a constant in the range [-31, -1], we
18811	// can select to CCMN to avoid the extra mov
18812	SDValue AbsOp1 =
18813	DAG.getConstant(Val: Op1->getAPIntValue().abs(), DL, VT: Op1->getValueType(ResNo: `0`));
18814	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMN, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`), N2: AbsOp1,
18815	N3: NZCVOp, N4: Condition, N5: Cmp0);
18816	} else {
18817	CCmp = DAG.getNode(Opcode: AArch64ISD::CCMP, DL, VT: MVT_CC, N1: Cmp1.getOperand(i: `0`),
18818	N2: Cmp1.getOperand(i: `1`), N3: NZCVOp, N4: Condition, N5: Cmp0);
18819	}
18820	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: CSel0.getOperand(i: `0`),
18821	N2: CSel0.getOperand(i: `1`), N3: DAG.getConstant(Val: CC1, DL, VT: MVT::i32),
18822	N4: CCmp);
18823	}
18824
18825	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
18826	const AArch64Subtarget *Subtarget,
18827	const AArch64TargetLowering &TLI) {
18828	SelectionDAG &DAG = DCI.DAG;
18829	EVT VT = N->getValueType(ResNo: `0`);
18830
18831	if (SDValue R = performANDORCSELCombine(N, DAG))
18832	return R;
18833
18834	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18835	return SDValue ();
18836
18837	if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18838	return Res;
18839
18840	return SDValue ();
18841	}
18842
18843	static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
18844	if (!MemVT.getVectorElementType().isSimple())
18845	return false;
18846
18847	uint64_t MaskForTy = `0ull`;
18848	switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18849	case MVT::i8:
18850	MaskForTy = `0xffull`;
18851	break;
18852	case MVT::i16:
18853	MaskForTy = `0xffffull`;
18854	break;
18855	case MVT::i32:
18856	MaskForTy = `0xffffffffull`;
18857	break;
18858	default:
18859	return false;
18860	break;
18861	}
18862
18863	if (N->getOpcode() == AArch64ISD::DUP \|\| N->getOpcode() == ISD::SPLAT_VECTOR)
18864	if (auto *Op0 = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`)))
18865	return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18866
18867	return false;
18868	}
18869
18870	static SDValue performReinterpretCastCombine(SDNode *N) {
18871	SDValue LeafOp = SDValue (N, `0`);
18872	SDValue Op = N->getOperand(Num: `0`);
18873	while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18874	LeafOp.getValueType() != Op.getValueType())
18875	Op = Op ->getOperand(Num: `0`);
18876	if (LeafOp.getValueType() == Op.getValueType())
18877	return Op;
18878	return SDValue ();
18879	}
18880
18881	static SDValue performSVEAndCombine(SDNode *N,
18882	TargetLowering::DAGCombinerInfo &DCI) {
18883	SelectionDAG &DAG = DCI.DAG;
18884	SDValue Src = N->getOperand(Num: `0`);
18885	unsigned Opc = Src ->getOpcode();
18886
18887	// Zero/any extend of an unsigned unpack
18888	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
18889	SDValue UnpkOp = Src ->getOperand(Num: `0`);
18890	SDValue Dup = N->getOperand(Num: `1`);
18891
18892	if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18893	return SDValue ();
18894
18895	SDLoc DL(N);
18896	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val: Dup ->getOperand(Num: `0`));
18897	if (!C)
18898	return SDValue ();
18899
18900	uint64_t ExtVal = C->getZExtValue();
18901
18902	auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18903	return ((ExtVal == `0xFF` && VT == MVT::i8) \|\|
18904	(ExtVal == `0xFFFF` && VT == MVT::i16) \|\|
18905	(ExtVal == `0xFFFFFFFF` && VT == MVT::i32));
18906	};
18907
18908	// If the mask is fully covered by the unpack, we don't need to push
18909	// a new AND onto the operand
18910	EVT EltTy = UnpkOp ->getValueType(ResNo: `0`).getVectorElementType();
18911	if (MaskAndTypeMatch (EltTy))
18912	return Src;
18913
18914	// If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18915	// to see if the mask is all-ones of size MemTy.
18916	auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(Val&: UnpkOp);
18917	if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD \|\|
18918	MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18919	EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18920	if (MaskAndTypeMatch (EltTy))
18921	return Src;
18922	}
18923
18924	// Truncate to prevent a DUP with an over wide constant
18925	APInt Mask = C->getAPIntValue().trunc(width: EltTy.getSizeInBits());
18926
18927	// Otherwise, make sure we propagate the AND to the operand
18928	// of the unpack
18929	Dup = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: UnpkOp ->getValueType(ResNo: `0`),
18930	Operand: DAG.getConstant(Val: Mask.zextOrTrunc(width: `32`), DL, VT: MVT::i32));
18931
18932	SDValue And = DAG.getNode(Opcode: ISD::AND, DL,
18933	VT: UnpkOp ->getValueType(ResNo: `0`), N1: UnpkOp, N2: Dup);
18934
18935	return DAG.getNode(Opcode: Opc, DL, VT: N->getValueType(ResNo: `0`), Operand: And);
18936	}
18937
18938	if (DCI.isBeforeLegalizeOps())
18939	return SDValue ();
18940
18941	// If both sides of AND operations are i1 splat_vectors then
18942	// we can produce just i1 splat_vector as the result.
18943	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `0`)))
18944	return N->getOperand(Num: `1`);
18945	if (isAllActivePredicate(DAG, N: N->getOperand(Num: `1`)))
18946	return N->getOperand(Num: `0`);
18947
18948	if (!EnableCombineMGatherIntrinsics)
18949	return SDValue ();
18950
18951	SDValue Mask = N->getOperand(Num: `1`);
18952
18953	if (!Src.hasOneUse())
18954	return SDValue ();
18955
18956	EVT MemVT;
18957
18958	// SVE load instructions perform an implicit zero-extend, which makes them
18959	// perfect candidates for combining.
18960	switch (Opc) {
18961	case AArch64ISD::LD1_MERGE_ZERO:
18962	case AArch64ISD::LDNF1_MERGE_ZERO:
18963	case AArch64ISD::LDFF1_MERGE_ZERO:
18964	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `3`))->getVT();
18965	break;
18966	case AArch64ISD::GLD1_MERGE_ZERO:
18967	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
18968	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
18969	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
18970	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
18971	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
18972	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
18973	case AArch64ISD::GLDFF1_MERGE_ZERO:
18974	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
18975	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
18976	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
18977	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
18978	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
18979	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
18980	case AArch64ISD::GLDNT1_MERGE_ZERO:
18981	MemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: `4`))->getVT();
18982	break;
18983	default:
18984	return SDValue ();
18985	}
18986
18987	if (isConstantSplatVectorMaskForType(N: Mask.getNode(), MemVT))
18988	return Src;
18989
18990	return SDValue ();
18991	}
18992
18993	// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18994	static SDValue performANDSETCCCombine(SDNode *N,
18995	TargetLowering::DAGCombinerInfo &DCI) {
18996
18997	// This function performs an optimization on a specific pattern involving
18998	// an AND operation and SETCC (Set Condition Code) node.
18999
19000	SDValue SetCC = N->getOperand(Num: `0`);
19001	EVT VT = N->getValueType(ResNo: `0`);
19002	SelectionDAG &DAG = DCI.DAG;
19003
19004	// Checks if the current node (N) is used by any SELECT instruction and
19005	// returns an empty SDValue to avoid applying the optimization to prevent
19006	// incorrect results
19007	for (auto U : N->uses())
19008	if (U->getOpcode() == ISD::SELECT)
19009	return SDValue ();
19010
19011	// Check if the operand is a SETCC node with floating-point comparison
19012	if (SetCC.getOpcode() == ISD::SETCC &&
19013	SetCC.getOperand(i: `0`).getValueType() == MVT::f32) {
19014
19015	SDValue Cmp;
19016	AArch64CC::CondCode CC;
19017
19018	// Check if the DAG is after legalization and if we can emit the conjunction
19019	if (!DCI.isBeforeLegalize() &&
19020	(Cmp = emitConjunction(DAG, Val: SDValue (N, `0`), OutCC&: CC))) {
19021
19022	AArch64CC::CondCode InvertedCC = AArch64CC::getInvertedCondCode(Code: CC);
19023
19024	SDLoc DL(N);
19025	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: DAG.getConstant(Val: `0`, DL, VT),
19026	N2: DAG.getConstant(Val: `0`, DL, VT),
19027	N3: DAG.getConstant(Val: InvertedCC, DL, VT: MVT::i32), N4: Cmp);
19028	}
19029	}
19030	return SDValue ();
19031	}
19032
19033	static SDValue performANDCombine(SDNode *N,
19034	TargetLowering::DAGCombinerInfo &DCI) {
19035	SelectionDAG &DAG = DCI.DAG;
19036	SDValue LHS = N->getOperand(Num: `0`);
19037	SDValue RHS = N->getOperand(Num: `1`);
19038	EVT VT = N->getValueType(ResNo: `0`);
19039
19040	if (SDValue R = performANDORCSELCombine(N, DAG))
19041	return R;
19042
19043	if (SDValue R = performANDSETCCCombine(N,DCI))
19044	return R;
19045
19046	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19047	return SDValue ();
19048
19049	if (VT.isScalableVector())
19050	return performSVEAndCombine(N, DCI);
19051
19052	// The combining code below works only for NEON vectors. In particular, it
19053	// does not work for SVE when dealing with vectors wider than 128 bits.
19054	if (!VT.is64BitVector() && !VT.is128BitVector())
19055	return SDValue ();
19056
19057	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: RHS.getNode());
19058	if (!BVN)
19059	return SDValue ();
19060
19061	// AND does not accept an immediate, so check if we can use a BIC immediate
19062	// instruction instead. We do this here instead of using a (and x, (mvni imm))
19063	// pattern in isel, because some immediates may be lowered to the preferred
19064	// (and x, (movi imm)) form, even though an mvni representation also exists.
19065	APInt DefBits(VT.getSizeInBits(), `0`);
19066	APInt UndefBits(VT.getSizeInBits(), `0`);
19067	if (resolveBuildVector(BVN, CnstBits&: DefBits, UndefBits)) {
19068	SDValue NewOp;
19069
19070	// Any bits known to already be 0 need not be cleared again, which can help
19071	// reduce the size of the immediate to one supported by the instruction.
19072	KnownBits Known = DAG.computeKnownBits(Op: LHS);
19073	APInt ZeroSplat(VT.getSizeInBits(), `0`);
19074	for (unsigned I = `0`; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19075	ZeroSplat \|= Known.Zero.zext(width: VT.getSizeInBits())
19076	<< (Known.Zero.getBitWidth() * I);
19077
19078	DefBits = ~(DefBits \| ZeroSplat);
19079	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19080	Bits: DefBits, LHS: &LHS)) \|\|
19081	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19082	Bits: DefBits, LHS: &LHS)))
19083	return NewOp;
19084
19085	UndefBits = ~(UndefBits \| ZeroSplat);
19086	if ((NewOp = tryAdvSIMDModImm32(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19087	Bits: UndefBits, LHS: &LHS)) \|\|
19088	(NewOp = tryAdvSIMDModImm16(NewOp: AArch64ISD::BICi, Op: SDValue (N, `0`), DAG,
19089	Bits: UndefBits, LHS: &LHS)))
19090	return NewOp;
19091	}
19092
19093	return SDValue ();
19094	}
19095
19096	static SDValue performFADDCombine(SDNode *N,
19097	TargetLowering::DAGCombinerInfo &DCI) {
19098	SelectionDAG &DAG = DCI.DAG;
19099	SDValue LHS = N->getOperand(Num: `0`);
19100	SDValue RHS = N->getOperand(Num: `1`);
19101	EVT VT = N->getValueType(ResNo: `0`);
19102	SDLoc DL(N);
19103
19104	if (!N->getFlags().hasAllowReassociation())
19105	return SDValue ();
19106
19107	// Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19108	auto ReassocComplex = [&](SDValue A, SDValue B) {
19109	if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19110	return SDValue ();
19111	unsigned Opc = A.getConstantOperandVal(i: `0`);
19112	if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19113	Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19114	Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19115	Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19116	return SDValue ();
19117	SDValue VCMLA = DAG.getNode(
19118	Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT, N1: A.getOperand(i: `0`),
19119	N2: DAG.getNode(Opcode: ISD::FADD, DL, VT, N1: A.getOperand(i: `1`), N2: B, Flags: N->getFlags()),
19120	N3: A.getOperand(i: `2`), N4: A.getOperand(i: `3`));
19121	VCMLA ->setFlags(A ->getFlags());
19122	return VCMLA;
19123	};
19124	if (SDValue R = ReassocComplex (LHS, RHS))
19125	return R;
19126	if (SDValue R = ReassocComplex (RHS, LHS))
19127	return R;
19128
19129	return SDValue ();
19130	}
19131
19132	static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19133	switch (Opcode) {
19134	case ISD::STRICT_FADD:
19135	case ISD::FADD:
19136	return (FullFP16 && VT == MVT::f16) \|\| VT == MVT::f32 \|\| VT == MVT::f64;
19137	case ISD::ADD:
19138	return VT == MVT::i64;
19139	default:
19140	return false;
19141	}
19142	}
19143
19144	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19145	AArch64CC::CondCode Cond);
19146
19147	static bool isPredicateCCSettingOp(SDValue N) {
19148	if ((N.getOpcode() == ISD::SETCC) \|\|
19149	(N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19150	(N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilege \|\|
19151	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilegt \|\|
19152	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilehi \|\|
19153	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilehs \|\|
19154	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilele \|\|
19155	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilelo \|\|
19156	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilels \|\|
19157	N.getConstantOperandVal(i: `0`) == Intrinsic::aarch64_sve_whilelt \|\|
19158	// get_active_lane_mask is lowered to a whilelo instruction.
19159	N.getConstantOperandVal(i: `0`) == Intrinsic::get_active_lane_mask)))
19160	return true;
19161
19162	return false;
19163	}
19164
19165	// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19166	// ... into: "ptrue p, all" + PTEST
19167	static SDValue
19168	performFirstTrueTestVectorCombine(SDNode *N,
19169	TargetLowering::DAGCombinerInfo &DCI,
19170	const AArch64Subtarget *Subtarget) {
19171	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19172	// Make sure PTEST can be legalised with illegal types.
19173	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
19174	return SDValue ();
19175
19176	SDValue N0 = N->getOperand(Num: `0`);
19177	EVT VT = N0.getValueType();
19178
19179	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1 \|\|
19180	!isNullConstant(V: N->getOperand(Num: `1`)))
19181	return SDValue ();
19182
19183	// Restricted the DAG combine to only cases where we're extracting from a
19184	// flag-setting operation.
19185	if (!isPredicateCCSettingOp(N: N0))
19186	return SDValue ();
19187
19188	// Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19189	SelectionDAG &DAG = DCI.DAG;
19190	SDValue Pg = getPTrue(DAG, DL: SDLoc (N), VT, Pattern: AArch64SVEPredPattern::all);
19191	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::FIRST_ACTIVE);
19192	}
19193
19194	// Materialize : Idx = (add (mul vscale, NumEls), -1)
19195	// i1 = extract_vector_elt t37, Constant:i64<Idx>
19196	// ... into: "ptrue p, all" + PTEST
19197	static SDValue
19198	performLastTrueTestVectorCombine(SDNode *N,
19199	TargetLowering::DAGCombinerInfo &DCI,
19200	const AArch64Subtarget *Subtarget) {
19201	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19202	// Make sure PTEST is legal types.
19203	if (!Subtarget->hasSVE() \|\| DCI.isBeforeLegalize())
19204	return SDValue ();
19205
19206	SDValue N0 = N->getOperand(Num: `0`);
19207	EVT OpVT = N0.getValueType();
19208
19209	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
19210	return SDValue ();
19211
19212	// Idx == (add (mul vscale, NumEls), -1)
19213	SDValue Idx = N->getOperand(Num: `1`);
19214	if (Idx.getOpcode() != ISD::ADD \|\| !isAllOnesConstant(V: Idx.getOperand(i: `1`)))
19215	return SDValue ();
19216
19217	SDValue VS = Idx.getOperand(i: `0`);
19218	if (VS.getOpcode() != ISD::VSCALE)
19219	return SDValue ();
19220
19221	unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19222	if (VS.getConstantOperandVal(i: `0`) != NumEls)
19223	return SDValue ();
19224
19225	// Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19226	SelectionDAG &DAG = DCI.DAG;
19227	SDValue Pg = getPTrue(DAG, DL: SDLoc (N), VT: OpVT, Pattern: AArch64SVEPredPattern::all);
19228	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg, Op: N0, Cond: AArch64CC::LAST_ACTIVE);
19229	}
19230
19231	static SDValue
19232	performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19233	const AArch64Subtarget *Subtarget) {
19234	assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19235	if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19236	return Res;
19237	if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19238	return Res;
19239
19240	SelectionDAG &DAG = DCI.DAG;
19241	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
19242
19243	EVT VT = N->getValueType(ResNo: `0`);
19244	const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19245	bool IsStrict = N0 ->isStrictFPOpcode();
19246
19247	// extract(dup x) -> x
19248	if (N0.getOpcode() == AArch64ISD::DUP)
19249	return VT.isInteger() ? DAG.getZExtOrTrunc(Op: N0.getOperand(i: `0`), DL: SDLoc (N), VT)
19250	: N0.getOperand(i: `0`);
19251
19252	// Rewrite for pairwise fadd pattern
19253	// (f32 (extract_vector_elt
19254	// (fadd (vXf32 Other)
19255	// (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19256	// ->
19257	// (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19258	// (extract_vector_elt (vXf32 Other) 1))
19259	// For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19260	// we can only do this when it's used only by the extract_vector_elt.
19261	if (isNullConstant(V: N1) && hasPairwiseAdd(Opcode: N0 ->getOpcode(), VT, FullFP16) &&
19262	(!IsStrict \|\| N0.hasOneUse())) {
19263	SDLoc DL(N0);
19264	SDValue N00 = N0 ->getOperand(Num: IsStrict ? `1` : `0`);
19265	SDValue N01 = N0 ->getOperand(Num: IsStrict ? `2` : `1`);
19266
19267	ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N01);
19268	SDValue Other = N00;
19269
19270	// And handle the commutative case.
19271	if (!Shuffle) {
19272	Shuffle = dyn_cast<ShuffleVectorSDNode>(Val&: N00);
19273	Other = N01;
19274	}
19275
19276	if (Shuffle && Shuffle->getMaskElt(Idx: `0`) == `1` &&
19277	Other == Shuffle->getOperand(Num: `0`)) {
19278	SDValue Extract1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19279	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
19280	SDValue Extract2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Other,
19281	N2: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
19282	if (!IsStrict)
19283	return DAG.getNode(Opcode: N0 ->getOpcode(), DL, VT, N1: Extract1, N2: Extract2);
19284
19285	// For strict_fadd we need uses of the final extract_vector to be replaced
19286	// with the strict_fadd, but we also need uses of the chain output of the
19287	// original strict_fadd to use the chain output of the new strict_fadd as
19288	// otherwise it may not be deleted.
19289	SDValue Ret = DAG.getNode(Opcode: N0 ->getOpcode(), DL,
19290	ResultTys: {VT, MVT::Other},
19291	Ops: {N0 ->getOperand(Num: `0`), Extract1, Extract2});
19292	DAG.ReplaceAllUsesOfValueWith(From: SDValue (N, `0`), To: Ret);
19293	DAG.ReplaceAllUsesOfValueWith(From: N0.getValue(R: `1`), To: Ret.getValue(R: `1`));
19294	return SDValue (N, `0`);
19295	}
19296	}
19297
19298	return SDValue ();
19299	}
19300
19301	static SDValue performConcatVectorsCombine(SDNode *N,
19302	TargetLowering::DAGCombinerInfo &DCI,
19303	SelectionDAG &DAG) {
19304	SDLoc dl(N);
19305	EVT VT = N->getValueType(ResNo: `0`);
19306	SDValue N0 = N->getOperand(Num: `0`), N1 = N->getOperand(Num: `1`);
19307	unsigned N0Opc = N0 ->getOpcode(), N1Opc = N1 ->getOpcode();
19308
19309	if (VT.isScalableVector())
19310	return SDValue ();
19311
19312	// Optimize concat_vectors of truncated vectors, where the intermediate
19313	// type is illegal, to avoid said illegality, e.g.,
19314	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19315	// (v2i16 (truncate (v2i64)))))
19316	// ->
19317	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19318	// (v4i32 (bitcast (v2i64))),
19319	// <0, 2, 4, 6>)))
19320	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19321	// on both input and result type, so we might generate worse code.
19322	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19323	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
19324	N1Opc == ISD::TRUNCATE) {
19325	SDValue N00 = N0 ->getOperand(Num: `0`);
19326	SDValue N10 = N1 ->getOperand(Num: `0`);
19327	EVT N00VT = N00.getValueType();
19328
19329	if (N00VT == N10.getValueType() &&
19330	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
19331	N00VT.getScalarSizeInBits() == `4` * VT.getScalarSizeInBits()) {
19332	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19333	SmallVector<int, `8`> Mask(MidVT.getVectorNumElements());
19334	for (size_t i = `0`; i < Mask.size(); ++i)
19335	Mask [i] = i * `2`;
19336	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT,
19337	Operand: DAG.getVectorShuffle(
19338	VT: MidVT, dl,
19339	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N00),
19340	N2: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: MidVT, Operand: N10), Mask));
19341	}
19342	}
19343
19344	if (N->getOperand(Num: `0`).getValueType() == MVT::v4i8 \|\|
19345	N->getOperand(Num: `0`).getValueType() == MVT::v2i16 \|\|
19346	N->getOperand(Num: `0`).getValueType() == MVT::v2i8) {
19347	EVT SrcVT = N->getOperand(Num: `0`).getValueType();
19348	// If we have a concat of v4i8 loads, convert them to a buildvector of f32
19349	// loads to prevent having to go through the v4i8 load legalization that
19350	// needs to extend each element into a larger type.
19351	if (N->getNumOperands() % `2` == `0` &&
19352	all_of(Range: N->op_values(), P: [SrcVT](SDValue V) {
19353	if (V.getValueType() != SrcVT)
19354	return false;
19355	if (V.isUndef())
19356	return true;
19357	LoadSDNode *LD = dyn_cast<LoadSDNode>(Val&: V);
19358	return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19359	LD->getExtensionType() == ISD::NON_EXTLOAD;
19360	})) {
19361	EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19362	EVT NVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: FVT, NumElements: N->getNumOperands());
19363	SmallVector<SDValue> Ops;
19364
19365	for (unsigned i = `0`; i < N->getNumOperands(); i++) {
19366	SDValue V = N->getOperand(Num: i);
19367	if (V.isUndef())
19368	Ops.push_back(Elt: DAG.getUNDEF(VT: FVT));
19369	else {
19370	LoadSDNode *LD = cast<LoadSDNode>(Val&: V);
19371	SDValue NewLoad = DAG.getLoad(VT: FVT, dl, Chain: LD->getChain(),
19372	Ptr: LD->getBasePtr(), MMO: LD->getMemOperand());
19373	DAG.ReplaceAllUsesOfValueWith(From: SDValue (LD, `1`), To: NewLoad.getValue(R: `1`));
19374	Ops.push_back(Elt: NewLoad);
19375	}
19376	}
19377	return DAG.getBitcast(VT: N->getValueType(ResNo: `0`),
19378	V: DAG.getBuildVector(VT: NVT, DL: dl, Ops));
19379	}
19380	}
19381
19382	// Canonicalise concat_vectors to replace concatenations of truncated nots
19383	// with nots of concatenated truncates. This in some cases allows for multiple
19384	// redundant negations to be eliminated.
19385	// (concat_vectors (v4i16 (truncate (not (v4i32)))),
19386	// (v4i16 (truncate (not (v4i32)))))
19387	// ->
19388	// (not (concat_vectors (v4i16 (truncate (v4i32))),
19389	// (v4i16 (truncate (v4i32)))))
19390	if (N->getNumOperands() == `2` && N0Opc == ISD::TRUNCATE &&
19391	N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N: N0.getNode()) &&
19392	N->isOnlyUserOf(N: N1.getNode())) {
19393	auto isBitwiseVectorNegate = [](SDValue V) {
19394	return V ->getOpcode() == ISD::XOR &&
19395	ISD::isConstantSplatVectorAllOnes(N: V.getOperand(i: `1`).getNode());
19396	};
19397	SDValue N00 = N0 ->getOperand(Num: `0`);
19398	SDValue N10 = N1 ->getOperand(Num: `0`);
19399	if (isBitwiseVectorNegate (N00) && N0 ->isOnlyUserOf(N: N00.getNode()) &&
19400	isBitwiseVectorNegate (N10) && N1 ->isOnlyUserOf(N: N10.getNode())) {
19401	return DAG.getNOT(
19402	DL: dl,
19403	Val: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT,
19404	N1: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N0.getValueType(),
19405	Operand: N00 ->getOperand(Num: `0`)),
19406	N2: DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: N1.getValueType(),
19407	Operand: N10 ->getOperand(Num: `0`))),
19408	VT);
19409	}
19410	}
19411
19412	// Wait till after everything is legalized to try this. That way we have
19413	// legal vector types and such.
19414	if (DCI.isBeforeLegalizeOps())
19415	return SDValue ();
19416
19417	// Optimise concat_vectors of two identical binops with a 128-bit destination
19418	// size, combine into an binop of two contacts of the source vectors. eg:
19419	// concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19420	if (N->getNumOperands() == `2` && N0Opc == N1Opc && VT.is128BitVector() &&
19421	DAG.getTargetLoweringInfo().isBinOp(Opcode: N0Opc) && N0 ->hasOneUse() &&
19422	N1 ->hasOneUse()) {
19423	SDValue N00 = N0 ->getOperand(Num: `0`);
19424	SDValue N01 = N0 ->getOperand(Num: `1`);
19425	SDValue N10 = N1 ->getOperand(Num: `0`);
19426	SDValue N11 = N1 ->getOperand(Num: `1`);
19427
19428	if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19429	SDValue Concat0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N00, N2: N10);
19430	SDValue Concat1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N01, N2: N11);
19431	return DAG.getNode(Opcode: N0Opc, DL: dl, VT, N1: Concat0, N2: Concat1);
19432	}
19433	}
19434
19435	auto IsRSHRN = [](SDValue Shr) {
19436	if (Shr.getOpcode() != AArch64ISD::VLSHR)
19437	return false;
19438	SDValue Op = Shr.getOperand(i: `0`);
19439	EVT VT = Op.getValueType();
19440	unsigned ShtAmt = Shr.getConstantOperandVal(i: `1`);
19441	if (ShtAmt > VT.getScalarSizeInBits() / `2` \|\| Op.getOpcode() != ISD::ADD)
19442	return false;
19443
19444	APInt Imm;
19445	if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::MOVIshift)
19446	Imm = APInt (VT.getScalarSizeInBits(),
19447	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`)
19448	<< Op.getOperand(i: `1`).getConstantOperandVal(i: `1`));
19449	else if (Op.getOperand(i: `1`).getOpcode() == AArch64ISD::DUP &&
19450	isa<ConstantSDNode>(Val: Op.getOperand(i: `1`).getOperand(i: `0`)))
19451	Imm = APInt (VT.getScalarSizeInBits(),
19452	Op.getOperand(i: `1`).getConstantOperandVal(i: `0`));
19453	else
19454	return false;
19455
19456	if (Imm != `1ULL` << (ShtAmt - `1`))
19457	return false;
19458	return true;
19459	};
19460
19461	// concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
19462	if (N->getNumOperands() == `2` && IsRSHRN (N0) &&
19463	((IsRSHRN (N1) &&
19464	N0.getConstantOperandVal(i: `1`) == N1.getConstantOperandVal(i: `1`)) \|\|
19465	N1.isUndef())) {
19466	SDValue X = N0.getOperand(i: `0`).getOperand(i: `0`);
19467	SDValue Y = N1.isUndef() ? DAG.getUNDEF(VT: X.getValueType())
19468	: N1.getOperand(i: `0`).getOperand(i: `0`);
19469	EVT BVT =
19470	X.getValueType().getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
19471	SDValue CC = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: BVT, N1: X, N2: Y);
19472	SDValue Add = DAG.getNode(
19473	Opcode: ISD::ADD, DL: dl, VT: BVT, N1: CC,
19474	N2: DAG.getConstant(Val: `1ULL` << (N0.getConstantOperandVal(i: `1`) - `1`), DL: dl, VT: BVT));
19475	SDValue Shr =
19476	DAG.getNode(Opcode: AArch64ISD::VLSHR, DL: dl, VT: BVT, N1: Add, N2: N0.getOperand(i: `1`));
19477	return Shr;
19478	}
19479
19480	// concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
19481	if (N->getNumOperands() == `2` && N0Opc == AArch64ISD::ZIP1 &&
19482	N1Opc == AArch64ISD::ZIP2 && N0.getOperand(i: `0`) == N1.getOperand(i: `0`) &&
19483	N0.getOperand(i: `1`) == N1.getOperand(i: `1`)) {
19484	SDValue E0 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: `0`),
19485	N2: DAG.getUNDEF(VT: N0.getValueType()));
19486	SDValue E1 = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, N1: N0.getOperand(i: `1`),
19487	N2: DAG.getUNDEF(VT: N0.getValueType()));
19488	return DAG.getNode(Opcode: AArch64ISD::ZIP1, DL: dl, VT, N1: E0, N2: E1);
19489	}
19490
19491	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
19492	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
19493	// canonicalise to that.
19494	if (N->getNumOperands() == `2` && N0 == N1 && VT.getVectorNumElements() == `2`) {
19495	assert(VT.getScalarSizeInBits() == `64`);
19496	return DAG.getNode(Opcode: AArch64ISD::DUPLANE64, DL: dl, VT, N1: WidenVector(V64Reg: N0, DAG),
19497	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
19498	}
19499
19500	// Canonicalise concat_vectors so that the right-hand vector has as few
19501	// bit-casts as possible before its real operation. The primary matching
19502	// destination for these operations will be the narrowing "2" instructions,
19503	// which depend on the operation being performed on this right-hand vector.
19504	// For example,
19505	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
19506	// becomes
19507	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
19508
19509	if (N->getNumOperands() != `2` \|\| N1Opc != ISD::BITCAST)
19510	return SDValue ();
19511	SDValue RHS = N1 ->getOperand(Num: `0`);
19512	MVT RHSTy = RHS.getValueType().getSimpleVT();
19513	// If the RHS is not a vector, this is not the pattern we're looking for.
19514	if (!RHSTy.isVector())
19515	return SDValue ();
19516
19517	LLVM_DEBUG(
19518	dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
19519
19520	MVT ConcatTy = MVT::getVectorVT(VT: RHSTy.getVectorElementType(),
19521	NumElements: RHSTy.getVectorNumElements() * `2`);
19522	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT,
19523	Operand: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT: ConcatTy,
19524	N1: DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: RHSTy, Operand: N0),
19525	N2: RHS));
19526	}
19527
19528	static SDValue
19529	performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19530	SelectionDAG &DAG) {
19531	if (DCI.isBeforeLegalizeOps())
19532	return SDValue ();
19533
19534	EVT VT = N->getValueType(ResNo: `0`);
19535	if (!VT.isScalableVector() \|\| VT.getVectorElementType() != MVT::i1)
19536	return SDValue ();
19537
19538	SDValue V = N->getOperand(Num: `0`);
19539
19540	// NOTE: This combine exists in DAGCombiner, but that version's legality check
19541	// blocks this combine because the non-const case requires custom lowering.
19542	//
19543	// ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
19544	if (V.getOpcode() == ISD::SPLAT_VECTOR)
19545	if (isa<ConstantSDNode>(Val: V.getOperand(i: `0`)))
19546	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT, Operand: V.getOperand(i: `0`));
19547
19548	return SDValue ();
19549	}
19550
19551	static SDValue
19552	performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19553	SelectionDAG &DAG) {
19554	SDLoc DL(N);
19555	SDValue Vec = N->getOperand(Num: `0`);
19556	SDValue SubVec = N->getOperand(Num: `1`);
19557	uint64_t IdxVal = N->getConstantOperandVal(Num: `2`);
19558	EVT VecVT = Vec.getValueType();
19559	EVT SubVT = SubVec.getValueType();
19560
19561	// Only do this for legal fixed vector types.
19562	if (!VecVT.isFixedLengthVector() \|\|
19563	!DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT) \|\|
19564	!DAG.getTargetLoweringInfo().isTypeLegal(VT: SubVT))
19565	return SDValue ();
19566
19567	// Ignore widening patterns.
19568	if (IdxVal == `0` && Vec.isUndef())
19569	return SDValue ();
19570
19571	// Subvector must be half the width and an "aligned" insertion.
19572	unsigned NumSubElts = SubVT.getVectorNumElements();
19573	if ((SubVT.getSizeInBits() * `2`) != VecVT.getSizeInBits() \|\|
19574	(IdxVal != `0` && IdxVal != NumSubElts))
19575	return SDValue ();
19576
19577	// Fold insert_subvector -> concat_vectors
19578	// insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
19579	// insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
19580	SDValue Lo, Hi;
19581	if (IdxVal == `0`) {
19582	Lo = SubVec;
19583	Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
19584	N2: DAG.getVectorIdxConstant(Val: NumSubElts, DL));
19585	} else {
19586	Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: SubVT, N1: Vec,
19587	N2: DAG.getVectorIdxConstant(Val: `0`, DL));
19588	Hi = SubVec;
19589	}
19590	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: VecVT, N1: Lo, N2: Hi);
19591	}
19592
19593	static SDValue tryCombineFixedPointConvert(SDNode *N,
19594	TargetLowering::DAGCombinerInfo &DCI,
19595	SelectionDAG &DAG) {
19596	// Wait until after everything is legalized to try this. That way we have
19597	// legal vector types and such.
19598	if (DCI.isBeforeLegalizeOps())
19599	return SDValue ();
19600	// Transform a scalar conversion of a value from a lane extract into a
19601	// lane extract of a vector conversion. E.g., from foo1 to foo2:
19602	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
19603	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
19604	//
19605	// The second form interacts better with instruction selection and the
19606	// register allocator to avoid cross-class register copies that aren't
19607	// coalescable due to a lane reference.
19608
19609	// Check the operand and see if it originates from a lane extract.
19610	SDValue Op1 = N->getOperand(Num: `1`);
19611	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
19612	return SDValue ();
19613
19614	// Yep, no additional predication needed. Perform the transform.
19615	SDValue IID = N->getOperand(Num: `0`);
19616	SDValue Shift = N->getOperand(Num: `2`);
19617	SDValue Vec = Op1.getOperand(i: `0`);
19618	SDValue Lane = Op1.getOperand(i: `1`);
19619	EVT ResTy = N->getValueType(ResNo: `0`);
19620	EVT VecResTy;
19621	SDLoc DL(N);
19622
19623	// The vector width should be 128 bits by the time we get here, even
19624	// if it started as 64 bits (the extract_vector handling will have
19625	// done so). Bail if it is not.
19626	if (Vec.getValueSizeInBits() != `128`)
19627	return SDValue ();
19628
19629	if (Vec.getValueType() == MVT::v4i32)
19630	VecResTy = MVT::v4f32;
19631	else if (Vec.getValueType() == MVT::v2i64)
19632	VecResTy = MVT::v2f64;
19633	else
19634	return SDValue ();
19635
19636	SDValue Convert =
19637	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: VecResTy, N1: IID, N2: Vec, N3: Shift);
19638	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResTy, N1: Convert, N2: Lane);
19639	}
19640
19641	// AArch64 high-vector "long" operations are formed by performing the non-high
19642	// version on an extract_subvector of each operand which gets the high half:
19643	//
19644	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19645	//
19646	// However, there are cases which don't have an extract_high explicitly, but
19647	// have another operation that can be made compatible with one for free. For
19648	// example:
19649	//
19650	// (dupv64 scalar) --> (extract_high (dup128 scalar))
19651	//
19652	// This routine does the actual conversion of such DUPs, once outer routines
19653	// have determined that everything else is in order.
19654	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19655	// similarly here.
19656	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
19657	MVT VT = N.getSimpleValueType();
19658	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19659	N.getConstantOperandVal(i: `1`) == `0`)
19660	N = N.getOperand(i: `0`);
19661
19662	switch (N.getOpcode()) {
19663	case AArch64ISD::DUP:
19664	case AArch64ISD::DUPLANE8:
19665	case AArch64ISD::DUPLANE16:
19666	case AArch64ISD::DUPLANE32:
19667	case AArch64ISD::DUPLANE64:
19668	case AArch64ISD::MOVI:
19669	case AArch64ISD::MOVIshift:
19670	case AArch64ISD::MOVIedit:
19671	case AArch64ISD::MOVImsl:
19672	case AArch64ISD::MVNIshift:
19673	case AArch64ISD::MVNImsl:
19674	break;
19675	default:
19676	// FMOV could be supported, but isn't very useful, as it would only occur
19677	// if you passed a bitcast' floating point immediate to an eligible long
19678	// integer op (addl, smull, ...).
19679	return SDValue ();
19680	}
19681
19682	if (!VT.is64BitVector())
19683	return SDValue ();
19684
19685	SDLoc DL(N);
19686	unsigned NumElems = VT.getVectorNumElements();
19687	if (N.getValueType().is64BitVector()) {
19688	MVT ElementTy = VT.getVectorElementType();
19689	MVT NewVT = MVT::getVectorVT(VT: ElementTy, NumElements: NumElems * `2`);
19690	N = DAG.getNode(Opcode: N ->getOpcode(), DL, VT: NewVT, Ops: N ->ops());
19691	}
19692
19693	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: N,
19694	N2: DAG.getConstant(Val: NumElems, DL, VT: MVT::i64));
19695	}
19696
19697	static bool isEssentiallyExtractHighSubvector(SDValue N) {
19698	if (N.getOpcode() == ISD::BITCAST)
19699	N = N.getOperand(i: `0`);
19700	if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19701	return false;
19702	if (N.getOperand(i: `0`).getValueType().isScalableVector())
19703	return false;
19704	return N.getConstantOperandAPInt(i: `1`) ==
19705	N.getOperand(i: `0`).getValueType().getVectorNumElements() / `2`;
19706	}
19707
19708	/// Helper structure to keep track of ISD::SET_CC operands.
19709	struct GenericSetCCInfo {
19710	const SDValue *Opnd0;
19711	const SDValue *Opnd1;
19712	ISD::CondCode CC;
19713	};
19714
19715	/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19716	struct AArch64SetCCInfo {
19717	const SDValue *Cmp;
19718	AArch64CC::CondCode CC;
19719	};
19720
19721	/// Helper structure to keep track of SetCC information.
19722	union SetCCInfo {
19723	GenericSetCCInfo Generic;
19724	AArch64SetCCInfo AArch64;
19725	};
19726
19727	/// Helper structure to be able to read SetCC information. If set to
19728	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19729	/// GenericSetCCInfo.
19730	struct SetCCInfoAndKind {
19731	SetCCInfo Info;
19732	bool IsAArch64;
19733	};
19734
19735	/// Check whether or not \p Op is a SET_CC operation, either a generic or
19736	/// an
19737	/// AArch64 lowered one.
19738	/// \p SetCCInfo is filled accordingly.
19739	/// \post SetCCInfo is meanginfull only when this function returns true.
19740	/// \return True when Op is a kind of SET_CC operation.
19741	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
19742	// If this is a setcc, this is straight forward.
19743	if (Op.getOpcode() == ISD::SETCC) {
19744	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(i: `0`);
19745	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(i: `1`);
19746	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Val: Op.getOperand(i: `2`))->get();
19747	SetCCInfo.IsAArch64 = false;
19748	return true;
19749	}
19750	// Otherwise, check if this is a matching csel instruction.
19751	// In other words:
19752	// - csel 1, 0, cc
19753	// - csel 0, 1, !cc
19754	if (Op.getOpcode() != AArch64ISD::CSEL)
19755	return false;
19756	// Set the information about the operands.
19757	// TODO: we want the operands of the Cmp not the csel
19758	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(i: `3`);
19759	SetCCInfo.IsAArch64 = true;
19760	SetCCInfo.Info.AArch64.CC =
19761	static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
19762
19763	// Check that the operands matches the constraints:
19764	// (1) Both operands must be constants.
19765	// (2) One must be 1 and the other must be 0.
19766	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `0`));
19767	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Val: Op.getOperand(i: `1`));
19768
19769	// Check (1).
19770	if (!TValue \|\| !FValue)
19771	return false;
19772
19773	// Check (2).
19774	if (!TValue->isOne()) {
19775	// Update the comparison when we are interested in !cc.
19776	std::swap(a&: TValue, b&: FValue);
19777	SetCCInfo.Info.AArch64.CC =
19778	AArch64CC::getInvertedCondCode(Code: SetCCInfo.Info.AArch64.CC);
19779	}
19780	return TValue->isOne() && FValue->isZero();
19781	}
19782
19783	// Returns true if Op is setcc or zext of setcc.
19784	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19785	if (isSetCC(Op, SetCCInfo&: Info))
19786	return true;
19787	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19788	isSetCC(Op: Op ->getOperand(Num: `0`), SetCCInfo&: Info));
19789	}
19790
19791	// The folding we want to perform is:
19792	// (add x, [zext] (setcc cc ...) )
19793	// -->
19794	// (csel x, (add x, 1), !cc ...)
19795	//
19796	// The latter will get matched to a CSINC instruction.
19797	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
19798	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19799	SDValue LHS = Op->getOperand(Num: `0`);
19800	SDValue RHS = Op->getOperand(Num: `1`);
19801	SetCCInfoAndKind InfoAndKind;
19802
19803	// If both operands are a SET_CC, then we don't want to perform this
19804	// folding and create another csel as this results in more instructions
19805	// (and higher register usage).
19806	if (isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind) &&
19807	isSetCCOrZExtSetCC(Op: RHS, Info&: InfoAndKind))
19808	return SDValue ();
19809
19810	// If neither operand is a SET_CC, give up.
19811	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind)) {
19812	std::swap(a&: LHS, b&: RHS);
19813	if (!isSetCCOrZExtSetCC(Op: LHS, Info&: InfoAndKind))
19814	return SDValue ();
19815	}
19816
19817	// FIXME: This could be generatized to work for FP comparisons.
19818	EVT CmpVT = InfoAndKind.IsAArch64
19819	? InfoAndKind.Info.AArch64.Cmp->getOperand(i: `0`).getValueType()
19820	: InfoAndKind.Info.Generic.Opnd0->getValueType();
19821	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19822	return SDValue ();
19823
19824	SDValue CCVal;
19825	SDValue Cmp;
19826	SDLoc dl(Op);
19827	if (InfoAndKind.IsAArch64) {
19828	CCVal = DAG.getConstant(
19829	Val: AArch64CC::getInvertedCondCode(Code: InfoAndKind.Info.AArch64.CC), DL: dl,
19830	VT: MVT::i32);
19831	Cmp = *InfoAndKind.Info.AArch64.Cmp;
19832	} else
19833	Cmp = getAArch64Cmp(
19834	LHS: InfoAndKind.Info.Generic.Opnd0, RHS: InfoAndKind.Info.Generic.Opnd1,
19835	CC: ISD::getSetCCInverse(Operation: InfoAndKind.Info.Generic.CC, Type: CmpVT), AArch64cc&: CCVal, DAG,
19836	dl);
19837
19838	EVT VT = Op->getValueType(ResNo: `0`);
19839	LHS = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT, N1: RHS, N2: DAG.getConstant(Val: `1`, DL: dl, VT));
19840	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL: dl, VT, N1: RHS, N2: LHS, N3: CCVal, N4: Cmp);
19841	}
19842
19843	// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19844	static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
19845	EVT VT = N->getValueType(ResNo: `0`);
19846	// Only scalar integer and vector types.
19847	if (N->getOpcode() != ISD::ADD \|\| !VT.isScalarInteger())
19848	return SDValue ();
19849
19850	SDValue LHS = N->getOperand(Num: `0`);
19851	SDValue RHS = N->getOperand(Num: `1`);
19852	if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
19853	RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\| LHS.getValueType() != VT)
19854	return SDValue ();
19855
19856	auto *LHSN1 = dyn_cast<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`));
19857	auto *RHSN1 = dyn_cast<ConstantSDNode>(Val: RHS ->getOperand(Num: `1`));
19858	if (!LHSN1 \|\| LHSN1 != RHSN1 \|\| !RHSN1->isZero())
19859	return SDValue ();
19860
19861	SDValue Op1 = LHS ->getOperand(Num: `0`);
19862	SDValue Op2 = RHS ->getOperand(Num: `0`);
19863	EVT OpVT1 = Op1.getValueType();
19864	EVT OpVT2 = Op2.getValueType();
19865	if (Op1.getOpcode() != AArch64ISD::UADDV \|\| OpVT1 != OpVT2 \|\|
19866	Op2.getOpcode() != AArch64ISD::UADDV \|\|
19867	OpVT1.getVectorElementType() != VT)
19868	return SDValue ();
19869
19870	SDValue Val1 = Op1.getOperand(i: `0`);
19871	SDValue Val2 = Op2.getOperand(i: `0`);
19872	EVT ValVT = Val1 ->getValueType(ResNo: `0`);
19873	SDLoc DL(N);
19874	SDValue AddVal = DAG.getNode(Opcode: ISD::ADD, DL, VT: ValVT, N1: Val1, N2: Val2);
19875	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT,
19876	N1: DAG.getNode(Opcode: AArch64ISD::UADDV, DL, VT: ValVT, Operand: AddVal),
19877	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
19878	}
19879
19880	/// Perform the scalar expression combine in the form of:
19881	/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19882	/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19883	static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
19884	EVT VT = N->getValueType(ResNo: `0`);
19885	if (!VT.isScalarInteger() \|\| N->getOpcode() != ISD::ADD)
19886	return SDValue ();
19887
19888	SDValue LHS = N->getOperand(Num: `0`);
19889	SDValue RHS = N->getOperand(Num: `1`);
19890
19891	// Handle commutivity.
19892	if (LHS.getOpcode() != AArch64ISD::CSEL &&
19893	LHS.getOpcode() != AArch64ISD::CSNEG) {
19894	std::swap(a&: LHS, b&: RHS);
19895	if (LHS.getOpcode() != AArch64ISD::CSEL &&
19896	LHS.getOpcode() != AArch64ISD::CSNEG) {
19897	return SDValue ();
19898	}
19899	}
19900
19901	if (!LHS.hasOneUse())
19902	return SDValue ();
19903
19904	AArch64CC::CondCode AArch64CC =
19905	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
19906
19907	// The CSEL should include a const one operand, and the CSNEG should include
19908	// One or NegOne operand.
19909	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `0`));
19910	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(Val: LHS.getOperand(i: `1`));
19911	if (!CTVal \|\| !CFVal)
19912	return SDValue ();
19913
19914	if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19915	(CTVal->isOne() \|\| CFVal->isOne())) &&
19916	!(LHS.getOpcode() == AArch64ISD::CSNEG &&
19917	(CTVal->isOne() \|\| CFVal->isAllOnes())))
19918	return SDValue ();
19919
19920	// Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19921	if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19922	!CFVal->isOne()) {
19923	std::swap(a&: CTVal, b&: CFVal);
19924	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19925	}
19926
19927	SDLoc DL(N);
19928	// Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19929	if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19930	!CFVal->isAllOnes()) {
19931	APInt C = -`1` * CFVal->getAPIntValue();
19932	CTVal = cast<ConstantSDNode>(Val: DAG.getConstant(Val: C, DL, VT));
19933	CFVal = cast<ConstantSDNode>(Val: DAG.getAllOnesConstant(DL, VT));
19934	AArch64CC = AArch64CC::getInvertedCondCode(Code: AArch64CC);
19935	}
19936
19937	// It might be neutral for larger constants, as the immediate need to be
19938	// materialized in a register.
19939	APInt ADDC = CTVal->getAPIntValue();
19940	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19941	if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19942	return SDValue ();
19943
19944	assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) \|\|
19945	(LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19946	"Unexpected constant value");
19947
19948	SDValue NewNode = DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: SDValue (CTVal, `0`));
19949	SDValue CCVal = DAG.getConstant(Val: AArch64CC, DL, VT: MVT::i32);
19950	SDValue Cmp = LHS.getOperand(i: `3`);
19951
19952	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: NewNode, N2: RHS, N3: CCVal, N4: Cmp);
19953	}
19954
19955	// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19956	static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
19957	EVT VT = N->getValueType(ResNo: `0`);
19958	if (N->getOpcode() != ISD::ADD)
19959	return SDValue ();
19960
19961	SDValue Dot = N->getOperand(Num: `0`);
19962	SDValue A = N->getOperand(Num: `1`);
19963	// Handle commutivity
19964	auto isZeroDot = [](SDValue Dot) {
19965	return (Dot.getOpcode() == AArch64ISD::UDOT \|\|
19966	Dot.getOpcode() == AArch64ISD::SDOT) &&
19967	isZerosVector(N: Dot.getOperand(i: `0`).getNode());
19968	};
19969	if (!isZeroDot (Dot))
19970	std::swap(a&: Dot, b&: A);
19971	if (!isZeroDot (Dot))
19972	return SDValue ();
19973
19974	return DAG.getNode(Opcode: Dot.getOpcode(), DL: SDLoc (N), VT, N1: A, N2: Dot.getOperand(i: `1`),
19975	N3: Dot.getOperand(i: `2`));
19976	}
19977
19978	static bool isNegatedInteger(SDValue Op) {
19979	return Op.getOpcode() == ISD::SUB && isNullConstant(V: Op.getOperand(i: `0`));
19980	}
19981
19982	static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
19983	SDLoc DL(Op);
19984	EVT VT = Op.getValueType();
19985	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT);
19986	return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Zero, N2: Op);
19987	}
19988
19989	// Try to fold
19990	//
19991	// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19992	//
19993	// The folding helps csel to be matched with csneg without generating
19994	// redundant neg instruction, which includes negation of the csel expansion
19995	// of abs node lowered by lowerABS.
19996	static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
19997	if (!isNegatedInteger(Op: SDValue (N, `0`)))
19998	return SDValue ();
19999
20000	SDValue CSel = N->getOperand(Num: `1`);
20001	if (CSel.getOpcode() != AArch64ISD::CSEL \|\| !CSel ->hasOneUse())
20002	return SDValue ();
20003
20004	SDValue N0 = CSel.getOperand(i: `0`);
20005	SDValue N1 = CSel.getOperand(i: `1`);
20006
20007	// If both of them is not negations, it's not worth the folding as it
20008	// introduces two additional negations while reducing one negation.
20009	if (!isNegatedInteger(Op: N0) && !isNegatedInteger(Op: N1))
20010	return SDValue ();
20011
20012	SDValue N0N = getNegatedInteger(Op: N0, DAG);
20013	SDValue N1N = getNegatedInteger(Op: N1, DAG);
20014
20015	SDLoc DL(N);
20016	EVT VT = CSel.getValueType();
20017	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: N0N, N2: N1N, N3: CSel.getOperand(i: `2`),
20018	N4: CSel.getOperand(i: `3`));
20019	}
20020
20021	// The basic add/sub long vector instructions have variants with "2" on the end
20022	// which act on the high-half of their inputs. They are normally matched by
20023	// patterns like:
20024	//
20025	// (add (zeroext (extract_high LHS)),
20026	// (zeroext (extract_high RHS)))
20027	// -> uaddl2 vD, vN, vM
20028	//
20029	// However, if one of the extracts is something like a duplicate, this
20030	// instruction can still be used profitably. This function puts the DAG into a
20031	// more appropriate form for those patterns to trigger.
20032	static SDValue performAddSubLongCombine(SDNode *N,
20033	TargetLowering::DAGCombinerInfo &DCI) {
20034	SelectionDAG &DAG = DCI.DAG;
20035	if (DCI.isBeforeLegalizeOps())
20036	return SDValue ();
20037
20038	MVT VT = N->getSimpleValueType(ResNo: `0`);
20039	if (!VT.is128BitVector()) {
20040	if (N->getOpcode() == ISD::ADD)
20041	return performSetccAddFolding(Op: N, DAG);
20042	return SDValue ();
20043	}
20044
20045	// Make sure both branches are extended in the same way.
20046	SDValue LHS = N->getOperand(Num: `0`);
20047	SDValue RHS = N->getOperand(Num: `1`);
20048	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20049	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
20050	LHS.getOpcode() != RHS.getOpcode())
20051	return SDValue ();
20052
20053	unsigned ExtType = LHS.getOpcode();
20054
20055	// It's not worth doing if at least one of the inputs isn't already an
20056	// extract, but we don't know which it'll be so we have to try both.
20057	if (isEssentiallyExtractHighSubvector(N: LHS.getOperand(i: `0`))) {
20058	RHS = tryExtendDUPToExtractHigh(N: RHS.getOperand(i: `0`), DAG);
20059	if (!RHS.getNode())
20060	return SDValue ();
20061
20062	RHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: RHS);
20063	} else if (isEssentiallyExtractHighSubvector(N: RHS.getOperand(i: `0`))) {
20064	LHS = tryExtendDUPToExtractHigh(N: LHS.getOperand(i: `0`), DAG);
20065	if (!LHS.getNode())
20066	return SDValue ();
20067
20068	LHS = DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT, Operand: LHS);
20069	}
20070
20071	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT, N1: LHS, N2: RHS);
20072	}
20073
20074	static bool isCMP(SDValue Op) {
20075	return Op.getOpcode() == AArch64ISD::SUBS &&
20076	!Op.getNode()->hasAnyUseOfValue(Value: `0`);
20077	}
20078
20079	// (CSEL 1 0 CC Cond) => CC
20080	// (CSEL 0 1 CC Cond) => !CC
20081	static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20082	if (Op.getOpcode() != AArch64ISD::CSEL)
20083	return std::nullopt;
20084	auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(i: `2`));
20085	if (CC == AArch64CC::AL \|\| CC == AArch64CC::NV)
20086	return std::nullopt;
20087	SDValue OpLHS = Op.getOperand(i: `0`);
20088	SDValue OpRHS = Op.getOperand(i: `1`);
20089	if (isOneConstant(V: OpLHS) && isNullConstant(V: OpRHS))
20090	return CC;
20091	if (isNullConstant(V: OpLHS) && isOneConstant(V: OpRHS))
20092	return getInvertedCondCode(Code: CC);
20093
20094	return std::nullopt;
20095	}
20096
20097	// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20098	// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20099	static SDValue foldOverflowCheck(SDNode Op, SelectionDAG &DAG, bool* IsAdd) {
20100	SDValue CmpOp = Op->getOperand(Num: `2`);
20101	if (!isCMP(Op: CmpOp))
20102	return SDValue ();
20103
20104	if (IsAdd) {
20105	if (!isOneConstant(V: CmpOp.getOperand(i: `1`)))
20106	return SDValue ();
20107	} else {
20108	if (!isNullConstant(V: CmpOp.getOperand(i: `0`)))
20109	return SDValue ();
20110	}
20111
20112	SDValue CsetOp = CmpOp ->getOperand(Num: IsAdd ? `0` : `1`);
20113	auto CC = getCSETCondCode(Op: CsetOp);
20114	if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20115	return SDValue ();
20116
20117	return DAG.getNode(Opcode: Op->getOpcode(), DL: SDLoc (Op), VTList: Op->getVTList(),
20118	N1: Op->getOperand(Num: `0`), N2: Op->getOperand(Num: `1`),
20119	N3: CsetOp.getOperand(i: `3`));
20120	}
20121
20122	// (ADC x 0 cond) => (CINC x HS cond)
20123	static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
20124	SDValue LHS = N->getOperand(Num: `0`);
20125	SDValue RHS = N->getOperand(Num: `1`);
20126	SDValue Cond = N->getOperand(Num: `2`);
20127
20128	if (!isNullConstant(V: RHS))
20129	return SDValue ();
20130
20131	EVT VT = N->getValueType(ResNo: `0`);
20132	SDLoc DL(N);
20133
20134	// (CINC x cc cond) <=> (CSINC x x !cc cond)
20135	SDValue CC = DAG.getConstant(Val: AArch64CC::LO, DL, VT: MVT::i32);
20136	return DAG.getNode(Opcode: AArch64ISD::CSINC, DL, VT, N1: LHS, N2: LHS, N3: CC, N4: Cond);
20137	}
20138
20139	static SDValue performBuildVectorCombine(SDNode *N,
20140	TargetLowering::DAGCombinerInfo &DCI,
20141	SelectionDAG &DAG) {
20142	SDLoc DL(N);
20143	EVT VT = N->getValueType(ResNo: `0`);
20144
20145	if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
20146	(VT == MVT::v4f16 \|\| VT == MVT::v4bf16)) {
20147	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`),
20148	Elt2 = N->getOperand(Num: `2`), Elt3 = N->getOperand(Num: `3`);
20149	if (Elt0 ->getOpcode() == ISD::FP_ROUND &&
20150	Elt1 ->getOpcode() == ISD::FP_ROUND &&
20151	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
20152	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
20153	Elt0 ->getConstantOperandVal(Num: `1`) == Elt1 ->getConstantOperandVal(Num: `1`) &&
20154	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20155	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20156	// Constant index.
20157	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20158	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20159	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20160	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20161	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
20162	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
20163	SDValue LowLanesSrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20164	if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20165	SDValue HighLanes;
20166	if (Elt2 ->getOpcode() == ISD::UNDEF &&
20167	Elt3 ->getOpcode() == ISD::UNDEF) {
20168	HighLanes = DAG.getUNDEF(VT: MVT::v2f32);
20169	} else if (Elt2 ->getOpcode() == ISD::FP_ROUND &&
20170	Elt3 ->getOpcode() == ISD::FP_ROUND &&
20171	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `1`)) &&
20172	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `1`)) &&
20173	Elt2 ->getConstantOperandVal(Num: `1`) ==
20174	Elt3 ->getConstantOperandVal(Num: `1`) &&
20175	Elt2 ->getOperand(Num: `0`)->getOpcode() ==
20176	ISD::EXTRACT_VECTOR_ELT &&
20177	Elt3 ->getOperand(Num: `0`)->getOpcode() ==
20178	ISD::EXTRACT_VECTOR_ELT &&
20179	// Constant index.
20180	isa<ConstantSDNode>(Val: Elt2 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20181	isa<ConstantSDNode>(Val: Elt3 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20182	Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20183	Elt3 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20184	Elt2 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `0` &&
20185	Elt3 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) == `1`) {
20186	SDValue HighLanesSrcVec = Elt2 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20187	HighLanes =
20188	DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: HighLanesSrcVec);
20189	}
20190	if (HighLanes) {
20191	SDValue DoubleToSingleSticky =
20192	DAG.getNode(Opcode: AArch64ISD::FCVTXN, DL, VT: MVT::v2f32, Operand: LowLanesSrcVec);
20193	SDValue Concat = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: MVT::v4f32,
20194	N1: DoubleToSingleSticky, N2: HighLanes);
20195	return DAG.getNode(Opcode: ISD::FP_ROUND, DL, VT, N1: Concat,
20196	N2: Elt0 ->getOperand(Num: `1`));
20197	}
20198	}
20199	}
20200	}
20201
20202	if (VT == MVT::v2f64) {
20203	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
20204	if (Elt0 ->getOpcode() == ISD::FP_EXTEND &&
20205	Elt1 ->getOpcode() == ISD::FP_EXTEND &&
20206	Elt0 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20207	Elt1 ->getOperand(Num: `0`)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20208	Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`) ==
20209	Elt1 ->getOperand(Num: `0`)->getOperand(Num: `0`) &&
20210	// Constant index.
20211	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20212	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `0`)->getOperand(Num: `1`)) &&
20213	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) + `1` ==
20214	Elt1 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) &&
20215	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20216	// ResultType's known minimum vector length.
20217	Elt0 ->getOperand(Num: `0`)->getConstantOperandVal(Num: `1`) %
20218	VT.getVectorMinNumElements() ==
20219	`0`) {
20220	SDValue SrcVec = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `0`);
20221	if (SrcVec.getValueType() == MVT::v4f16 \|\|
20222	SrcVec.getValueType() == MVT::v4bf16) {
20223	SDValue HalfToSingle =
20224	DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT: MVT::v4f32, Operand: SrcVec);
20225	SDValue SubvectorIdx = Elt0 ->getOperand(Num: `0`)->getOperand(Num: `1`);
20226	SDValue Extract = DAG.getNode(
20227	Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: VT.changeVectorElementType(EltVT: MVT::f32),
20228	N1: HalfToSingle, N2: SubvectorIdx);
20229	return DAG.getNode(Opcode: ISD::FP_EXTEND, DL, VT, Operand: Extract);
20230	}
20231	}
20232	}
20233
20234	// A build vector of two extracted elements is equivalent to an
20235	// extract subvector where the inner vector is any-extended to the
20236	// extract_vector_elt VT.
20237	// (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20238	// (extract_elt_iXX_to_i32 vec Idx+1))
20239	// => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20240
20241	// For now, only consider the v2i32 case, which arises as a result of
20242	// legalization.
20243	if (VT != MVT::v2i32)
20244	return SDValue ();
20245
20246	SDValue Elt0 = N->getOperand(Num: `0`), Elt1 = N->getOperand(Num: `1`);
20247	// Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20248	if (Elt0 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20249	Elt1 ->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20250	// Constant index.
20251	isa<ConstantSDNode>(Val: Elt0 ->getOperand(Num: `1`)) &&
20252	isa<ConstantSDNode>(Val: Elt1 ->getOperand(Num: `1`)) &&
20253	// Both EXTRACT_VECTOR_ELT from same vector...
20254	Elt0 ->getOperand(Num: `0`) == Elt1 ->getOperand(Num: `0`) &&
20255	// ... and contiguous. First element's index +1 == second element's index.
20256	Elt0 ->getConstantOperandVal(Num: `1`) + `1` == Elt1 ->getConstantOperandVal(Num: `1`) &&
20257	// EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20258	// ResultType's known minimum vector length.
20259	Elt0 ->getConstantOperandVal(Num: `1`) % VT.getVectorMinNumElements() == `0`) {
20260	SDValue VecToExtend = Elt0 ->getOperand(Num: `0`);
20261	EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(EltVT: MVT::i32);
20262	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT: ExtVT))
20263	return SDValue ();
20264
20265	SDValue SubvectorIdx = DAG.getVectorIdxConstant(Val: Elt0 ->getConstantOperandVal(Num: `1`), DL);
20266
20267	SDValue Ext = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: ExtVT, Operand: VecToExtend);
20268	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: Ext,
20269	N2: SubvectorIdx);
20270	}
20271
20272	return SDValue ();
20273	}
20274
20275	static SDValue performTruncateCombine(SDNode *N,
20276	SelectionDAG &DAG) {
20277	EVT VT = N->getValueType(ResNo: `0`);
20278	SDValue N0 = N->getOperand(Num: `0`);
20279	if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20280	N0.getOpcode() == AArch64ISD::DUP) {
20281	SDValue Op = N0.getOperand(i: `0`);
20282	if (VT.getScalarType() == MVT::i32 &&
20283	N0.getOperand(i: `0`).getValueType().getScalarType() == MVT::i64)
20284	Op = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: MVT::i32, Operand: Op);
20285	return DAG.getNode(Opcode: N0.getOpcode(), DL: SDLoc (N), VT, Operand: Op);
20286	}
20287
20288	return SDValue ();
20289	}
20290
20291	// Check an node is an extend or shift operand
20292	static bool isExtendOrShiftOperand(SDValue N) {
20293	unsigned Opcode = N.getOpcode();
20294	if (ISD::isExtOpcode(Opcode) \|\| Opcode == ISD::SIGN_EXTEND_INREG) {
20295	EVT SrcVT;
20296	if (Opcode == ISD::SIGN_EXTEND_INREG)
20297	SrcVT = cast<VTSDNode>(Val: N.getOperand(i: `1`))->getVT();
20298	else
20299	SrcVT = N.getOperand(i: `0`).getValueType();
20300
20301	return SrcVT == MVT::i32 \|\| SrcVT == MVT::i16 \|\| SrcVT == MVT::i8;
20302	} else if (Opcode == ISD::AND) {
20303	ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(Val: N.getOperand(i: `1`));
20304	if (!CSD)
20305	return false;
20306	uint64_t AndMask = CSD->getZExtValue();
20307	return AndMask == `0xff` \|\| AndMask == `0xffff` \|\| AndMask == `0xffffffff`;
20308	} else if (Opcode == ISD::SHL \|\| Opcode == ISD::SRL \|\| Opcode == ISD::SRA) {
20309	return isa<ConstantSDNode>(Val: N.getOperand(i: `1`));
20310	}
20311
20312	return false;
20313	}
20314
20315	// (N - Y) + Z --> (Z - Y) + N
20316	// when N is an extend or shift operand
20317	static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z,
20318	SelectionDAG &DAG) {
20319	auto IsOneUseExtend = [](SDValue N) {
20320	return N.hasOneUse() && isExtendOrShiftOperand(N);
20321	};
20322
20323	// DAGCombiner will revert the combination when Z is constant cause
20324	// dead loop. So don't enable the combination when Z is constant.
20325	// If Z is one use shift C, we also can't do the optimization.
20326	// It will falling to self infinite loop.
20327	if (isa<ConstantSDNode>(Val: Z) \|\| IsOneUseExtend (Z))
20328	return SDValue ();
20329
20330	if (SUB.getOpcode() != ISD::SUB \|\| !SUB.hasOneUse())
20331	return SDValue ();
20332
20333	SDValue Shift = SUB.getOperand(i: `0`);
20334	if (!IsOneUseExtend (Shift))
20335	return SDValue ();
20336
20337	SDLoc DL(N);
20338	EVT VT = N->getValueType(ResNo: `0`);
20339
20340	SDValue Y = SUB.getOperand(i: `1`);
20341	SDValue NewSub = DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Z, N2: Y);
20342	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: NewSub, N2: Shift);
20343	}
20344
20345	static SDValue performAddCombineForShiftedOperands(SDNode *N,
20346	SelectionDAG &DAG) {
20347	// NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20348	// commutative.
20349	if (N->getOpcode() != ISD::ADD)
20350	return SDValue ();
20351
20352	// Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20353	// shifted register is only available for i32 and i64.
20354	EVT VT = N->getValueType(ResNo: `0`);
20355	if (VT != MVT::i32 && VT != MVT::i64)
20356	return SDValue ();
20357
20358	SDLoc DL(N);
20359	SDValue LHS = N->getOperand(Num: `0`);
20360	SDValue RHS = N->getOperand(Num: `1`);
20361
20362	if (SDValue Val = performAddCombineSubShift(N, SUB: LHS, Z: RHS, DAG))
20363	return Val;
20364	if (SDValue Val = performAddCombineSubShift(N, SUB: RHS, Z: LHS, DAG))
20365	return Val;
20366
20367	uint64_t LHSImm = `0`, RHSImm = `0`;
20368	// If both operand are shifted by imm and shift amount is not greater than 4
20369	// for one operand, swap LHS and RHS to put operand with smaller shift amount
20370	// on RHS.
20371	//
20372	// On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20373	// LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20374	// with LSL (shift > 4). For the rest of processors, this is no-op for
20375	// performance or correctness.
20376	if (isOpcWithIntImmediate(N: LHS.getNode(), Opc: ISD::SHL, Imm&: LHSImm) &&
20377	isOpcWithIntImmediate(N: RHS.getNode(), Opc: ISD::SHL, Imm&: RHSImm) && LHSImm <= `4` &&
20378	RHSImm > `4` && LHS.hasOneUse())
20379	return DAG.getNode(Opcode: ISD::ADD, DL, VT, N1: RHS, N2: LHS);
20380
20381	return SDValue ();
20382	}
20383
20384	// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20385	// This reassociates it back to allow the creation of more mls instructions.
20386	static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG) {
20387	if (N->getOpcode() != ISD::SUB)
20388	return SDValue ();
20389
20390	SDValue Add = N->getOperand(Num: `1`);
20391	SDValue X = N->getOperand(Num: `0`);
20392	if (Add.getOpcode() != ISD::ADD)
20393	return SDValue ();
20394
20395	if (!Add.hasOneUse())
20396	return SDValue ();
20397	if (DAG.isConstantIntBuildVectorOrConstantInt(N: peekThroughBitcasts(V: X)))
20398	return SDValue ();
20399
20400	SDValue M1 = Add.getOperand(i: `0`);
20401	SDValue M2 = Add.getOperand(i: `1`);
20402	if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20403	M1.getOpcode() != AArch64ISD::UMULL)
20404	return SDValue ();
20405	if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
20406	M2.getOpcode() != AArch64ISD::UMULL)
20407	return SDValue ();
20408
20409	EVT VT = N->getValueType(ResNo: `0`);
20410	SDValue Sub = DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: X, N2: M1);
20411	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT, N1: Sub, N2: M2);
20412	}
20413
20414	// Combine into mla/mls.
20415	// This works on the patterns of:
20416	// add v1, (mul v2, v3)
20417	// sub v1, (mul v2, v3)
20418	// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
20419	// It will transform the add/sub to a scalable version, so that we can
20420	// make use of SVE's MLA/MLS that will be generated for that pattern
20421	static SDValue
20422	performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
20423	SelectionDAG &DAG = DCI.DAG;
20424	// Make sure that the types are legal
20425	if (!DCI.isAfterLegalizeDAG())
20426	return SDValue ();
20427	// Before using SVE's features, check first if it's available.
20428	if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
20429	return SDValue ();
20430
20431	if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
20432	return SDValue ();
20433
20434	if (!N->getValueType(ResNo: `0`).isFixedLengthVector())
20435	return SDValue ();
20436
20437	auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
20438	if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20439	return SDValue ();
20440
20441	if (!cast<ConstantSDNode>(Val: Op1 ->getOperand(Num: `1`))->isZero())
20442	return SDValue ();
20443
20444	SDValue MulValue = Op1 ->getOperand(Num: `0`);
20445	if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
20446	return SDValue ();
20447
20448	if (!Op1.hasOneUse() \|\| !MulValue.hasOneUse())
20449	return SDValue ();
20450
20451	EVT ScalableVT = MulValue.getValueType();
20452	if (!ScalableVT.isScalableVector())
20453	return SDValue ();
20454
20455	SDValue ScaledOp = convertToScalableVector(DAG, VT: ScalableVT, V: Op0);
20456	SDValue NewValue =
20457	DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: ScalableVT, Ops: {ScaledOp, MulValue});
20458	return convertFromScalableVector(DAG, VT: N->getValueType(ResNo: `0`), V: NewValue);
20459	};
20460
20461	if (SDValue res = performOpt (N->getOperand(Num: `0`), N->getOperand(Num: `1`)))
20462	return res;
20463	else if (N->getOpcode() == ISD::ADD)
20464	return performOpt (N->getOperand(Num: `1`), N->getOperand(Num: `0`));
20465
20466	return SDValue ();
20467	}
20468
20469	// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
20470	// help, for example, to produce ssra from sshr+add.
20471	static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) {
20472	EVT VT = N->getValueType(ResNo: `0`);
20473	if (VT != MVT::i64 \|\|
20474	DAG.getTargetLoweringInfo().isOperationExpand(Op: N->getOpcode(), VT: MVT::v1i64))
20475	return SDValue ();
20476	SDValue Op0 = N->getOperand(Num: `0`);
20477	SDValue Op1 = N->getOperand(Num: `1`);
20478
20479	// At least one of the operands should be an extract, and the other should be
20480	// something that is easy to convert to v1i64 type (in this case a load).
20481	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20482	Op0.getOpcode() != ISD::LOAD)
20483	return SDValue ();
20484	if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20485	Op1.getOpcode() != ISD::LOAD)
20486	return SDValue ();
20487
20488	SDLoc DL(N);
20489	if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20490	Op0.getOperand(i: `0`).getValueType() == MVT::v1i64) {
20491	Op0 = Op0.getOperand(i: `0`);
20492	Op1 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op1);
20493	} else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20494	Op1.getOperand(i: `0`).getValueType() == MVT::v1i64) {
20495	Op0 = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: MVT::v1i64, Operand: Op0);
20496	Op1 = Op1.getOperand(i: `0`);
20497	} else
20498	return SDValue ();
20499
20500	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i64,
20501	N1: DAG.getNode(Opcode: N->getOpcode(), DL, VT: MVT::v1i64, N1: Op0, N2: Op1),
20502	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
20503	}
20504
20505	static bool isLoadOrMultipleLoads(SDValue B, SmallVector<LoadSDNode *> &Loads) {
20506	SDValue BV = peekThroughOneUseBitcasts(V: B);
20507	if (!BV ->hasOneUse())
20508	return false;
20509	if (auto *Ld = dyn_cast<LoadSDNode>(Val&: BV)) {
20510	if (!Ld \|\| !Ld->isSimple())
20511	return false;
20512	Loads.push_back(Elt: Ld);
20513	return true;
20514	} else if (BV.getOpcode() == ISD::BUILD_VECTOR \|\|
20515	BV.getOpcode() == ISD::CONCAT_VECTORS) {
20516	for (unsigned Op = `0`; Op < BV.getNumOperands(); Op++) {
20517	auto *Ld = dyn_cast<LoadSDNode>(Val: BV.getOperand(i: Op));
20518	if (!Ld \|\| !Ld->isSimple() \|\| !BV.getOperand(i: Op).hasOneUse())
20519	return false;
20520	Loads.push_back(Elt: Ld);
20521	}
20522	return true;
20523	} else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
20524	// Try to find a tree of shuffles and concats from how IR shuffles of loads
20525	// are lowered. Note that this only comes up because we do not always visit
20526	// operands before uses. After that is fixed this can be removed and in the
20527	// meantime this is fairly specific to the lowering we expect from IR.
20528	// t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
20529	// t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
20530	// t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
20531	// t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
20532	// t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
20533	// t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
20534	// t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
20535	// t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
20536	// t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
20537	if (B.getOperand(i: `0`).getOpcode() != ISD::VECTOR_SHUFFLE \|\|
20538	B.getOperand(i: `0`).getOperand(i: `0`).getOpcode() != ISD::CONCAT_VECTORS \|\|
20539	B.getOperand(i: `0`).getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
20540	B.getOperand(i: `1`).getOpcode() != ISD::CONCAT_VECTORS \|\|
20541	B.getOperand(i: `1`).getNumOperands() != `4`)
20542	return false;
20543	auto SV1 = cast<ShuffleVectorSDNode>(Val&: B);
20544	auto SV2 = cast<ShuffleVectorSDNode>(Val: B.getOperand(i: `0`));
20545	int NumElts = B.getValueType().getVectorNumElements();
20546	int NumSubElts = NumElts / `4`;
20547	for (int I = `0`; I < NumSubElts; I++) {
20548	// <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
20549	if (SV1->getMaskElt(Idx: I) != I \|\|
20550	SV1->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
20551	SV1->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumSubElts * `2` \|\|
20552	SV1->getMaskElt(Idx: I + NumSubElts * `3`) != I + NumElts)
20553	return false;
20554	// <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
20555	if (SV2->getMaskElt(Idx: I) != I \|\|
20556	SV2->getMaskElt(Idx: I + NumSubElts) != I + NumSubElts \|\|
20557	SV2->getMaskElt(Idx: I + NumSubElts * `2`) != I + NumElts)
20558	return false;
20559	}
20560	auto *Ld0 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `0`));
20561	auto *Ld1 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `0`).getOperand(i: `1`));
20562	auto *Ld2 = dyn_cast<LoadSDNode>(Val: SV2->getOperand(Num: `1`).getOperand(i: `0`));
20563	auto *Ld3 = dyn_cast<LoadSDNode>(Val: B.getOperand(i: `1`).getOperand(i: `0`));
20564	if (!Ld0 \|\| !Ld1 \|\| !Ld2 \|\| !Ld3 \|\| !Ld0->isSimple() \|\| !Ld1->isSimple() \|\|
20565	!Ld2->isSimple() \|\| !Ld3->isSimple())
20566	return false;
20567	Loads.push_back(Elt: Ld0);
20568	Loads.push_back(Elt: Ld1);
20569	Loads.push_back(Elt: Ld2);
20570	Loads.push_back(Elt: Ld3);
20571	return true;
20572	}
20573	return false;
20574	}
20575
20576	static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1,
20577	SelectionDAG &DAG,
20578	unsigned &NumSubLoads) {
20579	if (!Op0.hasOneUse() \|\| !Op1.hasOneUse())
20580	return false;
20581
20582	SmallVector<LoadSDNode *> Loads0, Loads1;
20583	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
20584	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
20585	if (NumSubLoads && Loads0.size() != NumSubLoads)
20586	return false;
20587	NumSubLoads = Loads0.size();
20588	return Loads0.size() == Loads1.size() &&
20589	all_of(Range: zip(t&: Loads0, u&: Loads1), P: [&DAG](auto L) {
20590	unsigned Size = get<`0`>(L)->getValueType(`0`).getSizeInBits();
20591	return Size == get<`1`>(L)->getValueType(`0`).getSizeInBits() &&
20592	DAG.areNonVolatileConsecutiveLoads(LD: get<`1`>(L), Base: get<`0`>(L),
20593	Bytes: Size / `8`, Dist: `1`);
20594	});
20595	}
20596
20597	if (Op0.getOpcode() != Op1.getOpcode())
20598	return false;
20599
20600	switch (Op0.getOpcode()) {
20601	case ISD::ADD:
20602	case ISD::SUB:
20603	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
20604	DAG, NumSubLoads) &&
20605	areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `1`), Op1: Op1.getOperand(i: `1`),
20606	DAG, NumSubLoads);
20607	case ISD::SIGN_EXTEND:
20608	case ISD::ANY_EXTEND:
20609	case ISD::ZERO_EXTEND:
20610	EVT XVT = Op0.getOperand(i: `0`).getValueType();
20611	if (XVT.getScalarSizeInBits() != `8` && XVT.getScalarSizeInBits() != `16` &&
20612	XVT.getScalarSizeInBits() != `32`)
20613	return false;
20614	return areLoadedOffsetButOtherwiseSame(Op0: Op0.getOperand(i: `0`), Op1: Op1.getOperand(i: `0`),
20615	DAG, NumSubLoads);
20616	}
20617	return false;
20618	}
20619
20620	// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20621	// into a single load of twice the size, that we extract the bottom part and top
20622	// part so that the shl can use a shll2 instruction. The two loads in that
20623	// example can also be larger trees of instructions, which are identical except
20624	// for the leaves which are all loads offset from the LHS, including
20625	// buildvectors of multiple loads. For example the RHS tree could be
20626	// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20627	// Whilst it can be common for the larger loads to replace LDP instructions
20628	// (which doesn't gain anything on it's own), the larger loads can help create
20629	// more efficient code, and in buildvectors prevent the need for ld1 lane
20630	// inserts which can be slower than normal loads.
20631	static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG) {
20632	EVT VT = N->getValueType(ResNo: `0`);
20633	if (!VT.isFixedLengthVector() \|\|
20634	(VT.getScalarSizeInBits() != `16` && VT.getScalarSizeInBits() != `32` &&
20635	VT.getScalarSizeInBits() != `64`))
20636	return SDValue ();
20637
20638	SDValue Other = N->getOperand(Num: `0`);
20639	SDValue Shift = N->getOperand(Num: `1`);
20640	if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20641	std::swap(a&: Shift, b&: Other);
20642	APInt ShiftAmt;
20643	if (Shift.getOpcode() != ISD::SHL \|\| !Shift.hasOneUse() \|\|
20644	!ISD::isConstantSplatVector(N: Shift.getOperand(i: `1`).getNode(), SplatValue&: ShiftAmt))
20645	return SDValue ();
20646
20647	if (!ISD::isExtOpcode(Opcode: Shift.getOperand(i: `0`).getOpcode()) \|\|
20648	!ISD::isExtOpcode(Opcode: Other.getOpcode()) \|\|
20649	Shift.getOperand(i: `0`).getOperand(i: `0`).getValueType() !=
20650	Other.getOperand(i: `0`).getValueType() \|\|
20651	!Other.hasOneUse() \|\| !Shift.getOperand(i: `0`).hasOneUse())
20652	return SDValue ();
20653
20654	SDValue Op0 = Other.getOperand(i: `0`);
20655	SDValue Op1 = Shift.getOperand(i: `0`).getOperand(i: `0`);
20656
20657	unsigned NumSubLoads = `0`;
20658	if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20659	return SDValue ();
20660
20661	// Attempt to rule out some unprofitable cases using heuristics (some working
20662	// around suboptimal code generation), notably if the extend not be able to
20663	// use ushll2 instructions as the types are not large enough. Otherwise zip's
20664	// will need to be created which can increase the instruction count.
20665	unsigned NumElts = Op0.getValueType().getVectorNumElements();
20666	unsigned NumSubElts = NumElts / NumSubLoads;
20667	if (NumSubElts * VT.getScalarSizeInBits() < `128` \|\|
20668	(Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode() &&
20669	Op0.getValueType().getSizeInBits() < `128` &&
20670	!DAG.getTargetLoweringInfo().isTypeLegal(VT: Op0.getValueType())))
20671	return SDValue ();
20672
20673	// Recreate the tree with the new combined loads.
20674	std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20675	[&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20676	EVT DVT =
20677	Op0.getValueType().getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20678
20679	SmallVector<LoadSDNode *> Loads0, Loads1;
20680	if (isLoadOrMultipleLoads(B: Op0, Loads&: Loads0) &&
20681	isLoadOrMultipleLoads(B: Op1, Loads&: Loads1)) {
20682	EVT LoadVT = EVT::getVectorVT(
20683	Context&: *DAG.getContext(), VT: Op0.getValueType().getScalarType(),
20684	NumElements: Op0.getValueType().getVectorNumElements() / Loads0.size());
20685	EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20686
20687	SmallVector<SDValue> NewLoads;
20688	for (const auto &[L0, L1] : zip(t&: Loads0, u&: Loads1)) {
20689	SDValue Load = DAG.getLoad(VT: DLoadVT, dl: SDLoc (L0), Chain: L0->getChain(),
20690	Ptr: L0->getBasePtr(), PtrInfo: L0->getPointerInfo(),
20691	Alignment: L0->getOriginalAlign());
20692	DAG.makeEquivalentMemoryOrdering(OldLoad: L0, NewMemOp: Load.getValue(R: `1`));
20693	DAG.makeEquivalentMemoryOrdering(OldLoad: L1, NewMemOp: Load.getValue(R: `1`));
20694	NewLoads.push_back(Elt: Load);
20695	}
20696	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (Op0), VT: DVT, Ops: NewLoads);
20697	}
20698
20699	SmallVector<SDValue> Ops;
20700	for (const auto &[O0, O1] : zip(t: Op0 ->op_values(), u: Op1 ->op_values()))
20701	Ops.push_back(Elt: GenCombinedTree (O0, O1, DAG));
20702	return DAG.getNode(Opcode: Op0.getOpcode(), DL: SDLoc (Op0), VT: DVT, Ops);
20703	};
20704	SDValue NewOp = GenCombinedTree (Op0, Op1, DAG);
20705
20706	SmallVector<int> LowMask(NumElts, `0`), HighMask(NumElts, `0`);
20707	int Hi = NumSubElts, Lo = `0`;
20708	for (unsigned i = `0`; i < NumSubLoads; i++) {
20709	for (unsigned j = `0`; j < NumSubElts; j++) {
20710	LowMask [i * NumSubElts + j] = Lo++;
20711	HighMask [i * NumSubElts + j] = Hi++;
20712	}
20713	Lo += NumSubElts;
20714	Hi += NumSubElts;
20715	}
20716	SDLoc DL(N);
20717	SDValue Ext0, Ext1;
20718	// Extract the top and bottom lanes, then extend the result. Possibly extend
20719	// the result then extract the lanes if the two operands match as it produces
20720	// slightly smaller code.
20721	if (Other.getOpcode() != Shift.getOperand(i: `0`).getOpcode()) {
20722	SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(),
20723	N1: NewOp, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
20724	SDValue SubH =
20725	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: Op0.getValueType(), N1: NewOp,
20726	N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
20727	SDValue Extr0 =
20728	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
20729	SDValue Extr1 =
20730	DAG.getVectorShuffle(VT: Op0.getValueType(), dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
20731	Ext0 = DAG.getNode(Opcode: Other.getOpcode(), DL, VT, Operand: Extr0);
20732	Ext1 = DAG.getNode(Opcode: Shift.getOperand(i: `0`).getOpcode(), DL, VT, Operand: Extr1);
20733	} else {
20734	EVT DVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
20735	SDValue Ext = DAG.getNode(Opcode: Other.getOpcode(), DL, VT: DVT, Operand: NewOp);
20736	SDValue SubL = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
20737	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
20738	SDValue SubH =
20739	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: Ext,
20740	N2: DAG.getConstant(Val: NumSubElts * NumSubLoads, DL, VT: MVT::i64));
20741	Ext0 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: LowMask);
20742	Ext1 = DAG.getVectorShuffle(VT, dl: DL, N1: SubL, N2: SubH, Mask: HighMask);
20743	}
20744	SDValue NShift =
20745	DAG.getNode(Opcode: Shift.getOpcode(), DL, VT, N1: Ext1, N2: Shift.getOperand(i: `1`));
20746	return DAG.getNode(Opcode: N->getOpcode(), DL, VT, N1: Ext0, N2: NShift);
20747	}
20748
20749	static SDValue performAddSubCombine(SDNode *N,
20750	TargetLowering::DAGCombinerInfo &DCI) {
20751	// Try to change sum of two reductions.
20752	if (SDValue Val = performAddUADDVCombine(N, DAG&: DCI.DAG))
20753	return Val;
20754	if (SDValue Val = performAddDotCombine(N, DAG&: DCI.DAG))
20755	return Val;
20756	if (SDValue Val = performAddCSelIntoCSinc(N, DAG&: DCI.DAG))
20757	return Val;
20758	if (SDValue Val = performNegCSelCombine(N, DAG&: DCI.DAG))
20759	return Val;
20760	if (SDValue Val = performVectorExtCombine(N, DAG&: DCI.DAG))
20761	return Val;
20762	if (SDValue Val = performAddCombineForShiftedOperands(N, DAG&: DCI.DAG))
20763	return Val;
20764	if (SDValue Val = performSubAddMULCombine(N, DAG&: DCI.DAG))
20765	return Val;
20766	if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20767	return Val;
20768	if (SDValue Val = performAddSubIntoVectorOp(N, DAG&: DCI.DAG))
20769	return Val;
20770
20771	if (SDValue Val = performExtBinopLoadFold(N, DAG&: DCI.DAG))
20772	return Val;
20773
20774	return performAddSubLongCombine(N, DCI);
20775	}
20776
20777	// Massage DAGs which we can use the high-half "long" operations on into
20778	// something isel will recognize better. E.g.
20779	//
20780	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20781	// (aarch64_neon_umull (extract_high (v2i64 vec)))
20782	// (extract_high (v2i64 (dup128 scalar)))))
20783	//
20784	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
20785	TargetLowering::DAGCombinerInfo &DCI,
20786	SelectionDAG &DAG) {
20787	if (DCI.isBeforeLegalizeOps())
20788	return SDValue ();
20789
20790	SDValue LHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `0` : `1`);
20791	SDValue RHS = N->getOperand(Num: (IID == Intrinsic::not_intrinsic) ? `1` : `2`);
20792	assert(LHS.getValueType().is64BitVector() &&
20793	RHS.getValueType().is64BitVector() &&
20794	"unexpected shape for long operation");
20795
20796	// Either node could be a DUP, but it's not worth doing both of them (you'd
20797	// just as well use the non-high version) so look for a corresponding extract
20798	// operation on the other "wing".
20799	if (isEssentiallyExtractHighSubvector(N: LHS)) {
20800	RHS = tryExtendDUPToExtractHigh(N: RHS, DAG);
20801	if (!RHS.getNode())
20802	return SDValue ();
20803	} else if (isEssentiallyExtractHighSubvector(N: RHS)) {
20804	LHS = tryExtendDUPToExtractHigh(N: LHS, DAG);
20805	if (!LHS.getNode())
20806	return SDValue ();
20807	} else
20808	return SDValue ();
20809
20810	if (IID == Intrinsic::not_intrinsic)
20811	return DAG.getNode(Opcode: N->getOpcode(), DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: LHS, N2: RHS);
20812
20813	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
20814	N1: N->getOperand(Num: `0`), N2: LHS, N3: RHS);
20815	}
20816
20817	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20818	MVT ElemTy = N->getSimpleValueType(ResNo: `0`).getScalarType();
20819	unsigned ElemBits = ElemTy.getSizeInBits();
20820
20821	int64_t ShiftAmount;
20822	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Val: N->getOperand(Num: `2`))) {
20823	APInt SplatValue, SplatUndef;
20824	unsigned SplatBitSize;
20825	bool HasAnyUndefs;
20826	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20827	HasAnyUndefs, MinSplatBits: ElemBits) \|\|
20828	SplatBitSize != ElemBits)
20829	return SDValue ();
20830
20831	ShiftAmount = SplatValue.getSExtValue();
20832	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `2`))) {
20833	ShiftAmount = CVN->getSExtValue();
20834	} else
20835	return SDValue ();
20836
20837	// If the shift amount is zero, remove the shift intrinsic.
20838	if (ShiftAmount == `0` && IID != Intrinsic::aarch64_neon_sqshlu)
20839	return N->getOperand(Num: `1`);
20840
20841	unsigned Opcode;
20842	bool IsRightShift;
20843	switch (IID) {
20844	default:
20845	llvm_unreachable("Unknown shift intrinsic");
20846	case Intrinsic::aarch64_neon_sqshl:
20847	Opcode = AArch64ISD::SQSHL_I;
20848	IsRightShift = false;
20849	break;
20850	case Intrinsic::aarch64_neon_uqshl:
20851	Opcode = AArch64ISD::UQSHL_I;
20852	IsRightShift = false;
20853	break;
20854	case Intrinsic::aarch64_neon_srshl:
20855	Opcode = AArch64ISD::SRSHR_I;
20856	IsRightShift = true;
20857	break;
20858	case Intrinsic::aarch64_neon_urshl:
20859	Opcode = AArch64ISD::URSHR_I;
20860	IsRightShift = true;
20861	break;
20862	case Intrinsic::aarch64_neon_sqshlu:
20863	Opcode = AArch64ISD::SQSHLU_I;
20864	IsRightShift = false;
20865	break;
20866	case Intrinsic::aarch64_neon_sshl:
20867	case Intrinsic::aarch64_neon_ushl:
20868	// For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20869	// left shift for positive shift amounts. For negative shifts we can use a
20870	// VASHR/VLSHR as appropiate.
20871	if (ShiftAmount < `0`) {
20872	Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20873	: AArch64ISD::VLSHR;
20874	ShiftAmount = -ShiftAmount;
20875	} else
20876	Opcode = AArch64ISD::VSHL;
20877	IsRightShift = false;
20878	break;
20879	}
20880
20881	EVT VT = N->getValueType(ResNo: `0`);
20882	SDValue Op = N->getOperand(Num: `1`);
20883	SDLoc dl(N);
20884	if (VT == MVT::i64) {
20885	Op = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL: dl, VT: MVT::v1i64, Operand: Op);
20886	VT = MVT::v1i64;
20887	}
20888
20889	if (IsRightShift && ShiftAmount <= -`1` && ShiftAmount >= -(int)ElemBits) {
20890	Op = DAG.getNode(Opcode, DL: dl, VT, N1: Op,
20891	N2: DAG.getConstant(Val: -ShiftAmount, DL: dl, VT: MVT::i32));
20892	if (N->getValueType(ResNo: `0`) == MVT::i64)
20893	Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
20894	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
20895	return Op;
20896	} else if (!IsRightShift && ShiftAmount >= `0` && ShiftAmount < ElemBits) {
20897	Op = DAG.getNode(Opcode, DL: dl, VT, N1: Op,
20898	N2: DAG.getConstant(Val: ShiftAmount, DL: dl, VT: MVT::i32));
20899	if (N->getValueType(ResNo: `0`) == MVT::i64)
20900	Op = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: MVT::i64, N1: Op,
20901	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
20902	return Op;
20903	}
20904
20905	return SDValue ();
20906	}
20907
20908	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20909	// the intrinsics must be legal and take an i32, this means there's almost
20910	// certainly going to be a zext in the DAG which we can eliminate.
20911	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20912	SDValue AndN = N->getOperand(Num: `2`);
20913	if (AndN.getOpcode() != ISD::AND)
20914	return SDValue ();
20915
20916	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Val: AndN.getOperand(i: `1`));
20917	if (!CMask \|\| CMask->getZExtValue() != Mask)
20918	return SDValue ();
20919
20920	return DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL: SDLoc (N), VT: MVT::i32,
20921	N1: N->getOperand(Num: `0`), N2: N->getOperand(Num: `1`), N3: AndN.getOperand(i: `0`));
20922	}
20923
20924	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
20925	SelectionDAG &DAG) {
20926	SDLoc dl(N);
20927	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: N->getValueType(ResNo: `0`),
20928	N1: DAG.getNode(Opcode: Opc, DL: dl,
20929	VT: N->getOperand(Num: `1`).getSimpleValueType(),
20930	Operand: N->getOperand(Num: `1`)),
20931	N2: DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64));
20932	}
20933
20934	static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
20935	SDLoc DL(N);
20936	SDValue Op1 = N->getOperand(Num: `1`);
20937	SDValue Op2 = N->getOperand(Num: `2`);
20938	EVT ScalarTy = Op2.getValueType();
20939	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
20940	ScalarTy = MVT::i32;
20941
20942	// Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20943	SDValue StepVector = DAG.getStepVector(DL, ResVT: N->getValueType(ResNo: `0`));
20944	SDValue Step = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op2);
20945	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: N->getValueType(ResNo: `0`), N1: StepVector, N2: Step);
20946	SDValue Base = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: N->getValueType(ResNo: `0`), Operand: Op1);
20947	return DAG.getNode(Opcode: ISD::ADD, DL, VT: N->getValueType(ResNo: `0`), N1: Mul, N2: Base);
20948	}
20949
20950	static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
20951	SDLoc dl(N);
20952	SDValue Scalar = N->getOperand(Num: `3`);
20953	EVT ScalarTy = Scalar.getValueType();
20954
20955	if ((ScalarTy == MVT::i8) \|\| (ScalarTy == MVT::i16))
20956	Scalar = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL: dl, VT: MVT::i32, Operand: Scalar);
20957
20958	SDValue Passthru = N->getOperand(Num: `1`);
20959	SDValue Pred = N->getOperand(Num: `2`);
20960	return DAG.getNode(Opcode: AArch64ISD::DUP_MERGE_PASSTHRU, DL: dl, VT: N->getValueType(ResNo: `0`),
20961	N1: Pred, N2: Scalar, N3: Passthru);
20962	}
20963
20964	static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
20965	SDLoc dl(N);
20966	LLVMContext &Ctx = *DAG.getContext();
20967	EVT VT = N->getValueType(ResNo: `0`);
20968
20969	assert(VT.isScalableVector() && "Expected a scalable vector.");
20970
20971	// Current lowering only supports the SVE-ACLE types.
20972	if (VT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
20973	return SDValue ();
20974
20975	unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / `8`;
20976	unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / `8`;
20977	EVT ByteVT =
20978	EVT::getVectorVT(Context&: Ctx, VT: MVT::i8, EC: ElementCount::getScalable(MinVal: ByteSize));
20979
20980	// Convert everything to the domain of EXT (i.e bytes).
20981	SDValue Op0 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: `1`));
20982	SDValue Op1 = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ByteVT, Operand: N->getOperand(Num: `2`));
20983	SDValue Op2 = DAG.getNode(Opcode: ISD::MUL, DL: dl, VT: MVT::i32, N1: N->getOperand(Num: `3`),
20984	N2: DAG.getConstant(Val: ElemSize, DL: dl, VT: MVT::i32));
20985
20986	SDValue EXT = DAG.getNode(Opcode: AArch64ISD::EXT, DL: dl, VT: ByteVT, N1: Op0, N2: Op1, N3: Op2);
20987	return DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT, Operand: EXT);
20988	}
20989
20990	static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
20991	TargetLowering::DAGCombinerInfo &DCI,
20992	SelectionDAG &DAG) {
20993	if (DCI.isBeforeLegalize())
20994	return SDValue ();
20995
20996	SDValue Comparator = N->getOperand(Num: `3`);
20997	if (Comparator.getOpcode() == AArch64ISD::DUP \|\|
20998	Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20999	unsigned IID = getIntrinsicID(N);
21000	EVT VT = N->getValueType(ResNo: `0`);
21001	EVT CmpVT = N->getOperand(Num: `2`).getValueType();
21002	SDValue Pred = N->getOperand(Num: `1`);
21003	SDValue Imm;
21004	SDLoc DL(N);
21005
21006	switch (IID) {
21007	default:
21008	llvm_unreachable("Called with wrong intrinsic!");
21009	break;
21010
21011	// Signed comparisons
21012	case Intrinsic::aarch64_sve_cmpeq_wide:
21013	case Intrinsic::aarch64_sve_cmpne_wide:
21014	case Intrinsic::aarch64_sve_cmpge_wide:
21015	case Intrinsic::aarch64_sve_cmpgt_wide:
21016	case Intrinsic::aarch64_sve_cmplt_wide:
21017	case Intrinsic::aarch64_sve_cmple_wide: {
21018	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
21019	int64_t ImmVal = CN->getSExtValue();
21020	if (ImmVal >= -`16` && ImmVal <= `15`)
21021	Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21022	else
21023	return SDValue ();
21024	}
21025	break;
21026	}
21027	// Unsigned comparisons
21028	case Intrinsic::aarch64_sve_cmphs_wide:
21029	case Intrinsic::aarch64_sve_cmphi_wide:
21030	case Intrinsic::aarch64_sve_cmplo_wide:
21031	case Intrinsic::aarch64_sve_cmpls_wide: {
21032	if (auto *CN = dyn_cast<ConstantSDNode>(Val: Comparator.getOperand(i: `0`))) {
21033	uint64_t ImmVal = CN->getZExtValue();
21034	if (ImmVal <= `127`)
21035	Imm = DAG.getConstant(Val: ImmVal, DL, VT: MVT::i32);
21036	else
21037	return SDValue ();
21038	}
21039	break;
21040	}
21041	}
21042
21043	if (!Imm)
21044	return SDValue ();
21045
21046	SDValue Splat = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: CmpVT, Operand: Imm);
21047	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT, N1: Pred,
21048	N2: N->getOperand(Num: `2`), N3: Splat, N4: DAG.getCondCode(Cond: CC));
21049	}
21050
21051	return SDValue ();
21052	}
21053
21054	static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21055	AArch64CC::CondCode Cond) {
21056	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21057
21058	SDLoc DL(Op);
21059	assert(Op.getValueType().isScalableVector() &&
21060	TLI.isTypeLegal(Op.getValueType()) &&
21061	"Expected legal scalable vector type!");
21062	assert(Op.getValueType() == Pg.getValueType() &&
21063	"Expected same type for PTEST operands");
21064
21065	// Ensure target specific opcodes are using legal type.
21066	EVT OutVT = TLI.getTypeToTransformTo(Context&: *DAG.getContext(), VT);
21067	SDValue TVal = DAG.getConstant(Val: `1`, DL, VT: OutVT);
21068	SDValue FVal = DAG.getConstant(Val: `0`, DL, VT: OutVT);
21069
21070	// Ensure operands have type nxv16i1.
21071	if (Op.getValueType() != MVT::nxv16i1) {
21072	if ((Cond == AArch64CC::ANY_ACTIVE \|\| Cond == AArch64CC::NONE_ACTIVE) &&
21073	isZeroingInactiveLanes(Op))
21074	Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Pg);
21075	else
21076	Pg = getSVEPredicateBitCast(VT: MVT::nxv16i1, Op: Pg, DAG);
21077	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv16i1, Operand: Op);
21078	}
21079
21080	// Set condition code (CC) flags.
21081	SDValue Test = DAG.getNode(
21082	Opcode: Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
21083	DL, VT: MVT::Other, N1: Pg, N2: Op);
21084
21085	// Convert CC to integer based on requested condition.
21086	// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21087	SDValue CC = DAG.getConstant(Val: getInvertedCondCode(Code: Cond), DL, VT: MVT::i32);
21088	SDValue Res = DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: OutVT, N1: FVal, N2: TVal, N3: CC, N4: Test);
21089	return DAG.getZExtOrTrunc(Op: Res, DL, VT);
21090	}
21091
21092	static SDValue combineSVEReductionInt(SDNode N, unsigned* Opc,
21093	SelectionDAG &DAG) {
21094	SDLoc DL(N);
21095
21096	SDValue Pred = N->getOperand(Num: `1`);
21097	SDValue VecToReduce = N->getOperand(Num: `2`);
21098
21099	// NOTE: The integer reduction's result type is not always linked to the
21100	// operand's element type so we construct it from the intrinsic's result type.
21101	EVT ReduceVT = getPackedSVEVectorVT(VT: N->getValueType(ResNo: `0`));
21102	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21103
21104	// SVE reductions set the whole vector register with the first element
21105	// containing the reduction result, which we'll now extract.
21106	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21107	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21108	N2: Zero);
21109	}
21110
21111	static SDValue combineSVEReductionFP(SDNode N, unsigned* Opc,
21112	SelectionDAG &DAG) {
21113	SDLoc DL(N);
21114
21115	SDValue Pred = N->getOperand(Num: `1`);
21116	SDValue VecToReduce = N->getOperand(Num: `2`);
21117
21118	EVT ReduceVT = VecToReduce.getValueType();
21119	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: VecToReduce);
21120
21121	// SVE reductions set the whole vector register with the first element
21122	// containing the reduction result, which we'll now extract.
21123	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21124	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21125	N2: Zero);
21126	}
21127
21128	static SDValue combineSVEReductionOrderedFP(SDNode N, unsigned* Opc,
21129	SelectionDAG &DAG) {
21130	SDLoc DL(N);
21131
21132	SDValue Pred = N->getOperand(Num: `1`);
21133	SDValue InitVal = N->getOperand(Num: `2`);
21134	SDValue VecToReduce = N->getOperand(Num: `3`);
21135	EVT ReduceVT = VecToReduce.getValueType();
21136
21137	// Ordered reductions use the first lane of the result vector as the
21138	// reduction's initial value.
21139	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
21140	InitVal = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ReduceVT,
21141	N1: DAG.getUNDEF(VT: ReduceVT), N2: InitVal, N3: Zero);
21142
21143	SDValue Reduce = DAG.getNode(Opcode: Opc, DL, VT: ReduceVT, N1: Pred, N2: InitVal, N3: VecToReduce);
21144
21145	// SVE reductions set the whole vector register with the first element
21146	// containing the reduction result, which we'll now extract.
21147	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: N->getValueType(ResNo: `0`), N1: Reduce,
21148	N2: Zero);
21149	}
21150
21151	// If a merged operation has no inactive lanes we can relax it to a predicated
21152	// or unpredicated operation, which potentially allows better isel (perhaps
21153	// using immediate forms) or relaxing register reuse requirements.
21154	static SDValue convertMergedOpToPredOp(SDNode N, unsigned* Opc,
21155	SelectionDAG &DAG, bool UnpredOp = false,
21156	bool SwapOperands = false) {
21157	assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21158	assert(N->getNumOperands() == `4` && "Expected 3 operand intrinsic!");
21159	SDValue Pg = N->getOperand(Num: `1`);
21160	SDValue Op1 = N->getOperand(Num: SwapOperands ? `3` : `2`);
21161	SDValue Op2 = N->getOperand(Num: SwapOperands ? `2` : `3`);
21162
21163	// ISD way to specify an all active predicate.
21164	if (isAllActivePredicate(DAG, N: Pg)) {
21165	if (UnpredOp)
21166	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Op1, N2: Op2);
21167
21168	return DAG.getNode(Opcode: Opc, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: Pg, N2: Op1, N3: Op2);
21169	}
21170
21171	// FUTURE: SplatVector(true)
21172	return SDValue ();
21173	}
21174
21175	static SDValue tryCombineWhileLo(SDNode *N,
21176	TargetLowering::DAGCombinerInfo &DCI,
21177	const AArch64Subtarget *Subtarget) {
21178	if (DCI.isBeforeLegalize())
21179	return SDValue ();
21180
21181	if (!Subtarget->hasSVE2p1())
21182	return SDValue ();
21183
21184	if (!N->hasNUsesOfValue(NUses: `2`, Value: `0`))
21185	return SDValue ();
21186
21187	const uint64_t HalfSize = N->getValueType(ResNo: `0`).getVectorMinNumElements() / `2`;
21188	if (HalfSize < `2`)
21189	return SDValue ();
21190
21191	auto It = N->use_begin();
21192	SDNode Lo = It ++;
21193	SDNode Hi = It;
21194
21195	if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
21196	Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21197	return SDValue ();
21198
21199	uint64_t OffLo = Lo->getConstantOperandVal(Num: `1`);
21200	uint64_t OffHi = Hi->getConstantOperandVal(Num: `1`);
21201
21202	if (OffLo > OffHi) {
21203	std::swap(a&: Lo, b&: Hi);
21204	std::swap(a&: OffLo, b&: OffHi);
21205	}
21206
21207	if (OffLo != `0` \|\| OffHi != HalfSize)
21208	return SDValue ();
21209
21210	EVT HalfVec = Lo->getValueType(ResNo: `0`);
21211	if (HalfVec != Hi->getValueType(ResNo: `0`) \|\|
21212	HalfVec.getVectorElementCount() != ElementCount::getScalable(MinVal: HalfSize))
21213	return SDValue ();
21214
21215	SelectionDAG &DAG = DCI.DAG;
21216	SDLoc DL(N);
21217	SDValue ID =
21218	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_whilelo_x2, DL, VT: MVT::i64);
21219	SDValue Idx = N->getOperand(Num: `1`);
21220	SDValue TC = N->getOperand(Num: `2`);
21221	if (Idx.getValueType() != MVT::i64) {
21222	Idx = DAG.getZExtOrTrunc(Op: Idx, DL, VT: MVT::i64);
21223	TC = DAG.getZExtOrTrunc(Op: TC, DL, VT: MVT::i64);
21224	}
21225	auto R =
21226	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL,
21227	ResultTys: {Lo->getValueType(ResNo: `0`), Hi->getValueType(ResNo: `0`)}, Ops: {ID, Idx, TC});
21228
21229	DCI.CombineTo(N: Lo, Res: R.getValue(R: `0`));
21230	DCI.CombineTo(N: Hi, Res: R.getValue(R: `1`));
21231
21232	return SDValue (N, `0`);
21233	}
21234
21235	static SDValue performIntrinsicCombine(SDNode *N,
21236	TargetLowering::DAGCombinerInfo &DCI,
21237	const AArch64Subtarget *Subtarget) {
21238	SelectionDAG &DAG = DCI.DAG;
21239	unsigned IID = getIntrinsicID(N);
21240	switch (IID) {
21241	default:
21242	break;
21243	case Intrinsic::aarch64_neon_vcvtfxs2fp:
21244	case Intrinsic::aarch64_neon_vcvtfxu2fp:
21245	return tryCombineFixedPointConvert(N, DCI, DAG);
21246	case Intrinsic::aarch64_neon_saddv:
21247	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SADDV, N, DAG);
21248	case Intrinsic::aarch64_neon_uaddv:
21249	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UADDV, N, DAG);
21250	case Intrinsic::aarch64_neon_sminv:
21251	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMINV, N, DAG);
21252	case Intrinsic::aarch64_neon_uminv:
21253	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMINV, N, DAG);
21254	case Intrinsic::aarch64_neon_smaxv:
21255	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::SMAXV, N, DAG);
21256	case Intrinsic::aarch64_neon_umaxv:
21257	return combineAcrossLanesIntrinsic(Opc: AArch64ISD::UMAXV, N, DAG);
21258	case Intrinsic::aarch64_neon_fmax:
21259	return DAG.getNode(Opcode: ISD::FMAXIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21260	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21261	case Intrinsic::aarch64_neon_fmin:
21262	return DAG.getNode(Opcode: ISD::FMINIMUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21263	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21264	case Intrinsic::aarch64_neon_fmaxnm:
21265	return DAG.getNode(Opcode: ISD::FMAXNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21266	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21267	case Intrinsic::aarch64_neon_fminnm:
21268	return DAG.getNode(Opcode: ISD::FMINNUM, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21269	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21270	case Intrinsic::aarch64_neon_smull:
21271	return DAG.getNode(Opcode: AArch64ISD::SMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21272	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21273	case Intrinsic::aarch64_neon_umull:
21274	return DAG.getNode(Opcode: AArch64ISD::UMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21275	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21276	case Intrinsic::aarch64_neon_pmull:
21277	return DAG.getNode(Opcode: AArch64ISD::PMULL, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21278	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21279	case Intrinsic::aarch64_neon_sqdmull:
21280	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
21281	case Intrinsic::aarch64_neon_sqshl:
21282	case Intrinsic::aarch64_neon_uqshl:
21283	case Intrinsic::aarch64_neon_sqshlu:
21284	case Intrinsic::aarch64_neon_srshl:
21285	case Intrinsic::aarch64_neon_urshl:
21286	case Intrinsic::aarch64_neon_sshl:
21287	case Intrinsic::aarch64_neon_ushl:
21288	return tryCombineShiftImm(IID, N, DAG);
21289	case Intrinsic::aarch64_neon_sabd:
21290	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21291	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21292	case Intrinsic::aarch64_neon_uabd:
21293	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21294	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21295	case Intrinsic::aarch64_crc32b:
21296	case Intrinsic::aarch64_crc32cb:
21297	return tryCombineCRC32(Mask: `0xff`, N, DAG);
21298	case Intrinsic::aarch64_crc32h:
21299	case Intrinsic::aarch64_crc32ch:
21300	return tryCombineCRC32(Mask: `0xffff`, N, DAG);
21301	case Intrinsic::aarch64_sve_saddv:
21302	// There is no i64 version of SADDV because the sign is irrelevant.
21303	if (N->getOperand(Num: `2`)->getValueType(ResNo: `0`).getVectorElementType() == MVT::i64)
21304	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
21305	else
21306	return combineSVEReductionInt(N, Opc: AArch64ISD::SADDV_PRED, DAG);
21307	case Intrinsic::aarch64_sve_uaddv:
21308	return combineSVEReductionInt(N, Opc: AArch64ISD::UADDV_PRED, DAG);
21309	case Intrinsic::aarch64_sve_smaxv:
21310	return combineSVEReductionInt(N, Opc: AArch64ISD::SMAXV_PRED, DAG);
21311	case Intrinsic::aarch64_sve_umaxv:
21312	return combineSVEReductionInt(N, Opc: AArch64ISD::UMAXV_PRED, DAG);
21313	case Intrinsic::aarch64_sve_sminv:
21314	return combineSVEReductionInt(N, Opc: AArch64ISD::SMINV_PRED, DAG);
21315	case Intrinsic::aarch64_sve_uminv:
21316	return combineSVEReductionInt(N, Opc: AArch64ISD::UMINV_PRED, DAG);
21317	case Intrinsic::aarch64_sve_orv:
21318	return combineSVEReductionInt(N, Opc: AArch64ISD::ORV_PRED, DAG);
21319	case Intrinsic::aarch64_sve_eorv:
21320	return combineSVEReductionInt(N, Opc: AArch64ISD::EORV_PRED, DAG);
21321	case Intrinsic::aarch64_sve_andv:
21322	return combineSVEReductionInt(N, Opc: AArch64ISD::ANDV_PRED, DAG);
21323	case Intrinsic::aarch64_sve_index:
21324	return LowerSVEIntrinsicIndex(N, DAG);
21325	case Intrinsic::aarch64_sve_dup:
21326	return LowerSVEIntrinsicDUP(N, DAG);
21327	case Intrinsic::aarch64_sve_dup_x:
21328	return DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21329	Operand: N->getOperand(Num: `1`));
21330	case Intrinsic::aarch64_sve_ext:
21331	return LowerSVEIntrinsicEXT(N, DAG);
21332	case Intrinsic::aarch64_sve_mul_u:
21333	return DAG.getNode(Opcode: AArch64ISD::MUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21334	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21335	case Intrinsic::aarch64_sve_smulh_u:
21336	return DAG.getNode(Opcode: AArch64ISD::MULHS_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21337	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21338	case Intrinsic::aarch64_sve_umulh_u:
21339	return DAG.getNode(Opcode: AArch64ISD::MULHU_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21340	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21341	case Intrinsic::aarch64_sve_smin_u:
21342	return DAG.getNode(Opcode: AArch64ISD::SMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21343	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21344	case Intrinsic::aarch64_sve_umin_u:
21345	return DAG.getNode(Opcode: AArch64ISD::UMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21346	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21347	case Intrinsic::aarch64_sve_smax_u:
21348	return DAG.getNode(Opcode: AArch64ISD::SMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21349	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21350	case Intrinsic::aarch64_sve_umax_u:
21351	return DAG.getNode(Opcode: AArch64ISD::UMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21352	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21353	case Intrinsic::aarch64_sve_lsl_u:
21354	return DAG.getNode(Opcode: AArch64ISD::SHL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21355	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21356	case Intrinsic::aarch64_sve_lsr_u:
21357	return DAG.getNode(Opcode: AArch64ISD::SRL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21358	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21359	case Intrinsic::aarch64_sve_asr_u:
21360	return DAG.getNode(Opcode: AArch64ISD::SRA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21361	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21362	case Intrinsic::aarch64_sve_fadd_u:
21363	return DAG.getNode(Opcode: AArch64ISD::FADD_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21364	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21365	case Intrinsic::aarch64_sve_fdiv_u:
21366	return DAG.getNode(Opcode: AArch64ISD::FDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21367	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21368	case Intrinsic::aarch64_sve_fmax_u:
21369	return DAG.getNode(Opcode: AArch64ISD::FMAX_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21370	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21371	case Intrinsic::aarch64_sve_fmaxnm_u:
21372	return DAG.getNode(Opcode: AArch64ISD::FMAXNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21373	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21374	case Intrinsic::aarch64_sve_fmla_u:
21375	return DAG.getNode(Opcode: AArch64ISD::FMA_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21376	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `3`), N3: N->getOperand(Num: `4`),
21377	N4: N->getOperand(Num: `2`));
21378	case Intrinsic::aarch64_sve_fmin_u:
21379	return DAG.getNode(Opcode: AArch64ISD::FMIN_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21380	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21381	case Intrinsic::aarch64_sve_fminnm_u:
21382	return DAG.getNode(Opcode: AArch64ISD::FMINNM_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21383	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21384	case Intrinsic::aarch64_sve_fmul_u:
21385	return DAG.getNode(Opcode: AArch64ISD::FMUL_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21386	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21387	case Intrinsic::aarch64_sve_fsub_u:
21388	return DAG.getNode(Opcode: AArch64ISD::FSUB_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21389	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21390	case Intrinsic::aarch64_sve_add_u:
21391	return DAG.getNode(Opcode: ISD::ADD, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
21392	N2: N->getOperand(Num: `3`));
21393	case Intrinsic::aarch64_sve_sub_u:
21394	return DAG.getNode(Opcode: ISD::SUB, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
21395	N2: N->getOperand(Num: `3`));
21396	case Intrinsic::aarch64_sve_subr:
21397	return convertMergedOpToPredOp(N, Opc: ISD::SUB, DAG, UnpredOp: true, SwapOperands: true);
21398	case Intrinsic::aarch64_sve_and_u:
21399	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
21400	N2: N->getOperand(Num: `3`));
21401	case Intrinsic::aarch64_sve_bic_u:
21402	return DAG.getNode(Opcode: AArch64ISD::BIC, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21403	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
21404	case Intrinsic::aarch64_sve_eor_u:
21405	return DAG.getNode(Opcode: ISD::XOR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
21406	N2: N->getOperand(Num: `3`));
21407	case Intrinsic::aarch64_sve_orr_u:
21408	return DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `2`),
21409	N2: N->getOperand(Num: `3`));
21410	case Intrinsic::aarch64_sve_sabd_u:
21411	return DAG.getNode(Opcode: ISD::ABDS, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21412	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
21413	case Intrinsic::aarch64_sve_uabd_u:
21414	return DAG.getNode(Opcode: ISD::ABDU, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21415	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
21416	case Intrinsic::aarch64_sve_sdiv_u:
21417	return DAG.getNode(Opcode: AArch64ISD::SDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21418	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21419	case Intrinsic::aarch64_sve_udiv_u:
21420	return DAG.getNode(Opcode: AArch64ISD::UDIV_PRED, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21421	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21422	case Intrinsic::aarch64_sve_sqadd:
21423	return convertMergedOpToPredOp(N, Opc: ISD::SADDSAT, DAG, UnpredOp: true);
21424	case Intrinsic::aarch64_sve_sqsub_u:
21425	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21426	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
21427	case Intrinsic::aarch64_sve_uqadd:
21428	return convertMergedOpToPredOp(N, Opc: ISD::UADDSAT, DAG, UnpredOp: true);
21429	case Intrinsic::aarch64_sve_uqsub_u:
21430	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21431	N1: N->getOperand(Num: `2`), N2: N->getOperand(Num: `3`));
21432	case Intrinsic::aarch64_sve_sqadd_x:
21433	return DAG.getNode(Opcode: ISD::SADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21434	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21435	case Intrinsic::aarch64_sve_sqsub_x:
21436	return DAG.getNode(Opcode: ISD::SSUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21437	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21438	case Intrinsic::aarch64_sve_uqadd_x:
21439	return DAG.getNode(Opcode: ISD::UADDSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21440	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21441	case Intrinsic::aarch64_sve_uqsub_x:
21442	return DAG.getNode(Opcode: ISD::USUBSAT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21443	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
21444	case Intrinsic::aarch64_sve_asrd:
21445	return DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21446	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21447	case Intrinsic::aarch64_sve_cmphs:
21448	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
21449	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21450	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21451	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGE));
21452	break;
21453	case Intrinsic::aarch64_sve_cmphi:
21454	if (!N->getOperand(Num: `2`).getValueType().isFloatingPoint())
21455	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21456	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21457	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUGT));
21458	break;
21459	case Intrinsic::aarch64_sve_fcmpge:
21460	case Intrinsic::aarch64_sve_cmpge:
21461	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21462	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21463	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGE));
21464	break;
21465	case Intrinsic::aarch64_sve_fcmpgt:
21466	case Intrinsic::aarch64_sve_cmpgt:
21467	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21468	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21469	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETGT));
21470	break;
21471	case Intrinsic::aarch64_sve_fcmpeq:
21472	case Intrinsic::aarch64_sve_cmpeq:
21473	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21474	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21475	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETEQ));
21476	break;
21477	case Intrinsic::aarch64_sve_fcmpne:
21478	case Intrinsic::aarch64_sve_cmpne:
21479	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21480	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21481	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETNE));
21482	break;
21483	case Intrinsic::aarch64_sve_fcmpuo:
21484	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL: SDLoc (N),
21485	VT: N->getValueType(ResNo: `0`), N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`),
21486	N3: N->getOperand(Num: `3`), N4: DAG.getCondCode(Cond: ISD::SETUO));
21487	break;
21488	case Intrinsic::aarch64_sve_fadda:
21489	return combineSVEReductionOrderedFP(N, Opc: AArch64ISD::FADDA_PRED, DAG);
21490	case Intrinsic::aarch64_sve_faddv:
21491	return combineSVEReductionFP(N, Opc: AArch64ISD::FADDV_PRED, DAG);
21492	case Intrinsic::aarch64_sve_fmaxnmv:
21493	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXNMV_PRED, DAG);
21494	case Intrinsic::aarch64_sve_fmaxv:
21495	return combineSVEReductionFP(N, Opc: AArch64ISD::FMAXV_PRED, DAG);
21496	case Intrinsic::aarch64_sve_fminnmv:
21497	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINNMV_PRED, DAG);
21498	case Intrinsic::aarch64_sve_fminv:
21499	return combineSVEReductionFP(N, Opc: AArch64ISD::FMINV_PRED, DAG);
21500	case Intrinsic::aarch64_sve_sel:
21501	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
21502	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
21503	case Intrinsic::aarch64_sve_cmpeq_wide:
21504	return tryConvertSVEWideCompare(N, CC: ISD::SETEQ, DCI, DAG);
21505	case Intrinsic::aarch64_sve_cmpne_wide:
21506	return tryConvertSVEWideCompare(N, CC: ISD::SETNE, DCI, DAG);
21507	case Intrinsic::aarch64_sve_cmpge_wide:
21508	return tryConvertSVEWideCompare(N, CC: ISD::SETGE, DCI, DAG);
21509	case Intrinsic::aarch64_sve_cmpgt_wide:
21510	return tryConvertSVEWideCompare(N, CC: ISD::SETGT, DCI, DAG);
21511	case Intrinsic::aarch64_sve_cmplt_wide:
21512	return tryConvertSVEWideCompare(N, CC: ISD::SETLT, DCI, DAG);
21513	case Intrinsic::aarch64_sve_cmple_wide:
21514	return tryConvertSVEWideCompare(N, CC: ISD::SETLE, DCI, DAG);
21515	case Intrinsic::aarch64_sve_cmphs_wide:
21516	return tryConvertSVEWideCompare(N, CC: ISD::SETUGE, DCI, DAG);
21517	case Intrinsic::aarch64_sve_cmphi_wide:
21518	return tryConvertSVEWideCompare(N, CC: ISD::SETUGT, DCI, DAG);
21519	case Intrinsic::aarch64_sve_cmplo_wide:
21520	return tryConvertSVEWideCompare(N, CC: ISD::SETULT, DCI, DAG);
21521	case Intrinsic::aarch64_sve_cmpls_wide:
21522	return tryConvertSVEWideCompare(N, CC: ISD::SETULE, DCI, DAG);
21523	case Intrinsic::aarch64_sve_ptest_any:
21524	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
21525	Cond: AArch64CC::ANY_ACTIVE);
21526	case Intrinsic::aarch64_sve_ptest_first:
21527	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
21528	Cond: AArch64CC::FIRST_ACTIVE);
21529	case Intrinsic::aarch64_sve_ptest_last:
21530	return getPTest(DAG, VT: N->getValueType(ResNo: `0`), Pg: N->getOperand(Num: `1`), Op: N->getOperand(Num: `2`),
21531	Cond: AArch64CC::LAST_ACTIVE);
21532	case Intrinsic::aarch64_sve_whilelo:
21533	return tryCombineWhileLo(N, DCI, Subtarget);
21534	}
21535	return SDValue ();
21536	}
21537
21538	static bool isCheapToExtend(const SDValue &N) {
21539	unsigned OC = N ->getOpcode();
21540	return OC == ISD::LOAD \|\| OC == ISD::MLOAD \|\|
21541	ISD::isConstantSplatVectorAllZeros(N: N.getNode());
21542	}
21543
21544	static SDValue
21545	performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21546	SelectionDAG &DAG) {
21547	// If we have (sext (setcc A B)) and A and B are cheap to extend,
21548	// we can move the sext into the arguments and have the same result. For
21549	// example, if A and B are both loads, we can make those extending loads and
21550	// avoid an extra instruction. This pattern appears often in VLS code
21551	// generation where the inputs to the setcc have a different size to the
21552	// instruction that wants to use the result of the setcc.
21553	assert(N->getOpcode() == ISD::SIGN_EXTEND &&
21554	N->getOperand(`0`)->getOpcode() == ISD::SETCC);
21555	const SDValue SetCC = N->getOperand(Num: `0`);
21556
21557	const SDValue CCOp0 = SetCC.getOperand(i: `0`);
21558	const SDValue CCOp1 = SetCC.getOperand(i: `1`);
21559	if (!CCOp0 ->getValueType(ResNo: `0`).isInteger() \|\|
21560	!CCOp1 ->getValueType(ResNo: `0`).isInteger())
21561	return SDValue ();
21562
21563	ISD::CondCode Code =
21564	cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get();
21565
21566	ISD::NodeType ExtType =
21567	isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21568
21569	if (isCheapToExtend(N: SetCC.getOperand(i: `0`)) &&
21570	isCheapToExtend(N: SetCC.getOperand(i: `1`))) {
21571	const SDValue Ext1 =
21572	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp0);
21573	const SDValue Ext2 =
21574	DAG.getNode(Opcode: ExtType, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: CCOp1);
21575
21576	return DAG.getSetCC(
21577	DL: SDLoc (SetCC), VT: N->getValueType(ResNo: `0`), LHS: Ext1, RHS: Ext2,
21578	Cond: cast<CondCodeSDNode>(Val: SetCC ->getOperand(Num: `2`).getNode())->get());
21579	}
21580
21581	return SDValue ();
21582	}
21583
21584	static SDValue performExtendCombine(SDNode *N,
21585	TargetLowering::DAGCombinerInfo &DCI,
21586	SelectionDAG &DAG) {
21587	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
21588	// we can convert that DUP into another extract_high (of a bigger DUP), which
21589	// helps the backend to decide that an sabdl2 would be useful, saving a real
21590	// extract_high operation.
21591	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
21592	(N->getOperand(Num: `0`).getOpcode() == ISD::ABDU \|\|
21593	N->getOperand(Num: `0`).getOpcode() == ISD::ABDS)) {
21594	SDNode *ABDNode = N->getOperand(Num: `0`).getNode();
21595	SDValue NewABD =
21596	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N: ABDNode, DCI, DAG);
21597	if (!NewABD.getNode())
21598	return SDValue ();
21599
21600	return DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: NewABD);
21601	}
21602
21603	if (N->getValueType(ResNo: `0`).isFixedLengthVector() &&
21604	N->getOpcode() == ISD::SIGN_EXTEND &&
21605	N->getOperand(Num: `0`)->getOpcode() == ISD::SETCC)
21606	return performSignExtendSetCCCombine(N, DCI, DAG);
21607
21608	return SDValue ();
21609	}
21610
21611	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
21612	SDValue SplatVal, unsigned NumVecElts) {
21613	assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21614	Align OrigAlignment = St.getAlign();
21615	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / `8`;
21616
21617	// Create scalar stores. This is at least as good as the code sequence for a
21618	// split unaligned store which is a dup.s, ext.b, and two stores.
21619	// Most of the time the three stores should be replaced by store pair
21620	// instructions (stp).
21621	SDLoc DL(&St);
21622	SDValue BasePtr = St.getBasePtr();
21623	uint64_t BaseOffset = `0`;
21624
21625	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21626	SDValue NewST1 =
21627	DAG.getStore(Chain: St.getChain(), dl: DL, Val: SplatVal, Ptr: BasePtr, PtrInfo,
21628	Alignment: OrigAlignment, MMOFlags: St.getMemOperand()->getFlags());
21629
21630	// As this in ISel, we will not merge this add which may degrade results.
21631	if (BasePtr ->getOpcode() == ISD::ADD &&
21632	isa<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))) {
21633	BaseOffset = cast<ConstantSDNode>(Val: BasePtr ->getOperand(Num: `1`))->getSExtValue();
21634	BasePtr = BasePtr ->getOperand(Num: `0`);
21635	}
21636
21637	unsigned Offset = EltOffset;
21638	while (--NumVecElts) {
21639	Align Alignment = commonAlignment(A: OrigAlignment, Offset);
21640	SDValue OffsetPtr =
21641	DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
21642	N2: DAG.getConstant(Val: BaseOffset + Offset, DL, VT: MVT::i64));
21643	NewST1 = DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SplatVal, Ptr: OffsetPtr,
21644	PtrInfo: PtrInfo.getWithOffset(O: Offset), Alignment,
21645	MMOFlags: St.getMemOperand()->getFlags());
21646	Offset += EltOffset;
21647	}
21648	return NewST1;
21649	}
21650
21651	// Returns an SVE type that ContentTy can be trivially sign or zero extended
21652	// into.
21653	static MVT getSVEContainerType(EVT ContentTy) {
21654	assert(ContentTy.isSimple() && "No SVE containers for extended types");
21655
21656	switch (ContentTy.getSimpleVT().SimpleTy) {
21657	default:
21658	llvm_unreachable("No known SVE container for this MVT type");
21659	case MVT::nxv2i8:
21660	case MVT::nxv2i16:
21661	case MVT::nxv2i32:
21662	case MVT::nxv2i64:
21663	case MVT::nxv2f32:
21664	case MVT::nxv2f64:
21665	return MVT::nxv2i64;
21666	case MVT::nxv4i8:
21667	case MVT::nxv4i16:
21668	case MVT::nxv4i32:
21669	case MVT::nxv4f32:
21670	return MVT::nxv4i32;
21671	case MVT::nxv8i8:
21672	case MVT::nxv8i16:
21673	case MVT::nxv8f16:
21674	case MVT::nxv8bf16:
21675	return MVT::nxv8i16;
21676	case MVT::nxv16i8:
21677	return MVT::nxv16i8;
21678	}
21679	}
21680
21681	static SDValue performLD1Combine(SDNode N, SelectionDAG &DAG, unsigned* Opc) {
21682	SDLoc DL(N);
21683	EVT VT = N->getValueType(ResNo: `0`);
21684
21685	if (VT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
21686	return SDValue ();
21687
21688	EVT ContainerVT = VT;
21689	if (ContainerVT.isInteger())
21690	ContainerVT = getSVEContainerType(ContentTy: ContainerVT);
21691
21692	SDVTList VTs = DAG.getVTList(VT1: ContainerVT, VT2: MVT::Other);
21693	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
21694	N->getOperand(Num: `2`), // Pg
21695	N->getOperand(Num: `3`), // Base
21696	DAG.getValueType(VT) };
21697
21698	SDValue Load = DAG.getNode(Opcode: Opc, DL, VTList: VTs, Ops);
21699	SDValue LoadChain = SDValue (Load.getNode(), `1`);
21700
21701	if (ContainerVT.isInteger() && (VT != ContainerVT))
21702	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Load.getValue(R: `0`));
21703
21704	return DAG.getMergeValues(Ops: { Load, LoadChain }, dl: DL);
21705	}
21706
21707	static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
21708	SDLoc DL(N);
21709	EVT VT = N->getValueType(ResNo: `0`);
21710	EVT PtrTy = N->getOperand(Num: `3`).getValueType();
21711
21712	EVT LoadVT = VT;
21713	if (VT.isFloatingPoint())
21714	LoadVT = VT.changeTypeToInteger();
21715
21716	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21717	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT: LoadVT);
21718	SDValue L = DAG.getMaskedLoad(VT: LoadVT, dl: DL, Chain: MINode->getChain(),
21719	Base: MINode->getOperand(Num: `3`), Offset: DAG.getUNDEF(VT: PtrTy),
21720	Mask: MINode->getOperand(Num: `2`), Src0: PassThru,
21721	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21722	AM: ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding: false);
21723
21724	if (VT.isFloatingPoint()) {
21725	SDValue Ops[] = { DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: L), L.getValue(R: `1`) };
21726	return DAG.getMergeValues(Ops, dl: DL);
21727	}
21728
21729	return L;
21730	}
21731
21732	template <unsigned Opcode>
21733	static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
21734	static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO \|\|
21735	Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
21736	"Unsupported opcode.");
21737	SDLoc DL(N);
21738	EVT VT = N->getValueType(ResNo: `0`);
21739
21740	EVT LoadVT = VT;
21741	if (VT.isFloatingPoint())
21742	LoadVT = VT.changeTypeToInteger();
21743
21744	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `2`), N->getOperand(Num: `3`)};
21745	SDValue Load = DAG.getNode(Opcode, DL, ResultTys: {LoadVT, MVT::Other}, Ops);
21746	SDValue LoadChain = SDValue (Load.getNode(), `1`);
21747
21748	if (VT.isFloatingPoint())
21749	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Load.getValue(R: `0`));
21750
21751	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
21752	}
21753
21754	static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
21755	SDLoc DL(N);
21756	SDValue Data = N->getOperand(Num: `2`);
21757	EVT DataVT = Data.getValueType();
21758	EVT HwSrcVt = getSVEContainerType(ContentTy: DataVT);
21759	SDValue InputVT = DAG.getValueType(DataVT);
21760
21761	if (DataVT.isFloatingPoint())
21762	InputVT = DAG.getValueType(HwSrcVt);
21763
21764	SDValue SrcNew;
21765	if (Data.getValueType().isFloatingPoint())
21766	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Data);
21767	else
21768	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Data);
21769
21770	SDValue Ops[] = { N->getOperand(Num: `0`), // Chain
21771	SrcNew,
21772	N->getOperand(Num: `4`), // Base
21773	N->getOperand(Num: `3`), // Pg
21774	InputVT
21775	};
21776
21777	return DAG.getNode(Opcode: AArch64ISD::ST1_PRED, DL, VT: N->getValueType(ResNo: `0`), Ops);
21778	}
21779
21780	static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
21781	SDLoc DL(N);
21782
21783	SDValue Data = N->getOperand(Num: `2`);
21784	EVT DataVT = Data.getValueType();
21785	EVT PtrTy = N->getOperand(Num: `4`).getValueType();
21786
21787	if (DataVT.isFloatingPoint())
21788	Data = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: DataVT.changeTypeToInteger(), Operand: Data);
21789
21790	auto *MINode = cast<MemIntrinsicSDNode>(Val: N);
21791	return DAG.getMaskedStore(Chain: MINode->getChain(), dl: DL, Val: Data, Base: MINode->getOperand(Num: `4`),
21792	Offset: DAG.getUNDEF(VT: PtrTy), Mask: MINode->getOperand(Num: `3`),
21793	MemVT: MINode->getMemoryVT(), MMO: MINode->getMemOperand(),
21794	AM: ISD::UNINDEXED, IsTruncating: false, IsCompressing: false);
21795	}
21796
21797	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21798	/// load store optimizer pass will merge them to store pair stores. This should
21799	/// be better than a movi to create the vector zero followed by a vector store
21800	/// if the zero constant is not re-used, since one instructions and one register
21801	/// live range will be removed.
21802	///
21803	/// For example, the final generated code should be:
21804	///
21805	/// stp xzr, xzr, [x0]
21806	///
21807	/// instead of:
21808	///
21809	/// movi v0.2d, #0
21810	/// str q0, [x0]
21811	///
21812	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21813	SDValue StVal = St.getValue();
21814	EVT VT = StVal.getValueType();
21815
21816	// Avoid scalarizing zero splat stores for scalable vectors.
21817	if (VT.isScalableVector())
21818	return SDValue ();
21819
21820	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21821	// 2, 3 or 4 i32 elements.
21822	int NumVecElts = VT.getVectorNumElements();
21823	if (!(((NumVecElts == `2` \|\| NumVecElts == `3`) &&
21824	VT.getVectorElementType().getSizeInBits() == `64`) \|\|
21825	((NumVecElts == `2` \|\| NumVecElts == `3` \|\| NumVecElts == `4`) &&
21826	VT.getVectorElementType().getSizeInBits() == `32`)))
21827	return SDValue ();
21828
21829	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21830	return SDValue ();
21831
21832	// If the zero constant has more than one use then the vector store could be
21833	// better since the constant mov will be amortized and stp q instructions
21834	// should be able to be formed.
21835	if (!StVal.hasOneUse())
21836	return SDValue ();
21837
21838	// If the store is truncating then it's going down to i16 or smaller, which
21839	// means it can be implemented in a single store anyway.
21840	if (St.isTruncatingStore())
21841	return SDValue ();
21842
21843	// If the immediate offset of the address operand is too large for the stp
21844	// instruction, then bail out.
21845	if (DAG.isBaseWithConstantOffset(Op: St.getBasePtr())) {
21846	int64_t Offset = St.getBasePtr()->getConstantOperandVal(Num: `1`);
21847	if (Offset < -`512` \|\| Offset > `504`)
21848	return SDValue ();
21849	}
21850
21851	for (int I = `0`; I < NumVecElts; ++I) {
21852	SDValue EltVal = StVal.getOperand(i: I);
21853	if (!isNullConstant(V: EltVal) && !isNullFPConstant(V: EltVal))
21854	return SDValue ();
21855	}
21856
21857	// Use a CopyFromReg WZR/XZR here to prevent
21858	// DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21859	SDLoc DL(&St);
21860	unsigned ZeroReg;
21861	EVT ZeroVT;
21862	if (VT.getVectorElementType().getSizeInBits() == `32`) {
21863	ZeroReg = AArch64::WZR;
21864	ZeroVT = MVT::i32;
21865	} else {
21866	ZeroReg = AArch64::XZR;
21867	ZeroVT = MVT::i64;
21868	}
21869	SDValue SplatVal =
21870	DAG.getCopyFromReg(Chain: DAG.getEntryNode(), dl: DL, Reg: ZeroReg, VT: ZeroVT);
21871	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21872	}
21873
21874	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21875	/// value. The load store optimizer pass will merge them to store pair stores.
21876	/// This has better performance than a splat of the scalar followed by a split
21877	/// vector store. Even if the stores are not merged it is four stores vs a dup,
21878	/// followed by an ext.b and two stores.
21879	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
21880	SDValue StVal = St.getValue();
21881	EVT VT = StVal.getValueType();
21882
21883	// Don't replace floating point stores, they possibly won't be transformed to
21884	// stp because of the store pair suppress pass.
21885	if (VT.isFloatingPoint())
21886	return SDValue ();
21887
21888	// We can express a splat as store pair(s) for 2 or 4 elements.
21889	unsigned NumVecElts = VT.getVectorNumElements();
21890	if (NumVecElts != `4` && NumVecElts != `2`)
21891	return SDValue ();
21892
21893	// If the store is truncating then it's going down to i16 or smaller, which
21894	// means it can be implemented in a single store anyway.
21895	if (St.isTruncatingStore())
21896	return SDValue ();
21897
21898	// Check that this is a splat.
21899	// Make sure that each of the relevant vector element locations are inserted
21900	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21901	std::bitset<`4`> IndexNotInserted((`1` << NumVecElts) - `1`);
21902	SDValue SplatVal;
21903	for (unsigned I = `0`; I < NumVecElts; ++I) {
21904	// Check for insert vector elements.
21905	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21906	return SDValue ();
21907
21908	// Check that same value is inserted at each vector element.
21909	if (I == `0`)
21910	SplatVal = StVal.getOperand(i: `1`);
21911	else if (StVal.getOperand(i: `1`) != SplatVal)
21912	return SDValue ();
21913
21914	// Check insert element index.
21915	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(Val: StVal.getOperand(i: `2`));
21916	if (!CIndex)
21917	return SDValue ();
21918	uint64_t IndexVal = CIndex->getZExtValue();
21919	if (IndexVal >= NumVecElts)
21920	return SDValue ();
21921	IndexNotInserted.reset(position: IndexVal);
21922
21923	StVal = StVal.getOperand(i: `0`);
21924	}
21925	// Check that all vector element locations were inserted to.
21926	if (IndexNotInserted.any())
21927	return SDValue ();
21928
21929	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21930	}
21931
21932	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
21933	SelectionDAG &DAG,
21934	const AArch64Subtarget *Subtarget) {
21935
21936	StoreSDNode *S = cast<StoreSDNode>(Val: N);
21937	if (S->isVolatile() \|\| S->isIndexed())
21938	return SDValue ();
21939
21940	SDValue StVal = S->getValue();
21941	EVT VT = StVal.getValueType();
21942
21943	if (!VT.isFixedLengthVector())
21944	return SDValue ();
21945
21946	// If we get a splat of zeros, convert this vector store to a store of
21947	// scalars. They will be merged into store pairs of xzr thereby removing one
21948	// instruction and one register.
21949	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, St&: *S))
21950	return ReplacedZeroSplat;
21951
21952	// FIXME: The logic for deciding if an unaligned store should be split should
21953	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21954	// a call to that function here.
21955
21956	if (!Subtarget->isMisaligned128StoreSlow())
21957	return SDValue ();
21958
21959	// Don't split at -Oz.
21960	if (DAG.getMachineFunction().getFunction().hasMinSize())
21961	return SDValue ();
21962
21963	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21964	// those up regresses performance on micro-benchmarks and olden/bh.
21965	if (VT.getVectorNumElements() < `2` \|\| VT == MVT::v2i64)
21966	return SDValue ();
21967
21968	// Split unaligned 16B stores. They are terrible for performance.
21969	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
21970	// extensions can use this to mark that it does not want splitting to happen
21971	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21972	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
21973	if (VT.getSizeInBits() != `128` \|\| S->getAlign() >= Align (`16`) \|\|
21974	S->getAlign() <= Align (`2`))
21975	return SDValue ();
21976
21977	// If we get a splat of a scalar convert this vector store to a store of
21978	// scalars. They will be merged into store pairs thereby removing two
21979	// instructions.
21980	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, St&: *S))
21981	return ReplacedSplat;
21982
21983	SDLoc DL(S);
21984
21985	// Split VT into two.
21986	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
21987	unsigned NumElts = HalfVT.getVectorNumElements();
21988	SDValue SubVector0 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
21989	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
21990	SDValue SubVector1 = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: HalfVT, N1: StVal,
21991	N2: DAG.getConstant(Val: NumElts, DL, VT: MVT::i64));
21992	SDValue BasePtr = S->getBasePtr();
21993	SDValue NewST1 =
21994	DAG.getStore(Chain: S->getChain(), dl: DL, Val: SubVector0, Ptr: BasePtr, PtrInfo: S->getPointerInfo(),
21995	Alignment: S->getAlign(), MMOFlags: S->getMemOperand()->getFlags());
21996	SDValue OffsetPtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr,
21997	N2: DAG.getConstant(Val: `8`, DL, VT: MVT::i64));
21998	return DAG.getStore(Chain: NewST1.getValue(R: `0`), dl: DL, Val: SubVector1, Ptr: OffsetPtr,
21999	PtrInfo: S->getPointerInfo(), Alignment: S->getAlign(),
22000	MMOFlags: S->getMemOperand()->getFlags());
22001	}
22002
22003	static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
22004	assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22005
22006	// splice(pg, op1, undef) -> op1
22007	if (N->getOperand(Num: `2`).isUndef())
22008	return N->getOperand(Num: `1`);
22009
22010	return SDValue ();
22011	}
22012
22013	static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
22014	const AArch64Subtarget *Subtarget) {
22015	assert((N->getOpcode() == AArch64ISD::UUNPKHI \|\|
22016	N->getOpcode() == AArch64ISD::UUNPKLO) &&
22017	"Unexpected Opcode!");
22018
22019	// uunpklo/hi undef -> undef
22020	if (N->getOperand(Num: `0`).isUndef())
22021	return DAG.getUNDEF(VT: N->getValueType(ResNo: `0`));
22022
22023	// If this is a masked load followed by an UUNPKLO, fold this into a masked
22024	// extending load. We can do this even if this is already a masked
22025	// {z,}extload.
22026	if (N->getOperand(Num: `0`).getOpcode() == ISD::MLOAD &&
22027	N->getOpcode() == AArch64ISD::UUNPKLO) {
22028	MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(Val: N->getOperand(Num: `0`));
22029	SDValue Mask = MLD->getMask();
22030	SDLoc DL(N);
22031
22032	if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22033	SDValue (MLD, `0`).hasOneUse() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
22034	(MLD->getPassThru()->isUndef() \|\|
22035	isZerosVector(N: MLD->getPassThru().getNode()))) {
22036	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22037	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
22038	EVT VT = N->getValueType(ResNo: `0`);
22039
22040	// Ensure we can double the size of the predicate pattern
22041	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22042	if (NumElts &&
22043	NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22044	Mask =
22045	getPTrue(DAG, DL, VT: VT.changeVectorElementType(EltVT: MVT::i1), Pattern: PgPattern);
22046	SDValue PassThru = DAG.getConstant(Val: `0`, DL, VT);
22047	SDValue NewLoad = DAG.getMaskedLoad(
22048	VT, dl: DL, Chain: MLD->getChain(), Base: MLD->getBasePtr(), Offset: MLD->getOffset(), Mask,
22049	Src0: PassThru, MemVT: MLD->getMemoryVT(), MMO: MLD->getMemOperand(),
22050	AM: MLD->getAddressingMode(), ISD::ZEXTLOAD);
22051
22052	DAG.ReplaceAllUsesOfValueWith(From: SDValue (MLD, `1`), To: NewLoad.getValue(R: `1`));
22053
22054	return NewLoad;
22055	}
22056	}
22057	}
22058
22059	return SDValue ();
22060	}
22061
22062	static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N) {
22063	if (N->getOpcode() != AArch64ISD::UZP1)
22064	return false;
22065	SDValue Op0 = N->getOperand(Num: `0`);
22066	EVT SrcVT = Op0 ->getValueType(ResNo: `0`);
22067	EVT DstVT = N->getValueType(ResNo: `0`);
22068	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) \|\|
22069	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) \|\|
22070	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22071	}
22072
22073	// Try to combine rounding shifts where the operands come from an extend, and
22074	// the result is truncated and combined into one vector.
22075	// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
22076	static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG) {
22077	assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
22078	SDValue Op0 = N->getOperand(Num: `0`);
22079	SDValue Op1 = N->getOperand(Num: `1`);
22080	EVT ResVT = N->getValueType(ResNo: `0`);
22081
22082	unsigned RshOpc = Op0.getOpcode();
22083	if (RshOpc != AArch64ISD::RSHRNB_I)
22084	return SDValue ();
22085
22086	// Same op code and imm value?
22087	SDValue ShiftValue = Op0.getOperand(i: `1`);
22088	if (RshOpc != Op1.getOpcode() \|\| ShiftValue != Op1.getOperand(i: `1`))
22089	return SDValue ();
22090
22091	// Same unextended operand value?
22092	SDValue Lo = Op0.getOperand(i: `0`);
22093	SDValue Hi = Op1.getOperand(i: `0`);
22094	if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
22095	Hi.getOpcode() != AArch64ISD::UUNPKHI)
22096	return SDValue ();
22097	SDValue OrigArg = Lo.getOperand(i: `0`);
22098	if (OrigArg != Hi.getOperand(i: `0`))
22099	return SDValue ();
22100
22101	SDLoc DL(N);
22102	return DAG.getNode(Opcode: AArch64ISD::URSHR_I_PRED, DL, VT: ResVT,
22103	N1: getPredicateForVector(DAG, DL, VT: ResVT), N2: OrigArg,
22104	N3: ShiftValue);
22105	}
22106
22107	// Try to simplify:
22108	// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
22109	// t2 = nxv8i16 srl(t1, ShiftValue)
22110	// to
22111	// t1 = nxv8i16 rshrnb(X, shiftvalue).
22112	// rshrnb will zero the top half bits of each element. Therefore, this combine
22113	// should only be performed when a following instruction with the rshrnb
22114	// as an operand does not care about the top half of each element. For example,
22115	// a uzp1 or a truncating store.
22116	static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG,
22117	const AArch64Subtarget *Subtarget) {
22118	EVT VT = Srl ->getValueType(ResNo: `0`);
22119	if (!VT.isScalableVector() \|\| !Subtarget->hasSVE2())
22120	return SDValue ();
22121
22122	EVT ResVT;
22123	if (VT == MVT::nxv8i16)
22124	ResVT = MVT::nxv16i8;
22125	else if (VT == MVT::nxv4i32)
22126	ResVT = MVT::nxv8i16;
22127	else if (VT == MVT::nxv2i64)
22128	ResVT = MVT::nxv4i32;
22129	else
22130	return SDValue ();
22131
22132	SDLoc DL(Srl);
22133	unsigned ShiftValue;
22134	SDValue RShOperand;
22135	if (!canLowerSRLToRoundingShiftForVT(Shift: Srl, ResVT, DAG, ShiftValue, RShOperand))
22136	return SDValue ();
22137	SDValue Rshrnb = DAG.getNode(
22138	Opcode: AArch64ISD::RSHRNB_I, DL, VT: ResVT,
22139	Ops: {RShOperand, DAG.getTargetConstant(Val: ShiftValue, DL, VT: MVT::i32)});
22140	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Rshrnb);
22141	}
22142
22143	static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
22144	const AArch64Subtarget *Subtarget) {
22145	SDLoc DL(N);
22146	SDValue Op0 = N->getOperand(Num: `0`);
22147	SDValue Op1 = N->getOperand(Num: `1`);
22148	EVT ResVT = N->getValueType(ResNo: `0`);
22149
22150	// uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
22151	if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22152	Op1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22153	Op0.getOperand(i: `0`) == Op1.getOperand(i: `0`)) {
22154
22155	SDValue SourceVec = Op0.getOperand(i: `0`);
22156	uint64_t ExtIdx0 = Op0.getConstantOperandVal(i: `1`);
22157	uint64_t ExtIdx1 = Op1.getConstantOperandVal(i: `1`);
22158	uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
22159	if (ExtIdx0 == `0` && ExtIdx1 == NumElements / `2`) {
22160	EVT OpVT = Op0.getOperand(i: `1`).getValueType();
22161	EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
22162	SDValue Uzp = DAG.getNode(Opcode: N->getOpcode(), DL, VT: WidenedResVT, N1: SourceVec,
22163	N2: DAG.getUNDEF(VT: WidenedResVT));
22164	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: ResVT, N1: Uzp,
22165	N2: DAG.getConstant(Val: `0`, DL, VT: OpVT));
22166	}
22167	}
22168
22169	// Following optimizations only work with uzp1.
22170	if (N->getOpcode() == AArch64ISD::UZP2)
22171	return SDValue ();
22172
22173	// uzp1(x, undef) -> concat(truncate(x), undef)
22174	if (Op1.getOpcode() == ISD::UNDEF) {
22175	EVT BCVT = MVT::Other, HalfVT = MVT::Other;
22176	switch (ResVT.getSimpleVT().SimpleTy) {
22177	default:
22178	break;
22179	case MVT::v16i8:
22180	BCVT = MVT::v8i16;
22181	HalfVT = MVT::v8i8;
22182	break;
22183	case MVT::v8i16:
22184	BCVT = MVT::v4i32;
22185	HalfVT = MVT::v4i16;
22186	break;
22187	case MVT::v4i32:
22188	BCVT = MVT::v2i64;
22189	HalfVT = MVT::v2i32;
22190	break;
22191	}
22192	if (BCVT != MVT::Other) {
22193	SDValue BC = DAG.getBitcast(VT: BCVT, V: Op0);
22194	SDValue Trunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: HalfVT, Operand: BC);
22195	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ResVT, N1: Trunc,
22196	N2: DAG.getUNDEF(VT: HalfVT));
22197	}
22198	}
22199
22200	if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
22201	return Urshr;
22202
22203	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op0, DAG, Subtarget))
22204	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Rshrnb, N2: Op1);
22205
22206	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Op1, DAG, Subtarget))
22207	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Rshrnb);
22208
22209	// uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
22210	if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
22211	if (Op0.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
22212	SDValue X = Op0.getOperand(i: `0`).getOperand(i: `0`);
22213	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: X, N2: Op1);
22214	}
22215	}
22216
22217	// uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
22218	if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
22219	if (Op1.getOperand(i: `0`).getOpcode() == AArch64ISD::UZP1) {
22220	SDValue Z = Op1.getOperand(i: `0`).getOperand(i: `1`);
22221	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0, N2: Z);
22222	}
22223	}
22224
22225	// These optimizations only work on little endian.
22226	if (!DAG.getDataLayout().isLittleEndian())
22227	return SDValue ();
22228
22229	// uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
22230	// Example:
22231	// nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
22232	// to
22233	// nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
22234	if (isHalvingTruncateAndConcatOfLegalIntScalableType(N) &&
22235	Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
22236	if (Op0.getOperand(i: `0`).getValueType() == Op1.getOperand(i: `0`).getValueType()) {
22237	return DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: ResVT, N1: Op0.getOperand(i: `0`),
22238	N2: Op1.getOperand(i: `0`));
22239	}
22240	}
22241
22242	if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
22243	return SDValue ();
22244
22245	SDValue SourceOp0 = peekThroughBitcasts(V: Op0);
22246	SDValue SourceOp1 = peekThroughBitcasts(V: Op1);
22247
22248	// truncating uzp1(x, y) -> xtn(concat (x, y))
22249	if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
22250	EVT Op0Ty = SourceOp0.getValueType();
22251	if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) \|\|
22252	(ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
22253	SDValue Concat =
22254	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL,
22255	VT: Op0Ty.getDoubleNumVectorElementsVT(Context&: *DAG.getContext()),
22256	N1: SourceOp0, N2: SourceOp1);
22257	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT, Operand: Concat);
22258	}
22259	}
22260
22261	// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
22262	if (SourceOp0.getOpcode() != ISD::TRUNCATE \|\|
22263	SourceOp1.getOpcode() != ISD::TRUNCATE)
22264	return SDValue ();
22265	SourceOp0 = SourceOp0.getOperand(i: `0`);
22266	SourceOp1 = SourceOp1.getOperand(i: `0`);
22267
22268	if (SourceOp0.getValueType() != SourceOp1.getValueType() \|\|
22269	!SourceOp0.getValueType().isSimple())
22270	return SDValue ();
22271
22272	EVT ResultTy;
22273
22274	switch (SourceOp0.getSimpleValueType().SimpleTy) {
22275	case MVT::v2i64:
22276	ResultTy = MVT::v4i32;
22277	break;
22278	case MVT::v4i32:
22279	ResultTy = MVT::v8i16;
22280	break;
22281	case MVT::v8i16:
22282	ResultTy = MVT::v16i8;
22283	break;
22284	default:
22285	return SDValue ();
22286	}
22287
22288	SDValue UzpOp0 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp0);
22289	SDValue UzpOp1 = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ResultTy, Operand: SourceOp1);
22290	SDValue UzpResult =
22291	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UzpOp0.getValueType(), N1: UzpOp0, N2: UzpOp1);
22292
22293	EVT BitcastResultTy;
22294
22295	switch (ResVT.getSimpleVT().SimpleTy) {
22296	case MVT::v2i32:
22297	BitcastResultTy = MVT::v2i64;
22298	break;
22299	case MVT::v4i16:
22300	BitcastResultTy = MVT::v4i32;
22301	break;
22302	case MVT::v8i8:
22303	BitcastResultTy = MVT::v8i16;
22304	break;
22305	default:
22306	llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
22307	}
22308
22309	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT,
22310	Operand: DAG.getNode(Opcode: ISD::BITCAST, DL, VT: BitcastResultTy, Operand: UzpResult));
22311	}
22312
22313	static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
22314	unsigned Opc = N->getOpcode();
22315
22316	assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
22317	Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) \|\|
22318	(Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
22319	Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
22320	"Invalid opcode.");
22321
22322	const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO \|\|
22323	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
22324	const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO \|\|
22325	Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
22326	const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO \|\|
22327	Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO \|\|
22328	Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO \|\|
22329	Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
22330
22331	SDLoc DL(N);
22332	SDValue Chain = N->getOperand(Num: `0`);
22333	SDValue Pg = N->getOperand(Num: `1`);
22334	SDValue Base = N->getOperand(Num: `2`);
22335	SDValue Offset = N->getOperand(Num: `3`);
22336	SDValue Ty = N->getOperand(Num: `4`);
22337
22338	EVT ResVT = N->getValueType(ResNo: `0`);
22339
22340	const auto OffsetOpc = Offset.getOpcode();
22341	const bool OffsetIsZExt =
22342	OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
22343	const bool OffsetIsSExt =
22344	OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
22345
22346	// Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
22347	if (!Extended && (OffsetIsSExt \|\| OffsetIsZExt)) {
22348	SDValue ExtPg = Offset.getOperand(i: `0`);
22349	VTSDNode *ExtFrom = cast<VTSDNode>(Val: Offset.getOperand(i: `2`).getNode());
22350	EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
22351
22352	// If the predicate for the sign- or zero-extended offset is the
22353	// same as the predicate used for this load and the sign-/zero-extension
22354	// was from a 32-bits...
22355	if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
22356	SDValue UnextendedOffset = Offset.getOperand(i: `1`);
22357
22358	unsigned NewOpc = getGatherVecOpcode(IsScaled: Scaled, IsSigned: OffsetIsSExt, NeedsExtend: true);
22359	if (Signed)
22360	NewOpc = getSignExtendedGatherOpcode(Opcode: NewOpc);
22361
22362	return DAG.getNode(Opcode: NewOpc, DL, ResultTys: {ResVT, MVT::Other},
22363	Ops: {Chain, Pg, Base, UnextendedOffset, Ty});
22364	}
22365	}
22366
22367	return SDValue ();
22368	}
22369
22370	/// Optimize a vector shift instruction and its operand if shifted out
22371	/// bits are not used.
22372	static SDValue performVectorShiftCombine(SDNode *N,
22373	const AArch64TargetLowering &TLI,
22374	TargetLowering::DAGCombinerInfo &DCI) {
22375	assert(N->getOpcode() == AArch64ISD::VASHR \|\|
22376	N->getOpcode() == AArch64ISD::VLSHR);
22377
22378	SDValue Op = N->getOperand(Num: `0`);
22379	unsigned OpScalarSize = Op.getScalarValueSizeInBits();
22380
22381	unsigned ShiftImm = N->getConstantOperandVal(Num: `1`);
22382	assert(OpScalarSize > ShiftImm && "Invalid shift imm");
22383
22384	// Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
22385	if (N->getOpcode() == AArch64ISD::VASHR &&
22386	Op.getOpcode() == AArch64ISD::VSHL &&
22387	N->getOperand(Num: `1`) == Op.getOperand(i: `1`))
22388	if (DCI.DAG.ComputeNumSignBits(Op: Op.getOperand(i: `0`)) > ShiftImm)
22389	return Op.getOperand(i: `0`);
22390
22391	// If the shift is exact, the shifted out bits matter.
22392	if (N->getFlags().hasExact())
22393	return SDValue ();
22394
22395	APInt ShiftedOutBits = APInt::getLowBitsSet(numBits: OpScalarSize, loBitsSet: ShiftImm);
22396	APInt DemandedMask = ~ShiftedOutBits;
22397
22398	if (TLI.SimplifyDemandedBits(Op, DemandedBits: DemandedMask, DCI))
22399	return SDValue (N, `0`);
22400
22401	return SDValue ();
22402	}
22403
22404	static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
22405	// sunpklo(sext(pred)) -> sext(extract_low_half(pred))
22406	// This transform works in partnership with performSetCCPunpkCombine to
22407	// remove unnecessary transfer of predicates into standard registers and back
22408	if (N->getOperand(Num: `0`).getOpcode() == ISD::SIGN_EXTEND &&
22409	N->getOperand(Num: `0`)->getOperand(Num: `0`)->getValueType(ResNo: `0`).getScalarType() ==
22410	MVT::i1) {
22411	SDValue CC = N->getOperand(Num: `0`)->getOperand(Num: `0`);
22412	auto VT = CC ->getValueType(ResNo: `0`).getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22413	SDValue Unpk = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: SDLoc (N), VT, N1: CC,
22414	N2: DAG.getVectorIdxConstant(Val: `0`, DL: SDLoc (N)));
22415	return DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`), Operand: Unpk);
22416	}
22417
22418	return SDValue ();
22419	}
22420
22421	/// Target-specific DAG combine function for post-increment LD1 (lane) and
22422	/// post-increment LD1R.
22423	static SDValue performPostLD1Combine(SDNode *N,
22424	TargetLowering::DAGCombinerInfo &DCI,
22425	bool IsLaneOp) {
22426	if (DCI.isBeforeLegalizeOps())
22427	return SDValue ();
22428
22429	SelectionDAG &DAG = DCI.DAG;
22430	EVT VT = N->getValueType(ResNo: `0`);
22431
22432	if (!VT.is128BitVector() && !VT.is64BitVector())
22433	return SDValue ();
22434
22435	unsigned LoadIdx = IsLaneOp ? `1` : `0`;
22436	SDNode *LD = N->getOperand(Num: LoadIdx).getNode();
22437	// If it is not LOAD, can not do such combine.
22438	if (LD->getOpcode() != ISD::LOAD)
22439	return SDValue ();
22440
22441	// The vector lane must be a constant in the LD1LANE opcode.
22442	SDValue Lane;
22443	if (IsLaneOp) {
22444	Lane = N->getOperand(Num: `2`);
22445	auto *LaneC = dyn_cast<ConstantSDNode>(Val&: Lane);
22446	if (!LaneC \|\| LaneC->getZExtValue() >= VT.getVectorNumElements())
22447	return SDValue ();
22448	}
22449
22450	LoadSDNode *LoadSDN = cast<LoadSDNode>(Val: LD);
22451	EVT MemVT = LoadSDN->getMemoryVT();
22452	// Check if memory operand is the same type as the vector element.
22453	if (MemVT != VT.getVectorElementType())
22454	return SDValue ();
22455
22456	// Check if there are other uses. If so, do not combine as it will introduce
22457	// an extra load.
22458	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
22459	++UI) {
22460	if (UI.getUse().getResNo() == `1`) // Ignore uses of the chain result.
22461	continue;
22462	if (*UI != N)
22463	return SDValue ();
22464	}
22465
22466	// If there is one use and it can splat the value, prefer that operation.
22467	// TODO: This could be expanded to more operations if they reliably use the
22468	// index variants.
22469	if (N->hasOneUse()) {
22470	unsigned UseOpc = N->use_begin()->getOpcode();
22471	if (UseOpc == ISD::FMUL \|\| UseOpc == ISD::FMA)
22472	return SDValue ();
22473	}
22474
22475	SDValue Addr = LD->getOperand(Num: `1`);
22476	SDValue Vector = N->getOperand(Num: `0`);
22477	// Search for a use of the address operand that is an increment.
22478	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
22479	Addr.getNode()->use_end(); UI != UE; ++UI) {
22480	SDNode User = UI;
22481	if (User->getOpcode() != ISD::ADD
22482	\|\| UI.getUse().getResNo() != Addr.getResNo())
22483	continue;
22484
22485	// If the increment is a constant, it must match the memory ref size.
22486	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
22487	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
22488	uint32_t IncVal = CInc->getZExtValue();
22489	unsigned NumBytes = VT.getScalarSizeInBits() / `8`;
22490	if (IncVal != NumBytes)
22491	continue;
22492	Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
22493	}
22494
22495	// To avoid cycle construction make sure that neither the load nor the add
22496	// are predecessors to each other or the Vector.
22497	SmallPtrSet<const SDNode *, `32`> Visited;
22498	SmallVector<const SDNode *, `16`> Worklist;
22499	Visited.insert(Ptr: Addr.getNode());
22500	Worklist.push_back(Elt: User);
22501	Worklist.push_back(Elt: LD);
22502	Worklist.push_back(Elt: Vector.getNode());
22503	if (SDNode::hasPredecessorHelper(N: LD, Visited, Worklist) \|\|
22504	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
22505	continue;
22506
22507	SmallVector<SDValue, `8`> Ops;
22508	Ops.push_back(Elt: LD->getOperand(Num: `0`)); // Chain
22509	if (IsLaneOp) {
22510	Ops.push_back(Elt: Vector); // The vector to be inserted
22511	Ops.push_back(Elt: Lane); // The lane to be inserted in the vector
22512	}
22513	Ops.push_back(Elt: Addr);
22514	Ops.push_back(Elt: Inc);
22515
22516	EVT Tys[`3`] = { VT, MVT::i64, MVT::Other };
22517	SDVTList SDTys = DAG.getVTList(VTs: Tys);
22518	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
22519	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOp, dl: SDLoc (N), VTList: SDTys, Ops,
22520	MemVT,
22521	MMO: LoadSDN->getMemOperand());
22522
22523	// Update the uses.
22524	SDValue NewResults[] = {
22525	SDValue (LD, `0`), // The result of load
22526	SDValue (UpdN.getNode(), `2`) // Chain
22527	};
22528	DCI.CombineTo(N: LD, To: NewResults);
22529	DCI.CombineTo(N, Res: SDValue (UpdN.getNode(), `0`)); // Dup/Inserted Result
22530	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), `1`)); // Write back register
22531
22532	break;
22533	}
22534	return SDValue ();
22535	}
22536
22537	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
22538	/// address translation.
22539	static bool performTBISimplification(SDValue Addr,
22540	TargetLowering::DAGCombinerInfo &DCI,
22541	SelectionDAG &DAG) {
22542	APInt DemandedMask = APInt::getLowBitsSet(numBits: `64`, loBitsSet: `56`);
22543	KnownBits Known;
22544	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
22545	!DCI.isBeforeLegalizeOps());
22546	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22547	if (TLI.SimplifyDemandedBits(Op: Addr, DemandedBits: DemandedMask, Known, TLO)) {
22548	DCI.CommitTargetLoweringOpt(TLO);
22549	return true;
22550	}
22551	return false;
22552	}
22553
22554	static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
22555	assert((N->getOpcode() == ISD::STORE \|\| N->getOpcode() == ISD::MSTORE) &&
22556	"Expected STORE dag node in input!");
22557
22558	if (auto Store = dyn_cast<StoreSDNode>(Val: N)) {
22559	if (!Store->isTruncatingStore() \|\| Store->isIndexed())
22560	return SDValue ();
22561	SDValue Ext = Store->getValue();
22562	auto ExtOpCode = Ext.getOpcode();
22563	if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
22564	ExtOpCode != ISD::ANY_EXTEND)
22565	return SDValue ();
22566	SDValue Orig = Ext ->getOperand(Num: `0`);
22567	if (Store->getMemoryVT() != Orig.getValueType())
22568	return SDValue ();
22569	return DAG.getStore(Chain: Store->getChain(), dl: SDLoc (Store), Val: Orig,
22570	Ptr: Store->getBasePtr(), MMO: Store->getMemOperand());
22571	}
22572
22573	return SDValue ();
22574	}
22575
22576	// A custom combine to lower load <3 x i8> as the more efficient sequence
22577	// below:
22578	// ldrb wX, [x0, #2]
22579	// ldrh wY, [x0]
22580	// orr wX, wY, wX, lsl #16
22581	// fmov s0, wX
22582	//
22583	// Note that an alternative sequence with even fewer (although usually more
22584	// complex/expensive) instructions would be:
22585	// ld1r.4h { v0 }, [x0], #2
22586	// ld1.b { v0 }[2], [x0]
22587	//
22588	// Generating this sequence unfortunately results in noticeably worse codegen
22589	// for code that extends the loaded v3i8, due to legalization breaking vector
22590	// shuffle detection in a way that is very difficult to work around.
22591	// TODO: Revisit once v3i8 legalization has been improved in general.
22592	static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
22593	EVT MemVT = LD->getMemoryVT();
22594	if (MemVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: `3`) \|\|
22595	LD->getOriginalAlign() >= `4`)
22596	return SDValue ();
22597
22598	SDLoc DL(LD);
22599	MachineFunction &MF = DAG.getMachineFunction();
22600	SDValue Chain = LD->getChain();
22601	SDValue BasePtr = LD->getBasePtr();
22602	MachineMemOperand *MMO = LD->getMemOperand();
22603	assert(LD->getOffset().isUndef() && "undef offset expected");
22604
22605	// Load 2 x i8, then 1 x i8.
22606	SDValue L16 = DAG.getLoad(VT: MVT::i16, dl: DL, Chain, Ptr: BasePtr, MMO);
22607	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
22608	SDValue L8 = DAG.getLoad(VT: MVT::i8, dl: DL, Chain,
22609	Ptr: DAG.getMemBasePlusOffset(Base: BasePtr, Offset: Offset2, DL),
22610	MMO: MF.getMachineMemOperand(MMO, Offset: `2`, Size: `1`));
22611
22612	// Extend to i32.
22613	SDValue Ext16 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L16);
22614	SDValue Ext8 = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: MVT::i32, Operand: L8);
22615
22616	// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22617	SDValue Shl = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i32, N1: Ext8,
22618	N2: DAG.getConstant(Val: `16`, DL, VT: MVT::i32));
22619	SDValue Or = DAG.getNode(Opcode: ISD::OR, DL, VT: MVT::i32, N1: Ext16, N2: Shl);
22620	SDValue Cast = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v4i8, Operand: Or);
22621
22622	// Extract v3i8 again.
22623	SDValue Extract = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT, N1: Cast,
22624	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
22625	SDValue TokenFactor = DAG.getNode(
22626	Opcode: ISD::TokenFactor, DL, VT: MVT::Other,
22627	Ops: {SDValue (cast<SDNode>(Val&: L16), `1`), SDValue (cast<SDNode>(Val&: L8), `1`)});
22628	return DAG.getMergeValues(Ops: {Extract, TokenFactor}, dl: DL);
22629	}
22630
22631	// Perform TBI simplification if supported by the target and try to break up
22632	// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22633	// load instructions can be selected.
22634	static SDValue performLOADCombine(SDNode *N,
22635	TargetLowering::DAGCombinerInfo &DCI,
22636	SelectionDAG &DAG,
22637	const AArch64Subtarget *Subtarget) {
22638	if (Subtarget->supportsAddressTopByteIgnored())
22639	performTBISimplification(Addr: N->getOperand(Num: `1`), DCI, DAG);
22640
22641	LoadSDNode *LD = cast<LoadSDNode>(Val: N);
22642	if (LD->isVolatile() \|\| !Subtarget->isLittleEndian())
22643	return SDValue (N, `0`);
22644
22645	if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22646	return Res;
22647
22648	if (!LD->isNonTemporal())
22649	return SDValue (N, `0`);
22650
22651	EVT MemVT = LD->getMemoryVT();
22652	if (MemVT.isScalableVector() \|\| MemVT.getSizeInBits() <= `256` \|\|
22653	MemVT.getSizeInBits() % `256` == `0` \|\|
22654	`256` % MemVT.getScalarSizeInBits() != `0`)
22655	return SDValue (N, `0`);
22656
22657	SDLoc DL(LD);
22658	SDValue Chain = LD->getChain();
22659	SDValue BasePtr = LD->getBasePtr();
22660	SDNodeFlags Flags = LD->getFlags();
22661	SmallVector<SDValue, `4`> LoadOps;
22662	SmallVector<SDValue, `4`> LoadOpsChain;
22663	// Replace any non temporal load over 256-bit with a series of 256 bit loads
22664	// and a scalar/vector load less than 256. This way we can utilize 256-bit
22665	// loads and reduce the amount of load instructions generated.
22666	MVT NewVT =
22667	MVT::getVectorVT(VT: MemVT.getVectorElementType().getSimpleVT(),
22668	NumElements: `256` / MemVT.getVectorElementType().getSizeInBits());
22669	unsigned Num256Loads = MemVT.getSizeInBits() / `256`;
22670	// Create all 256-bit loads starting from offset 0 and up to Num256Loads-132.*
22671	for (unsigned I = `0`; I < Num256Loads; I++) {
22672	unsigned PtrOffset = I * `32`;
22673	SDValue NewPtr = DAG.getMemBasePlusOffset(
22674	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
22675	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
22676	SDValue NewLoad = DAG.getLoad(
22677	VT: NewVT, dl: DL, Chain, Ptr: NewPtr, PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset),
22678	Alignment: NewAlign, MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
22679	LoadOps.push_back(Elt: NewLoad);
22680	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: NewLoad), `1`));
22681	}
22682
22683	// Process remaining bits of the load operation.
22684	// This is done by creating an UNDEF vector to match the size of the
22685	// 256-bit loads and inserting the remaining load to it. We extract the
22686	// original load type at the end using EXTRACT_SUBVECTOR instruction.
22687	unsigned BitsRemaining = MemVT.getSizeInBits() % `256`;
22688	unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / `8`;
22689	MVT RemainingVT = MVT::getVectorVT(
22690	VT: MemVT.getVectorElementType().getSimpleVT(),
22691	NumElements: BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22692	SDValue NewPtr = DAG.getMemBasePlusOffset(
22693	Base: BasePtr, Offset: TypeSize::getFixed(ExactSize: PtrOffset), DL, Flags);
22694	Align NewAlign = commonAlignment(A: LD->getAlign(), Offset: PtrOffset);
22695	SDValue RemainingLoad =
22696	DAG.getLoad(VT: RemainingVT, dl: DL, Chain, Ptr: NewPtr,
22697	PtrInfo: LD->getPointerInfo().getWithOffset(O: PtrOffset), Alignment: NewAlign,
22698	MMOFlags: LD->getMemOperand()->getFlags(), AAInfo: LD->getAAInfo());
22699	SDValue UndefVector = DAG.getUNDEF(VT: NewVT);
22700	SDValue InsertIdx = DAG.getVectorIdxConstant(Val: `0`, DL);
22701	SDValue ExtendedReminingLoad =
22702	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewVT,
22703	Ops: {UndefVector, RemainingLoad, InsertIdx});
22704	LoadOps.push_back(Elt: ExtendedReminingLoad);
22705	LoadOpsChain.push_back(Elt: SDValue (cast<SDNode>(Val&: RemainingLoad), `1`));
22706	EVT ConcatVT =
22707	EVT::getVectorVT(Context&: *DAG.getContext(), VT: MemVT.getScalarType(),
22708	NumElements: LoadOps.size() * NewVT.getVectorNumElements());
22709	SDValue ConcatVectors =
22710	DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: ConcatVT, Ops: LoadOps);
22711	// Extract the original vector type size.
22712	SDValue ExtractSubVector =
22713	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MemVT,
22714	Ops: {ConcatVectors, DAG.getVectorIdxConstant(Val: `0`, DL)});
22715	SDValue TokenFactor =
22716	DAG.getNode(Opcode: ISD::TokenFactor, DL, VT: MVT::Other, Ops: LoadOpsChain);
22717	return DAG.getMergeValues(Ops: {ExtractSubVector, TokenFactor}, dl: DL);
22718	}
22719
22720	static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = `0`) {
22721	EVT VecVT = Op.getValueType();
22722	assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22723	"Need boolean vector type.");
22724
22725	if (Depth > `3`)
22726	return MVT::INVALID_SIMPLE_VALUE_TYPE;
22727
22728	// We can get the base type from a vector compare or truncate.
22729	if (Op.getOpcode() == ISD::SETCC \|\| Op.getOpcode() == ISD::TRUNCATE)
22730	return Op.getOperand(i: `0`).getValueType();
22731
22732	// If an operand is a bool vector, continue looking.
22733	EVT BaseVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
22734	for (SDValue Operand : Op ->op_values()) {
22735	if (Operand.getValueType() != VecVT)
22736	continue;
22737
22738	EVT OperandVT = tryGetOriginalBoolVectorType(Op: Operand, Depth: Depth + `1`);
22739	if (!BaseVT.isSimple())
22740	BaseVT = OperandVT;
22741	else if (OperandVT != BaseVT)
22742	return MVT::INVALID_SIMPLE_VALUE_TYPE;
22743	}
22744
22745	return BaseVT;
22746	}
22747
22748	// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22749	// iN, we can use a trick that extracts the i^th bit from the i^th element and
22750	// then performs a vector add to get a scalar bitmask. This requires that each
22751	// element's bits are either all 1 or all 0.
22752	static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
22753	SDLoc DL(N);
22754	SDValue ComparisonResult(N, `0`);
22755	EVT VecVT = ComparisonResult.getValueType();
22756	assert(VecVT.isVector() && "Must be a vector type");
22757
22758	unsigned NumElts = VecVT.getVectorNumElements();
22759	if (NumElts != `2` && NumElts != `4` && NumElts != `8` && NumElts != `16`)
22760	return SDValue ();
22761
22762	if (VecVT.getVectorElementType() != MVT::i1 &&
22763	!DAG.getTargetLoweringInfo().isTypeLegal(VT: VecVT))
22764	return SDValue ();
22765
22766	// If we can find the original types to work on instead of a vector of i1,
22767	// we can avoid extend/extract conversion instructions.
22768	if (VecVT.getVectorElementType() == MVT::i1) {
22769	VecVT = tryGetOriginalBoolVectorType(Op: ComparisonResult);
22770	if (!VecVT.isSimple()) {
22771	unsigned BitsPerElement = std::max(a: `64` / NumElts, b: `8u`); // >= 64-bit vector
22772	VecVT = MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: BitsPerElement), NumElements: NumElts);
22773	}
22774	}
22775	VecVT = VecVT.changeVectorElementTypeToInteger();
22776
22777	// Large vectors don't map directly to this conversion, so to avoid too many
22778	// edge cases, we don't apply it here. The conversion will likely still be
22779	// applied later via multiple smaller vectors, whose results are concatenated.
22780	if (VecVT.getSizeInBits() > `128`)
22781	return SDValue ();
22782
22783	// Ensure that all elements' bits are either 0s or 1s.
22784	ComparisonResult = DAG.getSExtOrTrunc(Op: ComparisonResult, DL, VT: VecVT);
22785
22786	SmallVector<SDValue, `16`> MaskConstants;
22787	if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
22788	VecVT == MVT::v16i8) {
22789	// v16i8 is a special case, as we have 16 entries but only 8 positional bits
22790	// per entry. We split it into two halves, apply the mask, zip the halves to
22791	// create 8x 16-bit values, and the perform the vector reduce.
22792	for (unsigned Half = `0`; Half < `2`; ++Half) {
22793	for (unsigned MaskBit = `1`; MaskBit <= `128`; MaskBit *= `2`) {
22794	MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i32));
22795	}
22796	}
22797	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22798	SDValue RepresentativeBits =
22799	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22800
22801	SDValue UpperRepresentativeBits =
22802	DAG.getNode(Opcode: AArch64ISD::EXT, DL, VT: VecVT, N1: RepresentativeBits,
22803	N2: RepresentativeBits, N3: DAG.getConstant(Val: `8`, DL, VT: MVT::i32));
22804	SDValue Zipped = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: VecVT,
22805	N1: RepresentativeBits, N2: UpperRepresentativeBits);
22806	Zipped = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::v8i16, Operand: Zipped);
22807	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: MVT::i16, Operand: Zipped);
22808	}
22809
22810	// All other vector sizes.
22811	unsigned MaxBitMask = `1u` << (VecVT.getVectorNumElements() - `1`);
22812	for (unsigned MaskBit = `1`; MaskBit <= MaxBitMask; MaskBit *= `2`) {
22813	MaskConstants.push_back(Elt: DAG.getConstant(Val: MaskBit, DL, VT: MVT::i64));
22814	}
22815
22816	SDValue Mask = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, Ops: MaskConstants);
22817	SDValue RepresentativeBits =
22818	DAG.getNode(Opcode: ISD::AND, DL, VT: VecVT, N1: ComparisonResult, N2: Mask);
22819	EVT ResultVT = MVT::getIntegerVT(BitWidth: std::max<unsigned>(
22820	a: NumElts, b: VecVT.getVectorElementType().getSizeInBits()));
22821	return DAG.getNode(Opcode: ISD::VECREDUCE_ADD, DL, VT: ResultVT, Operand: RepresentativeBits);
22822	}
22823
22824	static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
22825	StoreSDNode *Store) {
22826	if (!Store->isTruncatingStore())
22827	return SDValue ();
22828
22829	SDLoc DL(Store);
22830	SDValue VecOp = Store->getValue();
22831	EVT VT = VecOp.getValueType();
22832	EVT MemVT = Store->getMemoryVT();
22833
22834	if (!MemVT.isVector() \|\| !VT.isVector() \|\|
22835	MemVT.getVectorElementType() != MVT::i1)
22836	return SDValue ();
22837
22838	// If we are storing a vector that we are currently building, let
22839	// `scalarizeVectorStore()` handle this more efficiently.
22840	if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22841	return SDValue ();
22842
22843	VecOp = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MemVT, Operand: VecOp);
22844	SDValue VectorBits = vectorToScalarBitmask(N: VecOp.getNode(), DAG);
22845	if (!VectorBits)
22846	return SDValue ();
22847
22848	EVT StoreVT =
22849	EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: MemVT.getStoreSizeInBits());
22850	SDValue ExtendedBits = DAG.getZExtOrTrunc(Op: VectorBits, DL, VT: StoreVT);
22851	return DAG.getStore(Chain: Store->getChain(), dl: DL, Val: ExtendedBits, Ptr: Store->getBasePtr(),
22852	MMO: Store->getMemOperand());
22853	}
22854
22855	bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
22856	return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) \|\|
22857	(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) \|\|
22858	(SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22859	}
22860
22861	// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22862	static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
22863	const AArch64Subtarget *Subtarget) {
22864	SDValue Value = ST->getValue();
22865	EVT ValueVT = Value.getValueType();
22866
22867	if (ST->isVolatile() \|\| !Subtarget->isLittleEndian() \|\|
22868	Value.getOpcode() != ISD::TRUNCATE \|\|
22869	ValueVT != EVT::getVectorVT(Context&: *DAG.getContext(), VT: MVT::i8, NumElements: `3`))
22870	return SDValue ();
22871
22872	assert(ST->getOffset().isUndef() && "undef offset expected");
22873	SDLoc DL(ST);
22874	auto WideVT = EVT::getVectorVT(
22875	Context&: *DAG.getContext(),
22876	VT: Value ->getOperand(Num: `0`).getValueType().getVectorElementType(), NumElements: `4`);
22877	SDValue UndefVector = DAG.getUNDEF(VT: WideVT);
22878	SDValue WideTrunc = DAG.getNode(
22879	Opcode: ISD::INSERT_SUBVECTOR, DL, VT: WideVT,
22880	Ops: {UndefVector, Value ->getOperand(Num: `0`), DAG.getVectorIdxConstant(Val: `0`, DL)});
22881	SDValue Cast = DAG.getNode(
22882	Opcode: ISD::BITCAST, DL, VT: WideVT.getSizeInBits() == `64` ? MVT::v8i8 : MVT::v16i8,
22883	Operand: WideTrunc);
22884
22885	MachineFunction &MF = DAG.getMachineFunction();
22886	SDValue Chain = ST->getChain();
22887	MachineMemOperand *MMO = ST->getMemOperand();
22888	unsigned IdxScale = WideVT.getScalarSizeInBits() / `8`;
22889	SDValue E2 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22890	N2: DAG.getConstant(Val: `2` * IdxScale, DL, VT: MVT::i64));
22891	TypeSize Offset2 = TypeSize::getFixed(ExactSize: `2`);
22892	SDValue Ptr2 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset2, DL);
22893	Chain = DAG.getStore(Chain, dl: DL, Val: E2, Ptr: Ptr2, MMO: MF.getMachineMemOperand(MMO, Offset: `2`, Size: `1`));
22894
22895	SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22896	N2: DAG.getConstant(Val: `1` * IdxScale, DL, VT: MVT::i64));
22897	TypeSize Offset1 = TypeSize::getFixed(ExactSize: `1`);
22898	SDValue Ptr1 = DAG.getMemBasePlusOffset(Base: ST->getBasePtr(), Offset: Offset1, DL);
22899	Chain = DAG.getStore(Chain, dl: DL, Val: E1, Ptr: Ptr1, MMO: MF.getMachineMemOperand(MMO, Offset: `1`, Size: `1`));
22900
22901	SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: MVT::i8, N1: Cast,
22902	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
22903	Chain = DAG.getStore(Chain, dl: DL, Val: E0, Ptr: ST->getBasePtr(),
22904	MMO: MF.getMachineMemOperand(MMO, Offset: `0`, Size: `1`));
22905	return Chain;
22906	}
22907
22908	static SDValue performSTORECombine(SDNode *N,
22909	TargetLowering::DAGCombinerInfo &DCI,
22910	SelectionDAG &DAG,
22911	const AArch64Subtarget *Subtarget) {
22912	StoreSDNode *ST = cast<StoreSDNode>(Val: N);
22913	SDValue Chain = ST->getChain();
22914	SDValue Value = ST->getValue();
22915	SDValue Ptr = ST->getBasePtr();
22916	EVT ValueVT = Value.getValueType();
22917
22918	auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22919	EVT EltVT = VT.getVectorElementType();
22920	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
22921	};
22922
22923	if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22924	return Res;
22925
22926	// If this is an FP_ROUND followed by a store, fold this into a truncating
22927	// store. We can do this even if this is already a truncstore.
22928	// We purposefully don't care about legality of the nodes here as we know
22929	// they can be split down into something legal.
22930	if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22931	Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22932	Subtarget->useSVEForFixedLengthVectors() &&
22933	ValueVT.isFixedLengthVector() &&
22934	ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22935	hasValidElementTypeForFPTruncStore (Value.getOperand(i: `0`).getValueType()))
22936	return DAG.getTruncStore(Chain, dl: SDLoc (N), Val: Value.getOperand(i: `0`), Ptr,
22937	SVT: ST->getMemoryVT(), MMO: ST->getMemOperand());
22938
22939	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22940	return Split;
22941
22942	if (Subtarget->supportsAddressTopByteIgnored() &&
22943	performTBISimplification(Addr: N->getOperand(Num: `2`), DCI, DAG))
22944	return SDValue (N, `0`);
22945
22946	if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22947	return Store;
22948
22949	if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, Store: ST))
22950	return Store;
22951
22952	if (ST->isTruncatingStore()) {
22953	EVT StoreVT = ST->getMemoryVT();
22954	if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: StoreVT))
22955	return SDValue ();
22956	if (SDValue Rshrnb =
22957	trySimplifySrlAddToRshrnb(Srl: ST->getOperand(Num: `1`), DAG, Subtarget)) {
22958	return DAG.getTruncStore(Chain: ST->getChain(), dl: ST, Val: Rshrnb, Ptr: ST->getBasePtr(),
22959	SVT: StoreVT, MMO: ST->getMemOperand());
22960	}
22961	}
22962
22963	return SDValue ();
22964	}
22965
22966	static SDValue performMSTORECombine(SDNode *N,
22967	TargetLowering::DAGCombinerInfo &DCI,
22968	SelectionDAG &DAG,
22969	const AArch64Subtarget *Subtarget) {
22970	MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(Val: N);
22971	SDValue Value = MST->getValue();
22972	SDValue Mask = MST->getMask();
22973	SDLoc DL(N);
22974
22975	// If this is a UZP1 followed by a masked store, fold this into a masked
22976	// truncating store. We can do this even if this is already a masked
22977	// truncstore.
22978	if (Value.getOpcode() == AArch64ISD::UZP1 && Value ->hasOneUse() &&
22979	MST->isUnindexed() && Mask ->getOpcode() == AArch64ISD::PTRUE &&
22980	Value.getValueType().isInteger()) {
22981	Value = Value.getOperand(i: `0`);
22982	if (Value.getOpcode() == ISD::BITCAST) {
22983	EVT HalfVT =
22984	Value.getValueType().getHalfNumVectorElementsVT(Context&: *DAG.getContext());
22985	EVT InVT = Value.getOperand(i: `0`).getValueType();
22986
22987	if (HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext()) == InVT) {
22988	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22989	unsigned PgPattern = Mask ->getConstantOperandVal(Num: `0`);
22990
22991	// Ensure we can double the size of the predicate pattern
22992	unsigned NumElts = getNumElementsFromSVEPredPattern(Pattern: PgPattern);
22993	if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22994	MinSVESize) {
22995	Mask = getPTrue(DAG, DL, VT: InVT.changeVectorElementType(EltVT: MVT::i1),
22996	Pattern: PgPattern);
22997	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Value.getOperand(i: `0`),
22998	Base: MST->getBasePtr(), Offset: MST->getOffset(), Mask,
22999	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
23000	AM: MST->getAddressingMode(),
23001	/IsTruncating=/true);
23002	}
23003	}
23004	}
23005	}
23006
23007	if (MST->isTruncatingStore()) {
23008	EVT ValueVT = Value ->getValueType(ResNo: `0`);
23009	EVT MemVT = MST->getMemoryVT();
23010	if (!isHalvingTruncateOfLegalScalableType(SrcVT: ValueVT, DstVT: MemVT))
23011	return SDValue ();
23012	if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Srl: Value, DAG, Subtarget)) {
23013	return DAG.getMaskedStore(Chain: MST->getChain(), dl: DL, Val: Rshrnb, Base: MST->getBasePtr(),
23014	Offset: MST->getOffset(), Mask: MST->getMask(),
23015	MemVT: MST->getMemoryVT(), MMO: MST->getMemOperand(),
23016	AM: MST->getAddressingMode(), IsTruncating: true);
23017	}
23018	}
23019
23020	return SDValue ();
23021	}
23022
23023	/// \return true if part of the index was folded into the Base.
23024	static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
23025	SDLoc DL, SelectionDAG &DAG) {
23026	// This function assumes a vector of i64 indices.
23027	EVT IndexVT = Index.getValueType();
23028	if (!IndexVT.isVector() \|\| IndexVT.getVectorElementType() != MVT::i64)
23029	return false;
23030
23031	// Simplify:
23032	// BasePtr = Ptr
23033	// Index = X + splat(Offset)
23034	// ->
23035	// BasePtr = Ptr + Offset scale.*
23036	// Index = X
23037	if (Index.getOpcode() == ISD::ADD) {
23038	if (auto Offset = DAG.getSplatValue(V: Index.getOperand(i: `1`))) {
23039	Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
23040	BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
23041	Index = Index.getOperand(i: `0`);
23042	return true;
23043	}
23044	}
23045
23046	// Simplify:
23047	// BasePtr = Ptr
23048	// Index = (X + splat(Offset)) << splat(Shift)
23049	// ->
23050	// BasePtr = Ptr + (Offset << Shift) scale)*
23051	// Index = X << splat(shift)
23052	if (Index.getOpcode() == ISD::SHL &&
23053	Index.getOperand(i: `0`).getOpcode() == ISD::ADD) {
23054	SDValue Add = Index.getOperand(i: `0`);
23055	SDValue ShiftOp = Index.getOperand(i: `1`);
23056	SDValue OffsetOp = Add.getOperand(i: `1`);
23057	if (auto Shift = DAG.getSplatValue(V: ShiftOp))
23058	if (auto Offset = DAG.getSplatValue(V: OffsetOp)) {
23059	Offset = DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::i64, N1: Offset, N2: Shift);
23060	Offset = DAG.getNode(Opcode: ISD::MUL, DL, VT: MVT::i64, N1: Offset, N2: Scale);
23061	BasePtr = DAG.getNode(Opcode: ISD::ADD, DL, VT: MVT::i64, N1: BasePtr, N2: Offset);
23062	Index = DAG.getNode(Opcode: ISD::SHL, DL, VT: Index.getValueType(),
23063	N1: Add.getOperand(i: `0`), N2: ShiftOp);
23064	return true;
23065	}
23066	}
23067
23068	return false;
23069	}
23070
23071	// Analyse the specified address returning true if a more optimal addressing
23072	// mode is available. When returning true all parameters are updated to reflect
23073	// their recommended values.
23074	static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
23075	SDValue &BasePtr, SDValue &Index,
23076	SelectionDAG &DAG) {
23077	// Try to iteratively fold parts of the index into the base pointer to
23078	// simplify the index as much as possible.
23079	bool Changed = false;
23080	while (foldIndexIntoBase(BasePtr, Index, Scale: N->getScale(), DL: SDLoc (N), DAG))
23081	Changed = true;
23082
23083	// Only consider element types that are pointer sized as smaller types can
23084	// be easily promoted.
23085	EVT IndexVT = Index.getValueType();
23086	if (IndexVT.getVectorElementType() != MVT::i64 \|\| IndexVT == MVT::nxv2i64)
23087	return Changed;
23088
23089	// Can indices be trivially shrunk?
23090	EVT DataVT = N->getOperand(Num: `1`).getValueType();
23091	// Don't attempt to shrink the index for fixed vectors of 64 bit data since it
23092	// will later be re-extended to 64 bits in legalization
23093	if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == `64`)
23094	return Changed;
23095	if (ISD::isVectorShrinkable(N: Index.getNode(), NewEltSize: `32`, Signed: N->isIndexSigned())) {
23096	EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
23097	Index = DAG.getNode(Opcode: ISD::TRUNCATE, DL: SDLoc (N), VT: NewIndexVT, Operand: Index);
23098	return true;
23099	}
23100
23101	// Match:
23102	// Index = step(const)
23103	int64_t Stride = `0`;
23104	if (Index.getOpcode() == ISD::STEP_VECTOR) {
23105	Stride = cast<ConstantSDNode>(Val: Index.getOperand(i: `0`))->getSExtValue();
23106	}
23107	// Match:
23108	// Index = step(const) << shift(const)
23109	else if (Index.getOpcode() == ISD::SHL &&
23110	Index.getOperand(i: `0`).getOpcode() == ISD::STEP_VECTOR) {
23111	SDValue RHS = Index.getOperand(i: `1`);
23112	if (auto *Shift =
23113	dyn_cast_or_null<ConstantSDNode>(Val: DAG.getSplatValue(V: RHS))) {
23114	int64_t Step = (int64_t)Index.getOperand(i: `0`).getConstantOperandVal(i: `1`);
23115	Stride = Step << Shift->getZExtValue();
23116	}
23117	}
23118
23119	// Return early because no supported pattern is found.
23120	if (Stride == `0`)
23121	return Changed;
23122
23123	if (Stride < std::numeric_limits<int32_t>::min() \|\|
23124	Stride > std::numeric_limits<int32_t>::max())
23125	return Changed;
23126
23127	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
23128	unsigned MaxVScale =
23129	Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
23130	int64_t LastElementOffset =
23131	IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
23132
23133	if (LastElementOffset < std::numeric_limits<int32_t>::min() \|\|
23134	LastElementOffset > std::numeric_limits<int32_t>::max())
23135	return Changed;
23136
23137	EVT NewIndexVT = IndexVT.changeVectorElementType(EltVT: MVT::i32);
23138	// Stride does not scale explicitly by 'Scale', because it happens in
23139	// the gather/scatter addressing mode.
23140	Index = DAG.getStepVector(DL: SDLoc (N), ResVT: NewIndexVT, StepVal: APInt (`32`, Stride));
23141	return true;
23142	}
23143
23144	static SDValue performMaskedGatherScatterCombine(
23145	SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
23146	MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(Val: N);
23147	assert(MGS && "Can only combine gather load or scatter store nodes");
23148
23149	if (!DCI.isBeforeLegalize())
23150	return SDValue ();
23151
23152	SDLoc DL(MGS);
23153	SDValue Chain = MGS->getChain();
23154	SDValue Scale = MGS->getScale();
23155	SDValue Index = MGS->getIndex();
23156	SDValue Mask = MGS->getMask();
23157	SDValue BasePtr = MGS->getBasePtr();
23158	ISD::MemIndexType IndexType = MGS->getIndexType();
23159
23160	if (!findMoreOptimalIndexType(N: MGS, BasePtr, Index, DAG))
23161	return SDValue ();
23162
23163	// Here we catch such cases early and change MGATHER's IndexType to allow
23164	// the use of an Index that's more legalisation friendly.
23165	if (auto *MGT = dyn_cast<MaskedGatherSDNode>(Val: MGS)) {
23166	SDValue PassThru = MGT->getPassThru();
23167	SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
23168	return DAG.getMaskedGather(
23169	VTs: DAG.getVTList(VT1: N->getValueType(ResNo: `0`), VT2: MVT::Other), MemVT: MGT->getMemoryVT(), dl: DL,
23170	Ops, MMO: MGT->getMemOperand(), IndexType, ExtTy: MGT->getExtensionType());
23171	}
23172	auto *MSC = cast<MaskedScatterSDNode>(Val: MGS);
23173	SDValue Data = MSC->getValue();
23174	SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
23175	return DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT: MSC->getMemoryVT(), dl: DL,
23176	Ops, MMO: MSC->getMemOperand(), IndexType,
23177	IsTruncating: MSC->isTruncatingStore());
23178	}
23179
23180	/// Target-specific DAG combine function for NEON load/store intrinsics
23181	/// to merge base address updates.
23182	static SDValue performNEONPostLDSTCombine(SDNode *N,
23183	TargetLowering::DAGCombinerInfo &DCI,
23184	SelectionDAG &DAG) {
23185	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
23186	return SDValue ();
23187
23188	unsigned AddrOpIdx = N->getNumOperands() - `1`;
23189	SDValue Addr = N->getOperand(Num: AddrOpIdx);
23190
23191	// Search for a use of the address operand that is an increment.
23192	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
23193	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
23194	SDNode User = UI;
23195	if (User->getOpcode() != ISD::ADD \|\|
23196	UI.getUse().getResNo() != Addr.getResNo())
23197	continue;
23198
23199	// Check that the add is independent of the load/store. Otherwise, folding
23200	// it would create a cycle.
23201	SmallPtrSet<const SDNode *, `32`> Visited;
23202	SmallVector<const SDNode *, `16`> Worklist;
23203	Visited.insert(Ptr: Addr.getNode());
23204	Worklist.push_back(Elt: N);
23205	Worklist.push_back(Elt: User);
23206	if (SDNode::hasPredecessorHelper(N, Visited, Worklist) \|\|
23207	SDNode::hasPredecessorHelper(N: User, Visited, Worklist))
23208	continue;
23209
23210	// Find the new opcode for the updating load/store.
23211	bool IsStore = false;
23212	bool IsLaneOp = false;
23213	bool IsDupOp = false;
23214	unsigned NewOpc = `0`;
23215	unsigned NumVecs = `0`;
23216	unsigned IntNo = N->getConstantOperandVal(Num: `1`);
23217	switch (IntNo) {
23218	default: llvm_unreachable("unexpected intrinsic for Neon base update");
23219	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
23220	NumVecs = `2`; break;
23221	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
23222	NumVecs = `3`; break;
23223	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
23224	NumVecs = `4`; break;
23225	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
23226	NumVecs = `2`; IsStore = true; break;
23227	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
23228	NumVecs = `3`; IsStore = true; break;
23229	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
23230	NumVecs = `4`; IsStore = true; break;
23231	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
23232	NumVecs = `2`; break;
23233	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
23234	NumVecs = `3`; break;
23235	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
23236	NumVecs = `4`; break;
23237	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
23238	NumVecs = `2`; IsStore = true; break;
23239	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
23240	NumVecs = `3`; IsStore = true; break;
23241	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
23242	NumVecs = `4`; IsStore = true; break;
23243	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
23244	NumVecs = `2`; IsDupOp = true; break;
23245	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
23246	NumVecs = `3`; IsDupOp = true; break;
23247	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
23248	NumVecs = `4`; IsDupOp = true; break;
23249	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
23250	NumVecs = `2`; IsLaneOp = true; break;
23251	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
23252	NumVecs = `3`; IsLaneOp = true; break;
23253	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
23254	NumVecs = `4`; IsLaneOp = true; break;
23255	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
23256	NumVecs = `2`; IsStore = true; IsLaneOp = true; break;
23257	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
23258	NumVecs = `3`; IsStore = true; IsLaneOp = true; break;
23259	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
23260	NumVecs = `4`; IsStore = true; IsLaneOp = true; break;
23261	}
23262
23263	EVT VecTy;
23264	if (IsStore)
23265	VecTy = N->getOperand(Num: `2`).getValueType();
23266	else
23267	VecTy = N->getValueType(ResNo: `0`);
23268
23269	// If the increment is a constant, it must match the memory ref size.
23270	SDValue Inc = User->getOperand(Num: User->getOperand(Num: `0`) == Addr ? `1` : `0`);
23271	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Val: Inc.getNode())) {
23272	uint32_t IncVal = CInc->getZExtValue();
23273	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / `8`;
23274	if (IsLaneOp \|\| IsDupOp)
23275	NumBytes /= VecTy.getVectorNumElements();
23276	if (IncVal != NumBytes)
23277	continue;
23278	Inc = DAG.getRegister(Reg: AArch64::XZR, VT: MVT::i64);
23279	}
23280	SmallVector<SDValue, `8`> Ops;
23281	Ops.push_back(Elt: N->getOperand(Num: `0`)); // Incoming chain
23282	// Load lane and store have vector list as input.
23283	if (IsLaneOp \|\| IsStore)
23284	for (unsigned i = `2`; i < AddrOpIdx; ++i)
23285	Ops.push_back(Elt: N->getOperand(Num: i));
23286	Ops.push_back(Elt: Addr); // Base register
23287	Ops.push_back(Elt: Inc);
23288
23289	// Return Types.
23290	EVT Tys[`6`];
23291	unsigned NumResultVecs = (IsStore ? `0` : NumVecs);
23292	unsigned n;
23293	for (n = `0`; n < NumResultVecs; ++n)
23294	Tys[n] = VecTy;
23295	Tys[n++] = MVT::i64; // Type of write back register
23296	Tys[n] = MVT::Other; // Type of the chain
23297	SDVTList SDTys = DAG.getVTList(VTs: ArrayRef(Tys, NumResultVecs + `2`));
23298
23299	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(Val: N);
23300	SDValue UpdN = DAG.getMemIntrinsicNode(Opcode: NewOpc, dl: SDLoc (N), VTList: SDTys, Ops,
23301	MemVT: MemInt->getMemoryVT(),
23302	MMO: MemInt->getMemOperand());
23303
23304	// Update the uses.
23305	std::vector<SDValue> NewResults;
23306	for (unsigned i = `0`; i < NumResultVecs; ++i) {
23307	NewResults.push_back(x: SDValue (UpdN.getNode(), i));
23308	}
23309	NewResults.push_back(x: SDValue (UpdN.getNode(), NumResultVecs + `1`));
23310	DCI.CombineTo(N, To: NewResults);
23311	DCI.CombineTo(N: User, Res: SDValue (UpdN.getNode(), NumResultVecs));
23312
23313	break;
23314	}
23315	return SDValue ();
23316	}
23317
23318	// Checks to see if the value is the prescribed width and returns information
23319	// about its extension mode.
23320	static
23321	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
23322	ExtType = ISD::NON_EXTLOAD;
23323	switch(V.getNode()->getOpcode()) {
23324	default:
23325	return false;
23326	case ISD::LOAD: {
23327	LoadSDNode *LoadNode = cast<LoadSDNode>(Val: V.getNode());
23328	if ((LoadNode->getMemoryVT() == MVT::i8 && width == `8`)
23329	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == `16`)) {
23330	ExtType = LoadNode->getExtensionType();
23331	return true;
23332	}
23333	return false;
23334	}
23335	case ISD::AssertSext: {
23336	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
23337	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
23338	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
23339	ExtType = ISD::SEXTLOAD;
23340	return true;
23341	}
23342	return false;
23343	}
23344	case ISD::AssertZext: {
23345	VTSDNode *TypeNode = cast<VTSDNode>(Val: V.getNode()->getOperand(Num: `1`));
23346	if ((TypeNode->getVT() == MVT::i8 && width == `8`)
23347	\|\| (TypeNode->getVT() == MVT::i16 && width == `16`)) {
23348	ExtType = ISD::ZEXTLOAD;
23349	return true;
23350	}
23351	return false;
23352	}
23353	case ISD::Constant:
23354	case ISD::TargetConstant: {
23355	return std::abs(i: cast<ConstantSDNode>(Val: V.getNode())->getSExtValue()) <
23356	`1LL` << (width - `1`);
23357	}
23358	}
23359
23360	return true;
23361	}
23362
23363	// This function does a whole lot of voodoo to determine if the tests are
23364	// equivalent without and with a mask. Essentially what happens is that given a
23365	// DAG resembling:
23366	//
23367	// +-------------+ +-------------+ +-------------+ +-------------+
23368	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
23369	// +-------------+ +-------------+ +-------------+ +-------------+
23370	// \| \| \| \|
23371	// V V \| +----------+
23372	// +-------------+ +----+ \| \|
23373	// \| ADD \| \|0xff\| \| \|
23374	// +-------------+ +----+ \| \|
23375	// \| \| \| \|
23376	// V V \| \|
23377	// +-------------+ \| \|
23378	// \| AND \| \| \|
23379	// +-------------+ \| \|
23380	// \| \| \|
23381	// +-----+ \| \|
23382	// \| \| \|
23383	// V V V
23384	// +-------------+
23385	// \| CMP \|
23386	// +-------------+
23387	//
23388	// The AND node may be safely removed for some combinations of inputs. In
23389	// particular we need to take into account the extension type of the Input,
23390	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
23391	// width of the input (this can work for any width inputs, the above graph is
23392	// specific to 8 bits.
23393	//
23394	// The specific equations were worked out by generating output tables for each
23395	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
23396	// problem was simplified by working with 4 bit inputs, which means we only
23397	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
23398	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
23399	// patterns present in both extensions (0,7). For every distinct set of
23400	// AddConstant and CompConstants bit patterns we can consider the masked and
23401	// unmasked versions to be equivalent if the result of this function is true for
23402	// all 16 distinct bit patterns of for the current extension type of Input (w0).
23403	//
23404	// sub w8, w0, w1
23405	// and w10, w8, #0x0f
23406	// cmp w8, w2
23407	// cset w9, AArch64CC
23408	// cmp w10, w2
23409	// cset w11, AArch64CC
23410	// cmp w9, w11
23411	// cset w0, eq
23412	// ret
23413	//
23414	// Since the above function shows when the outputs are equivalent it defines
23415	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
23416	// would be expensive to run during compiles. The equations below were written
23417	// in a test harness that confirmed they gave equivalent outputs to the above
23418	// for all inputs function, so they can be used determine if the removal is
23419	// legal instead.
23420	//
23421	// isEquivalentMaskless() is the code for testing if the AND can be removed
23422	// factored out of the DAG recognition as the DAG can take several forms.
23423
23424	static bool isEquivalentMaskless(unsigned CC, unsigned width,
23425	ISD::LoadExtType ExtType, int AddConstant,
23426	int CompConstant) {
23427	// By being careful about our equations and only writing the in term
23428	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
23429	// make them generally applicable to all bit widths.
23430	int MaxUInt = (`1` << width);
23431
23432	// For the purposes of these comparisons sign extending the type is
23433	// equivalent to zero extending the add and displacing it by half the integer
23434	// width. Provided we are careful and make sure our equations are valid over
23435	// the whole range we can just adjust the input and avoid writing equations
23436	// for sign extended inputs.
23437	if (ExtType == ISD::SEXTLOAD)
23438	AddConstant -= (`1` << (width-`1`));
23439
23440	switch(CC) {
23441	case AArch64CC::LE:
23442	case AArch64CC::GT:
23443	if ((AddConstant == `0`) \|\|
23444	(CompConstant == MaxUInt - `1` && AddConstant < `0`) \|\|
23445	(AddConstant >= `0` && CompConstant < `0`) \|\|
23446	(AddConstant <= `0` && CompConstant <= `0` && CompConstant < AddConstant))
23447	return true;
23448	break;
23449	case AArch64CC::LT:
23450	case AArch64CC::GE:
23451	if ((AddConstant == `0`) \|\|
23452	(AddConstant >= `0` && CompConstant <= `0`) \|\|
23453	(AddConstant <= `0` && CompConstant <= `0` && CompConstant <= AddConstant))
23454	return true;
23455	break;
23456	case AArch64CC::HI:
23457	case AArch64CC::LS:
23458	if ((AddConstant >= `0` && CompConstant < `0`) \|\|
23459	(AddConstant <= `0` && CompConstant >= -`1` &&
23460	CompConstant < AddConstant + MaxUInt))
23461	return true;
23462	break;
23463	case AArch64CC::PL:
23464	case AArch64CC::MI:
23465	if ((AddConstant == `0`) \|\|
23466	(AddConstant > `0` && CompConstant <= `0`) \|\|
23467	(AddConstant < `0` && CompConstant <= AddConstant))
23468	return true;
23469	break;
23470	case AArch64CC::LO:
23471	case AArch64CC::HS:
23472	if ((AddConstant >= `0` && CompConstant <= `0`) \|\|
23473	(AddConstant <= `0` && CompConstant >= `0` &&
23474	CompConstant <= AddConstant + MaxUInt))
23475	return true;
23476	break;
23477	case AArch64CC::EQ:
23478	case AArch64CC::NE:
23479	if ((AddConstant > `0` && CompConstant < `0`) \|\|
23480	(AddConstant < `0` && CompConstant >= `0` &&
23481	CompConstant < AddConstant + MaxUInt) \|\|
23482	(AddConstant >= `0` && CompConstant >= `0` &&
23483	CompConstant >= AddConstant) \|\|
23484	(AddConstant <= `0` && CompConstant < `0` && CompConstant < AddConstant))
23485	return true;
23486	break;
23487	case AArch64CC::VS:
23488	case AArch64CC::VC:
23489	case AArch64CC::AL:
23490	case AArch64CC::NV:
23491	return true;
23492	case AArch64CC::Invalid:
23493	break;
23494	}
23495
23496	return false;
23497	}
23498
23499	// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
23500	// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
23501	static SDValue performSubsToAndsCombine(SDNode N, SDNode SubsNode,
23502	SDNode *AndNode, SelectionDAG &DAG,
23503	unsigned CCIndex, unsigned CmpIndex,
23504	unsigned CC) {
23505	ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(Val: SubsNode->getOperand(Num: `1`));
23506	if (!SubsC)
23507	return SDValue ();
23508
23509	APInt SubsAP = SubsC->getAPIntValue();
23510	if (CC == AArch64CC::HI) {
23511	if (!SubsAP.isMask())
23512	return SDValue ();
23513	} else if (CC == AArch64CC::LO) {
23514	if (!SubsAP.isPowerOf2())
23515	return SDValue ();
23516	} else
23517	return SDValue ();
23518
23519	ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`));
23520	if (!AndC)
23521	return SDValue ();
23522
23523	APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - `1`);
23524
23525	SDLoc DL(N);
23526	APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
23527	SDValue ANDS = DAG.getNode(
23528	Opcode: AArch64ISD::ANDS, DL, VTList: SubsNode->getVTList(), N1: AndNode->getOperand(Num: `0`),
23529	N2: DAG.getConstant(Val: AndSMask, DL, VT: SubsC->getValueType(ResNo: `0`)));
23530	SDValue AArch64_CC =
23531	DAG.getConstant(Val: CC == AArch64CC::HI ? AArch64CC::NE : AArch64CC::EQ, DL,
23532	VT: N->getOperand(Num: CCIndex)->getValueType(ResNo: `0`));
23533
23534	// For now, only performCSELCombine and performBRCONDCombine call this
23535	// function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
23536	// operands. So just init the ops direct to simplify the code. If we have some
23537	// other case with different CCIndex, CmpIndex, we need to use for loop to
23538	// rewrite the code here.
23539	// TODO: Do we need to assert number of operand is 4 here?
23540	assert((CCIndex == `2` && CmpIndex == `3`) &&
23541	"Expected CCIndex to be 2 and CmpIndex to be 3.");
23542	SDValue Ops[] = {N->getOperand(Num: `0`), N->getOperand(Num: `1`), AArch64_CC,
23543	ANDS.getValue(R: `1`)};
23544	return DAG.getNode(Opcode: N->getOpcode(), DL: N, VTList: N->getVTList(), Ops);
23545	}
23546
23547	static
23548	SDValue performCONDCombine(SDNode *N,
23549	TargetLowering::DAGCombinerInfo &DCI,
23550	SelectionDAG &DAG, unsigned CCIndex,
23551	unsigned CmpIndex) {
23552	unsigned CC = cast<ConstantSDNode>(Val: N->getOperand(Num: CCIndex))->getSExtValue();
23553	SDNode *SubsNode = N->getOperand(Num: CmpIndex).getNode();
23554	unsigned CondOpcode = SubsNode->getOpcode();
23555
23556	if (CondOpcode != AArch64ISD::SUBS \|\| SubsNode->hasAnyUseOfValue(Value: `0`) \|\|
23557	!SubsNode->hasOneUse())
23558	return SDValue ();
23559
23560	// There is a SUBS feeding this condition. Is it fed by a mask we can
23561	// use?
23562
23563	SDNode *AndNode = SubsNode->getOperand(Num: `0`).getNode();
23564	unsigned MaskBits = `0`;
23565
23566	if (AndNode->getOpcode() != ISD::AND)
23567	return SDValue ();
23568
23569	if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
23570	CmpIndex, CC))
23571	return Val;
23572
23573	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val: AndNode->getOperand(Num: `1`))) {
23574	uint32_t CNV = CN->getZExtValue();
23575	if (CNV == `255`)
23576	MaskBits = `8`;
23577	else if (CNV == `65535`)
23578	MaskBits = `16`;
23579	}
23580
23581	if (!MaskBits)
23582	return SDValue ();
23583
23584	SDValue AddValue = AndNode->getOperand(Num: `0`);
23585
23586	if (AddValue.getOpcode() != ISD::ADD)
23587	return SDValue ();
23588
23589	// The basic dag structure is correct, grab the inputs and validate them.
23590
23591	SDValue AddInputValue1 = AddValue.getNode()->getOperand(Num: `0`);
23592	SDValue AddInputValue2 = AddValue.getNode()->getOperand(Num: `1`);
23593	SDValue SubsInputValue = SubsNode->getOperand(Num: `1`);
23594
23595	// The mask is present and the provenance of all the values is a smaller type,
23596	// lets see if the mask is superfluous.
23597
23598	if (!isa<ConstantSDNode>(Val: AddInputValue2.getNode()) \|\|
23599	!isa<ConstantSDNode>(Val: SubsInputValue.getNode()))
23600	return SDValue ();
23601
23602	ISD::LoadExtType ExtType;
23603
23604	if (!checkValueWidth(V: SubsInputValue, width: MaskBits, ExtType) \|\|
23605	!checkValueWidth(V: AddInputValue2, width: MaskBits, ExtType) \|\|
23606	!checkValueWidth(V: AddInputValue1, width: MaskBits, ExtType) )
23607	return SDValue ();
23608
23609	if(!isEquivalentMaskless(CC, width: MaskBits, ExtType,
23610	AddConstant: cast<ConstantSDNode>(Val: AddInputValue2.getNode())->getSExtValue(),
23611	CompConstant: cast<ConstantSDNode>(Val: SubsInputValue.getNode())->getSExtValue()))
23612	return SDValue ();
23613
23614	// The AND is not necessary, remove it.
23615
23616	SDVTList VTs = DAG.getVTList(VT1: SubsNode->getValueType(ResNo: `0`),
23617	VT2: SubsNode->getValueType(ResNo: `1`));
23618	SDValue Ops[] = { AddValue, SubsNode->getOperand(Num: `1`) };
23619
23620	SDValue NewValue = DAG.getNode(Opcode: CondOpcode, DL: SDLoc (SubsNode), VTList: VTs, Ops);
23621	DAG.ReplaceAllUsesWith(From: SubsNode, To: NewValue.getNode());
23622
23623	return SDValue (N, `0`);
23624	}
23625
23626	// Optimize compare with zero and branch.
23627	static SDValue performBRCONDCombine(SDNode *N,
23628	TargetLowering::DAGCombinerInfo &DCI,
23629	SelectionDAG &DAG) {
23630	MachineFunction &MF = DAG.getMachineFunction();
23631	// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23632	// will not be produced, as they are conditional branch instructions that do
23633	// not set flags.
23634	if (MF.getFunction().hasFnAttribute(Kind: Attribute::SpeculativeLoadHardening))
23635	return SDValue ();
23636
23637	if (SDValue NV = performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`))
23638	N = NV.getNode();
23639	SDValue Chain = N->getOperand(Num: `0`);
23640	SDValue Dest = N->getOperand(Num: `1`);
23641	SDValue CCVal = N->getOperand(Num: `2`);
23642	SDValue Cmp = N->getOperand(Num: `3`);
23643
23644	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23645	unsigned CC = CCVal ->getAsZExtVal();
23646	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23647	return SDValue ();
23648
23649	unsigned CmpOpc = Cmp.getOpcode();
23650	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23651	return SDValue ();
23652
23653	// Only attempt folding if there is only one use of the flag and no use of the
23654	// value.
23655	if (!Cmp ->hasNUsesOfValue(NUses: `0`, Value: `0`) \|\| !Cmp ->hasNUsesOfValue(NUses: `1`, Value: `1`))
23656	return SDValue ();
23657
23658	SDValue LHS = Cmp.getOperand(i: `0`);
23659	SDValue RHS = Cmp.getOperand(i: `1`);
23660
23661	assert(LHS.getValueType() == RHS.getValueType() &&
23662	"Expected the value type to be the same for both operands!");
23663	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23664	return SDValue ();
23665
23666	if (isNullConstant(V: LHS))
23667	std::swap(a&: LHS, b&: RHS);
23668
23669	if (!isNullConstant(V: RHS))
23670	return SDValue ();
23671
23672	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
23673	LHS.getOpcode() == ISD::SRL)
23674	return SDValue ();
23675
23676	// Fold the compare into the branch instruction.
23677	SDValue BR;
23678	if (CC == AArch64CC::EQ)
23679	BR = DAG.getNode(Opcode: AArch64ISD::CBZ, DL: SDLoc (N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
23680	else
23681	BR = DAG.getNode(Opcode: AArch64ISD::CBNZ, DL: SDLoc (N), VT: MVT::Other, N1: Chain, N2: LHS, N3: Dest);
23682
23683	// Do not add new nodes to DAG combiner worklist.
23684	DCI.CombineTo(N, Res: BR, AddTo: false);
23685
23686	return SDValue ();
23687	}
23688
23689	static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
23690	unsigned CC = N->getConstantOperandVal(Num: `2`);
23691	SDValue SUBS = N->getOperand(Num: `3`);
23692	SDValue Zero, CTTZ;
23693
23694	if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23695	Zero = N->getOperand(Num: `0`);
23696	CTTZ = N->getOperand(Num: `1`);
23697	} else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23698	Zero = N->getOperand(Num: `1`);
23699	CTTZ = N->getOperand(Num: `0`);
23700	} else
23701	return SDValue ();
23702
23703	if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) \|\|
23704	(CTTZ.getOpcode() == ISD::TRUNCATE &&
23705	CTTZ.getOperand(i: `0`).getOpcode() != ISD::CTTZ))
23706	return SDValue ();
23707
23708	assert((CTTZ.getValueType() == MVT::i32 \|\| CTTZ.getValueType() == MVT::i64) &&
23709	"Illegal type in CTTZ folding");
23710
23711	if (!isNullConstant(V: Zero) \|\| !isNullConstant(V: SUBS.getOperand(i: `1`)))
23712	return SDValue ();
23713
23714	SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23715	? CTTZ.getOperand(i: `0`).getOperand(i: `0`)
23716	: CTTZ.getOperand(i: `0`);
23717
23718	if (X != SUBS.getOperand(i: `0`))
23719	return SDValue ();
23720
23721	unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23722	? CTTZ.getOperand(i: `0`).getValueSizeInBits()
23723	: CTTZ.getValueSizeInBits();
23724	SDValue BitWidthMinusOne =
23725	DAG.getConstant(Val: BitWidth - `1`, DL: SDLoc (N), VT: CTTZ.getValueType());
23726	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: CTTZ.getValueType(), N1: CTTZ,
23727	N2: BitWidthMinusOne);
23728	}
23729
23730	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23731	// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23732	// Where x and y are constants and x != y
23733
23734	// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23735	// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23736	// Where x and y are constants and x != y
23737	static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG) {
23738	SDValue L = Op->getOperand(Num: `0`);
23739	SDValue R = Op->getOperand(Num: `1`);
23740	AArch64CC::CondCode OpCC =
23741	static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(Num: `2`));
23742
23743	SDValue OpCmp = Op->getOperand(Num: `3`);
23744	if (!isCMP(Op: OpCmp))
23745	return SDValue ();
23746
23747	SDValue CmpLHS = OpCmp.getOperand(i: `0`);
23748	SDValue CmpRHS = OpCmp.getOperand(i: `1`);
23749
23750	if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23751	std::swap(a&: CmpLHS, b&: CmpRHS);
23752	else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23753	return SDValue ();
23754
23755	SDValue X = CmpLHS ->getOperand(Num: `0`);
23756	SDValue Y = CmpLHS ->getOperand(Num: `1`);
23757	if (!isa<ConstantSDNode>(Val: X) \|\| !isa<ConstantSDNode>(Val: Y) \|\| X == Y) {
23758	return SDValue ();
23759	}
23760
23761	// If one of the constant is opaque constant, x,y sdnode is still different
23762	// but the real value maybe the same. So check APInt here to make sure the
23763	// code is correct.
23764	ConstantSDNode *CX = cast<ConstantSDNode>(Val&: X);
23765	ConstantSDNode *CY = cast<ConstantSDNode>(Val&: Y);
23766	if (CX->getAPIntValue() == CY->getAPIntValue())
23767	return SDValue ();
23768
23769	AArch64CC::CondCode CC =
23770	static_cast<AArch64CC::CondCode>(CmpLHS ->getConstantOperandVal(Num: `2`));
23771	SDValue Cond = CmpLHS ->getOperand(Num: `3`);
23772
23773	if (CmpRHS == Y)
23774	CC = AArch64CC::getInvertedCondCode(Code: CC);
23775	else if (CmpRHS != X)
23776	return SDValue ();
23777
23778	if (OpCC == AArch64CC::NE)
23779	CC = AArch64CC::getInvertedCondCode(Code: CC);
23780	else if (OpCC != AArch64CC::EQ)
23781	return SDValue ();
23782
23783	SDLoc DL(Op);
23784	EVT VT = Op->getValueType(ResNo: `0`);
23785
23786	SDValue CCValue = DAG.getConstant(Val: CC, DL, VT: MVT::i32);
23787	return DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT, N1: L, N2: R, N3: CCValue, N4: Cond);
23788	}
23789
23790	// Optimize CSEL instructions
23791	static SDValue performCSELCombine(SDNode *N,
23792	TargetLowering::DAGCombinerInfo &DCI,
23793	SelectionDAG &DAG) {
23794	// CSEL x, x, cc -> x
23795	if (N->getOperand(Num: `0`) == N->getOperand(Num: `1`))
23796	return N->getOperand(Num: `0`);
23797
23798	if (SDValue R = foldCSELOfCSEL(Op: N, DAG))
23799	return R;
23800
23801	// CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23802	// CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23803	if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23804	return Folded;
23805
23806	return performCONDCombine(N, DCI, DAG, CCIndex: `2`, CmpIndex: `3`);
23807	}
23808
23809	// Try to re-use an already extended operand of a vector SetCC feeding a
23810	// extended select. Doing so avoids requiring another full extension of the
23811	// SET_CC result when lowering the select.
23812	static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
23813	EVT Op0MVT = Op->getOperand(Num: `0`).getValueType();
23814	if (!Op0MVT.isVector() \|\| Op->use_empty())
23815	return SDValue ();
23816
23817	// Make sure that all uses of Op are VSELECTs with result matching types where
23818	// the result type has a larger element type than the SetCC operand.
23819	SDNode FirstUse = Op->use_begin();
23820	if (FirstUse->getOpcode() != ISD::VSELECT)
23821	return SDValue ();
23822	EVT UseMVT = FirstUse->getValueType(ResNo: `0`);
23823	if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23824	return SDValue ();
23825	if (any_of(Range: Op->uses(), P: [&UseMVT](const SDNode *N) {
23826	return N->getOpcode() != ISD::VSELECT \|\| N->getValueType(ResNo: `0`) != UseMVT;
23827	}))
23828	return SDValue ();
23829
23830	APInt V;
23831	if (!ISD::isConstantSplatVector(N: Op->getOperand(Num: `1`).getNode(), SplatValue&: V))
23832	return SDValue ();
23833
23834	SDLoc DL(Op);
23835	SDValue Op0ExtV;
23836	SDValue Op1ExtV;
23837	ISD::CondCode CC = cast<CondCodeSDNode>(Val: Op->getOperand(Num: `2`))->get();
23838	// Check if the first operand of the SET_CC is already extended. If it is,
23839	// split the SET_CC and re-use the extended version of the operand.
23840	SDNode *Op0SExt = DAG.getNodeIfExists(Opcode: ISD::SIGN_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23841	Ops: Op->getOperand(Num: `0`));
23842	SDNode *Op0ZExt = DAG.getNodeIfExists(Opcode: ISD::ZERO_EXTEND, VTList: DAG.getVTList(VT: UseMVT),
23843	Ops: Op->getOperand(Num: `0`));
23844	if (Op0SExt && (isSignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
23845	Op0ExtV = SDValue (Op0SExt, `0`);
23846	Op1ExtV = DAG.getNode(Opcode: ISD::SIGN_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
23847	} else if (Op0ZExt && (isUnsignedIntSetCC(Code: CC) \|\| isIntEqualitySetCC(Code: CC))) {
23848	Op0ExtV = SDValue (Op0ZExt, `0`);
23849	Op1ExtV = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL, VT: UseMVT, Operand: Op->getOperand(Num: `1`));
23850	} else
23851	return SDValue ();
23852
23853	return DAG.getNode(Opcode: ISD::SETCC, DL, VT: UseMVT.changeVectorElementType(EltVT: MVT::i1),
23854	N1: Op0ExtV, N2: Op1ExtV, N3: Op->getOperand(Num: `2`));
23855	}
23856
23857	static SDValue
23858	performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
23859	SelectionDAG &DAG) {
23860	SDValue Vec = N->getOperand(Num: `0`);
23861	if (DCI.isBeforeLegalize() &&
23862	Vec.getValueType().getVectorElementType() == MVT::i1 &&
23863	Vec.getValueType().isFixedLengthVector() &&
23864	Vec.getValueType().isPow2VectorType()) {
23865	SDLoc DL(N);
23866	return getVectorBitwiseReduce(Opcode: N->getOpcode(), Vec, VT: N->getValueType(ResNo: `0`), DL,
23867	DAG);
23868	}
23869
23870	return SDValue ();
23871	}
23872
23873	static SDValue performSETCCCombine(SDNode *N,
23874	TargetLowering::DAGCombinerInfo &DCI,
23875	SelectionDAG &DAG) {
23876	assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23877	SDValue LHS = N->getOperand(Num: `0`);
23878	SDValue RHS = N->getOperand(Num: `1`);
23879	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `2`))->get();
23880	SDLoc DL(N);
23881	EVT VT = N->getValueType(ResNo: `0`);
23882
23883	if (SDValue V = tryToWidenSetCCOperands(Op: N, DAG))
23884	return V;
23885
23886	// setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23887	if (Cond == ISD::SETNE && isOneConstant(V: RHS) &&
23888	LHS ->getOpcode() == AArch64ISD::CSEL &&
23889	isNullConstant(V: LHS ->getOperand(Num: `0`)) && isOneConstant(V: LHS ->getOperand(Num: `1`)) &&
23890	LHS ->hasOneUse()) {
23891	// Invert CSEL's condition.
23892	auto OldCond =
23893	static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(i: `2`));
23894	auto NewCond = getInvertedCondCode(Code: OldCond);
23895
23896	// csel 0, 1, !cond, X
23897	SDValue CSEL =
23898	DAG.getNode(Opcode: AArch64ISD::CSEL, DL, VT: LHS.getValueType(), N1: LHS.getOperand(i: `0`),
23899	N2: LHS.getOperand(i: `1`), N3: DAG.getConstant(Val: NewCond, DL, VT: MVT::i32),
23900	N4: LHS.getOperand(i: `3`));
23901	return DAG.getZExtOrTrunc(Op: CSEL, DL, VT);
23902	}
23903
23904	// setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23905	if (Cond == ISD::SETNE && isNullConstant(V: RHS) &&
23906	LHS ->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Val: LHS ->getOperand(Num: `1`)) &&
23907	LHS ->getConstantOperandVal(Num: `1`) < VT.getScalarSizeInBits() &&
23908	LHS ->hasOneUse()) {
23909	EVT TstVT = LHS ->getValueType(ResNo: `0`);
23910	if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= `64`) {
23911	// this pattern will get better opt in emitComparison
23912	uint64_t TstImm = -`1ULL` << LHS ->getConstantOperandVal(Num: `1`);
23913	SDValue TST = DAG.getNode(Opcode: ISD::AND, DL, VT: TstVT, N1: LHS ->getOperand(Num: `0`),
23914	N2: DAG.getConstant(Val: TstImm, DL, VT: TstVT));
23915	return DAG.getNode(Opcode: ISD::SETCC, DL, VT, N1: TST, N2: RHS, N3: N->getOperand(Num: `2`));
23916	}
23917	}
23918
23919	// setcc (iN (bitcast (vNi1 X))), 0, (eq\|ne)
23920	// ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq\|ne)
23921	// setcc (iN (bitcast (vNi1 X))), -1, (eq\|ne)
23922	// ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq\|ne)
23923	if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23924	(Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) &&
23925	(isNullConstant(V: RHS) \|\| isAllOnesConstant(V: RHS)) &&
23926	LHS ->getOpcode() == ISD::BITCAST) {
23927	EVT ToVT = LHS ->getValueType(ResNo: `0`);
23928	EVT FromVT = LHS ->getOperand(Num: `0`).getValueType();
23929	if (FromVT.isFixedLengthVector() &&
23930	FromVT.getVectorElementType() == MVT::i1) {
23931	bool IsNull = isNullConstant(V: RHS);
23932	LHS = DAG.getNode(Opcode: IsNull ? ISD::VECREDUCE_OR : ISD::VECREDUCE_AND,
23933	DL, VT: MVT::i1, Operand: LHS ->getOperand(Num: `0`));
23934	LHS = DAG.getNode(Opcode: IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT: ToVT,
23935	Operand: LHS);
23936	return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23937	}
23938	}
23939
23940	// Try to perform the memcmp when the result is tested for [in]equality with 0
23941	if (SDValue V = performOrXorChainCombine(N, DAG))
23942	return V;
23943
23944	return SDValue ();
23945	}
23946
23947	// Replace a flag-setting operator (eg ANDS) with the generic version
23948	// (eg AND) if the flag is unused.
23949	static SDValue performFlagSettingCombine(SDNode *N,
23950	TargetLowering::DAGCombinerInfo &DCI,
23951	unsigned GenericOpcode) {
23952	SDLoc DL(N);
23953	SDValue LHS = N->getOperand(Num: `0`);
23954	SDValue RHS = N->getOperand(Num: `1`);
23955	EVT VT = N->getValueType(ResNo: `0`);
23956
23957	// If the flag result isn't used, convert back to a generic opcode.
23958	if (!N->hasAnyUseOfValue(Value: `1`)) {
23959	SDValue Res = DCI.DAG.getNode(Opcode: GenericOpcode, DL, VT, Ops: N->ops());
23960	return DCI.DAG.getMergeValues(Ops: {Res, DCI.DAG.getConstant(Val: `0`, DL, VT: MVT::i32)},
23961	dl: DL);
23962	}
23963
23964	// Combine identical generic nodes into this node, re-using the result.
23965	if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23966	Opcode: GenericOpcode, VTList: DCI.DAG.getVTList(VT), Ops: {LHS, RHS}))
23967	DCI.CombineTo(N: Generic, Res: SDValue (N, `0`));
23968
23969	return SDValue ();
23970	}
23971
23972	static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
23973	// setcc_merge_zero pred
23974	// (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23975	// => extract_subvector (inner setcc_merge_zero)
23976	SDValue Pred = N->getOperand(Num: `0`);
23977	SDValue LHS = N->getOperand(Num: `1`);
23978	SDValue RHS = N->getOperand(Num: `2`);
23979	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
23980
23981	if (Cond != ISD::SETNE \|\| !isZerosVector(N: RHS.getNode()) \|\|
23982	LHS ->getOpcode() != ISD::SIGN_EXTEND)
23983	return SDValue ();
23984
23985	SDValue Extract = LHS ->getOperand(Num: `0`);
23986	if (Extract ->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
23987	Extract ->getValueType(ResNo: `0`) != N->getValueType(ResNo: `0`) \|\|
23988	Extract ->getConstantOperandVal(Num: `1`) != `0`)
23989	return SDValue ();
23990
23991	SDValue InnerSetCC = Extract ->getOperand(Num: `0`);
23992	if (InnerSetCC ->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23993	return SDValue ();
23994
23995	// By this point we've effectively got
23996	// zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23997	// lanes are already zero then the trunc(sext()) sequence is redundant and we
23998	// can operate on A directly.
23999	SDValue InnerPred = InnerSetCC.getOperand(i: `0`);
24000	if (Pred.getOpcode() == AArch64ISD::PTRUE &&
24001	InnerPred.getOpcode() == AArch64ISD::PTRUE &&
24002	Pred.getConstantOperandVal(i: `0`) == InnerPred.getConstantOperandVal(i: `0`) &&
24003	Pred ->getConstantOperandVal(Num: `0`) >= AArch64SVEPredPattern::vl1 &&
24004	Pred ->getConstantOperandVal(Num: `0`) <= AArch64SVEPredPattern::vl256)
24005	return Extract;
24006
24007	return SDValue ();
24008	}
24009
24010	static SDValue
24011	performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24012	assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24013	"Unexpected opcode!");
24014
24015	SelectionDAG &DAG = DCI.DAG;
24016	SDValue Pred = N->getOperand(Num: `0`);
24017	SDValue LHS = N->getOperand(Num: `1`);
24018	SDValue RHS = N->getOperand(Num: `2`);
24019	ISD::CondCode Cond = cast<CondCodeSDNode>(Val: N->getOperand(Num: `3`))->get();
24020
24021	if (SDValue V = performSetCCPunpkCombine(N, DAG))
24022	return V;
24023
24024	if (Cond == ISD::SETNE && isZerosVector(N: RHS.getNode()) &&
24025	LHS ->getOpcode() == ISD::SIGN_EXTEND &&
24026	LHS ->getOperand(Num: `0`)->getValueType(ResNo: `0`) == N->getValueType(ResNo: `0`)) {
24027	// setcc_merge_zero(
24028	// pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
24029	// => setcc_merge_zero(pred, ...)
24030	if (LHS ->getOperand(Num: `0`)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24031	LHS ->getOperand(Num: `0`)->getOperand(Num: `0`) == Pred)
24032	return LHS ->getOperand(Num: `0`);
24033
24034	// setcc_merge_zero(
24035	// all_active, extend(nxvNi1 ...), != splat(0))
24036	// -> nxvNi1 ...
24037	if (isAllActivePredicate(DAG, N: Pred))
24038	return LHS ->getOperand(Num: `0`);
24039
24040	// setcc_merge_zero(
24041	// pred, extend(nxvNi1 ...), != splat(0))
24042	// -> nxvNi1 and(pred, ...)
24043	if (DCI.isAfterLegalizeDAG())
24044	// Do this after legalization to allow more folds on setcc_merge_zero
24045	// to be recognized.
24046	return DAG.getNode(Opcode: ISD::AND, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
24047	N1: LHS ->getOperand(Num: `0`), N2: Pred);
24048	}
24049
24050	return SDValue ();
24051	}
24052
24053	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
24054	// as well as whether the test should be inverted. This code is required to
24055	// catch these cases (as opposed to standard dag combines) because
24056	// AArch64ISD::TBZ is matched during legalization.
24057	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
24058	SelectionDAG &DAG) {
24059
24060	if (!Op ->hasOneUse())
24061	return Op;
24062
24063	// We don't handle undef/constant-fold cases below, as they should have
24064	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
24065	// etc.)
24066
24067	// (tbz (trunc x), b) -> (tbz x, b)
24068	// This case is just here to enable more of the below cases to be caught.
24069	if (Op ->getOpcode() == ISD::TRUNCATE &&
24070	Bit < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
24071	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24072	}
24073
24074	// (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
24075	if (Op ->getOpcode() == ISD::ANY_EXTEND &&
24076	Bit < Op ->getOperand(Num: `0`).getValueSizeInBits()) {
24077	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24078	}
24079
24080	if (Op ->getNumOperands() != `2`)
24081	return Op;
24082
24083	auto *C = dyn_cast<ConstantSDNode>(Val: Op ->getOperand(Num: `1`));
24084	if (!C)
24085	return Op;
24086
24087	switch (Op ->getOpcode()) {
24088	default:
24089	return Op;
24090
24091	// (tbz (and x, m), b) -> (tbz x, b)
24092	case ISD::AND:
24093	if ((C->getZExtValue() >> Bit) & `1`)
24094	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24095	return Op;
24096
24097	// (tbz (shl x, c), b) -> (tbz x, b-c)
24098	case ISD::SHL:
24099	if (C->getZExtValue() <= Bit &&
24100	(Bit - C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
24101	Bit = Bit - C->getZExtValue();
24102	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24103	}
24104	return Op;
24105
24106	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
24107	case ISD::SRA:
24108	Bit = Bit + C->getZExtValue();
24109	if (Bit >= Op ->getValueType(ResNo: `0`).getSizeInBits())
24110	Bit = Op ->getValueType(ResNo: `0`).getSizeInBits() - `1`;
24111	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24112
24113	// (tbz (srl x, c), b) -> (tbz x, b+c)
24114	case ISD::SRL:
24115	if ((Bit + C->getZExtValue()) < Op ->getValueType(ResNo: `0`).getSizeInBits()) {
24116	Bit = Bit + C->getZExtValue();
24117	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24118	}
24119	return Op;
24120
24121	// (tbz (xor x, -1), b) -> (tbnz x, b)
24122	case ISD::XOR:
24123	if ((C->getZExtValue() >> Bit) & `1`)
24124	Invert = !Invert;
24125	return getTestBitOperand(Op: Op ->getOperand(Num: `0`), Bit, Invert, DAG);
24126	}
24127	}
24128
24129	// Optimize test single bit zero/non-zero and branch.
24130	static SDValue performTBZCombine(SDNode *N,
24131	TargetLowering::DAGCombinerInfo &DCI,
24132	SelectionDAG &DAG) {
24133	unsigned Bit = N->getConstantOperandVal(Num: `2`);
24134	bool Invert = false;
24135	SDValue TestSrc = N->getOperand(Num: `1`);
24136	SDValue NewTestSrc = getTestBitOperand(Op: TestSrc, Bit, Invert, DAG);
24137
24138	if (TestSrc == NewTestSrc)
24139	return SDValue ();
24140
24141	unsigned NewOpc = N->getOpcode();
24142	if (Invert) {
24143	if (NewOpc == AArch64ISD::TBZ)
24144	NewOpc = AArch64ISD::TBNZ;
24145	else {
24146	assert(NewOpc == AArch64ISD::TBNZ);
24147	NewOpc = AArch64ISD::TBZ;
24148	}
24149	}
24150
24151	SDLoc DL(N);
24152	return DAG.getNode(Opcode: NewOpc, DL, VT: MVT::Other, N1: N->getOperand(Num: `0`), N2: NewTestSrc,
24153	N3: DAG.getConstant(Val: Bit, DL, VT: MVT::i64), N4: N->getOperand(Num: `3`));
24154	}
24155
24156	// Swap vselect operands where it may allow a predicated operation to achieve
24157	// the `sel`.
24158	//
24159	// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
24160	// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
24161	static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
24162	auto SelectA = N->getOperand(Num: `1`);
24163	auto SelectB = N->getOperand(Num: `2`);
24164	auto NTy = N->getValueType(ResNo: `0`);
24165
24166	if (!NTy.isScalableVector())
24167	return SDValue ();
24168	SDValue SetCC = N->getOperand(Num: `0`);
24169	if (SetCC.getOpcode() != ISD::SETCC \|\| !SetCC.hasOneUse())
24170	return SDValue ();
24171
24172	switch (SelectB.getOpcode()) {
24173	default:
24174	return SDValue ();
24175	case ISD::FMUL:
24176	case ISD::FSUB:
24177	case ISD::FADD:
24178	break;
24179	}
24180	if (SelectA != SelectB.getOperand(i: `0`))
24181	return SDValue ();
24182
24183	ISD::CondCode CC = cast<CondCodeSDNode>(Val: SetCC.getOperand(i: `2`))->get();
24184	ISD::CondCode InverseCC =
24185	ISD::getSetCCInverse(Operation: CC, Type: SetCC.getOperand(i: `0`).getValueType());
24186	auto InverseSetCC =
24187	DAG.getSetCC(DL: SDLoc (SetCC), VT: SetCC.getValueType(), LHS: SetCC.getOperand(i: `0`),
24188	RHS: SetCC.getOperand(i: `1`), Cond: InverseCC);
24189
24190	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: NTy,
24191	Ops: {InverseSetCC, SelectB, SelectA});
24192	}
24193
24194	// vselect (v1i1 setcc) ->
24195	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
24196	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
24197	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
24198	// such VSELECT.
24199	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
24200	if (auto SwapResult = trySwapVSelectOperands(N, DAG))
24201	return SwapResult;
24202
24203	SDValue N0 = N->getOperand(Num: `0`);
24204	EVT CCVT = N0.getValueType();
24205
24206	if (isAllActivePredicate(DAG, N: N0))
24207	return N->getOperand(Num: `1`);
24208
24209	if (isAllInactivePredicate(N: N0))
24210	return N->getOperand(Num: `2`);
24211
24212	// Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
24213	// into (OR (ASR lhs, N-1), 1), which requires less instructions for the
24214	// supported types.
24215	SDValue SetCC = N->getOperand(Num: `0`);
24216	if (SetCC.getOpcode() == ISD::SETCC &&
24217	SetCC.getOperand(i: `2`) == DAG.getCondCode(Cond: ISD::SETGT)) {
24218	SDValue CmpLHS = SetCC.getOperand(i: `0`);
24219	EVT VT = CmpLHS.getValueType();
24220	SDNode *CmpRHS = SetCC.getOperand(i: `1`).getNode();
24221	SDNode *SplatLHS = N->getOperand(Num: `1`).getNode();
24222	SDNode *SplatRHS = N->getOperand(Num: `2`).getNode();
24223	APInt SplatLHSVal;
24224	if (CmpLHS.getValueType() == N->getOperand(Num: `1`).getValueType() &&
24225	VT.isSimple() &&
24226	is_contained(Range: ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
24227	MVT::v2i32, MVT::v4i32, MVT::v2i64}),
24228	Element: VT.getSimpleVT().SimpleTy) &&
24229	ISD::isConstantSplatVector(N: SplatLHS, SplatValue&: SplatLHSVal) &&
24230	SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(N: CmpRHS) &&
24231	ISD::isConstantSplatVectorAllOnes(N: SplatRHS)) {
24232	unsigned NumElts = VT.getVectorNumElements();
24233	SmallVector<SDValue, `8`> Ops(
24234	NumElts, DAG.getConstant(Val: VT.getScalarSizeInBits() - `1`, DL: SDLoc (N),
24235	VT: VT.getScalarType()));
24236	SDValue Val = DAG.getBuildVector(VT, DL: SDLoc (N), Ops);
24237
24238	auto Shift = DAG.getNode(Opcode: ISD::SRA, DL: SDLoc (N), VT, N1: CmpLHS, N2: Val);
24239	auto Or = DAG.getNode(Opcode: ISD::OR, DL: SDLoc (N), VT, N1: Shift, N2: N->getOperand(Num: `1`));
24240	return Or;
24241	}
24242	}
24243
24244	EVT CmpVT = N0.getOperand(i: `0`).getValueType();
24245	if (N0.getOpcode() != ISD::SETCC \|\|
24246	CCVT.getVectorElementCount() != ElementCount::getFixed(MinVal: `1`) \|\|
24247	CCVT.getVectorElementType() != MVT::i1 \|\|
24248	CmpVT.getVectorElementType().isFloatingPoint())
24249	return SDValue ();
24250
24251	EVT ResVT = N->getValueType(ResNo: `0`);
24252	// Only combine when the result type is of the same size as the compared
24253	// operands.
24254	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
24255	return SDValue ();
24256
24257	SDValue IfTrue = N->getOperand(Num: `1`);
24258	SDValue IfFalse = N->getOperand(Num: `2`);
24259	SetCC = DAG.getSetCC(DL: SDLoc (N), VT: CmpVT.changeVectorElementTypeToInteger(),
24260	LHS: N0.getOperand(i: `0`), RHS: N0.getOperand(i: `1`),
24261	Cond: cast<CondCodeSDNode>(Val: N0.getOperand(i: `2`))->get());
24262	return DAG.getNode(Opcode: ISD::VSELECT, DL: SDLoc (N), VT: ResVT, N1: SetCC,
24263	N2: IfTrue, N3: IfFalse);
24264	}
24265
24266	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
24267	/// the compare-mask instructions rather than going via NZCV, even if LHS and
24268	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
24269	/// with a vector one followed by a DUP shuffle on the result.
24270	static SDValue performSelectCombine(SDNode *N,
24271	TargetLowering::DAGCombinerInfo &DCI) {
24272	SelectionDAG &DAG = DCI.DAG;
24273	SDValue N0 = N->getOperand(Num: `0`);
24274	EVT ResVT = N->getValueType(ResNo: `0`);
24275
24276	if (N0.getOpcode() != ISD::SETCC)
24277	return SDValue ();
24278
24279	if (ResVT.isScalableVT())
24280	return SDValue ();
24281
24282	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
24283	// scalar SetCCResultType. We also don't expect vectors, because we assume
24284	// that selects fed by vector SETCCs are canonicalized to VSELECT.
24285	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
24286	"Scalar-SETCC feeding SELECT has unexpected result type!");
24287
24288	// If NumMaskElts == 0, the comparison is larger than select result. The
24289	// largest real NEON comparison is 64-bits per lane, which means the result is
24290	// at most 32-bits and an illegal vector. Just bail out for now.
24291	EVT SrcVT = N0.getOperand(i: `0`).getValueType();
24292
24293	// Don't try to do this optimization when the setcc itself has i1 operands.
24294	// There are no legal vectors of i1, so this would be pointless. v1f16 is
24295	// ruled out to prevent the creation of setcc that need to be scalarized.
24296	if (SrcVT == MVT::i1 \|\|
24297	(SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= `16`))
24298	return SDValue ();
24299
24300	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
24301	if (!ResVT.isVector() \|\| NumMaskElts == `0`)
24302	return SDValue ();
24303
24304	SrcVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: SrcVT, NumElements: NumMaskElts);
24305	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
24306
24307	// Also bail out if the vector CCVT isn't the same size as ResVT.
24308	// This can happen if the SETCC operand size doesn't divide the ResVT size
24309	// (e.g., f64 vs v3f32).
24310	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
24311	return SDValue ();
24312
24313	// Make sure we didn't create illegal types, if we're not supposed to.
24314	assert(DCI.isBeforeLegalize() \|\|
24315	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
24316
24317	// First perform a vector comparison, where lane 0 is the one we're interested
24318	// in.
24319	SDLoc DL(N0);
24320	SDValue LHS =
24321	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `0`));
24322	SDValue RHS =
24323	DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: SrcVT, Operand: N0.getOperand(i: `1`));
24324	SDValue SetCC = DAG.getNode(Opcode: ISD::SETCC, DL, VT: CCVT, N1: LHS, N2: RHS, N3: N0.getOperand(i: `2`));
24325
24326	// Now duplicate the comparison mask we want across all other lanes.
24327	SmallVector<int, `8`> DUPMask(CCVT.getVectorNumElements(), `0`);
24328	SDValue Mask = DAG.getVectorShuffle(VT: CCVT, dl: DL, N1: SetCC, N2: SetCC, Mask: DUPMask);
24329	Mask = DAG.getNode(Opcode: ISD::BITCAST, DL,
24330	VT: ResVT.changeVectorElementTypeToInteger(), Operand: Mask);
24331
24332	return DAG.getSelect(DL, VT: ResVT, Cond: Mask, LHS: N->getOperand(Num: `1`), RHS: N->getOperand(Num: `2`));
24333	}
24334
24335	static SDValue performDUPCombine(SDNode *N,
24336	TargetLowering::DAGCombinerInfo &DCI) {
24337	EVT VT = N->getValueType(ResNo: `0`);
24338	SDLoc DL(N);
24339	// If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
24340	// 128bit vector version.
24341	if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
24342	EVT LVT = VT.getDoubleNumVectorElementsVT(Context&: *DCI.DAG.getContext());
24343	SmallVector<SDValue> Ops(N->ops());
24344	if (SDNode *LN = DCI.DAG.getNodeIfExists(Opcode: N->getOpcode(),
24345	VTList: DCI.DAG.getVTList(VT: LVT), Ops)) {
24346	return DCI.DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: SDValue (LN, `0`),
24347	N2: DCI.DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
24348	}
24349	}
24350
24351	if (N->getOpcode() == AArch64ISD::DUP) {
24352	if (DCI.isAfterLegalizeDAG()) {
24353	// If scalar dup's operand is extract_vector_elt, try to combine them into
24354	// duplane. For example,
24355	//
24356	// t21: i32 = extract_vector_elt t19, Constant:i64<0>
24357	// t18: v4i32 = AArch64ISD::DUP t21
24358	// ==>
24359	// t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
24360	SDValue EXTRACT_VEC_ELT = N->getOperand(Num: `0`);
24361	if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24362	if (VT == EXTRACT_VEC_ELT.getOperand(i: `0`).getValueType()) {
24363	unsigned Opcode = getDUPLANEOp(EltType: VT.getVectorElementType());
24364	return DCI.DAG.getNode(Opcode, DL, VT, N1: EXTRACT_VEC_ELT.getOperand(i: `0`),
24365	N2: EXTRACT_VEC_ELT.getOperand(i: `1`));
24366	}
24367	}
24368	}
24369
24370	return performPostLD1Combine(N, DCI, IsLaneOp: false);
24371	}
24372
24373	return SDValue ();
24374	}
24375
24376	/// Get rid of unnecessary NVCASTs (that don't change the type).
24377	static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
24378	if (N->getValueType(ResNo: `0`) == N->getOperand(Num: `0`).getValueType())
24379	return N->getOperand(Num: `0`);
24380	if (N->getOperand(Num: `0`).getOpcode() == AArch64ISD::NVCAST)
24381	return DAG.getNode(Opcode: AArch64ISD::NVCAST, DL: SDLoc (N), VT: N->getValueType(ResNo: `0`),
24382	Operand: N->getOperand(Num: `0`).getOperand(i: `0`));
24383
24384	return SDValue ();
24385	}
24386
24387	// If all users of the globaladdr are of the form (globaladdr + constant), find
24388	// the smallest constant, fold it into the globaladdr's offset and rewrite the
24389	// globaladdr as (globaladdr + constant) - constant.
24390	static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
24391	const AArch64Subtarget *Subtarget,
24392	const TargetMachine &TM) {
24393	auto *GN = cast<GlobalAddressSDNode>(Val: N);
24394	if (Subtarget->ClassifyGlobalReference(GV: GN->getGlobal(), TM) !=
24395	AArch64II::MO_NO_FLAG)
24396	return SDValue ();
24397
24398	uint64_t MinOffset = -`1ull`;
24399	for (SDNode *N : GN->uses()) {
24400	if (N->getOpcode() != ISD::ADD)
24401	return SDValue ();
24402	auto *C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `0`));
24403	if (!C)
24404	C = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
24405	if (!C)
24406	return SDValue ();
24407	MinOffset = std::min(a: MinOffset, b: C->getZExtValue());
24408	}
24409	uint64_t Offset = MinOffset + GN->getOffset();
24410
24411	// Require that the new offset is larger than the existing one. Otherwise, we
24412	// can end up oscillating between two possible DAGs, for example,
24413	// (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
24414	if (Offset <= uint64_t(GN->getOffset()))
24415	return SDValue ();
24416
24417	// Check whether folding this offset is legal. It must not go out of bounds of
24418	// the referenced object to avoid violating the code model, and must be
24419	// smaller than 2^20 because this is the largest offset expressible in all
24420	// object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
24421	// stores an immediate signed 21 bit offset.)
24422	//
24423	// This check also prevents us from folding negative offsets, which will end
24424	// up being treated in the same way as large positive ones. They could also
24425	// cause code model violations, and aren't really common enough to matter.
24426	if (Offset >= (`1` << `20`))
24427	return SDValue ();
24428
24429	const GlobalValue *GV = GN->getGlobal();
24430	Type *T = GV->getValueType();
24431	if (!T->isSized() \|\|
24432	Offset > GV->getDataLayout().getTypeAllocSize(Ty: T))
24433	return SDValue ();
24434
24435	SDLoc DL(GN);
24436	SDValue Result = DAG.getGlobalAddress(GV, DL, VT: MVT::i64, offset: Offset);
24437	return DAG.getNode(Opcode: ISD::SUB, DL, VT: MVT::i64, N1: Result,
24438	N2: DAG.getConstant(Val: MinOffset, DL, VT: MVT::i64));
24439	}
24440
24441	static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG,
24442	const AArch64Subtarget *Subtarget) {
24443	SDValue BR = N->getOperand(Num: `0`);
24444	if (!Subtarget->hasCSSC() \|\| BR.getOpcode() != ISD::BITREVERSE \|\|
24445	!BR.getValueType().isScalarInteger())
24446	return SDValue ();
24447
24448	SDLoc DL(N);
24449	return DAG.getNode(Opcode: ISD::CTTZ, DL, VT: BR.getValueType(), Operand: BR.getOperand(i: `0`));
24450	}
24451
24452	// Turns the vector of indices into a vector of byte offstes by scaling Offset
24453	// by (BitWidth / 8).
24454	static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
24455	SDLoc DL, unsigned BitWidth) {
24456	assert(Offset.getValueType().isScalableVector() &&
24457	"This method is only for scalable vectors of offsets");
24458
24459	SDValue Shift = DAG.getConstant(Val: Log2_32(Value: BitWidth / `8`), DL, VT: MVT::i64);
24460	SDValue SplatShift = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MVT::nxv2i64, Operand: Shift);
24461
24462	return DAG.getNode(Opcode: ISD::SHL, DL, VT: MVT::nxv2i64, N1: Offset, N2: SplatShift);
24463	}
24464
24465	/// Check if the value of \p OffsetInBytes can be used as an immediate for
24466	/// the gather load/prefetch and scatter store instructions with vector base and
24467	/// immediate offset addressing mode:
24468	///
24469	/// [<Zn>.[S\|D]{, #<imm>}]
24470	///
24471	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
24472	inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
24473	unsigned ScalarSizeInBytes) {
24474	// The immediate is not a multiple of the scalar size.
24475	if (OffsetInBytes % ScalarSizeInBytes)
24476	return false;
24477
24478	// The immediate is out of range.
24479	if (OffsetInBytes / ScalarSizeInBytes > `31`)
24480	return false;
24481
24482	return true;
24483	}
24484
24485	/// Check if the value of \p Offset represents a valid immediate for the SVE
24486	/// gather load/prefetch and scatter store instructiona with vector base and
24487	/// immediate offset addressing mode:
24488	///
24489	/// [<Zn>.[S\|D]{, #<imm>}]
24490	///
24491	/// where <imm> = sizeof(<T>) k, for k = 0, 1, ..., 31.*
24492	static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
24493	unsigned ScalarSizeInBytes) {
24494	ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Val: Offset.getNode());
24495	return OffsetConst && isValidImmForSVEVecImmAddrMode(
24496	OffsetInBytes: OffsetConst->getZExtValue(), ScalarSizeInBytes);
24497	}
24498
24499	static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
24500	unsigned Opcode,
24501	bool OnlyPackedOffsets = true) {
24502	const SDValue Src = N->getOperand(Num: `2`);
24503	const EVT SrcVT = Src ->getValueType(ResNo: `0`);
24504	assert(SrcVT.isScalableVector() &&
24505	"Scatter stores are only possible for SVE vectors");
24506
24507	SDLoc DL(N);
24508	MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
24509
24510	// Make sure that source data will fit into an SVE register
24511	if (SrcVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
24512	return SDValue ();
24513
24514	// For FPs, ACLE only supports _packed_ single and double precision types.
24515	// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
24516	if (SrcElVT.isFloatingPoint())
24517	if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
24518	((Opcode != AArch64ISD::SST1Q_PRED &&
24519	Opcode != AArch64ISD::SST1Q_INDEX_PRED) \|\|
24520	((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
24521	return SDValue ();
24522
24523	// Depending on the addressing mode, this is either a pointer or a vector of
24524	// pointers (that fits into one register)
24525	SDValue Base = N->getOperand(Num: `4`);
24526	// Depending on the addressing mode, this is either a single offset or a
24527	// vector of offsets (that fits into one register)
24528	SDValue Offset = N->getOperand(Num: `5`);
24529
24530	// For "scalar + vector of indices", just scale the indices. This only
24531	// applies to non-temporal scatters because there's no instruction that takes
24532	// indices.
24533	if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
24534	Offset =
24535	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
24536	Opcode = AArch64ISD::SSTNT1_PRED;
24537	} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
24538	Offset =
24539	getScaledOffsetForBitWidth(DAG, Offset, DL, BitWidth: SrcElVT.getSizeInBits());
24540	Opcode = AArch64ISD::SST1Q_PRED;
24541	}
24542
24543	// In the case of non-temporal gather loads there's only one SVE instruction
24544	// per data-size: "scalar + vector", i.e.
24545	// stnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]*
24546	// Since we do have intrinsics that allow the arguments to be in a different
24547	// order, we may need to swap them to match the spec.
24548	if ((Opcode == AArch64ISD::SSTNT1_PRED \|\| Opcode == AArch64ISD::SST1Q_PRED) &&
24549	Offset.getValueType().isVector())
24550	std::swap(a&: Base, b&: Offset);
24551
24552	// SST1_IMM requires that the offset is an immediate that is:
24553	// a multiple of #SizeInBytes,*
24554	// in the range [0, 31 x #SizeInBytes],*
24555	// where #SizeInBytes is the size in bytes of the stored items. For
24556	// immediates outside that range and non-immediate scalar offsets use SST1 or
24557	// SST1_UXTW instead.
24558	if (Opcode == AArch64ISD::SST1_IMM_PRED) {
24559	if (!isValidImmForSVEVecImmAddrMode(Offset,
24560	ScalarSizeInBytes: SrcVT.getScalarSizeInBits() / `8`)) {
24561	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24562	Opcode = AArch64ISD::SST1_UXTW_PRED;
24563	else
24564	Opcode = AArch64ISD::SST1_PRED;
24565
24566	std::swap(a&: Base, b&: Offset);
24567	}
24568	}
24569
24570	auto &TLI = DAG.getTargetLoweringInfo();
24571	if (!TLI.isTypeLegal(VT: Base.getValueType()))
24572	return SDValue ();
24573
24574	// Some scatter store variants allow unpacked offsets, but only as nxv2i32
24575	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24576	// nxv2i64. Legalize accordingly.
24577	if (!OnlyPackedOffsets &&
24578	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24579	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: `0`);
24580
24581	if (!TLI.isTypeLegal(VT: Offset.getValueType()))
24582	return SDValue ();
24583
24584	// Source value type that is representable in hardware
24585	EVT HwSrcVt = getSVEContainerType(ContentTy: SrcVT);
24586
24587	// Keep the original type of the input data to store - this is needed to be
24588	// able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
24589	// FP values we want the integer equivalent, so just use HwSrcVt.
24590	SDValue InputVT = DAG.getValueType(SrcVT);
24591	if (SrcVT.isFloatingPoint())
24592	InputVT = DAG.getValueType(HwSrcVt);
24593
24594	SDVTList VTs = DAG.getVTList(VT: MVT::Other);
24595	SDValue SrcNew;
24596
24597	if (Src.getValueType().isFloatingPoint())
24598	SrcNew = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: HwSrcVt, Operand: Src);
24599	else
24600	SrcNew = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: HwSrcVt, Operand: Src);
24601
24602	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
24603	SrcNew,
24604	N->getOperand(Num: `3`), // Pg
24605	Base,
24606	Offset,
24607	InputVT};
24608
24609	return DAG.getNode(Opcode, DL, VTList: VTs, Ops);
24610	}
24611
24612	static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
24613	unsigned Opcode,
24614	bool OnlyPackedOffsets = true) {
24615	const EVT RetVT = N->getValueType(ResNo: `0`);
24616	assert(RetVT.isScalableVector() &&
24617	"Gather loads are only possible for SVE vectors");
24618
24619	SDLoc DL(N);
24620
24621	// Make sure that the loaded data will fit into an SVE register
24622	if (RetVT.getSizeInBits().getKnownMinValue() > AArch64::SVEBitsPerBlock)
24623	return SDValue ();
24624
24625	// Depending on the addressing mode, this is either a pointer or a vector of
24626	// pointers (that fits into one register)
24627	SDValue Base = N->getOperand(Num: `3`);
24628	// Depending on the addressing mode, this is either a single offset or a
24629	// vector of offsets (that fits into one register)
24630	SDValue Offset = N->getOperand(Num: `4`);
24631
24632	// For "scalar + vector of indices", scale the indices to obtain unscaled
24633	// offsets. This applies to non-temporal and quadword gathers, which do not
24634	// have an addressing mode with scaled offset.
24635	if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
24636	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
24637	BitWidth: RetVT.getScalarSizeInBits());
24638	Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
24639	} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24640	Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
24641	BitWidth: RetVT.getScalarSizeInBits());
24642	Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
24643	}
24644
24645	// In the case of non-temporal gather loads and quadword gather loads there's
24646	// only one addressing mode : "vector + scalar", e.g.
24647	// ldnt1{b\|h\|w\|d} { z0.s }, p0/z, [z0.s, x0]
24648	// Since we do have intrinsics that allow the arguments to be in a different
24649	// order, we may need to swap them to match the spec.
24650	if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO \|\|
24651	Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24652	Offset.getValueType().isVector())
24653	std::swap(a&: Base, b&: Offset);
24654
24655	// GLD{FF}1_IMM requires that the offset is an immediate that is:
24656	// a multiple of #SizeInBytes,*
24657	// in the range [0, 31 x #SizeInBytes],*
24658	// where #SizeInBytes is the size in bytes of the loaded items. For
24659	// immediates outside that range and non-immediate scalar offsets use
24660	// GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24661	if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO \|\|
24662	Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
24663	if (!isValidImmForSVEVecImmAddrMode(Offset,
24664	ScalarSizeInBytes: RetVT.getScalarSizeInBits() / `8`)) {
24665	if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24666	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24667	? AArch64ISD::GLD1_UXTW_MERGE_ZERO
24668	: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
24669	else
24670	Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24671	? AArch64ISD::GLD1_MERGE_ZERO
24672	: AArch64ISD::GLDFF1_MERGE_ZERO;
24673
24674	std::swap(a&: Base, b&: Offset);
24675	}
24676	}
24677
24678	auto &TLI = DAG.getTargetLoweringInfo();
24679	if (!TLI.isTypeLegal(VT: Base.getValueType()))
24680	return SDValue ();
24681
24682	// Some gather load variants allow unpacked offsets, but only as nxv2i32
24683	// vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24684	// nxv2i64. Legalize accordingly.
24685	if (!OnlyPackedOffsets &&
24686	Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24687	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset).getValue(R: `0`);
24688
24689	// Return value type that is representable in hardware
24690	EVT HwRetVt = getSVEContainerType(ContentTy: RetVT);
24691
24692	// Keep the original output value type around - this is needed to be able to
24693	// select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24694	// values we want the integer equivalent, so just use HwRetVT.
24695	SDValue OutVT = DAG.getValueType(RetVT);
24696	if (RetVT.isFloatingPoint())
24697	OutVT = DAG.getValueType(HwRetVt);
24698
24699	SDVTList VTs = DAG.getVTList(VT1: HwRetVt, VT2: MVT::Other);
24700	SDValue Ops[] = {N->getOperand(Num: `0`), // Chain
24701	N->getOperand(Num: `2`), // Pg
24702	Base, Offset, OutVT};
24703
24704	SDValue Load = DAG.getNode(Opcode, DL, VTList: VTs, Ops);
24705	SDValue LoadChain = SDValue (Load.getNode(), `1`);
24706
24707	if (RetVT.isInteger() && (RetVT != HwRetVt))
24708	Load = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
24709
24710	// If the original return value was FP, bitcast accordingly. Doing it here
24711	// means that we can avoid adding TableGen patterns for FPs.
24712	if (RetVT.isFloatingPoint())
24713	Load = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: RetVT, Operand: Load.getValue(R: `0`));
24714
24715	return DAG.getMergeValues(Ops: {Load, LoadChain}, dl: DL);
24716	}
24717
24718	static SDValue
24719	performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
24720	SelectionDAG &DAG) {
24721	SDLoc DL(N);
24722	SDValue Src = N->getOperand(Num: `0`);
24723	unsigned Opc = Src ->getOpcode();
24724
24725	// Sign extend of an unsigned unpack -> signed unpack
24726	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
24727
24728	unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24729	: AArch64ISD::SUNPKLO;
24730
24731	// Push the sign extend to the operand of the unpack
24732	// This is necessary where, for example, the operand of the unpack
24733	// is another unpack:
24734	// 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24735	// ->
24736	// 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24737	// ->
24738	// 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24739	SDValue ExtOp = Src ->getOperand(Num: `0`);
24740	auto VT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
24741	EVT EltTy = VT.getVectorElementType();
24742	(void)EltTy;
24743
24744	assert((EltTy == MVT::i8 \|\| EltTy == MVT::i16 \|\| EltTy == MVT::i32) &&
24745	"Sign extending from an invalid type");
24746
24747	EVT ExtVT = VT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
24748
24749	SDValue Ext = DAG.getNode(Opcode: ISD::SIGN_EXTEND_INREG, DL, VT: ExtOp.getValueType(),
24750	N1: ExtOp, N2: DAG.getValueType(ExtVT));
24751
24752	return DAG.getNode(Opcode: SOpc, DL, VT: N->getValueType(ResNo: `0`), Operand: Ext);
24753	}
24754
24755	if (DCI.isBeforeLegalizeOps())
24756	return SDValue ();
24757
24758	if (!EnableCombineMGatherIntrinsics)
24759	return SDValue ();
24760
24761	// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24762	// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24763	unsigned NewOpc;
24764	unsigned MemVTOpNum = `4`;
24765	switch (Opc) {
24766	case AArch64ISD::LD1_MERGE_ZERO:
24767	NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
24768	MemVTOpNum = `3`;
24769	break;
24770	case AArch64ISD::LDNF1_MERGE_ZERO:
24771	NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
24772	MemVTOpNum = `3`;
24773	break;
24774	case AArch64ISD::LDFF1_MERGE_ZERO:
24775	NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
24776	MemVTOpNum = `3`;
24777	break;
24778	case AArch64ISD::GLD1_MERGE_ZERO:
24779	NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
24780	break;
24781	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
24782	NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
24783	break;
24784	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
24785	NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
24786	break;
24787	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
24788	NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
24789	break;
24790	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
24791	NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
24792	break;
24793	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
24794	NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
24795	break;
24796	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
24797	NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
24798	break;
24799	case AArch64ISD::GLDFF1_MERGE_ZERO:
24800	NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
24801	break;
24802	case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
24803	NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
24804	break;
24805	case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
24806	NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
24807	break;
24808	case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
24809	NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
24810	break;
24811	case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
24812	NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
24813	break;
24814	case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
24815	NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
24816	break;
24817	case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
24818	NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
24819	break;
24820	case AArch64ISD::GLDNT1_MERGE_ZERO:
24821	NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
24822	break;
24823	default:
24824	return SDValue ();
24825	}
24826
24827	EVT SignExtSrcVT = cast<VTSDNode>(Val: N->getOperand(Num: `1`))->getVT();
24828	EVT SrcMemVT = cast<VTSDNode>(Val: Src ->getOperand(Num: MemVTOpNum))->getVT();
24829
24830	if ((SignExtSrcVT != SrcMemVT) \|\| !Src.hasOneUse())
24831	return SDValue ();
24832
24833	EVT DstVT = N->getValueType(ResNo: `0`);
24834	SDVTList VTs = DAG.getVTList(VT1: DstVT, VT2: MVT::Other);
24835
24836	SmallVector<SDValue, `5`> Ops;
24837	for (unsigned I = `0`; I < Src ->getNumOperands(); ++I)
24838	Ops.push_back(Elt: Src ->getOperand(Num: I));
24839
24840	SDValue ExtLoad = DAG.getNode(Opcode: NewOpc, DL: SDLoc (N), VTList: VTs, Ops);
24841	DCI.CombineTo(N, Res: ExtLoad);
24842	DCI.CombineTo(N: Src.getNode(), Res0: ExtLoad, Res1: ExtLoad.getValue(R: `1`));
24843
24844	// Return N so it doesn't get rechecked
24845	return SDValue (N, `0`);
24846	}
24847
24848	/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24849	/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24850	/// != nxv2i32) do not need legalization.
24851	static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
24852	const unsigned OffsetPos = `4`;
24853	SDValue Offset = N->getOperand(Num: OffsetPos);
24854
24855	// Not an unpacked vector, bail out.
24856	if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24857	return SDValue ();
24858
24859	// Extend the unpacked offset vector to 64-bit lanes.
24860	SDLoc DL(N);
24861	Offset = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::nxv2i64, Operand: Offset);
24862	SmallVector<SDValue, `5`> Ops(N->op_begin(), N->op_end());
24863	// Replace the offset operand with the 64-bit one.
24864	Ops [OffsetPos] = Offset;
24865
24866	return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
24867	}
24868
24869	/// Combines a node carrying the intrinsic
24870	/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24871	/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24872	/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24873	/// sve gather prefetch instruction with vector plus immediate addressing mode.
24874	static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
24875	unsigned ScalarSizeInBytes) {
24876	const unsigned ImmPos = `4`, OffsetPos = `3`;
24877	// No need to combine the node if the immediate is valid...
24878	if (isValidImmForSVEVecImmAddrMode(Offset: N->getOperand(Num: ImmPos), ScalarSizeInBytes))
24879	return SDValue ();
24880
24881	// ...otherwise swap the offset base with the offset...
24882	SmallVector<SDValue, `5`> Ops(N->op_begin(), N->op_end());
24883	std::swap(a&: Ops [ImmPos], b&: Ops [OffsetPos]);
24884	// ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24885	// `aarch64_sve_prfb_gather_uxtw_index`.
24886	SDLoc DL(N);
24887	Ops [`1`] = DAG.getConstant(Val: Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24888	VT: MVT::i64);
24889
24890	return DAG.getNode(Opcode: N->getOpcode(), DL, VTList: DAG.getVTList(VT: MVT::Other), Ops);
24891	}
24892
24893	// Return true if the vector operation can guarantee only the first lane of its
24894	// result contains data, with all bits in other lanes set to zero.
24895	static bool isLanes1toNKnownZero(SDValue Op) {
24896	switch (Op.getOpcode()) {
24897	default:
24898	return false;
24899	case AArch64ISD::ANDV_PRED:
24900	case AArch64ISD::EORV_PRED:
24901	case AArch64ISD::FADDA_PRED:
24902	case AArch64ISD::FADDV_PRED:
24903	case AArch64ISD::FMAXNMV_PRED:
24904	case AArch64ISD::FMAXV_PRED:
24905	case AArch64ISD::FMINNMV_PRED:
24906	case AArch64ISD::FMINV_PRED:
24907	case AArch64ISD::ORV_PRED:
24908	case AArch64ISD::SADDV_PRED:
24909	case AArch64ISD::SMAXV_PRED:
24910	case AArch64ISD::SMINV_PRED:
24911	case AArch64ISD::UADDV_PRED:
24912	case AArch64ISD::UMAXV_PRED:
24913	case AArch64ISD::UMINV_PRED:
24914	return true;
24915	}
24916	}
24917
24918	static SDValue removeRedundantInsertVectorElt(SDNode *N) {
24919	assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24920	SDValue InsertVec = N->getOperand(Num: `0`);
24921	SDValue InsertElt = N->getOperand(Num: `1`);
24922	SDValue InsertIdx = N->getOperand(Num: `2`);
24923
24924	// We only care about inserts into the first element...
24925	if (!isNullConstant(V: InsertIdx))
24926	return SDValue ();
24927	// ...of a zero'd vector...
24928	if (!ISD::isConstantSplatVectorAllZeros(N: InsertVec.getNode()))
24929	return SDValue ();
24930	// ...where the inserted data was previously extracted...
24931	if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24932	return SDValue ();
24933
24934	SDValue ExtractVec = InsertElt.getOperand(i: `0`);
24935	SDValue ExtractIdx = InsertElt.getOperand(i: `1`);
24936
24937	// ...from the first element of a vector.
24938	if (!isNullConstant(V: ExtractIdx))
24939	return SDValue ();
24940
24941	// If we get here we are effectively trying to zero lanes 1-N of a vector.
24942
24943	// Ensure there's no type conversion going on.
24944	if (N->getValueType(ResNo: `0`) != ExtractVec.getValueType())
24945	return SDValue ();
24946
24947	if (!isLanes1toNKnownZero(Op: ExtractVec))
24948	return SDValue ();
24949
24950	// The explicit zeroing is redundant.
24951	return ExtractVec;
24952	}
24953
24954	static SDValue
24955	performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
24956	if (SDValue Res = removeRedundantInsertVectorElt(N))
24957	return Res;
24958
24959	return performPostLD1Combine(N, DCI, IsLaneOp: true);
24960	}
24961
24962	static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
24963	TargetLowering::DAGCombinerInfo &DCI,
24964	const AArch64Subtarget *Subtarget) {
24965	SDValue N0 = N->getOperand(Num: `0`);
24966	EVT VT = N->getValueType(ResNo: `0`);
24967
24968	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24969	if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24970	return SDValue ();
24971
24972	auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24973	EVT EltVT = VT.getVectorElementType();
24974	return EltVT == MVT::f32 \|\| EltVT == MVT::f64;
24975	};
24976
24977	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24978	// We purposefully don't care about legality of the nodes here as we know
24979	// they can be split down into something legal.
24980	if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N: N0.getNode()) &&
24981	N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24982	VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad (VT) &&
24983	VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24984	LoadSDNode *LN0 = cast<LoadSDNode>(Val&: N0);
24985	SDValue ExtLoad = DAG.getExtLoad(ExtType: ISD::EXTLOAD, dl: SDLoc (N), VT,
24986	Chain: LN0->getChain(), Ptr: LN0->getBasePtr(),
24987	MemVT: N0.getValueType(), MMO: LN0->getMemOperand());
24988	DCI.CombineTo(N, Res: ExtLoad);
24989	DCI.CombineTo(
24990	N: N0.getNode(),
24991	Res0: DAG.getNode(Opcode: ISD::FP_ROUND, DL: SDLoc (N0), VT: N0.getValueType(), N1: ExtLoad,
24992	N2: DAG.getIntPtrConstant(Val: `1`, DL: SDLoc (N0), /isTarget=/true)),
24993	Res1: ExtLoad.getValue(R: `1`));
24994	return SDValue (N, `0`); // Return N so it doesn't get rechecked!
24995	}
24996
24997	return SDValue ();
24998	}
24999
25000	static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
25001	const AArch64Subtarget *Subtarget) {
25002	EVT VT = N->getValueType(ResNo: `0`);
25003
25004	// Don't expand for NEON, SVE2 or SME
25005	if (!VT.isScalableVector() \|\| Subtarget->hasSVE2() \|\| Subtarget->hasSME())
25006	return SDValue ();
25007
25008	SDLoc DL(N);
25009
25010	SDValue Mask = N->getOperand(Num: `0`);
25011	SDValue In1 = N->getOperand(Num: `1`);
25012	SDValue In2 = N->getOperand(Num: `2`);
25013
25014	SDValue InvMask = DAG.getNOT(DL, Val: Mask, VT);
25015	SDValue Sel = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: Mask, N2: In1);
25016	SDValue SelInv = DAG.getNode(Opcode: ISD::AND, DL, VT, N1: InvMask, N2: In2);
25017	return DAG.getNode(Opcode: ISD::OR, DL, VT, N1: Sel, N2: SelInv);
25018	}
25019
25020	static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
25021	EVT VT = N->getValueType(ResNo: `0`);
25022
25023	SDValue Insert = N->getOperand(Num: `0`);
25024	if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
25025	return SDValue ();
25026
25027	if (!Insert.getOperand(i: `0`).isUndef())
25028	return SDValue ();
25029
25030	uint64_t IdxInsert = Insert.getConstantOperandVal(i: `2`);
25031	uint64_t IdxDupLane = N->getConstantOperandVal(Num: `1`);
25032	if (IdxInsert != `0` \|\| IdxDupLane != `0`)
25033	return SDValue ();
25034
25035	SDValue Bitcast = Insert.getOperand(i: `1`);
25036	if (Bitcast.getOpcode() != ISD::BITCAST)
25037	return SDValue ();
25038
25039	SDValue Subvec = Bitcast.getOperand(i: `0`);
25040	EVT SubvecVT = Subvec.getValueType();
25041	if (!SubvecVT.is128BitVector())
25042	return SDValue ();
25043	EVT NewSubvecVT =
25044	getPackedSVEVectorVT(VT: Subvec.getValueType().getVectorElementType());
25045
25046	SDLoc DL(N);
25047	SDValue NewInsert =
25048	DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT: NewSubvecVT,
25049	N1: DAG.getUNDEF(VT: NewSubvecVT), N2: Subvec, N3: Insert ->getOperand(Num: `2`));
25050	SDValue NewDuplane128 = DAG.getNode(Opcode: AArch64ISD::DUPLANE128, DL, VT: NewSubvecVT,
25051	N1: NewInsert, N2: N->getOperand(Num: `1`));
25052	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: NewDuplane128);
25053	}
25054
25055	// Try to combine mull with uzp1.
25056	static SDValue tryCombineMULLWithUZP1(SDNode *N,
25057	TargetLowering::DAGCombinerInfo &DCI,
25058	SelectionDAG &DAG) {
25059	if (DCI.isBeforeLegalizeOps())
25060	return SDValue ();
25061
25062	SDValue LHS = N->getOperand(Num: `0`);
25063	SDValue RHS = N->getOperand(Num: `1`);
25064
25065	SDValue ExtractHigh;
25066	SDValue ExtractLow;
25067	SDValue TruncHigh;
25068	SDValue TruncLow;
25069	SDLoc DL(N);
25070
25071	// Check the operands are trunc and extract_high.
25072	if (isEssentiallyExtractHighSubvector(N: LHS) &&
25073	RHS.getOpcode() == ISD::TRUNCATE) {
25074	TruncHigh = RHS;
25075	if (LHS.getOpcode() == ISD::BITCAST)
25076	ExtractHigh = LHS.getOperand(i: `0`);
25077	else
25078	ExtractHigh = LHS;
25079	} else if (isEssentiallyExtractHighSubvector(N: RHS) &&
25080	LHS.getOpcode() == ISD::TRUNCATE) {
25081	TruncHigh = LHS;
25082	if (RHS.getOpcode() == ISD::BITCAST)
25083	ExtractHigh = RHS.getOperand(i: `0`);
25084	else
25085	ExtractHigh = RHS;
25086	} else
25087	return SDValue ();
25088
25089	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25090	// with uzp1.
25091	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25092	SDValue TruncHighOp = TruncHigh.getOperand(i: `0`);
25093	EVT TruncHighOpVT = TruncHighOp.getValueType();
25094	if (TruncHighOp.getOpcode() == AArch64ISD::DUP \|\|
25095	DAG.isSplatValue(V: TruncHighOp, AllowUndefs: false))
25096	return SDValue ();
25097
25098	// Check there is other extract_high with same source vector.
25099	// For example,
25100	//
25101	// t18: v4i16 = extract_subvector t2, Constant:i64<0>
25102	// t12: v4i16 = truncate t11
25103	// t31: v4i32 = AArch64ISD::SMULL t18, t12
25104	// t23: v4i16 = extract_subvector t2, Constant:i64<4>
25105	// t16: v4i16 = truncate t15
25106	// t30: v4i32 = AArch64ISD::SMULL t23, t1
25107	//
25108	// This dagcombine assumes the two extract_high uses same source vector in
25109	// order to detect the pair of the mull. If they have different source vector,
25110	// this code will not work.
25111	// TODO: Should also try to look through a bitcast.
25112	bool HasFoundMULLow = true;
25113	SDValue ExtractHighSrcVec = ExtractHigh.getOperand(i: `0`);
25114	if (ExtractHighSrcVec ->use_size() != `2`)
25115	HasFoundMULLow = false;
25116
25117	// Find ExtractLow.
25118	for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
25119	if (User == ExtractHigh.getNode())
25120	continue;
25121
25122	if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR \|\|
25123	!isNullConstant(V: User->getOperand(Num: `1`))) {
25124	HasFoundMULLow = false;
25125	break;
25126	}
25127
25128	ExtractLow.setNode(User);
25129	}
25130
25131	if (!ExtractLow \|\| !ExtractLow ->hasOneUse())
25132	HasFoundMULLow = false;
25133
25134	// Check ExtractLow's user.
25135	if (HasFoundMULLow) {
25136	SDNode ExtractLowUser = ExtractLow.getNode()->use_begin();
25137	if (ExtractLowUser->getOpcode() != N->getOpcode()) {
25138	HasFoundMULLow = false;
25139	} else {
25140	if (ExtractLowUser->getOperand(Num: `0`) == ExtractLow) {
25141	if (ExtractLowUser->getOperand(Num: `1`).getOpcode() == ISD::TRUNCATE)
25142	TruncLow = ExtractLowUser->getOperand(Num: `1`);
25143	else
25144	HasFoundMULLow = false;
25145	} else {
25146	if (ExtractLowUser->getOperand(Num: `0`).getOpcode() == ISD::TRUNCATE)
25147	TruncLow = ExtractLowUser->getOperand(Num: `0`);
25148	else
25149	HasFoundMULLow = false;
25150	}
25151	}
25152	}
25153
25154	// If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25155	// with uzp1.
25156	// You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25157	EVT TruncHighVT = TruncHigh.getValueType();
25158	EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
25159	SDValue TruncLowOp =
25160	HasFoundMULLow ? TruncLow.getOperand(i: `0`) : DAG.getUNDEF(VT: UZP1VT);
25161	EVT TruncLowOpVT = TruncLowOp.getValueType();
25162	if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP \|\|
25163	DAG.isSplatValue(V: TruncLowOp, AllowUndefs: false)))
25164	return SDValue ();
25165
25166	// Create uzp1, extract_high and extract_low.
25167	if (TruncHighOpVT != UZP1VT)
25168	TruncHighOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncHighOp);
25169	if (TruncLowOpVT != UZP1VT)
25170	TruncLowOp = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: UZP1VT, Operand: TruncLowOp);
25171
25172	SDValue UZP1 =
25173	DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: UZP1VT, N1: TruncLowOp, N2: TruncHighOp);
25174	SDValue HighIdxCst =
25175	DAG.getConstant(Val: TruncHighVT.getVectorNumElements(), DL, VT: MVT::i64);
25176	SDValue NewTruncHigh =
25177	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncHighVT, N1: UZP1, N2: HighIdxCst);
25178	DAG.ReplaceAllUsesWith(From: TruncHigh, To: NewTruncHigh);
25179
25180	if (HasFoundMULLow) {
25181	EVT TruncLowVT = TruncLow.getValueType();
25182	SDValue NewTruncLow = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: TruncLowVT,
25183	N1: UZP1, N2: ExtractLow.getOperand(i: `1`));
25184	DAG.ReplaceAllUsesWith(From: TruncLow, To: NewTruncLow);
25185	}
25186
25187	return SDValue (N, `0`);
25188	}
25189
25190	static SDValue performMULLCombine(SDNode *N,
25191	TargetLowering::DAGCombinerInfo &DCI,
25192	SelectionDAG &DAG) {
25193	if (SDValue Val =
25194	tryCombineLongOpWithDup(IID: Intrinsic::not_intrinsic, N, DCI, DAG))
25195	return Val;
25196
25197	if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
25198	return Val;
25199
25200	return SDValue ();
25201	}
25202
25203	static SDValue
25204	performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
25205	SelectionDAG &DAG) {
25206	// Let's do below transform.
25207	//
25208	// t34: v4i32 = AArch64ISD::UADDLV t2
25209	// t35: i32 = extract_vector_elt t34, Constant:i64<0>
25210	// t7: i64 = zero_extend t35
25211	// t20: v1i64 = scalar_to_vector t7
25212	// ==>
25213	// t34: v4i32 = AArch64ISD::UADDLV t2
25214	// t39: v2i32 = extract_subvector t34, Constant:i64<0>
25215	// t40: v1i64 = AArch64ISD::NVCAST t39
25216	if (DCI.isBeforeLegalizeOps())
25217	return SDValue ();
25218
25219	EVT VT = N->getValueType(ResNo: `0`);
25220	if (VT != MVT::v1i64)
25221	return SDValue ();
25222
25223	SDValue ZEXT = N->getOperand(Num: `0`);
25224	if (ZEXT.getOpcode() != ISD::ZERO_EXTEND \|\| ZEXT.getValueType() != MVT::i64)
25225	return SDValue ();
25226
25227	SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(i: `0`);
25228	if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
25229	EXTRACT_VEC_ELT.getValueType() != MVT::i32)
25230	return SDValue ();
25231
25232	if (!isNullConstant(V: EXTRACT_VEC_ELT.getOperand(i: `1`)))
25233	return SDValue ();
25234
25235	SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(i: `0`);
25236	if (UADDLV.getOpcode() != AArch64ISD::UADDLV \|\|
25237	UADDLV.getValueType() != MVT::v4i32 \|\|
25238	UADDLV.getOperand(i: `0`).getValueType() != MVT::v8i8)
25239	return SDValue ();
25240
25241	// Let's generate new sequence with AArch64ISD::NVCAST.
25242	SDLoc DL(N);
25243	SDValue EXTRACT_SUBVEC =
25244	DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT: MVT::v2i32, N1: UADDLV,
25245	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
25246	SDValue NVCAST =
25247	DAG.getNode(Opcode: AArch64ISD::NVCAST, DL, VT: MVT::v1i64, Operand: EXTRACT_SUBVEC);
25248
25249	return NVCAST;
25250	}
25251
25252	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
25253	DAGCombinerInfo &DCI) const {
25254	SelectionDAG &DAG = DCI.DAG;
25255	switch (N->getOpcode()) {
25256	default:
25257	LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
25258	break;
25259	case ISD::VECREDUCE_AND:
25260	case ISD::VECREDUCE_OR:
25261	case ISD::VECREDUCE_XOR:
25262	return performVecReduceBitwiseCombine(N, DCI, DAG);
25263	case ISD::ADD:
25264	case ISD::SUB:
25265	return performAddSubCombine(N, DCI);
25266	case ISD::BUILD_VECTOR:
25267	return performBuildVectorCombine(N, DCI, DAG);
25268	case ISD::TRUNCATE:
25269	return performTruncateCombine(N, DAG);
25270	case AArch64ISD::ANDS:
25271	return performFlagSettingCombine(N, DCI, GenericOpcode: ISD::AND);
25272	case AArch64ISD::ADC:
25273	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
25274	return R;
25275	return foldADCToCINC(N, DAG);
25276	case AArch64ISD::SBC:
25277	return foldOverflowCheck(Op: N, DAG, / IsAdd / false);
25278	case AArch64ISD::ADCS:
25279	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / true))
25280	return R;
25281	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::ADC);
25282	case AArch64ISD::SBCS:
25283	if (auto R = foldOverflowCheck(Op: N, DAG, / IsAdd / false))
25284	return R;
25285	return performFlagSettingCombine(N, DCI, GenericOpcode: AArch64ISD::SBC);
25286	case AArch64ISD::BICi: {
25287	APInt DemandedBits =
25288	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getScalarSizeInBits());
25289	APInt DemandedElts =
25290	APInt::getAllOnes(numBits: N->getValueType(ResNo: `0`).getVectorNumElements());
25291
25292	if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
25293	Op: SDValue (N, `0`), DemandedBits, DemandedElts, DCI))
25294	return SDValue ();
25295
25296	break;
25297	}
25298	case ISD::XOR:
25299	return performXorCombine(N, DAG, DCI, Subtarget);
25300	case ISD::MUL:
25301	return performMulCombine(N, DAG, DCI, Subtarget);
25302	case ISD::SINT_TO_FP:
25303	case ISD::UINT_TO_FP:
25304	return performIntToFpCombine(N, DAG, Subtarget);
25305	case ISD::FP_TO_SINT:
25306	case ISD::FP_TO_UINT:
25307	case ISD::FP_TO_SINT_SAT:
25308	case ISD::FP_TO_UINT_SAT:
25309	return performFpToIntCombine(N, DAG, DCI, Subtarget);
25310	case ISD::OR:
25311	return performORCombine(N, DCI, Subtarget, TLI: *this);
25312	case ISD::AND:
25313	return performANDCombine(N, DCI);
25314	case ISD::FADD:
25315	return performFADDCombine(N, DCI);
25316	case ISD::INTRINSIC_WO_CHAIN:
25317	return performIntrinsicCombine(N, DCI, Subtarget);
25318	case ISD::ANY_EXTEND:
25319	case ISD::ZERO_EXTEND:
25320	case ISD::SIGN_EXTEND:
25321	return performExtendCombine(N, DCI, DAG);
25322	case ISD::SIGN_EXTEND_INREG:
25323	return performSignExtendInRegCombine(N, DCI, DAG);
25324	case ISD::CONCAT_VECTORS:
25325	return performConcatVectorsCombine(N, DCI, DAG);
25326	case ISD::EXTRACT_SUBVECTOR:
25327	return performExtractSubvectorCombine(N, DCI, DAG);
25328	case ISD::INSERT_SUBVECTOR:
25329	return performInsertSubvectorCombine(N, DCI, DAG);
25330	case ISD::SELECT:
25331	return performSelectCombine(N, DCI);
25332	case ISD::VSELECT:
25333	return performVSelectCombine(N, DAG&: DCI.DAG);
25334	case ISD::SETCC:
25335	return performSETCCCombine(N, DCI, DAG);
25336	case ISD::LOAD:
25337	return performLOADCombine(N, DCI, DAG, Subtarget);
25338	case ISD::STORE:
25339	return performSTORECombine(N, DCI, DAG, Subtarget);
25340	case ISD::MSTORE:
25341	return performMSTORECombine(N, DCI, DAG, Subtarget);
25342	case ISD::MGATHER:
25343	case ISD::MSCATTER:
25344	return performMaskedGatherScatterCombine(N, DCI, DAG);
25345	case ISD::FP_EXTEND:
25346	return performFPExtendCombine(N, DAG, DCI, Subtarget);
25347	case AArch64ISD::BRCOND:
25348	return performBRCONDCombine(N, DCI, DAG);
25349	case AArch64ISD::TBNZ:
25350	case AArch64ISD::TBZ:
25351	return performTBZCombine(N, DCI, DAG);
25352	case AArch64ISD::CSEL:
25353	return performCSELCombine(N, DCI, DAG);
25354	case AArch64ISD::DUP:
25355	case AArch64ISD::DUPLANE8:
25356	case AArch64ISD::DUPLANE16:
25357	case AArch64ISD::DUPLANE32:
25358	case AArch64ISD::DUPLANE64:
25359	return performDUPCombine(N, DCI);
25360	case AArch64ISD::DUPLANE128:
25361	return performDupLane128Combine(N, DAG);
25362	case AArch64ISD::NVCAST:
25363	return performNVCASTCombine(N, DAG);
25364	case AArch64ISD::SPLICE:
25365	return performSpliceCombine(N, DAG);
25366	case AArch64ISD::UUNPKLO:
25367	case AArch64ISD::UUNPKHI:
25368	return performUnpackCombine(N, DAG, Subtarget);
25369	case AArch64ISD::UZP1:
25370	case AArch64ISD::UZP2:
25371	return performUzpCombine(N, DAG, Subtarget);
25372	case AArch64ISD::SETCC_MERGE_ZERO:
25373	return performSetccMergeZeroCombine(N, DCI);
25374	case AArch64ISD::REINTERPRET_CAST:
25375	return performReinterpretCastCombine(N);
25376	case AArch64ISD::GLD1_MERGE_ZERO:
25377	case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
25378	case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
25379	case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
25380	case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
25381	case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
25382	case AArch64ISD::GLD1_IMM_MERGE_ZERO:
25383	case AArch64ISD::GLD1S_MERGE_ZERO:
25384	case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
25385	case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
25386	case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
25387	case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
25388	case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
25389	case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
25390	return performGLD1Combine(N, DAG);
25391	case AArch64ISD::VASHR:
25392	case AArch64ISD::VLSHR:
25393	return performVectorShiftCombine(N, TLI: *this, DCI);
25394	case AArch64ISD::SUNPKLO:
25395	return performSunpkloCombine(N, DAG);
25396	case AArch64ISD::BSP:
25397	return performBSPExpandForSVE(N, DAG, Subtarget);
25398	case ISD::INSERT_VECTOR_ELT:
25399	return performInsertVectorEltCombine(N, DCI);
25400	case ISD::EXTRACT_VECTOR_ELT:
25401	return performExtractVectorEltCombine(N, DCI, Subtarget);
25402	case ISD::VECREDUCE_ADD:
25403	return performVecReduceAddCombine(N, DAG&: DCI.DAG, ST: Subtarget);
25404	case AArch64ISD::UADDV:
25405	return performUADDVCombine(N, DAG);
25406	case AArch64ISD::SMULL:
25407	case AArch64ISD::UMULL:
25408	case AArch64ISD::PMULL:
25409	return performMULLCombine(N, DCI, DAG);
25410	case ISD::INTRINSIC_VOID:
25411	case ISD::INTRINSIC_W_CHAIN:
25412	switch (N->getConstantOperandVal(Num: `1`)) {
25413	case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
25414	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `1` /=ScalarSizeInBytes/);
25415	case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
25416	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `2` /=ScalarSizeInBytes/);
25417	case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
25418	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `4` /=ScalarSizeInBytes/);
25419	case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
25420	return combineSVEPrefetchVecBaseImmOff(N, DAG, ScalarSizeInBytes: `8` /=ScalarSizeInBytes/);
25421	case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
25422	case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
25423	case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
25424	case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
25425	case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
25426	case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
25427	case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
25428	case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
25429	return legalizeSVEGatherPrefetchOffsVec(N, DAG);
25430	case Intrinsic::aarch64_neon_ld2:
25431	case Intrinsic::aarch64_neon_ld3:
25432	case Intrinsic::aarch64_neon_ld4:
25433	case Intrinsic::aarch64_neon_ld1x2:
25434	case Intrinsic::aarch64_neon_ld1x3:
25435	case Intrinsic::aarch64_neon_ld1x4:
25436	case Intrinsic::aarch64_neon_ld2lane:
25437	case Intrinsic::aarch64_neon_ld3lane:
25438	case Intrinsic::aarch64_neon_ld4lane:
25439	case Intrinsic::aarch64_neon_ld2r:
25440	case Intrinsic::aarch64_neon_ld3r:
25441	case Intrinsic::aarch64_neon_ld4r:
25442	case Intrinsic::aarch64_neon_st2:
25443	case Intrinsic::aarch64_neon_st3:
25444	case Intrinsic::aarch64_neon_st4:
25445	case Intrinsic::aarch64_neon_st1x2:
25446	case Intrinsic::aarch64_neon_st1x3:
25447	case Intrinsic::aarch64_neon_st1x4:
25448	case Intrinsic::aarch64_neon_st2lane:
25449	case Intrinsic::aarch64_neon_st3lane:
25450	case Intrinsic::aarch64_neon_st4lane:
25451	return performNEONPostLDSTCombine(N, DCI, DAG);
25452	case Intrinsic::aarch64_sve_ldnt1:
25453	return performLDNT1Combine(N, DAG);
25454	case Intrinsic::aarch64_sve_ld1rq:
25455	return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
25456	case Intrinsic::aarch64_sve_ld1ro:
25457	return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
25458	case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
25459	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25460	case Intrinsic::aarch64_sve_ldnt1_gather:
25461	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25462	case Intrinsic::aarch64_sve_ldnt1_gather_index:
25463	return performGatherLoadCombine(N, DAG,
25464	Opcode: AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
25465	case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
25466	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDNT1_MERGE_ZERO);
25467	case Intrinsic::aarch64_sve_ld1:
25468	return performLD1Combine(N, DAG, Opc: AArch64ISD::LD1_MERGE_ZERO);
25469	case Intrinsic::aarch64_sve_ldnf1:
25470	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDNF1_MERGE_ZERO);
25471	case Intrinsic::aarch64_sve_ldff1:
25472	return performLD1Combine(N, DAG, Opc: AArch64ISD::LDFF1_MERGE_ZERO);
25473	case Intrinsic::aarch64_sve_st1:
25474	return performST1Combine(N, DAG);
25475	case Intrinsic::aarch64_sve_stnt1:
25476	return performSTNT1Combine(N, DAG);
25477	case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
25478	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25479	case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
25480	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25481	case Intrinsic::aarch64_sve_stnt1_scatter:
25482	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_PRED);
25483	case Intrinsic::aarch64_sve_stnt1_scatter_index:
25484	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SSTNT1_INDEX_PRED);
25485	case Intrinsic::aarch64_sve_ld1_gather:
25486	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_MERGE_ZERO);
25487	case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
25488	case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
25489	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1Q_MERGE_ZERO);
25490	case Intrinsic::aarch64_sve_ld1q_gather_index:
25491	return performGatherLoadCombine(N, DAG,
25492	Opcode: AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
25493	case Intrinsic::aarch64_sve_ld1_gather_index:
25494	return performGatherLoadCombine(N, DAG,
25495	Opcode: AArch64ISD::GLD1_SCALED_MERGE_ZERO);
25496	case Intrinsic::aarch64_sve_ld1_gather_sxtw:
25497	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_SXTW_MERGE_ZERO,
25498	/OnlyPackedOffsets=/false);
25499	case Intrinsic::aarch64_sve_ld1_gather_uxtw:
25500	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_UXTW_MERGE_ZERO,
25501	/OnlyPackedOffsets=/false);
25502	case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
25503	return performGatherLoadCombine(N, DAG,
25504	Opcode: AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
25505	/OnlyPackedOffsets=/false);
25506	case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
25507	return performGatherLoadCombine(N, DAG,
25508	Opcode: AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
25509	/OnlyPackedOffsets=/false);
25510	case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
25511	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLD1_IMM_MERGE_ZERO);
25512	case Intrinsic::aarch64_sve_ldff1_gather:
25513	return performGatherLoadCombine(N, DAG, Opcode: AArch64ISD::GLDFF1_MERGE_ZERO);
25514	case Intrinsic::aarch64_sve_ldff1_gather_index:
25515	return performGatherLoadCombine(N, DAG,
25516	Opcode: AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
25517	case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
25518	return performGatherLoadCombine(N, DAG,
25519	Opcode: AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
25520	/OnlyPackedOffsets=/false);
25521	case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
25522	return performGatherLoadCombine(N, DAG,
25523	Opcode: AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
25524	/OnlyPackedOffsets=/false);
25525	case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
25526	return performGatherLoadCombine(N, DAG,
25527	Opcode: AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
25528	/OnlyPackedOffsets=/false);
25529	case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
25530	return performGatherLoadCombine(N, DAG,
25531	Opcode: AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
25532	/OnlyPackedOffsets=/false);
25533	case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
25534	return performGatherLoadCombine(N, DAG,
25535	Opcode: AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
25536	case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
25537	case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
25538	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_PRED);
25539	case Intrinsic::aarch64_sve_st1q_scatter_index:
25540	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1Q_INDEX_PRED);
25541	case Intrinsic::aarch64_sve_st1_scatter:
25542	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_PRED);
25543	case Intrinsic::aarch64_sve_st1_scatter_index:
25544	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SCALED_PRED);
25545	case Intrinsic::aarch64_sve_st1_scatter_sxtw:
25546	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_SXTW_PRED,
25547	/OnlyPackedOffsets=/false);
25548	case Intrinsic::aarch64_sve_st1_scatter_uxtw:
25549	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_UXTW_PRED,
25550	/OnlyPackedOffsets=/false);
25551	case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
25552	return performScatterStoreCombine(N, DAG,
25553	Opcode: AArch64ISD::SST1_SXTW_SCALED_PRED,
25554	/OnlyPackedOffsets=/false);
25555	case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
25556	return performScatterStoreCombine(N, DAG,
25557	Opcode: AArch64ISD::SST1_UXTW_SCALED_PRED,
25558	/OnlyPackedOffsets=/false);
25559	case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
25560	return performScatterStoreCombine(N, DAG, Opcode: AArch64ISD::SST1_IMM_PRED);
25561	case Intrinsic::aarch64_rndr:
25562	case Intrinsic::aarch64_rndrrs: {
25563	unsigned IntrinsicID = N->getConstantOperandVal(Num: `1`);
25564	auto Register =
25565	(IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
25566	: AArch64SysReg::RNDRRS);
25567	SDLoc DL(N);
25568	SDValue A = DAG.getNode(
25569	Opcode: AArch64ISD::MRS, DL, VTList: DAG.getVTList(VT1: MVT::i64, VT2: MVT::Glue, VT3: MVT::Other),
25570	N1: N->getOperand(Num: `0`), N2: DAG.getConstant(Val: Register, DL, VT: MVT::i64));
25571	SDValue B = DAG.getNode(
25572	Opcode: AArch64ISD::CSINC, DL, VT: MVT::i32, N1: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
25573	N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i32),
25574	N3: DAG.getConstant(Val: AArch64CC::NE, DL, VT: MVT::i32), N4: A.getValue(R: `1`));
25575	return DAG.getMergeValues(
25576	Ops: {A, DAG.getZExtOrTrunc(Op: B, DL, VT: MVT::i1), A.getValue(R: `2`)}, dl: DL);
25577	}
25578	case Intrinsic::aarch64_sme_ldr_zt:
25579	return DAG.getNode(Opcode: AArch64ISD::RESTORE_ZT, DL: SDLoc (N),
25580	VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: `0`),
25581	N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
25582	case Intrinsic::aarch64_sme_str_zt:
25583	return DAG.getNode(Opcode: AArch64ISD::SAVE_ZT, DL: SDLoc (N),
25584	VTList: DAG.getVTList(VT: MVT::Other), N1: N->getOperand(Num: `0`),
25585	N2: N->getOperand(Num: `2`), N3: N->getOperand(Num: `3`));
25586	default:
25587	break;
25588	}
25589	break;
25590	case ISD::GlobalAddress:
25591	return performGlobalAddressCombine(N, DAG, Subtarget, TM: getTargetMachine());
25592	case ISD::CTLZ:
25593	return performCTLZCombine(N, DAG, Subtarget);
25594	case ISD::SCALAR_TO_VECTOR:
25595	return performScalarToVectorCombine(N, DCI, DAG);
25596	}
25597	return SDValue ();
25598	}
25599
25600	// Check if the return value is used as only a return value, as otherwise
25601	// we can't perform a tail-call. In particular, we need to check for
25602	// target ISD nodes that are returns and any other "odd" constructs
25603	// that the generic analysis code won't necessarily catch.
25604	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
25605	SDValue &Chain) const {
25606	if (N->getNumValues() != `1`)
25607	return false;
25608	if (!N->hasNUsesOfValue(NUses: `1`, Value: `0`))
25609	return false;
25610
25611	SDValue TCChain = Chain;
25612	SDNode Copy = N->use_begin();
25613	if (Copy->getOpcode() == ISD::CopyToReg) {
25614	// If the copy has a glue operand, we conservatively assume it isn't safe to
25615	// perform a tail call.
25616	if (Copy->getOperand(Num: Copy->getNumOperands() - `1`).getValueType() ==
25617	MVT::Glue)
25618	return false;
25619	TCChain = Copy->getOperand(Num: `0`);
25620	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
25621	return false;
25622
25623	bool HasRet = false;
25624	for (SDNode *Node : Copy->uses()) {
25625	if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25626	return false;
25627	HasRet = true;
25628	}
25629
25630	if (!HasRet)
25631	return false;
25632
25633	Chain = TCChain;
25634	return true;
25635	}
25636
25637	// Return whether the an instruction can potentially be optimized to a tail
25638	// call. This will cause the optimizers to attempt to move, or duplicate,
25639	// return instructions to help enable tail call optimizations for this
25640	// instruction.
25641	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst CI) const* {
25642	return CI->isTailCall();
25643	}
25644
25645	bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25646	Register Offset, bool IsPre,
25647	MachineRegisterInfo &MRI) const {
25648	auto CstOffset = getIConstantVRegVal(VReg: Offset, MRI);
25649	if (!CstOffset \|\| CstOffset ->isZero())
25650	return false;
25651
25652	// All of the indexed addressing mode instructions take a signed 9 bit
25653	// immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25654	// encodes the sign/indexing direction.
25655	return isInt<`9`>(x: CstOffset ->getSExtValue());
25656	}
25657
25658	bool AArch64TargetLowering::getIndexedAddressParts(SDNode N, SDNode Op,
25659	SDValue &Base,
25660	SDValue &Offset,
25661	SelectionDAG &DAG) const {
25662	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25663	return false;
25664
25665	// Non-null if there is exactly one user of the loaded value (ignoring chain).
25666	SDNode ValOnlyUser = nullptr*;
25667	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25668	++UI) {
25669	if (UI.getUse().getResNo() == `1`)
25670	continue; // Ignore chain.
25671	if (ValOnlyUser == nullptr)
25672	ValOnlyUser = *UI;
25673	else {
25674	ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25675	break;
25676	}
25677	}
25678
25679	auto IsUndefOrZero = [](SDValue V) {
25680	return V.isUndef() \|\| isNullOrNullSplat(V, /AllowUndefs/ true);
25681	};
25682
25683	// If the only user of the value is a scalable vector splat, it is
25684	// preferable to do a replicating load (ld1r).*
25685	if (ValOnlyUser && ValOnlyUser->getValueType(ResNo: `0`).isScalableVector() &&
25686	(ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR \|\|
25687	(ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25688	IsUndefOrZero (ValOnlyUser->getOperand(Num: `2`)))))
25689	return false;
25690
25691	Base = Op->getOperand(Num: `0`);
25692	// All of the indexed addressing mode instructions take a signed
25693	// 9 bit immediate offset.
25694	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Val: Op->getOperand(Num: `1`))) {
25695	int64_t RHSC = RHS->getSExtValue();
25696	if (Op->getOpcode() == ISD::SUB)
25697	RHSC = -(uint64_t)RHSC;
25698	if (!isInt<`9`>(x: RHSC))
25699	return false;
25700	// Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25701	// when dealing with subtraction.
25702	Offset = DAG.getConstant(Val: RHSC, DL: SDLoc (N), VT: RHS->getValueType(ResNo: `0`));
25703	return true;
25704	}
25705	return false;
25706	}
25707
25708	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25709	SDValue &Offset,
25710	ISD::MemIndexedMode &AM,
25711	SelectionDAG &DAG) const {
25712	EVT VT;
25713	SDValue Ptr;
25714	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
25715	VT = LD->getMemoryVT();
25716	Ptr = LD->getBasePtr();
25717	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
25718	VT = ST->getMemoryVT();
25719	Ptr = ST->getBasePtr();
25720	} else
25721	return false;
25722
25723	if (!getIndexedAddressParts(N, Op: Ptr.getNode(), Base, Offset, DAG))
25724	return false;
25725	AM = ISD::PRE_INC;
25726	return true;
25727	}
25728
25729	bool AArch64TargetLowering::getPostIndexedAddressParts(
25730	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
25731	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25732	EVT VT;
25733	SDValue Ptr;
25734	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val: N)) {
25735	VT = LD->getMemoryVT();
25736	Ptr = LD->getBasePtr();
25737	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Val: N)) {
25738	VT = ST->getMemoryVT();
25739	Ptr = ST->getBasePtr();
25740	} else
25741	return false;
25742
25743	if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25744	return false;
25745	// Post-indexing updates the base, so it's not a valid transform
25746	// if that's not the same as the load's pointer.
25747	if (Ptr != Base)
25748	return false;
25749	AM = ISD::POST_INC;
25750	return true;
25751	}
25752
25753	static void replaceBoolVectorBitcast(SDNode *N,
25754	SmallVectorImpl<SDValue> &Results,
25755	SelectionDAG &DAG) {
25756	SDLoc DL(N);
25757	SDValue Op = N->getOperand(Num: `0`);
25758	EVT VT = N->getValueType(ResNo: `0`);
25759	[[maybe_unused]] EVT SrcVT = Op.getValueType();
25760	assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25761	"Must be bool vector.");
25762
25763	// Special handling for Clang's __builtin_convertvector. For vectors with <8
25764	// elements, it adds a vector concatenation with undef(s). If we encounter
25765	// this here, we can skip the concat.
25766	if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(i: `0`).isUndef()) {
25767	bool AllUndef = true;
25768	for (unsigned I = `1`; I < Op.getNumOperands(); ++I)
25769	AllUndef &= Op.getOperand(i: I).isUndef();
25770
25771	if (AllUndef)
25772	Op = Op.getOperand(i: `0`);
25773	}
25774
25775	SDValue VectorBits = vectorToScalarBitmask(N: Op.getNode(), DAG);
25776	if (VectorBits)
25777	Results.push_back(Elt: DAG.getZExtOrTrunc(Op: VectorBits, DL, VT));
25778	}
25779
25780	static void CustomNonLegalBITCASTResults(SDNode *N,
25781	SmallVectorImpl<SDValue> &Results,
25782	SelectionDAG &DAG, EVT ExtendVT,
25783	EVT CastVT) {
25784	SDLoc DL(N);
25785	SDValue Op = N->getOperand(Num: `0`);
25786	EVT VT = N->getValueType(ResNo: `0`);
25787
25788	// Use SCALAR_TO_VECTOR for lane zero
25789	SDValue Vec = DAG.getNode(Opcode: ISD::SCALAR_TO_VECTOR, DL, VT: ExtendVT, Operand: Op);
25790	SDValue CastVal = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: CastVT, Operand: Vec);
25791	SDValue IdxZero = DAG.getVectorIdxConstant(Val: `0`, DL);
25792	Results.push_back(
25793	Elt: DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: CastVal, N2: IdxZero));
25794	}
25795
25796	void AArch64TargetLowering::ReplaceBITCASTResults(
25797	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
25798	SDLoc DL(N);
25799	SDValue Op = N->getOperand(Num: `0`);
25800	EVT VT = N->getValueType(ResNo: `0`);
25801	EVT SrcVT = Op.getValueType();
25802
25803	if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25804	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v4i16);
25805	return;
25806	}
25807
25808	if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25809	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v2i32, CastVT: MVT::v8i8);
25810	return;
25811	}
25812
25813	if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25814	CustomNonLegalBITCASTResults(N, Results, DAG, ExtendVT: MVT::v4i16, CastVT: MVT::v8i8);
25815	return;
25816	}
25817
25818	if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(VT: SrcVT)) {
25819	assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25820	"Expected fp->int bitcast!");
25821
25822	// Bitcasting between unpacked vector types of different element counts is
25823	// not a NOP because the live elements are laid out differently.
25824	// 01234567
25825	// e.g. nxv2i32 = XX??XX??
25826	// nxv4f16 = X?X?X?X?
25827	if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25828	return;
25829
25830	SDValue CastResult = getSVESafeBitCast(VT: getSVEContainerType(ContentTy: VT), Op, DAG);
25831	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: CastResult));
25832	return;
25833	}
25834
25835	if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25836	!VT.isVector())
25837	return replaceBoolVectorBitcast(N, Results, DAG);
25838
25839	if (VT != MVT::i16 \|\| (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25840	return;
25841
25842	Op = DAG.getTargetInsertSubreg(SRIdx: AArch64::hsub, DL, VT: MVT::f32,
25843	Operand: DAG.getUNDEF(VT: MVT::i32), Subreg: Op);
25844	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::i32, Operand: Op);
25845	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: MVT::i16, Operand: Op));
25846	}
25847
25848	static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
25849	SelectionDAG &DAG,
25850	const AArch64Subtarget *Subtarget) {
25851	EVT VT = N->getValueType(ResNo: `0`);
25852	if (!VT.is256BitVector() \|\|
25853	(VT.getScalarType().isFloatingPoint() &&
25854	!N->getFlags().hasAllowReassociation()) \|\|
25855	(VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) \|\|
25856	VT.getScalarType() == MVT::bf16)
25857	return;
25858
25859	SDValue X = N->getOperand(Num: `0`);
25860	auto *Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `1`));
25861	if (!Shuf) {
25862	Shuf = dyn_cast<ShuffleVectorSDNode>(Val: N->getOperand(Num: `0`));
25863	X = N->getOperand(Num: `1`);
25864	if (!Shuf)
25865	return;
25866	}
25867
25868	if (Shuf->getOperand(Num: `0`) != X \|\| !Shuf->getOperand(Num: `1`)->isUndef())
25869	return;
25870
25871	// Check the mask is 1,0,3,2,5,4,...
25872	ArrayRef<int> Mask = Shuf->getMask();
25873	for (int I = `0`, E = Mask.size(); I < E; I++)
25874	if (Mask [I] != (I % `2` == `0` ? I + `1` : I - `1`))
25875	return;
25876
25877	SDLoc DL(N);
25878	auto LoHi = DAG.SplitVector(N: X, DL);
25879	assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25880	SDValue Addp = DAG.getNode(Opcode: AArch64ISD::ADDP, DL: N, VT: LoHi.first.getValueType(),
25881	N1: LoHi.first, N2: LoHi.second);
25882
25883	// Shuffle the elements back into order.
25884	SmallVector<int> NMask;
25885	for (unsigned I = `0`, E = VT.getVectorNumElements() / `2`; I < E; I++) {
25886	NMask.push_back(Elt: I);
25887	NMask.push_back(Elt: I);
25888	}
25889	Results.push_back(
25890	Elt: DAG.getVectorShuffle(VT, dl: DL,
25891	N1: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, N1: Addp,
25892	N2: DAG.getUNDEF(VT: LoHi.first.getValueType())),
25893	N2: DAG.getUNDEF(VT), Mask: NMask));
25894	}
25895
25896	static void ReplaceReductionResults(SDNode *N,
25897	SmallVectorImpl<SDValue> &Results,
25898	SelectionDAG &DAG, unsigned InterOp,
25899	unsigned AcrossOp) {
25900	EVT LoVT, HiVT;
25901	SDValue Lo, Hi;
25902	SDLoc dl(N);
25903	std::tie(args&: LoVT, args&: HiVT) = DAG.GetSplitDestVTs(VT: N->getValueType(ResNo: `0`));
25904	std::tie(args&: Lo, args&: Hi) = DAG.SplitVectorOperand(N, OpNo: `0`);
25905	SDValue InterVal = DAG.getNode(Opcode: InterOp, DL: dl, VT: LoVT, N1: Lo, N2: Hi);
25906	SDValue SplitVal = DAG.getNode(Opcode: AcrossOp, DL: dl, VT: LoVT, Operand: InterVal);
25907	Results.push_back(Elt: SplitVal);
25908	}
25909
25910	void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25911	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
25912	SDValue In = N->getOperand(Num: `0`);
25913	EVT InVT = In.getValueType();
25914
25915	// Common code will handle these just fine.
25916	if (!InVT.isScalableVector() \|\| !InVT.isInteger())
25917	return;
25918
25919	SDLoc DL(N);
25920	EVT VT = N->getValueType(ResNo: `0`);
25921
25922	// The following checks bail if this is not a halving operation.
25923
25924	ElementCount ResEC = VT.getVectorElementCount();
25925
25926	if (InVT.getVectorElementCount() != (ResEC * `2`))
25927	return;
25928
25929	auto *CIndex = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: `1`));
25930	if (!CIndex)
25931	return;
25932
25933	unsigned Index = CIndex->getZExtValue();
25934	if ((Index != `0`) && (Index != ResEC.getKnownMinValue()))
25935	return;
25936
25937	unsigned Opcode = (Index == `0`) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25938	EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
25939
25940	SDValue Half = DAG.getNode(Opcode, DL, VT: ExtendedHalfVT, Operand: N->getOperand(Num: `0`));
25941	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Half));
25942	}
25943
25944	// Create an even/odd pair of X registers holding integer value V.
25945	static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
25946	SDLoc dl(V.getNode());
25947	auto [VLo, VHi] = DAG.SplitScalar(N: V, DL: dl, LoVT: MVT::i64, HiVT: MVT::i64);
25948	if (DAG.getDataLayout().isBigEndian())
25949	std::swap (a&: VLo, b&: VHi);
25950	SDValue RegClass =
25951	DAG.getTargetConstant(Val: AArch64::XSeqPairsClassRegClassID, DL: dl, VT: MVT::i32);
25952	SDValue SubReg0 = DAG.getTargetConstant(Val: AArch64::sube64, DL: dl, VT: MVT::i32);
25953	SDValue SubReg1 = DAG.getTargetConstant(Val: AArch64::subo64, DL: dl, VT: MVT::i32);
25954	const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25955	return SDValue (
25956	DAG.getMachineNode(Opcode: TargetOpcode::REG_SEQUENCE, dl, VT: MVT::Untyped, Ops), `0`);
25957	}
25958
25959	static void ReplaceCMP_SWAP_128Results(SDNode *N,
25960	SmallVectorImpl<SDValue> &Results,
25961	SelectionDAG &DAG,
25962	const AArch64Subtarget *Subtarget) {
25963	assert(N->getValueType(`0`) == MVT::i128 &&
25964	"AtomicCmpSwap on types less than 128 should be legal");
25965
25966	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
25967	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics()) {
25968	// LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25969	// so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25970	SDValue Ops[] = {
25971	createGPRPairNode(DAG, V: N->getOperand(Num: `2`)), // Compare value
25972	createGPRPairNode(DAG, V: N->getOperand(Num: `3`)), // Store value
25973	N->getOperand(Num: `1`), // Ptr
25974	N->getOperand(Num: `0`), // Chain in
25975	};
25976
25977	unsigned Opcode;
25978	switch (MemOp->getMergedOrdering()) {
25979	case AtomicOrdering::Monotonic:
25980	Opcode = AArch64::CASPX;
25981	break;
25982	case AtomicOrdering::Acquire:
25983	Opcode = AArch64::CASPAX;
25984	break;
25985	case AtomicOrdering::Release:
25986	Opcode = AArch64::CASPLX;
25987	break;
25988	case AtomicOrdering::AcquireRelease:
25989	case AtomicOrdering::SequentiallyConsistent:
25990	Opcode = AArch64::CASPALX;
25991	break;
25992	default:
25993	llvm_unreachable("Unexpected ordering!");
25994	}
25995
25996	MachineSDNode *CmpSwap = DAG.getMachineNode(
25997	Opcode, dl: SDLoc (N), VTs: DAG.getVTList(VT1: MVT::Untyped, VT2: MVT::Other), Ops);
25998	DAG.setNodeMemRefs(N: CmpSwap, NewMemRefs: {MemOp});
25999
26000	unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
26001	if (DAG.getDataLayout().isBigEndian())
26002	std::swap(a&: SubReg1, b&: SubReg2);
26003	SDValue Lo = DAG.getTargetExtractSubreg(SRIdx: SubReg1, DL: SDLoc (N), VT: MVT::i64,
26004	Operand: SDValue (CmpSwap, `0`));
26005	SDValue Hi = DAG.getTargetExtractSubreg(SRIdx: SubReg2, DL: SDLoc (N), VT: MVT::i64,
26006	Operand: SDValue (CmpSwap, `0`));
26007	Results.push_back(
26008	Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128, N1: Lo, N2: Hi));
26009	Results.push_back(Elt: SDValue (CmpSwap, `1`)); // Chain out
26010	return;
26011	}
26012
26013	unsigned Opcode;
26014	switch (MemOp->getMergedOrdering()) {
26015	case AtomicOrdering::Monotonic:
26016	Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
26017	break;
26018	case AtomicOrdering::Acquire:
26019	Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
26020	break;
26021	case AtomicOrdering::Release:
26022	Opcode = AArch64::CMP_SWAP_128_RELEASE;
26023	break;
26024	case AtomicOrdering::AcquireRelease:
26025	case AtomicOrdering::SequentiallyConsistent:
26026	Opcode = AArch64::CMP_SWAP_128;
26027	break;
26028	default:
26029	llvm_unreachable("Unexpected ordering!");
26030	}
26031
26032	SDLoc DL(N);
26033	auto Desired = DAG.SplitScalar(N: N->getOperand(Num: `2`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
26034	auto New = DAG.SplitScalar(N: N->getOperand(Num: `3`), DL, LoVT: MVT::i64, HiVT: MVT::i64);
26035	SDValue Ops[] = {N->getOperand(Num: `1`), Desired.first, Desired.second,
26036	New.first, New.second, N->getOperand(Num: `0`)};
26037	SDNode *CmpSwap = DAG.getMachineNode(
26038	Opcode, dl: SDLoc (N), VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::i32, VT4: MVT::Other),
26039	Ops);
26040	DAG.setNodeMemRefs(N: cast<MachineSDNode>(Val: CmpSwap), NewMemRefs: {MemOp});
26041
26042	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
26043	N1: SDValue (CmpSwap, `0`), N2: SDValue (CmpSwap, `1`)));
26044	Results.push_back(Elt: SDValue (CmpSwap, `3`));
26045	}
26046
26047	static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
26048	AtomicOrdering Ordering) {
26049	// ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
26050	// LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
26051	// the type is not legal. Therefore we shouldn't expect to see a 128-bit
26052	// ATOMIC_LOAD_CLR at any point.
26053	assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
26054	"ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
26055	assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
26056	assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
26057
26058	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26059	// The operand will need to be XORed in a separate step.
26060	switch (Ordering) {
26061	case AtomicOrdering::Monotonic:
26062	return AArch64::LDCLRP;
26063	break;
26064	case AtomicOrdering::Acquire:
26065	return AArch64::LDCLRPA;
26066	break;
26067	case AtomicOrdering::Release:
26068	return AArch64::LDCLRPL;
26069	break;
26070	case AtomicOrdering::AcquireRelease:
26071	case AtomicOrdering::SequentiallyConsistent:
26072	return AArch64::LDCLRPAL;
26073	break;
26074	default:
26075	llvm_unreachable("Unexpected ordering!");
26076	}
26077	}
26078
26079	if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
26080	switch (Ordering) {
26081	case AtomicOrdering::Monotonic:
26082	return AArch64::LDSETP;
26083	break;
26084	case AtomicOrdering::Acquire:
26085	return AArch64::LDSETPA;
26086	break;
26087	case AtomicOrdering::Release:
26088	return AArch64::LDSETPL;
26089	break;
26090	case AtomicOrdering::AcquireRelease:
26091	case AtomicOrdering::SequentiallyConsistent:
26092	return AArch64::LDSETPAL;
26093	break;
26094	default:
26095	llvm_unreachable("Unexpected ordering!");
26096	}
26097	}
26098
26099	if (ISDOpcode == ISD::ATOMIC_SWAP) {
26100	switch (Ordering) {
26101	case AtomicOrdering::Monotonic:
26102	return AArch64::SWPP;
26103	break;
26104	case AtomicOrdering::Acquire:
26105	return AArch64::SWPPA;
26106	break;
26107	case AtomicOrdering::Release:
26108	return AArch64::SWPPL;
26109	break;
26110	case AtomicOrdering::AcquireRelease:
26111	case AtomicOrdering::SequentiallyConsistent:
26112	return AArch64::SWPPAL;
26113	break;
26114	default:
26115	llvm_unreachable("Unexpected ordering!");
26116	}
26117	}
26118
26119	llvm_unreachable("Unexpected ISDOpcode!");
26120	}
26121
26122	static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
26123	SmallVectorImpl<SDValue> &Results,
26124	SelectionDAG &DAG,
26125	const AArch64Subtarget *Subtarget) {
26126	// LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
26127	// here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
26128	// rather than the CASP instructions, because CASP has register classes for
26129	// the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
26130	// to present them as single operands. LSE128 instructions use the GPR64
26131	// register class (because the pair does not have to be sequential), like
26132	// CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
26133
26134	assert(N->getValueType(`0`) == MVT::i128 &&
26135	"AtomicLoadXXX on types less than 128 should be legal");
26136
26137	if (!Subtarget->hasLSE128())
26138	return;
26139
26140	MachineMemOperand *MemOp = cast<MemSDNode>(Val: N)->getMemOperand();
26141	const SDValue &Chain = N->getOperand(Num: `0`);
26142	const SDValue &Ptr = N->getOperand(Num: `1`);
26143	const SDValue &Val128 = N->getOperand(Num: `2`);
26144	std::pair<SDValue, SDValue> Val2x64 =
26145	DAG.SplitScalar(N: Val128, DL: SDLoc (Val128), LoVT: MVT::i64, HiVT: MVT::i64);
26146
26147	const unsigned ISDOpcode = N->getOpcode();
26148	const unsigned MachineOpcode =
26149	getAtomicLoad128Opcode(ISDOpcode, Ordering: MemOp->getMergedOrdering());
26150
26151	if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26152	SDLoc dl(Val128);
26153	Val2x64.first =
26154	DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i64,
26155	N1: DAG.getConstant(Val: -`1ULL`, DL: dl, VT: MVT::i64), N2: Val2x64.first);
26156	Val2x64.second =
26157	DAG.getNode(Opcode: ISD::XOR, DL: dl, VT: MVT::i64,
26158	N1: DAG.getConstant(Val: -`1ULL`, DL: dl, VT: MVT::i64), N2: Val2x64.second);
26159	}
26160
26161	SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
26162	if (DAG.getDataLayout().isBigEndian())
26163	std::swap(a&: Ops[`0`], b&: Ops[`1`]);
26164
26165	MachineSDNode *AtomicInst =
26166	DAG.getMachineNode(Opcode: MachineOpcode, dl: SDLoc (N),
26167	VTs: DAG.getVTList(VT1: MVT::i64, VT2: MVT::i64, VT3: MVT::Other), Ops);
26168
26169	DAG.setNodeMemRefs(N: AtomicInst, NewMemRefs: {MemOp});
26170
26171	SDValue Lo = SDValue (AtomicInst, `0`), Hi = SDValue (AtomicInst, `1`);
26172	if (DAG.getDataLayout().isBigEndian())
26173	std::swap(a&: Lo, b&: Hi);
26174
26175	Results.push_back(Elt: DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128, N1: Lo, N2: Hi));
26176	Results.push_back(Elt: SDValue (AtomicInst, `2`)); // Chain out
26177	}
26178
26179	void AArch64TargetLowering::ReplaceNodeResults(
26180	SDNode N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const* {
26181	switch (N->getOpcode()) {
26182	default:
26183	llvm_unreachable("Don't know how to custom expand this");
26184	case ISD::BITCAST:
26185	ReplaceBITCASTResults(N, Results, DAG);
26186	return;
26187	case ISD::VECREDUCE_ADD:
26188	case ISD::VECREDUCE_SMAX:
26189	case ISD::VECREDUCE_SMIN:
26190	case ISD::VECREDUCE_UMAX:
26191	case ISD::VECREDUCE_UMIN:
26192	Results.push_back(Elt: LowerVECREDUCE(Op: SDValue (N, `0`), DAG));
26193	return;
26194	case ISD::ADD:
26195	case ISD::FADD:
26196	ReplaceAddWithADDP(N, Results, DAG, Subtarget);
26197	return;
26198
26199	case ISD::CTPOP:
26200	case ISD::PARITY:
26201	if (SDValue Result = LowerCTPOP_PARITY(Op: SDValue (N, `0`), DAG))
26202	Results.push_back(Elt: Result);
26203	return;
26204	case AArch64ISD::SADDV:
26205	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::SADDV);
26206	return;
26207	case AArch64ISD::UADDV:
26208	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::ADD, AcrossOp: AArch64ISD::UADDV);
26209	return;
26210	case AArch64ISD::SMINV:
26211	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMIN, AcrossOp: AArch64ISD::SMINV);
26212	return;
26213	case AArch64ISD::UMINV:
26214	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMIN, AcrossOp: AArch64ISD::UMINV);
26215	return;
26216	case AArch64ISD::SMAXV:
26217	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::SMAX, AcrossOp: AArch64ISD::SMAXV);
26218	return;
26219	case AArch64ISD::UMAXV:
26220	ReplaceReductionResults(N, Results, DAG, InterOp: ISD::UMAX, AcrossOp: AArch64ISD::UMAXV);
26221	return;
26222	case ISD::MULHS:
26223	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
26224	Results.push_back(
26225	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHS_PRED));
26226	return;
26227	case ISD::MULHU:
26228	if (useSVEForFixedLengthVectorVT(VT: SDValue (N, `0`).getValueType()))
26229	Results.push_back(
26230	Elt: LowerToPredicatedOp(Op: SDValue (N, `0`), DAG, NewOp: AArch64ISD::MULHU_PRED));
26231	return;
26232	case ISD::FP_TO_UINT:
26233	case ISD::FP_TO_SINT:
26234	case ISD::STRICT_FP_TO_SINT:
26235	case ISD::STRICT_FP_TO_UINT:
26236	assert(N->getValueType(`0`) == MVT::i128 && "unexpected illegal conversion");
26237	// Let normal code take care of it by not adding anything to Results.
26238	return;
26239	case ISD::ATOMIC_CMP_SWAP:
26240	ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
26241	return;
26242	case ISD::ATOMIC_LOAD_CLR:
26243	assert(N->getValueType(`0`) != MVT::i128 &&
26244	"128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
26245	break;
26246	case ISD::ATOMIC_LOAD_AND:
26247	case ISD::ATOMIC_LOAD_OR:
26248	case ISD::ATOMIC_SWAP: {
26249	assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
26250	"Expected 128-bit atomicrmw.");
26251	// These need custom type legalisation so we go directly to instruction.
26252	ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
26253	return;
26254	}
26255	case ISD::ATOMIC_LOAD:
26256	case ISD::LOAD: {
26257	MemSDNode *LoadNode = cast<MemSDNode>(Val: N);
26258	EVT MemVT = LoadNode->getMemoryVT();
26259	// Handle lowering 256 bit non temporal loads into LDNP for little-endian
26260	// targets.
26261	if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
26262	MemVT.getSizeInBits() == `256u` &&
26263	(MemVT.getScalarSizeInBits() == `8u` \|\|
26264	MemVT.getScalarSizeInBits() == `16u` \|\|
26265	MemVT.getScalarSizeInBits() == `32u` \|\|
26266	MemVT.getScalarSizeInBits() == `64u`)) {
26267
26268	SDValue Result = DAG.getMemIntrinsicNode(
26269	Opcode: AArch64ISD::LDNP, dl: SDLoc (N),
26270	VTList: DAG.getVTList(VTs: {MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
26271	MemVT.getHalfNumVectorElementsVT(Context&: *DAG.getContext()),
26272	MVT::Other}),
26273	Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
26274	MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
26275
26276	SDValue Pair = DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: SDLoc (N), VT: MemVT,
26277	N1: Result.getValue(R: `0`), N2: Result.getValue(R: `1`));
26278	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
26279	return;
26280	}
26281
26282	if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) \|\|
26283	LoadNode->getMemoryVT() != MVT::i128) {
26284	// Non-volatile or atomic loads are optimized later in AArch64's load/store
26285	// optimizer.
26286	return;
26287	}
26288
26289	if (SDValue (N, `0`).getValueType() == MVT::i128) {
26290	auto *AN = dyn_cast<AtomicSDNode>(Val: LoadNode);
26291	bool isLoadAcquire =
26292	AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
26293	unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
26294
26295	if (isLoadAcquire)
26296	assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
26297
26298	SDValue Result = DAG.getMemIntrinsicNode(
26299	Opcode, dl: SDLoc (N), VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
26300	Ops: {LoadNode->getChain(), LoadNode->getBasePtr()},
26301	MemVT: LoadNode->getMemoryVT(), MMO: LoadNode->getMemOperand());
26302
26303	unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? `1` : `0`;
26304
26305	SDValue Pair =
26306	DAG.getNode(Opcode: ISD::BUILD_PAIR, DL: SDLoc (N), VT: MVT::i128,
26307	N1: Result.getValue(R: FirstRes), N2: Result.getValue(R: `1` - FirstRes));
26308	Results.append(IL: {Pair, Result.getValue(R: `2`) / Chain /});
26309	}
26310	return;
26311	}
26312	case ISD::EXTRACT_SUBVECTOR:
26313	ReplaceExtractSubVectorResults(N, Results, DAG);
26314	return;
26315	case ISD::INSERT_SUBVECTOR:
26316	case ISD::CONCAT_VECTORS:
26317	// Custom lowering has been requested for INSERT_SUBVECTOR and
26318	// CONCAT_VECTORS -- but delegate to common code for result type
26319	// legalisation
26320	return;
26321	case ISD::INTRINSIC_WO_CHAIN: {
26322	EVT VT = N->getValueType(ResNo: `0`);
26323
26324	Intrinsic::ID IntID =
26325	static_cast<Intrinsic::ID>(N->getConstantOperandVal(Num: `0`));
26326	switch (IntID) {
26327	default:
26328	return;
26329	case Intrinsic::aarch64_sve_clasta_n: {
26330	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
26331	"custom lowering for unexpected type");
26332	SDLoc DL(N);
26333	auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: `2`));
26334	auto V = DAG.getNode(Opcode: AArch64ISD::CLASTA_N, DL, VT: MVT::i32,
26335	N1: N->getOperand(Num: `1`), N2: Op2, N3: N->getOperand(Num: `3`));
26336	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26337	return;
26338	}
26339	case Intrinsic::aarch64_sve_clastb_n: {
26340	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
26341	"custom lowering for unexpected type");
26342	SDLoc DL(N);
26343	auto Op2 = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: MVT::i32, Operand: N->getOperand(Num: `2`));
26344	auto V = DAG.getNode(Opcode: AArch64ISD::CLASTB_N, DL, VT: MVT::i32,
26345	N1: N->getOperand(Num: `1`), N2: Op2, N3: N->getOperand(Num: `3`));
26346	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26347	return;
26348	}
26349	case Intrinsic::aarch64_sve_lasta: {
26350	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
26351	"custom lowering for unexpected type");
26352	SDLoc DL(N);
26353	auto V = DAG.getNode(Opcode: AArch64ISD::LASTA, DL, VT: MVT::i32,
26354	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
26355	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26356	return;
26357	}
26358	case Intrinsic::aarch64_sve_lastb: {
26359	assert((VT == MVT::i8 \|\| VT == MVT::i16) &&
26360	"custom lowering for unexpected type");
26361	SDLoc DL(N);
26362	auto V = DAG.getNode(Opcode: AArch64ISD::LASTB, DL, VT: MVT::i32,
26363	N1: N->getOperand(Num: `1`), N2: N->getOperand(Num: `2`));
26364	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26365	return;
26366	}
26367	case Intrinsic::get_active_lane_mask: {
26368	if (!VT.isFixedLengthVector() \|\| VT.getVectorElementType() != MVT::i1)
26369	return;
26370
26371	// NOTE: Only trivial type promotion is supported.
26372	EVT NewVT = getTypeToTransformTo(Context&: *DAG.getContext(), VT);
26373	if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
26374	return;
26375
26376	SDLoc DL(N);
26377	auto V = DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: NewVT, Ops: N->ops());
26378	Results.push_back(Elt: DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: V));
26379	return;
26380	}
26381	}
26382	}
26383	case ISD::READ_REGISTER: {
26384	SDLoc DL(N);
26385	assert(N->getValueType(`0`) == MVT::i128 &&
26386	"READ_REGISTER custom lowering is only for 128-bit sysregs");
26387	SDValue Chain = N->getOperand(Num: `0`);
26388	SDValue SysRegName = N->getOperand(Num: `1`);
26389
26390	SDValue Result = DAG.getNode(
26391	Opcode: AArch64ISD::MRRS, DL, VTList: DAG.getVTList(VTs: {MVT::i64, MVT::i64, MVT::Other}),
26392	N1: Chain, N2: SysRegName);
26393
26394	// Sysregs are not endian. Result.getValue(0) always contains the lower half
26395	// of the 128-bit System Register value.
26396	SDValue Pair = DAG.getNode(Opcode: ISD::BUILD_PAIR, DL, VT: MVT::i128,
26397	N1: Result.getValue(R: `0`), N2: Result.getValue(R: `1`));
26398	Results.push_back(Elt: Pair);
26399	Results.push_back(Elt: Result.getValue(R: `2`)); // Chain
26400	return;
26401	}
26402	}
26403	}
26404
26405	bool AArch64TargetLowering::useLoadStackGuardNode() const {
26406	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
26407	return TargetLowering::useLoadStackGuardNode();
26408	return true;
26409	}
26410
26411	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
26412	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
26413	// reciprocal if there are three or more FDIVs.
26414	return `3`;
26415	}
26416
26417	TargetLoweringBase::LegalizeTypeAction
26418	AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
26419	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
26420	// v4i16, v2i32 instead of to promote.
26421	if (VT == MVT::v1i8 \|\| VT == MVT::v1i16 \|\| VT == MVT::v1i32 \|\|
26422	VT == MVT::v1f32)
26423	return TypeWidenVector;
26424
26425	return TargetLoweringBase::getPreferredVectorAction(VT);
26426	}
26427
26428	// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
26429	// provided the address is 16-byte aligned.
26430	bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction I) const* {
26431	if (!Subtarget->hasLSE2())
26432	return false;
26433
26434	if (auto LI = dyn_cast<LoadInst>(Val: I))
26435	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
26436	LI->getAlign() >= Align (`16`);
26437
26438	if (auto SI = dyn_cast<StoreInst>(Val: I))
26439	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
26440	SI->getAlign() >= Align (`16`);
26441
26442	return false;
26443	}
26444
26445	bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction I) const* {
26446	if (!Subtarget->hasLSE128())
26447	return false;
26448
26449	// Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
26450	// will clobber the two registers.
26451	if (const auto *SI = dyn_cast<StoreInst>(Val: I))
26452	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
26453	SI->getAlign() >= Align (`16`) &&
26454	(SI->getOrdering() == AtomicOrdering::Release \|\|
26455	SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
26456
26457	if (const auto *RMW = dyn_cast<AtomicRMWInst>(Val: I))
26458	return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
26459	RMW->getAlign() >= Align (`16`) &&
26460	(RMW->getOperation() == AtomicRMWInst::Xchg \|\|
26461	RMW->getOperation() == AtomicRMWInst::And \|\|
26462	RMW->getOperation() == AtomicRMWInst::Or);
26463
26464	return false;
26465	}
26466
26467	bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction I) const* {
26468	if (!Subtarget->hasLSE2() \|\| !Subtarget->hasRCPC3())
26469	return false;
26470
26471	if (auto LI = dyn_cast<LoadInst>(Val: I))
26472	return LI->getType()->getPrimitiveSizeInBits() == `128` &&
26473	LI->getAlign() >= Align (`16`) &&
26474	LI->getOrdering() == AtomicOrdering::Acquire;
26475
26476	if (auto SI = dyn_cast<StoreInst>(Val: I))
26477	return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == `128` &&
26478	SI->getAlign() >= Align (`16`) &&
26479	SI->getOrdering() == AtomicOrdering::Release;
26480
26481	return false;
26482	}
26483
26484	bool AArch64TargetLowering::shouldInsertFencesForAtomic(
26485	const Instruction I) const* {
26486	if (isOpSuitableForRCPC3(I))
26487	return false;
26488	if (isOpSuitableForLSE128(I))
26489	return false;
26490	if (isOpSuitableForLDPSTP(I))
26491	return true;
26492	return false;
26493	}
26494
26495	bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
26496	const Instruction I) const* {
26497	// Store-Release instructions only provide seq_cst guarantees when paired with
26498	// Load-Acquire instructions. MSVC CRT does not use these instructions to
26499	// implement seq_cst loads and stores, so we need additional explicit fences
26500	// after memory writes.
26501	if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26502	return false;
26503
26504	switch (I->getOpcode()) {
26505	default:
26506	return false;
26507	case Instruction::AtomicCmpXchg:
26508	return cast<AtomicCmpXchgInst>(Val: I)->getSuccessOrdering() ==
26509	AtomicOrdering::SequentiallyConsistent;
26510	case Instruction::AtomicRMW:
26511	return cast<AtomicRMWInst>(Val: I)->getOrdering() ==
26512	AtomicOrdering::SequentiallyConsistent;
26513	case Instruction::Store:
26514	return cast<StoreInst>(Val: I)->getOrdering() ==
26515	AtomicOrdering::SequentiallyConsistent;
26516	}
26517	}
26518
26519	// Loads and stores less than 128-bits are already atomic; ones above that
26520	// are doomed anyway, so defer to the default libcall and blame the OS when
26521	// things go wrong.
26522	TargetLoweringBase::AtomicExpansionKind
26523	AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst SI) const* {
26524	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
26525	if (Size != `128`)
26526	return AtomicExpansionKind::None;
26527	if (isOpSuitableForRCPC3(I: SI))
26528	return AtomicExpansionKind::None;
26529	if (isOpSuitableForLSE128(I: SI))
26530	return AtomicExpansionKind::Expand;
26531	if (isOpSuitableForLDPSTP(I: SI))
26532	return AtomicExpansionKind::None;
26533	return AtomicExpansionKind::Expand;
26534	}
26535
26536	// Loads and stores less than 128-bits are already atomic; ones above that
26537	// are doomed anyway, so defer to the default libcall and blame the OS when
26538	// things go wrong.
26539	TargetLowering::AtomicExpansionKind
26540	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst LI) const* {
26541	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
26542
26543	if (Size != `128`)
26544	return AtomicExpansionKind::None;
26545	if (isOpSuitableForRCPC3(I: LI))
26546	return AtomicExpansionKind::None;
26547	// No LSE128 loads
26548	if (isOpSuitableForLDPSTP(I: LI))
26549	return AtomicExpansionKind::None;
26550
26551	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
26552	// implement atomicrmw without spilling. If the target address is also on the
26553	// stack and close enough to the spill slot, this can lead to a situation
26554	// where the monitor always gets cleared and the atomic operation can never
26555	// succeed. So at -O0 lower this operation to a CAS loop.
26556	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26557	return AtomicExpansionKind::CmpXChg;
26558
26559	// Using CAS for an atomic load has a better chance of succeeding under high
26560	// contention situations. So use it if available.
26561	return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
26562	: AtomicExpansionKind::LLSC;
26563	}
26564
26565	// The "default" for integer RMW operations is to expand to an LL/SC loop.
26566	// However, with the LSE instructions (or outline-atomics mode, which provides
26567	// library routines in place of the LSE-instructions), we can directly emit many
26568	// operations instead.
26569	//
26570	// Floating-point operations are always emitted to a cmpxchg loop, because they
26571	// may trigger a trap which aborts an LLSC sequence.
26572	TargetLowering::AtomicExpansionKind
26573	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst AI) const* {
26574	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26575	assert(Size <= `128` && "AtomicExpandPass should've handled larger sizes.");
26576
26577	if (AI->isFloatingPointOperation())
26578	return AtomicExpansionKind::CmpXChg;
26579
26580	bool CanUseLSE128 = Subtarget->hasLSE128() && Size == `128` &&
26581	(AI->getOperation() == AtomicRMWInst::Xchg \|\|
26582	AI->getOperation() == AtomicRMWInst::Or \|\|
26583	AI->getOperation() == AtomicRMWInst::And);
26584	if (CanUseLSE128)
26585	return AtomicExpansionKind::None;
26586
26587	// Nand is not supported in LSE.
26588	// Leave 128 bits to LLSC or CmpXChg.
26589	if (AI->getOperation() != AtomicRMWInst::Nand && Size < `128`) {
26590	if (Subtarget->hasLSE())
26591	return AtomicExpansionKind::None;
26592	if (Subtarget->outlineAtomics()) {
26593	// [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
26594	// Don't outline them unless
26595	// (1) high level <atomic> support approved:
26596	// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
26597	// (2) low level libgcc and compiler-rt support implemented by:
26598	// min/max outline atomics helpers
26599	if (AI->getOperation() != AtomicRMWInst::Min &&
26600	AI->getOperation() != AtomicRMWInst::Max &&
26601	AI->getOperation() != AtomicRMWInst::UMin &&
26602	AI->getOperation() != AtomicRMWInst::UMax) {
26603	return AtomicExpansionKind::None;
26604	}
26605	}
26606	}
26607
26608	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
26609	// implement atomicrmw without spilling. If the target address is also on the
26610	// stack and close enough to the spill slot, this can lead to a situation
26611	// where the monitor always gets cleared and the atomic operation can never
26612	// succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26613	// we have a single CAS instruction that can replace the loop.
26614	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None \|\|
26615	Subtarget->hasLSE())
26616	return AtomicExpansionKind::CmpXChg;
26617
26618	return AtomicExpansionKind::LLSC;
26619	}
26620
26621	TargetLowering::AtomicExpansionKind
26622	AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
26623	AtomicCmpXchgInst AI) const* {
26624	// If subtarget has LSE, leave cmpxchg intact for codegen.
26625	if (Subtarget->hasLSE() \|\| Subtarget->outlineAtomics())
26626	return AtomicExpansionKind::None;
26627	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
26628	// implement cmpxchg without spilling. If the address being exchanged is also
26629	// on the stack and close enough to the spill slot, this can lead to a
26630	// situation where the monitor always gets cleared and the atomic operation
26631	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26632	if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26633	return AtomicExpansionKind::None;
26634
26635	// 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26636	// it.
26637	unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
26638	if (Size > `64`)
26639	return AtomicExpansionKind::None;
26640
26641	return AtomicExpansionKind::LLSC;
26642	}
26643
26644	Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
26645	Type ValueTy, Value Addr,
26646	AtomicOrdering Ord) const {
26647	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26648	bool IsAcquire = isAcquireOrStronger(AO: Ord);
26649
26650	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26651	// intrinsic must return {i64, i64} and we have to recombine them into a
26652	// single i128 here.
26653	if (ValueTy->getPrimitiveSizeInBits() == `128`) {
26654	Intrinsic::ID Int =
26655	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26656	Function *Ldxr = Intrinsic::getDeclaration(M, id: Int);
26657
26658	Value *LoHi = Builder.CreateCall(Callee: Ldxr, Args: Addr, Name: "lohi");
26659
26660	Value *Lo = Builder.CreateExtractValue(Agg: LoHi, Idxs: `0`, Name: "lo");
26661	Value *Hi = Builder.CreateExtractValue(Agg: LoHi, Idxs: `1`, Name: "hi");
26662	Lo = Builder.CreateZExt(V: Lo, DestTy: ValueTy, Name: "lo64");
26663	Hi = Builder.CreateZExt(V: Hi, DestTy: ValueTy, Name: "hi64");
26664	return Builder.CreateOr(
26665	LHS: Lo, RHS: Builder.CreateShl(LHS: Hi, RHS: ConstantInt::get(Ty: ValueTy, V: `64`)), Name: "val64");
26666	}
26667
26668	Type *Tys[] = { Addr->getType() };
26669	Intrinsic::ID Int =
26670	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26671	Function *Ldxr = Intrinsic::getDeclaration(M, id: Int, Tys);
26672
26673	const DataLayout &DL = M->getDataLayout();
26674	IntegerType *IntEltTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: ValueTy));
26675	CallInst *CI = Builder.CreateCall(Callee: Ldxr, Args: Addr);
26676	CI->addParamAttr(
26677	ArgNo: `0`, Attr: Attribute::get(Context&: Builder.getContext(), Kind: Attribute::ElementType, Ty: ValueTy));
26678	Value *Trunc = Builder.CreateTrunc(V: CI, DestTy: IntEltTy);
26679
26680	return Builder.CreateBitCast(V: Trunc, DestTy: ValueTy);
26681	}
26682
26683	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
26684	IRBuilderBase &Builder) const {
26685	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26686	Builder.CreateCall(Callee: Intrinsic::getDeclaration(M, id: Intrinsic::aarch64_clrex));
26687	}
26688
26689	Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
26690	Value Val, Value Addr,
26691	AtomicOrdering Ord) const {
26692	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26693	bool IsRelease = isReleaseOrStronger(AO: Ord);
26694
26695	// Since the intrinsics must have legal type, the i128 intrinsics take two
26696	// parameters: "i64, i64". We must marshal Val into the appropriate form
26697	// before the call.
26698	if (Val->getType()->getPrimitiveSizeInBits() == `128`) {
26699	Intrinsic::ID Int =
26700	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26701	Function *Stxr = Intrinsic::getDeclaration(M, id: Int);
26702	Type *Int64Ty = Type::getInt64Ty(C&: M->getContext());
26703
26704	Value *Lo = Builder.CreateTrunc(V: Val, DestTy: Int64Ty, Name: "lo");
26705	Value *Hi = Builder.CreateTrunc(V: Builder.CreateLShr(LHS: Val, RHS: `64`), DestTy: Int64Ty, Name: "hi");
26706	return Builder.CreateCall(Callee: Stxr, Args: {Lo, Hi, Addr});
26707	}
26708
26709	Intrinsic::ID Int =
26710	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26711	Type *Tys[] = { Addr->getType() };
26712	Function *Stxr = Intrinsic::getDeclaration(M, id: Int, Tys);
26713
26714	const DataLayout &DL = M->getDataLayout();
26715	IntegerType *IntValTy = Builder.getIntNTy(N: DL.getTypeSizeInBits(Ty: Val->getType()));
26716	Val = Builder.CreateBitCast(V: Val, DestTy: IntValTy);
26717
26718	CallInst *CI = Builder.CreateCall(
26719	Callee: Stxr, Args: {Builder.CreateZExtOrBitCast(
26720	V: Val, DestTy: Stxr->getFunctionType()->getParamType(i: `0`)),
26721	Addr});
26722	CI->addParamAttr(ArgNo: `1`, Attr: Attribute::get(Context&: Builder.getContext(),
26723	Kind: Attribute::ElementType, Ty: Val->getType()));
26724	return CI;
26725	}
26726
26727	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
26728	Type Ty, CallingConv::ID CallConv, bool* isVarArg,
26729	const DataLayout &DL) const {
26730	if (!Ty->isArrayTy()) {
26731	const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26732	return TySize.isScalable() && TySize.getKnownMinValue() > `128`;
26733	}
26734
26735	// All non aggregate members of the type must have the same type
26736	SmallVector<EVT> ValueVTs;
26737	ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs);
26738	return all_equal(Range&: ValueVTs);
26739	}
26740
26741	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26742	EVT) const {
26743	return false;
26744	}
26745
26746	static Value UseTlsOffset(IRBuilderBase &IRB, unsigned* Offset) {
26747	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26748	Function *ThreadPointerFunc =
26749	Intrinsic::getDeclaration(M, id: Intrinsic::thread_pointer);
26750	return IRB.CreatePointerCast(
26751	V: IRB.CreateConstGEP1_32(Ty: IRB.getInt8Ty(), Ptr: IRB.CreateCall(Callee: ThreadPointerFunc),
26752	Idx0: Offset),
26753	DestTy: IRB.getPtrTy(AddrSpace: `0`));
26754	}
26755
26756	Value AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const* {
26757	// Android provides a fixed TLS slot for the stack cookie. See the definition
26758	// of TLS_SLOT_STACK_GUARD in
26759	// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26760	if (Subtarget->isTargetAndroid())
26761	return UseTlsOffset(IRB, Offset: `0x28`);
26762
26763	// Fuchsia is similar.
26764	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26765	if (Subtarget->isTargetFuchsia())
26766	return UseTlsOffset(IRB, Offset: -`0x10`);
26767
26768	return TargetLowering::getIRStackGuard(IRB);
26769	}
26770
26771	void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
26772	// MSVC CRT provides functionalities for stack protection.
26773	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26774	// MSVC CRT has a global variable holding security cookie.
26775	M.getOrInsertGlobal(Name: "__security_cookie",
26776	Ty: PointerType::getUnqual(C&: M.getContext()));
26777
26778	// MSVC CRT has a function to validate security cookie.
26779	FunctionCallee SecurityCheckCookie =
26780	M.getOrInsertFunction(Name: Subtarget->getSecurityCheckCookieName(),
26781	RetTy: Type::getVoidTy(C&: M.getContext()),
26782	Args: PointerType::getUnqual(C&: M.getContext()));
26783	if (Function *F = dyn_cast<Function>(Val: SecurityCheckCookie.getCallee())) {
26784	F->setCallingConv(CallingConv::Win64);
26785	F->addParamAttr(ArgNo: `0`, Kind: Attribute::AttrKind::InReg);
26786	}
26787	return;
26788	}
26789	TargetLowering::insertSSPDeclarations(M);
26790	}
26791
26792	Value AArch64TargetLowering::getSDagStackGuard(const* Module &M) const {
26793	// MSVC CRT has a global variable holding security cookie.
26794	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26795	return M.getGlobalVariable(Name: "__security_cookie");
26796	return TargetLowering::getSDagStackGuard(M);
26797	}
26798
26799	Function AArch64TargetLowering::getSSPStackGuardCheck(const* Module &M) const {
26800	// MSVC CRT has a function to validate security cookie.
26801	if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26802	return M.getFunction(Name: Subtarget->getSecurityCheckCookieName());
26803	return TargetLowering::getSSPStackGuardCheck(M);
26804	}
26805
26806	Value *
26807	AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
26808	// Android provides a fixed TLS slot for the SafeStack pointer. See the
26809	// definition of TLS_SLOT_SAFESTACK in
26810	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26811	if (Subtarget->isTargetAndroid())
26812	return UseTlsOffset(IRB, Offset: `0x48`);
26813
26814	// Fuchsia is similar.
26815	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26816	if (Subtarget->isTargetFuchsia())
26817	return UseTlsOffset(IRB, Offset: -`0x8`);
26818
26819	return TargetLowering::getSafeStackPointerLocation(IRB);
26820	}
26821
26822	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
26823	const Instruction &AndI) const {
26824	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
26825	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
26826	// may be beneficial to sink in other cases, but we would have to check that
26827	// the cmp would not get folded into the br to form a cbz for these to be
26828	// beneficial.
26829	ConstantInt* Mask = dyn_cast<ConstantInt>(Val: AndI.getOperand(i: `1`));
26830	if (!Mask)
26831	return false;
26832	return Mask->getValue().isPowerOf2();
26833	}
26834
26835	bool AArch64TargetLowering::
26836	shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26837	SDValue X, ConstantSDNode XC, ConstantSDNode CC, SDValue Y,
26838	unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26839	SelectionDAG &DAG) const {
26840	// Does baseline recommend not to perform the fold by default?
26841	if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
26842	X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26843	return false;
26844	// Else, if this is a vector shift, prefer 'shl'.
26845	return X.getValueType().isScalarInteger() \|\| NewShiftOpcode == ISD::SHL;
26846	}
26847
26848	TargetLowering::ShiftLegalizationStrategy
26849	AArch64TargetLowering::preferredShiftLegalizationStrategy(
26850	SelectionDAG &DAG, SDNode N, unsigned* int ExpansionFactor) const {
26851	if (DAG.getMachineFunction().getFunction().hasMinSize() &&
26852	!Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26853	return ShiftLegalizationStrategy::LowerToLibcall;
26854	return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
26855	ExpansionFactor);
26856	}
26857
26858	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock Entry) const* {
26859	// Update IsSplitCSR in AArch64unctionInfo.
26860	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26861	AFI->setIsSplitCSR(true);
26862	}
26863
26864	void AArch64TargetLowering::insertCopiesSplitCSR(
26865	MachineBasicBlock *Entry,
26866	const SmallVectorImpl<MachineBasicBlock > &Exits) const* {
26867	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26868	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(MF: Entry->getParent());
26869	if (!IStart)
26870	return;
26871
26872	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26873	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26874	MachineBasicBlock::iterator MBBI = Entry->begin();
26875	for (const MCPhysReg I = IStart; I; ++I) {
26876	const TargetRegisterClass RC = nullptr*;
26877	if (AArch64::GPR64RegClass.contains(Reg: *I))
26878	RC = &AArch64::GPR64RegClass;
26879	else if (AArch64::FPR64RegClass.contains(Reg: *I))
26880	RC = &AArch64::FPR64RegClass;
26881	else
26882	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26883
26884	Register NewVR = MRI->createVirtualRegister(RegClass: RC);
26885	// Create copy from CSR to a virtual register.
26886	// FIXME: this currently does not emit CFI pseudo-instructions, it works
26887	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26888	// nounwind. If we want to generalize this later, we may need to emit
26889	// CFI pseudo-instructions.
26890	assert(Entry->getParent()->getFunction().hasFnAttribute(
26891	Attribute::NoUnwind) &&
26892	"Function should be nounwind in insertCopiesSplitCSR!");
26893	Entry->addLiveIn(PhysReg: *I);
26894	BuildMI(BB&: *Entry, I: MBBI, MIMD: DebugLoc (), MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewVR)
26895	.addReg(RegNo: *I);
26896
26897	// Insert the copy-back instructions right before the terminator.
26898	for (auto *Exit : Exits)
26899	BuildMI(BB&: *Exit, I: Exit->getFirstTerminator(), MIMD: DebugLoc (),
26900	MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: *I)
26901	.addReg(RegNo: NewVR);
26902	}
26903	}
26904
26905	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
26906	// Integer division on AArch64 is expensive. However, when aggressively
26907	// optimizing for code size, we prefer to use a div instruction, as it is
26908	// usually smaller than the alternative sequence.
26909	// The exception to this is vector division. Since AArch64 doesn't have vector
26910	// integer division, leaving the division as-is is a loss even in terms of
26911	// size, because it will have to be scalarized, while the alternative code
26912	// sequence can be performed in vector form.
26913	bool OptSize = Attr.hasFnAttr(Kind: Attribute::MinSize);
26914	return OptSize && !VT.isVector();
26915	}
26916
26917	bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
26918	// We want inc-of-add for scalars and sub-of-not for vectors.
26919	return VT.isScalarInteger();
26920	}
26921
26922	bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
26923	EVT VT) const {
26924	// v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26925	// legalize.
26926	if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26927	return false;
26928	if (FPVT == MVT::v8bf16)
26929	return false;
26930	return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26931	}
26932
26933	MachineInstr *
26934	AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
26935	MachineBasicBlock::instr_iterator &MBBI,
26936	const TargetInstrInfo TII) const* {
26937	assert(MBBI->isCall() && MBBI->getCFIType() &&
26938	"Invalid call instruction for a KCFI check");
26939
26940	switch (MBBI ->getOpcode()) {
26941	case AArch64::BLR:
26942	case AArch64::BLRNoIP:
26943	case AArch64::TCRETURNri:
26944	case AArch64::TCRETURNrix16x17:
26945	case AArch64::TCRETURNrix17:
26946	case AArch64::TCRETURNrinotx16:
26947	break;
26948	default:
26949	llvm_unreachable("Unexpected CFI call opcode");
26950	}
26951
26952	MachineOperand &Target = MBBI ->getOperand(i: `0`);
26953	assert(Target.isReg() && "Invalid target operand for an indirect call");
26954	Target.setIsRenamable(false);
26955
26956	return BuildMI(BB&: MBB, I: MBBI, MIMD: MBBI ->getDebugLoc(), MCID: TII->get(Opcode: AArch64::KCFI_CHECK))
26957	.addReg(RegNo: Target.getReg())
26958	.addImm(Val: MBBI ->getCFIType())
26959	.getInstr();
26960	}
26961
26962	bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
26963	return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26964	}
26965
26966	unsigned
26967	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
26968	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
26969	return getPointerTy(DL).getSizeInBits();
26970
26971	return `3` * getPointerTy(DL).getSizeInBits() + `2` * `32`;
26972	}
26973
26974	void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26975	MachineFrameInfo &MFI = MF.getFrameInfo();
26976	// If we have any vulnerable SVE stack objects then the stack protector
26977	// needs to be placed at the top of the SVE stack area, as the SVE locals
26978	// are placed above the other locals, so we allocate it as if it were a
26979	// scalable vector.
26980	// FIXME: It may be worthwhile having a specific interface for this rather
26981	// than doing it here in finalizeLowering.
26982	if (MFI.hasStackProtectorIndex()) {
26983	for (unsigned int i = `0`, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26984	if (MFI.getStackID(ObjectIdx: i) == TargetStackID::ScalableVector &&
26985	MFI.getObjectSSPLayout(ObjectIdx: i) != MachineFrameInfo::SSPLK_None) {
26986	MFI.setStackID(ObjectIdx: MFI.getStackProtectorIndex(),
26987	ID: TargetStackID::ScalableVector);
26988	MFI.setObjectAlignment(ObjectIdx: MFI.getStackProtectorIndex(), Alignment: Align (`16`));
26989	break;
26990	}
26991	}
26992	}
26993	MFI.computeMaxCallFrameSize(MF);
26994	TargetLoweringBase::finalizeLowering(MF);
26995	}
26996
26997	// Unlike X86, we let frame lowering assign offsets to all catch objects.
26998	bool AArch64TargetLowering::needsFixedCatchObjects() const {
26999	return false;
27000	}
27001
27002	bool AArch64TargetLowering::shouldLocalize(
27003	const MachineInstr &MI, const TargetTransformInfo TTI) const* {
27004	auto &MF = *MI.getMF();
27005	auto &MRI = MF.getRegInfo();
27006	auto maxUses = [](unsigned RematCost) {
27007	// A cost of 1 means remats are basically free.
27008	if (RematCost == `1`)
27009	return std::numeric_limits<unsigned>::max();
27010	if (RematCost == `2`)
27011	return `2U`;
27012
27013	// Remat is too expensive, only sink if there's one user.
27014	if (RematCost > `2`)
27015	return `1U`;
27016	llvm_unreachable("Unexpected remat cost");
27017	};
27018
27019	unsigned Opc = MI.getOpcode();
27020	switch (Opc) {
27021	case TargetOpcode::G_GLOBAL_VALUE: {
27022	// On Darwin, TLS global vars get selected into function calls, which
27023	// we don't want localized, as they can get moved into the middle of a
27024	// another call sequence.
27025	const GlobalValue &GV = *MI.getOperand(i: `1`).getGlobal();
27026	if (GV.isThreadLocal() && Subtarget->isTargetMachO())
27027	return false;
27028	return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
27029	}
27030	case TargetOpcode::G_FCONSTANT:
27031	case TargetOpcode::G_CONSTANT: {
27032	const ConstantInt *CI;
27033	unsigned AdditionalCost = `0`;
27034
27035	if (Opc == TargetOpcode::G_CONSTANT)
27036	CI = MI.getOperand(i: `1`).getCImm();
27037	else {
27038	LLT Ty = MRI.getType(Reg: MI.getOperand(i: `0`).getReg());
27039	// We try to estimate cost of 32/64b fpimms, as they'll likely be
27040	// materialized as integers.
27041	if (Ty.getScalarSizeInBits() != `32` && Ty.getScalarSizeInBits() != `64`)
27042	break;
27043	auto APF = MI.getOperand(i: `1`).getFPImm()->getValueAPF();
27044	bool OptForSize =
27045	MF.getFunction().hasOptSize() \|\| MF.getFunction().hasMinSize();
27046	if (isFPImmLegal(Imm: APF, VT: EVT::getFloatingPointVT(BitWidth: Ty.getScalarSizeInBits()),
27047	OptForSize))
27048	return true; // Constant should be cheap.
27049	CI =
27050	ConstantInt::get(Context&: MF.getFunction().getContext(), V: APF.bitcastToAPInt());
27051	// FP materialization also costs an extra move, from gpr to fpr.
27052	AdditionalCost = `1`;
27053	}
27054	APInt Imm = CI->getValue();
27055	InstructionCost Cost = TTI->getIntImmCost(
27056	Imm, Ty: CI->getType(), CostKind: TargetTransformInfo::TCK_CodeSize);
27057	assert(Cost.isValid() && "Expected a valid imm cost");
27058
27059	unsigned RematCost = *Cost.getValue();
27060	RematCost += AdditionalCost;
27061	Register Reg = MI.getOperand(i: `0`).getReg();
27062	unsigned MaxUses = maxUses (RematCost);
27063	// Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
27064	if (MaxUses == std::numeric_limits<unsigned>::max())
27065	--MaxUses;
27066	return MRI.hasAtMostUserInstrs(Reg, MaxUsers: MaxUses);
27067	}
27068	// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
27069	// localizable.
27070	case AArch64::ADRP:
27071	case AArch64::G_ADD_LOW:
27072	// Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
27073	case TargetOpcode::G_PTR_ADD:
27074	return true;
27075	default:
27076	break;
27077	}
27078	return TargetLoweringBase::shouldLocalize(MI, TTI);
27079	}
27080
27081	bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
27082	// Fallback for scalable vectors.
27083	// Note that if EnableSVEGISel is true, we allow scalable vector types for
27084	// all instructions, regardless of whether they are actually supported.
27085	if (!EnableSVEGISel) {
27086	if (Inst.getType()->isScalableTy()) {
27087	return true;
27088	}
27089
27090	for (unsigned i = `0`; i < Inst.getNumOperands(); ++i)
27091	if (Inst.getOperand(i)->getType()->isScalableTy())
27092	return true;
27093
27094	if (const AllocaInst *AI = dyn_cast<AllocaInst>(Val: &Inst)) {
27095	if (AI->getAllocatedType()->isScalableTy())
27096	return true;
27097	}
27098	}
27099
27100	// Checks to allow the use of SME instructions
27101	if (auto *Base = dyn_cast<CallBase>(Val: &Inst)) {
27102	auto CallerAttrs = SMEAttrs (*Inst.getFunction());
27103	auto CalleeAttrs = SMEAttrs (*Base);
27104	if (CallerAttrs.requiresSMChange(Callee: CalleeAttrs) \|\|
27105	CallerAttrs.requiresLazySave(Callee: CalleeAttrs) \|\|
27106	CallerAttrs.requiresPreservingZT0(Callee: CalleeAttrs))
27107	return true;
27108	}
27109	return false;
27110	}
27111
27112	// Return the largest legal scalable vector type that matches VT's element type.
27113	static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
27114	assert(VT.isFixedLengthVector() &&
27115	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27116	"Expected legal fixed length vector!");
27117	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27118	default:
27119	llvm_unreachable("unexpected element type for SVE container");
27120	case MVT::i8:
27121	return EVT (MVT::nxv16i8);
27122	case MVT::i16:
27123	return EVT (MVT::nxv8i16);
27124	case MVT::i32:
27125	return EVT (MVT::nxv4i32);
27126	case MVT::i64:
27127	return EVT (MVT::nxv2i64);
27128	case MVT::bf16:
27129	return EVT (MVT::nxv8bf16);
27130	case MVT::f16:
27131	return EVT (MVT::nxv8f16);
27132	case MVT::f32:
27133	return EVT (MVT::nxv4f32);
27134	case MVT::f64:
27135	return EVT (MVT::nxv2f64);
27136	}
27137	}
27138
27139	// Return a PTRUE with active lanes corresponding to the extent of VT.
27140	static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
27141	EVT VT) {
27142	assert(VT.isFixedLengthVector() &&
27143	DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27144	"Expected legal fixed length vector!");
27145
27146	std::optional<unsigned> PgPattern =
27147	getSVEPredPatternFromNumElements(MinNumElts: VT.getVectorNumElements());
27148	assert(PgPattern && "Unexpected element count for SVE predicate");
27149
27150	// For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
27151	// AArch64SVEPredPattern::all, which can enable the use of unpredicated
27152	// variants of instructions when available.
27153	const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27154	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27155	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27156	if (MaxSVESize && MinSVESize == MaxSVESize &&
27157	MaxSVESize == VT.getSizeInBits())
27158	PgPattern = AArch64SVEPredPattern::all;
27159
27160	MVT MaskVT;
27161	switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27162	default:
27163	llvm_unreachable("unexpected element type for SVE predicate");
27164	case MVT::i8:
27165	MaskVT = MVT::nxv16i1;
27166	break;
27167	case MVT::i16:
27168	case MVT::f16:
27169	case MVT::bf16:
27170	MaskVT = MVT::nxv8i1;
27171	break;
27172	case MVT::i32:
27173	case MVT::f32:
27174	MaskVT = MVT::nxv4i1;
27175	break;
27176	case MVT::i64:
27177	case MVT::f64:
27178	MaskVT = MVT::nxv2i1;
27179	break;
27180	}
27181
27182	return getPTrue(DAG, DL, VT: MaskVT, Pattern: *PgPattern);
27183	}
27184
27185	static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
27186	EVT VT) {
27187	assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
27188	"Expected legal scalable vector!");
27189	auto PredTy = VT.changeVectorElementType(EltVT: MVT::i1);
27190	return getPTrue(DAG, DL, VT: PredTy, Pattern: AArch64SVEPredPattern::all);
27191	}
27192
27193	static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
27194	if (VT.isFixedLengthVector())
27195	return getPredicateForFixedLengthVector(DAG, DL, VT);
27196
27197	return getPredicateForScalableVector(DAG, DL, VT);
27198	}
27199
27200	// Grow V to consume an entire SVE register.
27201	static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
27202	assert(VT.isScalableVector() &&
27203	"Expected to convert into a scalable vector!");
27204	assert(V.getValueType().isFixedLengthVector() &&
27205	"Expected a fixed length vector operand!");
27206	SDLoc DL(V);
27207	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
27208	return DAG.getNode(Opcode: ISD::INSERT_SUBVECTOR, DL, VT, N1: DAG.getUNDEF(VT), N2: V, N3: Zero);
27209	}
27210
27211	// Shrink V so it's just big enough to maintain a VT's worth of data.
27212	static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
27213	assert(VT.isFixedLengthVector() &&
27214	"Expected to convert into a fixed length vector!");
27215	assert(V.getValueType().isScalableVector() &&
27216	"Expected a scalable vector operand!");
27217	SDLoc DL(V);
27218	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
27219	return DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL, VT, N1: V, N2: Zero);
27220	}
27221
27222	// Convert all fixed length vector loads larger than NEON to masked_loads.
27223	SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
27224	SDValue Op, SelectionDAG &DAG) const {
27225	auto Load = cast<LoadSDNode>(Val&: Op);
27226
27227	SDLoc DL(Op);
27228	EVT VT = Op.getValueType();
27229	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27230	EVT LoadVT = ContainerVT;
27231	EVT MemVT = Load->getMemoryVT();
27232
27233	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27234
27235	if (VT.isFloatingPoint()) {
27236	LoadVT = ContainerVT.changeTypeToInteger();
27237	MemVT = MemVT.changeTypeToInteger();
27238	}
27239
27240	SDValue NewLoad = DAG.getMaskedLoad(
27241	VT: LoadVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(), Mask: Pg,
27242	Src0: DAG.getUNDEF(VT: LoadVT), MemVT, MMO: Load->getMemOperand(),
27243	AM: Load->getAddressingMode(), Load->getExtensionType());
27244
27245	SDValue Result = NewLoad;
27246	if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
27247	EVT ExtendVT = ContainerVT.changeVectorElementType(
27248	EltVT: Load->getMemoryVT().getVectorElementType());
27249
27250	Result = getSVESafeBitCast(VT: ExtendVT, Op: Result, DAG);
27251	Result = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27252	N1: Pg, N2: Result, N3: DAG.getUNDEF(VT: ContainerVT));
27253	} else if (VT.isFloatingPoint()) {
27254	Result = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Result);
27255	}
27256
27257	Result = convertFromScalableVector(DAG, VT, V: Result);
27258	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
27259	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
27260	}
27261
27262	static SDValue convertFixedMaskToScalableVector(SDValue Mask,
27263	SelectionDAG &DAG) {
27264	SDLoc DL(Mask);
27265	EVT InVT = Mask.getValueType();
27266	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27267
27268	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
27269
27270	if (ISD::isBuildVectorAllOnes(N: Mask.getNode()))
27271	return Pg;
27272
27273	auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Mask);
27274	auto Op2 = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
27275
27276	return DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: Pg.getValueType(),
27277	Ops: {Pg, Op1, Op2, DAG.getCondCode(Cond: ISD::SETNE)});
27278	}
27279
27280	// Convert all fixed length vector loads larger than NEON to masked_loads.
27281	SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
27282	SDValue Op, SelectionDAG &DAG) const {
27283	auto Load = cast<MaskedLoadSDNode>(Val&: Op);
27284
27285	SDLoc DL(Op);
27286	EVT VT = Op.getValueType();
27287	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27288
27289	SDValue Mask = Load->getMask();
27290	// If this is an extending load and the mask type is not the same as
27291	// load's type then we have to extend the mask type.
27292	if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
27293	assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
27294	"Incorrect mask type");
27295	Mask = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Mask);
27296	}
27297	Mask = convertFixedMaskToScalableVector(Mask, DAG);
27298
27299	SDValue PassThru;
27300	bool IsPassThruZeroOrUndef = false;
27301
27302	if (Load->getPassThru()->isUndef()) {
27303	PassThru = DAG.getUNDEF(VT: ContainerVT);
27304	IsPassThruZeroOrUndef = true;
27305	} else {
27306	if (ContainerVT.isInteger())
27307	PassThru = DAG.getConstant(Val: `0`, DL, VT: ContainerVT);
27308	else
27309	PassThru = DAG.getConstantFP(Val: `0`, DL, VT: ContainerVT);
27310	if (isZerosVector(N: Load->getPassThru().getNode()))
27311	IsPassThruZeroOrUndef = true;
27312	}
27313
27314	SDValue NewLoad = DAG.getMaskedLoad(
27315	VT: ContainerVT, dl: DL, Chain: Load->getChain(), Base: Load->getBasePtr(), Offset: Load->getOffset(),
27316	Mask, Src0: PassThru, MemVT: Load->getMemoryVT(), MMO: Load->getMemOperand(),
27317	AM: Load->getAddressingMode(), Load->getExtensionType());
27318
27319	SDValue Result = NewLoad;
27320	if (!IsPassThruZeroOrUndef) {
27321	SDValue OldPassThru =
27322	convertToScalableVector(DAG, VT: ContainerVT, V: Load->getPassThru());
27323	Result = DAG.getSelect(DL, VT: ContainerVT, Cond: Mask, LHS: Result, RHS: OldPassThru);
27324	}
27325
27326	Result = convertFromScalableVector(DAG, VT, V: Result);
27327	SDValue MergedValues[`2`] = {Result, NewLoad.getValue(R: `1`)};
27328	return DAG.getMergeValues(Ops: MergedValues, dl: DL);
27329	}
27330
27331	// Convert all fixed length vector stores larger than NEON to masked_stores.
27332	SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
27333	SDValue Op, SelectionDAG &DAG) const {
27334	auto Store = cast<StoreSDNode>(Val&: Op);
27335
27336	SDLoc DL(Op);
27337	EVT VT = Store->getValue().getValueType();
27338	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27339	EVT MemVT = Store->getMemoryVT();
27340
27341	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27342	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
27343
27344	if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
27345	EVT TruncVT = ContainerVT.changeVectorElementType(
27346	EltVT: Store->getMemoryVT().getVectorElementType());
27347	MemVT = MemVT.changeTypeToInteger();
27348	NewValue = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: TruncVT, N1: Pg,
27349	N2: NewValue, N3: DAG.getTargetConstant(Val: `0`, DL, VT: MVT::i64),
27350	N4: DAG.getUNDEF(VT: TruncVT));
27351	NewValue =
27352	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
27353	} else if (VT.isFloatingPoint()) {
27354	MemVT = MemVT.changeTypeToInteger();
27355	NewValue =
27356	getSVESafeBitCast(VT: ContainerVT.changeTypeToInteger(), Op: NewValue, DAG);
27357	}
27358
27359	return DAG.getMaskedStore(Chain: Store->getChain(), dl: DL, Val: NewValue,
27360	Base: Store->getBasePtr(), Offset: Store->getOffset(), Mask: Pg, MemVT,
27361	MMO: Store->getMemOperand(), AM: Store->getAddressingMode(),
27362	IsTruncating: Store->isTruncatingStore());
27363	}
27364
27365	SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
27366	SDValue Op, SelectionDAG &DAG) const {
27367	auto *Store = cast<MaskedStoreSDNode>(Val&: Op);
27368
27369	SDLoc DL(Op);
27370	EVT VT = Store->getValue().getValueType();
27371	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27372
27373	auto NewValue = convertToScalableVector(DAG, VT: ContainerVT, V: Store->getValue());
27374	SDValue Mask = convertFixedMaskToScalableVector(Mask: Store->getMask(), DAG);
27375
27376	return DAG.getMaskedStore(
27377	Chain: Store->getChain(), dl: DL, Val: NewValue, Base: Store->getBasePtr(), Offset: Store->getOffset(),
27378	Mask, MemVT: Store->getMemoryVT(), MMO: Store->getMemOperand(),
27379	AM: Store->getAddressingMode(), IsTruncating: Store->isTruncatingStore());
27380	}
27381
27382	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
27383	SDValue Op, SelectionDAG &DAG) const {
27384	SDLoc dl(Op);
27385	EVT VT = Op.getValueType();
27386	EVT EltVT = VT.getVectorElementType();
27387
27388	bool Signed = Op.getOpcode() == ISD::SDIV;
27389	unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
27390
27391	bool Negated;
27392	uint64_t SplatVal;
27393	if (Signed && isPow2Splat(Op: Op.getOperand(i: `1`), SplatVal, Negated)) {
27394	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27395	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
27396	SDValue Op2 = DAG.getTargetConstant(Val: Log2_64(Value: SplatVal), DL: dl, VT: MVT::i32);
27397
27398	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL&: dl, VT);
27399	SDValue Res =
27400	DAG.getNode(Opcode: AArch64ISD::SRAD_MERGE_OP1, DL: dl, VT: ContainerVT, N1: Pg, N2: Op1, N3: Op2);
27401	if (Negated)
27402	Res = DAG.getNode(Opcode: ISD::SUB, DL: dl, VT: ContainerVT,
27403	N1: DAG.getConstant(Val: `0`, DL: dl, VT: ContainerVT), N2: Res);
27404
27405	return convertFromScalableVector(DAG, VT, V: Res);
27406	}
27407
27408	// Scalable vector i32/i64 DIV is supported.
27409	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
27410	return LowerToPredicatedOp(Op, DAG, NewOp: PredOpcode);
27411
27412	// Scalable vector i8/i16 DIV is not supported. Promote it to i32.
27413	EVT HalfVT = VT.getHalfNumVectorElementsVT(Context&: *DAG.getContext());
27414	EVT PromVT = HalfVT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27415	unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27416
27417	// If the wider type is legal: extend, op, and truncate.
27418	EVT WideVT = VT.widenIntegerVectorElementType(Context&: *DAG.getContext());
27419	if (DAG.getTargetLoweringInfo().isTypeLegal(VT: WideVT)) {
27420	SDValue Op0 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: `0`));
27421	SDValue Op1 = DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: WideVT, Operand: Op.getOperand(i: `1`));
27422	SDValue Div = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: WideVT, N1: Op0, N2: Op1);
27423	return DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT, Operand: Div);
27424	}
27425
27426	auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
27427	&ExtendOpcode](SDValue Op) {
27428	SDValue IdxZero = DAG.getConstant(Val: `0`, DL: dl, VT: MVT::i64);
27429	SDValue IdxHalf =
27430	DAG.getConstant(Val: HalfVT.getVectorNumElements(), DL: dl, VT: MVT::i64);
27431	SDValue Lo = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxZero);
27432	SDValue Hi = DAG.getNode(Opcode: ISD::EXTRACT_SUBVECTOR, DL: dl, VT: HalfVT, N1: Op, N2: IdxHalf);
27433	return std::pair<SDValue, SDValue>(
27434	{DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Lo),
27435	DAG.getNode(Opcode: ExtendOpcode, DL: dl, VT: PromVT, Operand: Hi)});
27436	};
27437
27438	// If wider type is not legal: split, extend, op, trunc and concat.
27439	auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector (Op.getOperand(i: `0`));
27440	auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector (Op.getOperand(i: `1`));
27441	SDValue Lo = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0LoExt, N2: Op1LoExt);
27442	SDValue Hi = DAG.getNode(Opcode: Op.getOpcode(), DL: dl, VT: PromVT, N1: Op0HiExt, N2: Op1HiExt);
27443	SDValue LoTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Lo);
27444	SDValue HiTrunc = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: HalfVT, Operand: Hi);
27445	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL: dl, VT, Ops: {LoTrunc, HiTrunc});
27446	}
27447
27448	SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
27449	SDValue Op, SelectionDAG &DAG) const {
27450	EVT VT = Op.getValueType();
27451	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27452
27453	SDLoc DL(Op);
27454	SDValue Val = Op.getOperand(i: `0`);
27455	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
27456	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
27457
27458	bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
27459	unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
27460
27461	// Repeatedly unpack Val until the result is of the desired element type.
27462	switch (ContainerVT.getSimpleVT().SimpleTy) {
27463	default:
27464	llvm_unreachable("unimplemented container type");
27465	case MVT::nxv16i8:
27466	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv8i16, Operand: Val);
27467	if (VT.getVectorElementType() == MVT::i16)
27468	break;
27469	[[fallthrough]];
27470	case MVT::nxv8i16:
27471	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv4i32, Operand: Val);
27472	if (VT.getVectorElementType() == MVT::i32)
27473	break;
27474	[[fallthrough]];
27475	case MVT::nxv4i32:
27476	Val = DAG.getNode(Opcode: ExtendOpc, DL, VT: MVT::nxv2i64, Operand: Val);
27477	assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
27478	break;
27479	}
27480
27481	return convertFromScalableVector(DAG, VT, V: Val);
27482	}
27483
27484	SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
27485	SDValue Op, SelectionDAG &DAG) const {
27486	EVT VT = Op.getValueType();
27487	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27488
27489	SDLoc DL(Op);
27490	SDValue Val = Op.getOperand(i: `0`);
27491	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: Val.getValueType());
27492	Val = convertToScalableVector(DAG, VT: ContainerVT, V: Val);
27493
27494	// Repeatedly truncate Val until the result is of the desired element type.
27495	switch (ContainerVT.getSimpleVT().SimpleTy) {
27496	default:
27497	llvm_unreachable("unimplemented container type");
27498	case MVT::nxv2i64:
27499	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv4i32, Operand: Val);
27500	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv4i32, N1: Val, N2: Val);
27501	if (VT.getVectorElementType() == MVT::i32)
27502	break;
27503	[[fallthrough]];
27504	case MVT::nxv4i32:
27505	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv8i16, Operand: Val);
27506	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv8i16, N1: Val, N2: Val);
27507	if (VT.getVectorElementType() == MVT::i16)
27508	break;
27509	[[fallthrough]];
27510	case MVT::nxv8i16:
27511	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: MVT::nxv16i8, Operand: Val);
27512	Val = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: MVT::nxv16i8, N1: Val, N2: Val);
27513	assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
27514	break;
27515	}
27516
27517	return convertFromScalableVector(DAG, VT, V: Val);
27518	}
27519
27520	SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
27521	SDValue Op, SelectionDAG &DAG) const {
27522	EVT VT = Op.getValueType();
27523	EVT InVT = Op.getOperand(i: `0`).getValueType();
27524	assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
27525
27526	SDLoc DL(Op);
27527	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27528	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
27529
27530	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT, N1: Op0, N2: Op.getOperand(i: `1`));
27531	}
27532
27533	SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
27534	SDValue Op, SelectionDAG &DAG) const {
27535	EVT VT = Op.getValueType();
27536	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27537
27538	SDLoc DL(Op);
27539	EVT InVT = Op.getOperand(i: `0`).getValueType();
27540	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27541	SDValue Op0 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `0`));
27542
27543	auto ScalableRes = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT, N1: Op0,
27544	N2: Op.getOperand(i: `1`), N3: Op.getOperand(i: `2`));
27545
27546	return convertFromScalableVector(DAG, VT, V: ScalableRes);
27547	}
27548
27549	// Convert vector operation 'Op' to an equivalent predicated operation whereby
27550	// the original operation's type is used to construct a suitable predicate.
27551	// NOTE: The results for inactive lanes are undefined.
27552	SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
27553	SelectionDAG &DAG,
27554	unsigned NewOp) const {
27555	EVT VT = Op.getValueType();
27556	SDLoc DL(Op);
27557	auto Pg = getPredicateForVector(DAG, DL, VT);
27558
27559	if (VT.isFixedLengthVector()) {
27560	assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
27561	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27562
27563	// Create list of operands by converting existing ones to scalable types.
27564	SmallVector<SDValue, `4`> Operands = {Pg};
27565	for (const SDValue &V : Op ->op_values()) {
27566	if (isa<CondCodeSDNode>(Val: V)) {
27567	Operands.push_back(Elt: V);
27568	continue;
27569	}
27570
27571	if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(Val: V)) {
27572	EVT VTArg = VTNode->getVT().getVectorElementType();
27573	EVT NewVTArg = ContainerVT.changeVectorElementType(EltVT: VTArg);
27574	Operands.push_back(Elt: DAG.getValueType(NewVTArg));
27575	continue;
27576	}
27577
27578	assert(isTypeLegal(V.getValueType()) &&
27579	"Expected only legal fixed-width types");
27580	Operands.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
27581	}
27582
27583	if (isMergePassthruOpcode(Opc: NewOp))
27584	Operands.push_back(Elt: DAG.getUNDEF(VT: ContainerVT));
27585
27586	auto ScalableRes = DAG.getNode(Opcode: NewOp, DL, VT: ContainerVT, Ops: Operands);
27587	return convertFromScalableVector(DAG, VT, V: ScalableRes);
27588	}
27589
27590	assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
27591
27592	SmallVector<SDValue, `4`> Operands = {Pg};
27593	for (const SDValue &V : Op ->op_values()) {
27594	assert((!V.getValueType().isVector() \|\|
27595	V.getValueType().isScalableVector()) &&
27596	"Only scalable vectors are supported!");
27597	Operands.push_back(Elt: V);
27598	}
27599
27600	if (isMergePassthruOpcode(Opc: NewOp))
27601	Operands.push_back(Elt: DAG.getUNDEF(VT));
27602
27603	return DAG.getNode(Opcode: NewOp, DL, VT, Ops: Operands, Flags: Op ->getFlags());
27604	}
27605
27606	// If a fixed length vector operation has no side effects when applied to
27607	// undefined elements, we can safely use scalable vectors to perform the same
27608	// operation without needing to worry about predication.
27609	SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
27610	SelectionDAG &DAG) const {
27611	EVT VT = Op.getValueType();
27612	assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
27613	"Only expected to lower fixed length vector operation!");
27614	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27615
27616	// Create list of operands by converting existing ones to scalable types.
27617	SmallVector<SDValue, `4`> Ops;
27618	for (const SDValue &V : Op ->op_values()) {
27619	assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27620
27621	// Pass through non-vector operands.
27622	if (!V.getValueType().isVector()) {
27623	Ops.push_back(Elt: V);
27624	continue;
27625	}
27626
27627	// "cast" fixed length vector to a scalable vector.
27628	assert(V.getValueType().isFixedLengthVector() &&
27629	isTypeLegal(V.getValueType()) &&
27630	"Only fixed length vectors are supported!");
27631	Ops.push_back(Elt: convertToScalableVector(DAG, VT: ContainerVT, V));
27632	}
27633
27634	auto ScalableRes = DAG.getNode(Opcode: Op.getOpcode(), DL: SDLoc (Op), VT: ContainerVT, Ops);
27635	return convertFromScalableVector(DAG, VT, V: ScalableRes);
27636	}
27637
27638	SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27639	SelectionDAG &DAG) const {
27640	SDLoc DL(ScalarOp);
27641	SDValue AccOp = ScalarOp.getOperand(i: `0`);
27642	SDValue VecOp = ScalarOp.getOperand(i: `1`);
27643	EVT SrcVT = VecOp.getValueType();
27644	EVT ResVT = SrcVT.getVectorElementType();
27645
27646	EVT ContainerVT = SrcVT;
27647	if (SrcVT.isFixedLengthVector()) {
27648	ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27649	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
27650	}
27651
27652	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
27653	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
27654
27655	// Convert operands to Scalable.
27656	AccOp = DAG.getNode(Opcode: ISD::INSERT_VECTOR_ELT, DL, VT: ContainerVT,
27657	N1: DAG.getUNDEF(VT: ContainerVT), N2: AccOp, N3: Zero);
27658
27659	// Perform reduction.
27660	SDValue Rdx = DAG.getNode(Opcode: AArch64ISD::FADDA_PRED, DL, VT: ContainerVT,
27661	N1: Pg, N2: AccOp, N3: VecOp);
27662
27663	return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT, N1: Rdx, N2: Zero);
27664	}
27665
27666	SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27667	SelectionDAG &DAG) const {
27668	SDLoc DL(ReduceOp);
27669	SDValue Op = ReduceOp.getOperand(i: `0`);
27670	EVT OpVT = Op.getValueType();
27671	EVT VT = ReduceOp.getValueType();
27672
27673	if (!OpVT.isScalableVector() \|\| OpVT.getVectorElementType() != MVT::i1)
27674	return SDValue ();
27675
27676	SDValue Pg = getPredicateForVector(DAG, DL, VT: OpVT);
27677
27678	switch (ReduceOp.getOpcode()) {
27679	default:
27680	return SDValue ();
27681	case ISD::VECREDUCE_OR:
27682	if (isAllActivePredicate(DAG, N: Pg) && OpVT == MVT::nxv16i1)
27683	// The predicate can be 'Op' because
27684	// vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27685	return getPTest(DAG, VT, Pg: Op, Op, Cond: AArch64CC::ANY_ACTIVE);
27686	else
27687	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::ANY_ACTIVE);
27688	case ISD::VECREDUCE_AND: {
27689	Op = DAG.getNode(Opcode: ISD::XOR, DL, VT: OpVT, N1: Op, N2: Pg);
27690	return getPTest(DAG, VT, Pg, Op, Cond: AArch64CC::NONE_ACTIVE);
27691	}
27692	case ISD::VECREDUCE_XOR: {
27693	SDValue ID =
27694	DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_cntp, DL, VT: MVT::i64);
27695	if (OpVT == MVT::nxv1i1) {
27696	// Emulate a CNTP on .Q using .D and a different governing predicate.
27697	Pg = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Pg);
27698	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: MVT::nxv2i1, Operand: Op);
27699	}
27700	SDValue Cntp =
27701	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: MVT::i64, N1: ID, N2: Pg, N3: Op);
27702	return DAG.getAnyExtOrTrunc(Op: Cntp, DL, VT);
27703	}
27704	}
27705
27706	return SDValue ();
27707	}
27708
27709	SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27710	SDValue ScalarOp,
27711	SelectionDAG &DAG) const {
27712	SDLoc DL(ScalarOp);
27713	SDValue VecOp = ScalarOp.getOperand(i: `0`);
27714	EVT SrcVT = VecOp.getValueType();
27715
27716	if (useSVEForFixedLengthVectorVT(
27717	VT: SrcVT,
27718	/OverrideNEON=/Subtarget->useSVEForFixedLengthVectors())) {
27719	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27720	VecOp = convertToScalableVector(DAG, VT: ContainerVT, V: VecOp);
27721	}
27722
27723	// UADDV always returns an i64 result.
27724	EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27725	SrcVT.getVectorElementType();
27726	EVT RdxVT = SrcVT;
27727	if (SrcVT.isFixedLengthVector() \|\| Opcode == AArch64ISD::UADDV_PRED)
27728	RdxVT = getPackedSVEVectorVT(VT: ResVT);
27729
27730	SDValue Pg = getPredicateForVector(DAG, DL, VT: SrcVT);
27731	SDValue Rdx = DAG.getNode(Opcode, DL, VT: RdxVT, N1: Pg, N2: VecOp);
27732	SDValue Res = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ResVT,
27733	N1: Rdx, N2: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
27734
27735	// The VEC_REDUCE nodes expect an element size result.
27736	if (ResVT != ScalarOp.getValueType())
27737	Res = DAG.getAnyExtOrTrunc(Op: Res, DL, VT: ScalarOp.getValueType());
27738
27739	return Res;
27740	}
27741
27742	SDValue
27743	AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27744	SelectionDAG &DAG) const {
27745	EVT VT = Op.getValueType();
27746	SDLoc DL(Op);
27747
27748	EVT InVT = Op.getOperand(i: `1`).getValueType();
27749	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27750	SDValue Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `1`));
27751	SDValue Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op ->getOperand(Num: `2`));
27752
27753	// Convert the mask to a predicated (NOTE: We don't need to worry about
27754	// inactive lanes since VSELECT is safe when given undefined elements).
27755	EVT MaskVT = Op.getOperand(i: `0`).getValueType();
27756	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskVT);
27757	auto Mask = convertToScalableVector(DAG, VT: MaskContainerVT, V: Op.getOperand(i: `0`));
27758	Mask = DAG.getNode(Opcode: ISD::TRUNCATE, DL,
27759	VT: MaskContainerVT.changeVectorElementType(EltVT: MVT::i1), Operand: Mask);
27760
27761	auto ScalableRes = DAG.getNode(Opcode: ISD::VSELECT, DL, VT: ContainerVT,
27762	N1: Mask, N2: Op1, N3: Op2);
27763
27764	return convertFromScalableVector(DAG, VT, V: ScalableRes);
27765	}
27766
27767	SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27768	SDValue Op, SelectionDAG &DAG) const {
27769	SDLoc DL(Op);
27770	EVT InVT = Op.getOperand(i: `0`).getValueType();
27771	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT: InVT);
27772
27773	assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27774	"Only expected to lower fixed length vector operation!");
27775	assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27776	"Expected integer result of the same bit length as the inputs!");
27777
27778	auto Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `0`));
27779	auto Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op.getOperand(i: `1`));
27780	auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT: InVT);
27781
27782	EVT CmpVT = Pg.getValueType();
27783	auto Cmp = DAG.getNode(Opcode: AArch64ISD::SETCC_MERGE_ZERO, DL, VT: CmpVT,
27784	Ops: {Pg, Op1, Op2, Op.getOperand(i: `2`)});
27785
27786	EVT PromoteVT = ContainerVT.changeTypeToInteger();
27787	auto Promote = DAG.getBoolExtOrTrunc(Op: Cmp, SL: DL, VT: PromoteVT, OpVT: InVT);
27788	return convertFromScalableVector(DAG, VT: Op.getValueType(), V: Promote);
27789	}
27790
27791	SDValue
27792	AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27793	SelectionDAG &DAG) const {
27794	SDLoc DL(Op);
27795	auto SrcOp = Op.getOperand(i: `0`);
27796	EVT VT = Op.getValueType();
27797	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27798	EVT ContainerSrcVT =
27799	getContainerForFixedLengthVector(DAG, VT: SrcOp.getValueType());
27800
27801	SrcOp = convertToScalableVector(DAG, VT: ContainerSrcVT, V: SrcOp);
27802	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerDstVT, Operand: SrcOp);
27803	return convertFromScalableVector(DAG, VT, V: Op);
27804	}
27805
27806	SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27807	SDValue Op, SelectionDAG &DAG) const {
27808	SDLoc DL(Op);
27809	unsigned NumOperands = Op ->getNumOperands();
27810
27811	assert(NumOperands > `1` && isPowerOf2_32(NumOperands) &&
27812	"Unexpected number of operands in CONCAT_VECTORS");
27813
27814	auto SrcOp1 = Op.getOperand(i: `0`);
27815	auto SrcOp2 = Op.getOperand(i: `1`);
27816	EVT VT = Op.getValueType();
27817	EVT SrcVT = SrcOp1.getValueType();
27818
27819	if (NumOperands > `2`) {
27820	SmallVector<SDValue, `4`> Ops;
27821	EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(Context&: *DAG.getContext());
27822	for (unsigned I = `0`; I < NumOperands; I += `2`)
27823	Ops.push_back(Elt: DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT: PairVT,
27824	N1: Op ->getOperand(Num: I), N2: Op ->getOperand(Num: I + `1`)));
27825
27826	return DAG.getNode(Opcode: ISD::CONCAT_VECTORS, DL, VT, Ops);
27827	}
27828
27829	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27830
27831	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27832	SrcOp1 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp1);
27833	SrcOp2 = convertToScalableVector(DAG, VT: ContainerVT, V: SrcOp2);
27834
27835	Op = DAG.getNode(Opcode: AArch64ISD::SPLICE, DL, VT: ContainerVT, N1: Pg, N2: SrcOp1, N3: SrcOp2);
27836
27837	return convertFromScalableVector(DAG, VT, V: Op);
27838	}
27839
27840	SDValue
27841	AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27842	SelectionDAG &DAG) const {
27843	EVT VT = Op.getValueType();
27844	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27845
27846	SDLoc DL(Op);
27847	SDValue Val = Op.getOperand(i: `0`);
27848	SDValue Pg = getPredicateForVector(DAG, DL, VT);
27849	EVT SrcVT = Val.getValueType();
27850	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27851	EVT ExtendVT = ContainerVT.changeVectorElementType(
27852	EltVT: SrcVT.getVectorElementType());
27853
27854	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
27855	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27856
27857	Val = convertToScalableVector(DAG, VT: ContainerVT.changeTypeToInteger(), V: Val);
27858	Val = getSVESafeBitCast(VT: ExtendVT, Op: Val, DAG);
27859	Val = DAG.getNode(Opcode: AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, VT: ContainerVT,
27860	N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: ContainerVT));
27861
27862	return convertFromScalableVector(DAG, VT, V: Val);
27863	}
27864
27865	SDValue
27866	AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27867	SelectionDAG &DAG) const {
27868	EVT VT = Op.getValueType();
27869	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27870
27871	SDLoc DL(Op);
27872	SDValue Val = Op.getOperand(i: `0`);
27873	EVT SrcVT = Val.getValueType();
27874	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27875	EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27876	EltVT: VT.getVectorElementType());
27877	SDValue Pg = getPredicateForVector(DAG, DL, VT: RoundVT);
27878
27879	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27880	Val = DAG.getNode(Opcode: AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, VT: RoundVT, N1: Pg, N2: Val,
27881	N3: Op.getOperand(i: `1`), N4: DAG.getUNDEF(VT: RoundVT));
27882	Val = getSVESafeBitCast(VT: ContainerSrcVT.changeTypeToInteger(), Op: Val, DAG);
27883	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
27884
27885	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27886	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27887	}
27888
27889	SDValue
27890	AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27891	SelectionDAG &DAG) const {
27892	EVT VT = Op.getValueType();
27893	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27894
27895	bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27896	unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27897	: AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
27898
27899	SDLoc DL(Op);
27900	SDValue Val = Op.getOperand(i: `0`);
27901	EVT SrcVT = Val.getValueType();
27902	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27903	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
27904
27905	if (VT.bitsGE(VT: SrcVT)) {
27906	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27907
27908	Val = DAG.getNode(Opcode: IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27909	VT: VT.changeTypeToInteger(), Operand: Val);
27910
27911	// Safe to use a larger than specified operand because by promoting the
27912	// value nothing has changed from an arithmetic point of view.
27913	Val =
27914	convertToScalableVector(DAG, VT: ContainerDstVT.changeTypeToInteger(), V: Val);
27915	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
27916	N3: DAG.getUNDEF(VT: ContainerDstVT));
27917	return convertFromScalableVector(DAG, VT, V: Val);
27918	} else {
27919	EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27920	EltVT: ContainerDstVT.getVectorElementType());
27921	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
27922
27923	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
27924	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
27925	Val = getSVESafeBitCast(VT: ContainerSrcVT, Op: Val, DAG);
27926	Val = convertFromScalableVector(DAG, VT: SrcVT, V: Val);
27927
27928	Val = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: VT.changeTypeToInteger(), Operand: Val);
27929	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: Val);
27930	}
27931	}
27932
27933	SDValue
27934	AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27935	SelectionDAG &DAG) const {
27936	SDLoc DL(Op);
27937	EVT OpVT = Op.getValueType();
27938	assert(OpVT.isScalableVector() &&
27939	"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27940	SDValue Even = DAG.getNode(Opcode: AArch64ISD::UZP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27941	N2: Op.getOperand(i: `1`));
27942	SDValue Odd = DAG.getNode(Opcode: AArch64ISD::UZP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27943	N2: Op.getOperand(i: `1`));
27944	return DAG.getMergeValues(Ops: {Even, Odd}, dl: DL);
27945	}
27946
27947	SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27948	SelectionDAG &DAG) const {
27949	SDLoc DL(Op);
27950	EVT OpVT = Op.getValueType();
27951	assert(OpVT.isScalableVector() &&
27952	"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27953
27954	SDValue Lo = DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27955	N2: Op.getOperand(i: `1`));
27956	SDValue Hi = DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: OpVT, N1: Op.getOperand(i: `0`),
27957	N2: Op.getOperand(i: `1`));
27958	return DAG.getMergeValues(Ops: {Lo, Hi}, dl: DL);
27959	}
27960
27961	SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27962	SelectionDAG &DAG) const {
27963	// FIXME: Maybe share some code with LowerMGather/Scatter?
27964	MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Val&: Op);
27965	SDLoc DL(HG);
27966	SDValue Chain = HG->getChain();
27967	SDValue Inc = HG->getInc();
27968	SDValue Mask = HG->getMask();
27969	SDValue Ptr = HG->getBasePtr();
27970	SDValue Index = HG->getIndex();
27971	SDValue Scale = HG->getScale();
27972	SDValue IntID = HG->getIntID();
27973
27974	// The Intrinsic ID determines the type of update operation.
27975	[[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(Val: IntID.getNode());
27976	// Right now, we only support 'add' as an update.
27977	assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27978	"Unexpected histogram update operation");
27979
27980	EVT IncVT = Inc.getValueType();
27981	EVT IndexVT = Index.getValueType();
27982	EVT MemVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: IncVT,
27983	EC: IndexVT.getVectorElementCount());
27984	SDValue Zero = DAG.getConstant(Val: `0`, DL, VT: MVT::i64);
27985	SDValue PassThru = DAG.getSplatVector(VT: MemVT, DL, Op: Zero);
27986	SDValue IncSplat = DAG.getSplatVector(VT: MemVT, DL, Op: Inc);
27987	SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27988
27989	MachineMemOperand *MMO = HG->getMemOperand();
27990	// Create an MMO for the gather, without load\|store flags.
27991	MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
27992	PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOLoad, Size: MMO->getSize(),
27993	BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
27994	ISD::MemIndexType IndexType = HG->getIndexType();
27995	SDValue Gather =
27996	DAG.getMaskedGather(VTs: DAG.getVTList(VT1: MemVT, VT2: MVT::Other), MemVT, dl: DL, Ops,
27997	MMO: GMMO, IndexType, ExtTy: ISD::NON_EXTLOAD);
27998
27999	SDValue GChain = Gather.getValue(R: `1`);
28000
28001	// Perform the histcnt, multiply by inc, add to bucket data.
28002	SDValue ID = DAG.getTargetConstant(Val: Intrinsic::aarch64_sve_histcnt, DL, VT: IncVT);
28003	SDValue HistCnt =
28004	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: IndexVT, N1: ID, N2: Mask, N3: Index, N4: Index);
28005	SDValue Mul = DAG.getNode(Opcode: ISD::MUL, DL, VT: MemVT, N1: HistCnt, N2: IncSplat);
28006	SDValue Add = DAG.getNode(Opcode: ISD::ADD, DL, VT: MemVT, N1: Gather, N2: Mul);
28007
28008	// Create an MMO for the scatter, without load\|store flags.
28009	MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
28010	PtrInfo: MMO->getPointerInfo(), F: MachineMemOperand::MOStore, Size: MMO->getSize(),
28011	BaseAlignment: MMO->getAlign(), AAInfo: MMO->getAAInfo());
28012
28013	SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
28014	SDValue Scatter = DAG.getMaskedScatter(VTs: DAG.getVTList(VT: MVT::Other), MemVT, dl: DL,
28015	Ops: ScatterOps, MMO: SMMO, IndexType, IsTruncating: false);
28016	return Scatter;
28017	}
28018
28019	SDValue
28020	AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
28021	SelectionDAG &DAG) const {
28022	EVT VT = Op.getValueType();
28023	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28024
28025	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
28026	unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
28027	: AArch64ISD::FCVTZU_MERGE_PASSTHRU;
28028
28029	SDLoc DL(Op);
28030	SDValue Val = Op.getOperand(i: `0`);
28031	EVT SrcVT = Val.getValueType();
28032	EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
28033	EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, VT: SrcVT);
28034
28035	if (VT.bitsGT(VT: SrcVT)) {
28036	EVT CvtVT = ContainerDstVT.changeVectorElementType(
28037	EltVT: ContainerSrcVT.getVectorElementType());
28038	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
28039
28040	Val = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: SrcVT.changeTypeToInteger(), Operand: Val);
28041	Val = DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT, Operand: Val);
28042
28043	Val = convertToScalableVector(DAG, VT: ContainerDstVT, V: Val);
28044	Val = getSVESafeBitCast(VT: CvtVT, Op: Val, DAG);
28045	Val = DAG.getNode(Opcode, DL, VT: ContainerDstVT, N1: Pg, N2: Val,
28046	N3: DAG.getUNDEF(VT: ContainerDstVT));
28047	return convertFromScalableVector(DAG, VT, V: Val);
28048	} else {
28049	EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
28050	SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT: SrcVT);
28051
28052	// Safe to use a larger than specified result since an fp_to_int where the
28053	// result doesn't fit into the destination is undefined.
28054	Val = convertToScalableVector(DAG, VT: ContainerSrcVT, V: Val);
28055	Val = DAG.getNode(Opcode, DL, VT: CvtVT, N1: Pg, N2: Val, N3: DAG.getUNDEF(VT: CvtVT));
28056	Val = convertFromScalableVector(DAG, VT: SrcVT.changeTypeToInteger(), V: Val);
28057
28058	return DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT, Operand: Val);
28059	}
28060	}
28061
28062	static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
28063	ArrayRef<int> ShuffleMask, EVT VT,
28064	EVT ContainerVT, SelectionDAG &DAG) {
28065	auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28066	SDLoc DL(Op);
28067	unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28068	unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28069	bool IsSingleOp =
28070	ShuffleVectorInst::isSingleSourceMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size());
28071
28072	if (!Subtarget.isNeonAvailable() && !MinSVESize)
28073	MinSVESize = `128`;
28074
28075	// Ignore two operands if no SVE2 or all index numbers couldn't
28076	// be represented.
28077	if (!IsSingleOp && !Subtarget.hasSVE2())
28078	return SDValue ();
28079
28080	EVT VTOp1 = Op.getOperand(i: `0`).getValueType();
28081	unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
28082	unsigned IndexLen = MinSVESize / BitsPerElt;
28083	unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
28084	uint64_t MaxOffset = APInt (BitsPerElt, -`1`, false).getZExtValue();
28085	EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
28086	EVT MaskType = EVT::getVectorVT(Context&: *DAG.getContext(), VT: MaskEltType, NumElements: IndexLen);
28087	bool MinMaxEqual = (MinSVESize == MaxSVESize);
28088	assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
28089	"Incorrectly legalised shuffle operation");
28090
28091	SmallVector<SDValue, `8`> TBLMask;
28092	// If MinSVESize is not equal to MaxSVESize then we need to know which
28093	// TBL mask element needs adjustment.
28094	SmallVector<SDValue, `8`> AddRuntimeVLMask;
28095
28096	// Bail out for 8-bits element types, because with 2048-bit SVE register
28097	// size 8 bits is only sufficient to index into the first source vector.
28098	if (!IsSingleOp && !MinMaxEqual && BitsPerElt == `8`)
28099	return SDValue ();
28100
28101	for (int Index : ShuffleMask) {
28102	// Handling poison index value.
28103	if (Index < `0`)
28104	Index = `0`;
28105	// If the mask refers to elements in the second operand, then we have to
28106	// offset the index by the number of elements in a vector. If this is number
28107	// is not known at compile-time, we need to maintain a mask with 'VL' values
28108	// to add at runtime.
28109	if ((unsigned)Index >= ElementsPerVectorReg) {
28110	if (MinMaxEqual) {
28111	Index += IndexLen - ElementsPerVectorReg;
28112	} else {
28113	Index = Index - ElementsPerVectorReg;
28114	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `1`, DL, VT: MVT::i64));
28115	}
28116	} else if (!MinMaxEqual)
28117	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
28118	// For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
28119	// to 255, this might point to the last element of in the second operand
28120	// of the shufflevector, thus we are rejecting this transform.
28121	if ((unsigned)Index >= MaxOffset)
28122	return SDValue ();
28123	TBLMask.push_back(Elt: DAG.getConstant(Val: Index, DL, VT: MVT::i64));
28124	}
28125
28126	// Choosing an out-of-range index leads to the lane being zeroed vs zero
28127	// value where it would perform first lane duplication for out of
28128	// index elements. For i8 elements an out-of-range index could be a valid
28129	// for 2048-bit vector register size.
28130	for (unsigned i = `0`; i < IndexLen - ElementsPerVectorReg; ++i) {
28131	TBLMask.push_back(Elt: DAG.getConstant(Val: (int)MaxOffset, DL, VT: MVT::i64));
28132	if (!MinMaxEqual)
28133	AddRuntimeVLMask.push_back(Elt: DAG.getConstant(Val: `0`, DL, VT: MVT::i64));
28134	}
28135
28136	EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, VT: MaskType);
28137	SDValue VecMask =
28138	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
28139	SDValue SVEMask = convertToScalableVector(DAG, VT: MaskContainerVT, V: VecMask);
28140
28141	SDValue Shuffle;
28142	if (IsSingleOp)
28143	Shuffle =
28144	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
28145	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl, DL, VT: MVT::i32),
28146	N2: Op1, N3: SVEMask);
28147	else if (Subtarget.hasSVE2()) {
28148	if (!MinMaxEqual) {
28149	unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
28150	SDValue VScale = (BitsPerElt == `64`)
28151	? DAG.getVScale(DL, VT: MVT::i64, MulImm: APInt (`64`, MinNumElts))
28152	: DAG.getVScale(DL, VT: MVT::i32, MulImm: APInt (`32`, MinNumElts));
28153	SDValue VecMask =
28154	DAG.getBuildVector(VT: MaskType, DL, Ops: ArrayRef(TBLMask.data(), IndexLen));
28155	SDValue MulByMask = DAG.getNode(
28156	Opcode: ISD::MUL, DL, VT: MaskType,
28157	N1: DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: MaskType, Operand: VScale),
28158	N2: DAG.getBuildVector(VT: MaskType, DL,
28159	Ops: ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
28160	SDValue UpdatedVecMask =
28161	DAG.getNode(Opcode: ISD::ADD, DL, VT: MaskType, N1: VecMask, N2: MulByMask);
28162	SVEMask = convertToScalableVector(
28163	DAG, VT: getContainerForFixedLengthVector(DAG, VT: MaskType), V: UpdatedVecMask);
28164	}
28165	Shuffle =
28166	DAG.getNode(Opcode: ISD::INTRINSIC_WO_CHAIN, DL, VT: ContainerVT,
28167	N1: DAG.getConstant(Val: Intrinsic::aarch64_sve_tbl2, DL, VT: MVT::i32),
28168	N2: Op1, N3: Op2, N4: SVEMask);
28169	}
28170	Shuffle = convertFromScalableVector(DAG, VT, V: Shuffle);
28171	return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op.getValueType(), Operand: Shuffle);
28172	}
28173
28174	SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
28175	SDValue Op, SelectionDAG &DAG) const {
28176	EVT VT = Op.getValueType();
28177	assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28178
28179	auto *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
28180	auto ShuffleMask = SVN->getMask();
28181
28182	SDLoc DL(Op);
28183	SDValue Op1 = Op.getOperand(i: `0`);
28184	SDValue Op2 = Op.getOperand(i: `1`);
28185
28186	EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28187	Op1 = convertToScalableVector(DAG, VT: ContainerVT, V: Op1);
28188	Op2 = convertToScalableVector(DAG, VT: ContainerVT, V: Op2);
28189
28190	auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
28191	if (ScalarTy == MVT::i8 \|\| ScalarTy == MVT::i16)
28192	return MVT::i32;
28193	return ScalarTy;
28194	};
28195
28196	if (SVN->isSplat()) {
28197	unsigned Lane = std::max(a: `0`, b: SVN->getSplatIndex());
28198	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
28199	SDValue SplatEl = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
28200	N2: DAG.getConstant(Val: Lane, DL, VT: MVT::i64));
28201	Op = DAG.getNode(Opcode: ISD::SPLAT_VECTOR, DL, VT: ContainerVT, Operand: SplatEl);
28202	return convertFromScalableVector(DAG, VT, V: Op);
28203	}
28204
28205	bool ReverseEXT = false;
28206	unsigned Imm;
28207	if (isEXTMask(M: ShuffleMask, VT, ReverseEXT, Imm) &&
28208	Imm == VT.getVectorNumElements() - `1`) {
28209	if (ReverseEXT)
28210	std::swap(a&: Op1, b&: Op2);
28211	EVT ScalarTy = MinLegalExtractEltScalarTy (VT.getVectorElementType());
28212	SDValue Scalar = DAG.getNode(
28213	Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: ScalarTy, N1: Op1,
28214	N2: DAG.getConstant(Val: VT.getVectorNumElements() - `1`, DL, VT: MVT::i64));
28215	Op = DAG.getNode(Opcode: AArch64ISD::INSR, DL, VT: ContainerVT, N1: Op2, N2: Scalar);
28216	return convertFromScalableVector(DAG, VT, V: Op);
28217	}
28218
28219	unsigned EltSize = VT.getScalarSizeInBits();
28220	for (unsigned LaneSize : {`64U`, `32U`, `16U`}) {
28221	if (isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: LaneSize)) {
28222	EVT NewVT =
28223	getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: LaneSize));
28224	unsigned RevOp;
28225	if (EltSize == `8`)
28226	RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
28227	else if (EltSize == `16`)
28228	RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
28229	else
28230	RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
28231
28232	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
28233	Op = LowerToPredicatedOp(Op, DAG, NewOp: RevOp);
28234	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
28235	return convertFromScalableVector(DAG, VT, V: Op);
28236	}
28237	}
28238
28239	if (Subtarget->hasSVE2p1() && EltSize == `64` &&
28240	isREVMask(M: ShuffleMask, EltSize, NumElts: VT.getVectorNumElements(), BlockSize: `128`)) {
28241	if (!VT.isFloatingPoint())
28242	return LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
28243
28244	EVT NewVT = getPackedSVEVectorVT(VT: EVT::getIntegerVT(Context&: *DAG.getContext(), BitWidth: `64`));
28245	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: NewVT, Operand: Op1);
28246	Op = LowerToPredicatedOp(Op, DAG, NewOp: AArch64ISD::REVD_MERGE_PASSTHRU);
28247	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: ContainerVT, Operand: Op);
28248	return convertFromScalableVector(DAG, VT, V: Op);
28249	}
28250
28251	unsigned WhichResult;
28252	if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
28253	WhichResult == `0`)
28254	return convertFromScalableVector(
28255	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28256
28257	if (isTRNMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResult)) {
28258	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28259	return convertFromScalableVector(
28260	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28261	}
28262
28263	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult == `0`)
28264	return convertFromScalableVector(
28265	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP1, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28266
28267	if (isTRN_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
28268	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28269	return convertFromScalableVector(
28270	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28271	}
28272
28273	// Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
28274	// represents the same logical operation as performed by a ZIP instruction. In
28275	// isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
28276	// equivalent to an AArch64 instruction. There's the extra component of
28277	// ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
28278	// only operated on 64/128bit vector types that have a direct mapping to a
28279	// target register and so an exact mapping is implied.
28280	// However, when using SVE for fixed length vectors, most legal vector types
28281	// are actually sub-vectors of a larger SVE register. When mapping
28282	// ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
28283	// how the mask's indices translate. Specifically, when the mapping requires
28284	// an exact meaning for a specific vector index (e.g. Index X is the last
28285	// vector element in the register) then such mappings are often only safe when
28286	// the exact SVE register size is know. The main exception to this is when
28287	// indices are logically relative to the first element of either
28288	// ISD::VECTOR_SHUFFLE operand because these relative indices don't change
28289	// when converting from fixed-length to scalable vector types (i.e. the start
28290	// of a fixed length vector is always the start of a scalable vector).
28291	unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
28292	unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
28293	if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
28294	if (ShuffleVectorInst::isReverseMask(Mask: ShuffleMask, NumSrcElts: ShuffleMask.size()) &&
28295	Op2.isUndef()) {
28296	Op = DAG.getNode(Opcode: ISD::VECTOR_REVERSE, DL, VT: ContainerVT, Operand: Op1);
28297	return convertFromScalableVector(DAG, VT, V: Op);
28298	}
28299
28300	if (isZIPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult) &&
28301	WhichResult != `0`)
28302	return convertFromScalableVector(
28303	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28304
28305	if (isUZPMask(M: ShuffleMask, NumElts: VT.getVectorNumElements(), WhichResultOut&: WhichResult)) {
28306	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28307	return convertFromScalableVector(
28308	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op2));
28309	}
28310
28311	if (isZIP_v_undef_Mask(M: ShuffleMask, VT, WhichResult) && WhichResult != `0`)
28312	return convertFromScalableVector(
28313	DAG, VT, V: DAG.getNode(Opcode: AArch64ISD::ZIP2, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28314
28315	if (isUZP_v_undef_Mask(M: ShuffleMask, VT, WhichResult)) {
28316	unsigned Opc = (WhichResult == `0`) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28317	return convertFromScalableVector(
28318	DAG, VT, V: DAG.getNode(Opcode: Opc, DL, VT: ContainerVT, N1: Op1, N2: Op1));
28319	}
28320	}
28321
28322	// Avoid producing TBL instruction if we don't know SVE register minimal size,
28323	// unless NEON is not available and we can assume minimal SVE register size is
28324	// 128-bits.
28325	if (MinSVESize \|\| !Subtarget->isNeonAvailable())
28326	return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
28327	DAG);
28328
28329	return SDValue ();
28330	}
28331
28332	SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
28333	SelectionDAG &DAG) const {
28334	SDLoc DL(Op);
28335	EVT InVT = Op.getValueType();
28336
28337	assert(VT.isScalableVector() && isTypeLegal(VT) &&
28338	InVT.isScalableVector() && isTypeLegal(InVT) &&
28339	"Only expect to cast between legal scalable vector types!");
28340	assert(VT.getVectorElementType() != MVT::i1 &&
28341	InVT.getVectorElementType() != MVT::i1 &&
28342	"For predicate bitcasts, use getSVEPredicateBitCast");
28343
28344	if (InVT == VT)
28345	return Op;
28346
28347	EVT PackedVT = getPackedSVEVectorVT(VT: VT.getVectorElementType());
28348	EVT PackedInVT = getPackedSVEVectorVT(VT: InVT.getVectorElementType());
28349
28350	// Safe bitcasting between unpacked vector types of different element counts
28351	// is currently unsupported because the following is missing the necessary
28352	// work to ensure the result's elements live where they're supposed to within
28353	// an SVE register.
28354	// 01234567
28355	// e.g. nxv2i32 = XX??XX??
28356	// nxv4f16 = X?X?X?X?
28357	assert((VT.getVectorElementCount() == InVT.getVectorElementCount() \|\|
28358	VT == PackedVT \|\| InVT == PackedInVT) &&
28359	"Unexpected bitcast!");
28360
28361	// Pack input if required.
28362	if (InVT != PackedInVT)
28363	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT: PackedInVT, Operand: Op);
28364
28365	Op = DAG.getNode(Opcode: ISD::BITCAST, DL, VT: PackedVT, Operand: Op);
28366
28367	// Unpack result if required.
28368	if (VT != PackedVT)
28369	Op = DAG.getNode(Opcode: AArch64ISD::REINTERPRET_CAST, DL, VT, Operand: Op);
28370
28371	return Op;
28372	}
28373
28374	bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
28375	SDValue N) const {
28376	return ::isAllActivePredicate(DAG, N);
28377	}
28378
28379	EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
28380	return ::getPromotedVTForPredicate(VT);
28381	}
28382
28383	bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
28384	SDValue Op, const APInt &OriginalDemandedBits,
28385	const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
28386	unsigned Depth) const {
28387
28388	unsigned Opc = Op.getOpcode();
28389	switch (Opc) {
28390	case AArch64ISD::VSHL: {
28391	// Match (VSHL (VLSHR Val X) X)
28392	SDValue ShiftL = Op;
28393	SDValue ShiftR = Op ->getOperand(Num: `0`);
28394	if (ShiftR ->getOpcode() != AArch64ISD::VLSHR)
28395	return false;
28396
28397	if (!ShiftL.hasOneUse() \|\| !ShiftR.hasOneUse())
28398	return false;
28399
28400	unsigned ShiftLBits = ShiftL ->getConstantOperandVal(Num: `1`);
28401	unsigned ShiftRBits = ShiftR ->getConstantOperandVal(Num: `1`);
28402
28403	// Other cases can be handled as well, but this is not
28404	// implemented.
28405	if (ShiftRBits != ShiftLBits)
28406	return false;
28407
28408	unsigned ScalarSize = Op.getScalarValueSizeInBits();
28409	assert(ScalarSize > ShiftLBits && "Invalid shift imm");
28410
28411	APInt ZeroBits = APInt::getLowBitsSet(numBits: ScalarSize, loBitsSet: ShiftLBits);
28412	APInt UnusedBits = ~OriginalDemandedBits;
28413
28414	if ((ZeroBits & UnusedBits) != ZeroBits)
28415	return false;
28416
28417	// All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
28418	// used - simplify to just Val.
28419	return TLO.CombineTo(O: Op, N: ShiftR ->getOperand(Num: `0`));
28420	}
28421	case AArch64ISD::BICi: {
28422	// Fold BICi if all destination bits already known to be zeroed
28423	SDValue Op0 = Op.getOperand(i: `0`);
28424	KnownBits KnownOp0 =
28425	TLO.DAG.computeKnownBits(Op: Op0, DemandedElts: OriginalDemandedElts, Depth: Depth + `1`);
28426	// Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
28427	uint64_t BitsToClear = Op ->getConstantOperandVal(Num: `1`)
28428	<< Op ->getConstantOperandVal(Num: `2`);
28429	APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
28430	if (APInt (Known.getBitWidth(), BitsToClear)
28431	.isSubsetOf(RHS: AlreadyZeroedBitsToClear))
28432	return TLO.CombineTo(O: Op, N: Op0);
28433
28434	Known = KnownOp0 &
28435	KnownBits::makeConstant(C: APInt (Known.getBitWidth(), ~BitsToClear));
28436
28437	return false;
28438	}
28439	case ISD::INTRINSIC_WO_CHAIN: {
28440	if (auto ElementSize = IsSVECntIntrinsic(S: Op)) {
28441	unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
28442	if (!MaxSVEVectorSizeInBits)
28443	MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
28444	unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
28445	// The SVE count intrinsics don't support the multiplier immediate so we
28446	// don't have to account for that here. The value returned may be slightly
28447	// over the true required bits, as this is based on the "ALL" pattern. The
28448	// other patterns are also exposed by these intrinsics, but they all
28449	// return a value that's strictly less than "ALL".
28450	unsigned RequiredBits = llvm::bit_width(Value: MaxElements);
28451	unsigned BitWidth = Known.Zero.getBitWidth();
28452	if (RequiredBits < BitWidth)
28453	Known.Zero.setHighBits(BitWidth - RequiredBits);
28454	return false;
28455	}
28456	}
28457	}
28458
28459	return TargetLowering::SimplifyDemandedBitsForTargetNode(
28460	Op, DemandedBits: OriginalDemandedBits, DemandedElts: OriginalDemandedElts, Known, TLO, Depth);
28461	}
28462
28463	bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
28464	return Op.getOpcode() == AArch64ISD::DUP \|\|
28465	Op.getOpcode() == AArch64ISD::MOVI \|\|
28466	(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
28467	Op.getOperand(i: `0`).getOpcode() == AArch64ISD::DUP) \|\|
28468	TargetLowering::isTargetCanonicalConstantNode(Op);
28469	}
28470
28471	bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
28472	return Subtarget->hasSVE() \|\| Subtarget->hasSVE2() \|\|
28473	Subtarget->hasComplxNum();
28474	}
28475
28476	bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
28477	ComplexDeinterleavingOperation Operation, Type Ty) const* {
28478	auto *VTy = dyn_cast<VectorType>(Val: Ty);
28479	if (!VTy)
28480	return false;
28481
28482	// If the vector is scalable, SVE is enabled, implying support for complex
28483	// numbers. Otherwise, we need to ensure complex number support is available
28484	if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
28485	return false;
28486
28487	auto *ScalarTy = VTy->getScalarType();
28488	unsigned NumElements = VTy->getElementCount().getKnownMinValue();
28489
28490	// We can only process vectors that have a bit size of 128 or higher (with an
28491	// additional 64 bits for Neon). Additionally, these vectors must have a
28492	// power-of-2 size, as we later split them into the smallest supported size
28493	// and merging them back together after applying complex operation.
28494	unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
28495	if ((VTyWidth < `128` && (VTy->isScalableTy() \|\| VTyWidth != `64`)) \|\|
28496	!llvm::isPowerOf2_32(Value: VTyWidth))
28497	return false;
28498
28499	if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
28500	unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
28501	return `8` <= ScalarWidth && ScalarWidth <= `64`;
28502	}
28503
28504	return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) \|\|
28505	ScalarTy->isFloatTy() \|\| ScalarTy->isDoubleTy();
28506	}
28507
28508	Value *AArch64TargetLowering::createComplexDeinterleavingIR(
28509	IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
28510	ComplexDeinterleavingRotation Rotation, Value InputA, Value InputB,
28511	Value Accumulator) const* {
28512	VectorType *Ty = cast<VectorType>(Val: InputA->getType());
28513	bool IsScalable = Ty->isScalableTy();
28514	bool IsInt = Ty->getElementType()->isIntegerTy();
28515
28516	unsigned TyWidth =
28517	Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
28518
28519	assert(((TyWidth >= `128` && llvm::isPowerOf2_32(TyWidth)) \|\| TyWidth == `64`) &&
28520	"Vector type must be either 64 or a power of 2 that is at least 128");
28521
28522	if (TyWidth > `128`) {
28523	int Stride = Ty->getElementCount().getKnownMinValue() / `2`;
28524	auto *HalfTy = VectorType::getHalfElementsVectorType(VTy: Ty);
28525	auto *LowerSplitA = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: `0`));
28526	auto *LowerSplitB = B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: `0`));
28527	auto *UpperSplitA =
28528	B.CreateExtractVector(DstType: HalfTy, SrcVec: InputA, Idx: B.getInt64(C: Stride));
28529	auto *UpperSplitB =
28530	B.CreateExtractVector(DstType: HalfTy, SrcVec: InputB, Idx: B.getInt64(C: Stride));
28531	Value LowerSplitAcc = nullptr*;
28532	Value UpperSplitAcc = nullptr*;
28533	if (Accumulator) {
28534	LowerSplitAcc = B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: `0`));
28535	UpperSplitAcc =
28536	B.CreateExtractVector(DstType: HalfTy, SrcVec: Accumulator, Idx: B.getInt64(C: Stride));
28537	}
28538	auto *LowerSplitInt = createComplexDeinterleavingIR(
28539	B, OperationType, Rotation, InputA: LowerSplitA, InputB: LowerSplitB, Accumulator: LowerSplitAcc);
28540	auto *UpperSplitInt = createComplexDeinterleavingIR(
28541	B, OperationType, Rotation, InputA: UpperSplitA, InputB: UpperSplitB, Accumulator: UpperSplitAcc);
28542
28543	auto *Result = B.CreateInsertVector(DstType: Ty, SrcVec: PoisonValue::get(T: Ty), SubVec: LowerSplitInt,
28544	Idx: B.getInt64(C: `0`));
28545	return B.CreateInsertVector(DstType: Ty, SrcVec: Result, SubVec: UpperSplitInt, Idx: B.getInt64(C: Stride));
28546	}
28547
28548	if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
28549	if (Accumulator == nullptr)
28550	Accumulator = Constant::getNullValue(Ty);
28551
28552	if (IsScalable) {
28553	if (IsInt)
28554	return B.CreateIntrinsic(
28555	ID: Intrinsic::aarch64_sve_cmla_x, Types: Ty,
28556	Args: {Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
28557
28558	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
28559	return B.CreateIntrinsic(
28560	ID: Intrinsic::aarch64_sve_fcmla, Types: Ty,
28561	Args: {Mask, Accumulator, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
28562	}
28563
28564	Intrinsic::ID IdMap[`4`] = {Intrinsic::aarch64_neon_vcmla_rot0,
28565	Intrinsic::aarch64_neon_vcmla_rot90,
28566	Intrinsic::aarch64_neon_vcmla_rot180,
28567	Intrinsic::aarch64_neon_vcmla_rot270};
28568
28569
28570	return B.CreateIntrinsic(ID: IdMap[(int)Rotation], Types: Ty,
28571	Args: {Accumulator, InputA, InputB});
28572	}
28573
28574	if (OperationType == ComplexDeinterleavingOperation::CAdd) {
28575	if (IsScalable) {
28576	if (Rotation == ComplexDeinterleavingRotation::Rotation_90 \|\|
28577	Rotation == ComplexDeinterleavingRotation::Rotation_270) {
28578	if (IsInt)
28579	return B.CreateIntrinsic(
28580	ID: Intrinsic::aarch64_sve_cadd_x, Types: Ty,
28581	Args: {InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
28582
28583	auto *Mask = B.getAllOnesMask(NumElts: Ty->getElementCount());
28584	return B.CreateIntrinsic(
28585	ID: Intrinsic::aarch64_sve_fcadd, Types: Ty,
28586	Args: {Mask, InputA, InputB, B.getInt32(C: (int)Rotation * `90`)});
28587	}
28588	return nullptr;
28589	}
28590
28591	Intrinsic::ID IntId = Intrinsic::not_intrinsic;
28592	if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
28593	IntId = Intrinsic::aarch64_neon_vcadd_rot90;
28594	else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
28595	IntId = Intrinsic::aarch64_neon_vcadd_rot270;
28596
28597	if (IntId == Intrinsic::not_intrinsic)
28598	return nullptr;
28599
28600	return B.CreateIntrinsic(ID: IntId, Types: Ty, Args: {InputA, InputB});
28601	}
28602
28603	return nullptr;
28604	}
28605
28606	bool AArch64TargetLowering::preferScalarizeSplat(SDNode N) const* {
28607	unsigned Opc = N->getOpcode();
28608	if (ISD::isExtOpcode(Opcode: Opc)) {
28609	if (any_of(Range: N->uses(),
28610	P: [&](SDNode Use) { return* Use->getOpcode() == ISD::MUL; }))
28611	return false;
28612	}
28613	return true;
28614	}
28615
28616	unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
28617	return Subtarget->getMinimumJumpTableEntries();
28618	}
28619
28620	MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
28621	CallingConv::ID CC,
28622	EVT VT) const {
28623	bool NonUnitFixedLengthVector =
28624	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
28625	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
28626	return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
28627
28628	EVT VT1;
28629	MVT RegisterVT;
28630	unsigned NumIntermediates;
28631	getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1, NumIntermediates,
28632	RegisterVT);
28633	return RegisterVT;
28634	}
28635
28636	unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
28637	LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
28638	bool NonUnitFixedLengthVector =
28639	VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
28640	if (!NonUnitFixedLengthVector \|\| !Subtarget->useSVEForFixedLengthVectors())
28641	return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
28642
28643	EVT VT1;
28644	MVT VT2;
28645	unsigned NumIntermediates;
28646	return getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT&: VT1,
28647	NumIntermediates, RegisterVT&: VT2);
28648	}
28649
28650	unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
28651	LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
28652	unsigned &NumIntermediates, MVT &RegisterVT) const {
28653	int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
28654	Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28655	if (!RegisterVT.isFixedLengthVector() \|\|
28656	RegisterVT.getFixedSizeInBits() <= `128`)
28657	return NumRegs;
28658
28659	assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28660	assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28661	assert(RegisterVT.getFixedSizeInBits() % `128` == `0` && "Unexpected size!");
28662
28663	// A size mismatch here implies either type promotion or widening and would
28664	// have resulted in scalarisation if larger vectors had not be available.
28665	if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28666	EVT EltTy = VT.getVectorElementType();
28667	EVT NewVT = EVT::getVectorVT(Context, VT: EltTy, EC: ElementCount::getFixed(MinVal: `1`));
28668	if (!isTypeLegal(VT: NewVT))
28669	NewVT = EltTy;
28670
28671	IntermediateVT = NewVT;
28672	NumIntermediates = VT.getVectorNumElements();
28673	RegisterVT = getRegisterType(Context, VT: NewVT);
28674	return NumIntermediates;
28675	}
28676
28677	// SVE VLS support does not introduce a new ABI so we should use NEON sized
28678	// types for vector arguments and returns.
28679
28680	unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / `128`;
28681	NumIntermediates *= NumSubRegs;
28682	NumRegs *= NumSubRegs;
28683
28684	switch (RegisterVT.getVectorElementType().SimpleTy) {
28685	default:
28686	llvm_unreachable("unexpected element type for vector");
28687	case MVT::i8:
28688	IntermediateVT = RegisterVT = MVT::v16i8;
28689	break;
28690	case MVT::i16:
28691	IntermediateVT = RegisterVT = MVT::v8i16;
28692	break;
28693	case MVT::i32:
28694	IntermediateVT = RegisterVT = MVT::v4i32;
28695	break;
28696	case MVT::i64:
28697	IntermediateVT = RegisterVT = MVT::v2i64;
28698	break;
28699	case MVT::f16:
28700	IntermediateVT = RegisterVT = MVT::v8f16;
28701	break;
28702	case MVT::f32:
28703	IntermediateVT = RegisterVT = MVT::v4f32;
28704	break;
28705	case MVT::f64:
28706	IntermediateVT = RegisterVT = MVT::v2f64;
28707	break;
28708	case MVT::bf16:
28709	IntermediateVT = RegisterVT = MVT::v8bf16;
28710	break;
28711	}
28712
28713	return NumRegs;
28714	}
28715
28716	bool AArch64TargetLowering::hasInlineStackProbe(
28717	const MachineFunction &MF) const {
28718	return !Subtarget->isTargetWindows() &&
28719	MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28720	}
28721
28722	#ifndef NDEBUG
28723	void AArch64TargetLowering::verifyTargetSDNode(const SDNode N) const* {
28724	switch (N->getOpcode()) {
28725	default:
28726	break;
28727	case AArch64ISD::SUNPKLO:
28728	case AArch64ISD::SUNPKHI:
28729	case AArch64ISD::UUNPKLO:
28730	case AArch64ISD::UUNPKHI: {
28731	assert(N->getNumValues() == `1` && "Expected one result!");
28732	assert(N->getNumOperands() == `1` && "Expected one operand!");
28733	EVT VT = N->getValueType(`0`);
28734	EVT OpVT = N->getOperand(`0`).getValueType();
28735	assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28736	VT.isInteger() && "Expected integer vectors!");
28737	assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28738	"Expected vectors of equal size!");
28739	// TODO: Enable assert once bogus creations have been fixed.
28740	// assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()2 &&*
28741	// "Expected result vector with half the lanes of its input!");
28742	break;
28743	}
28744	case AArch64ISD::TRN1:
28745	case AArch64ISD::TRN2:
28746	case AArch64ISD::UZP1:
28747	case AArch64ISD::UZP2:
28748	case AArch64ISD::ZIP1:
28749	case AArch64ISD::ZIP2: {
28750	assert(N->getNumValues() == `1` && "Expected one result!");
28751	assert(N->getNumOperands() == `2` && "Expected two operands!");
28752	EVT VT = N->getValueType(`0`);
28753	EVT Op0VT = N->getOperand(`0`).getValueType();
28754	EVT Op1VT = N->getOperand(`1`).getValueType();
28755	assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28756	"Expected vectors!");
28757	// TODO: Enable assert once bogus creations have been fixed.
28758	// assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28759	break;
28760	}
28761	}
28762	}
28763	#endif
28764

Browse the source code of llvm_projects/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp